[Pkg-opencl-devel] [beignet] 01/01: imported dfsg-cleaned upstream version 0.9.3~dfsg
Andreas Beckmann
anbe at moszumanska.debian.org
Fri Oct 31 21:38:21 UTC 2014
This is an automated email from the git hooks/post-receive script.
anbe pushed a commit to tag upstream/0.9.3_dfsg
in repository beignet.
commit 02e7a3971ff6e85c15e2075e4b296858a2c18cde
Author: Andreas Beckmann <anbe at debian.org>
Date: Fri Oct 31 20:34:28 2014 +0100
imported dfsg-cleaned upstream version 0.9.3~dfsg
some files were removed due dfsg violations, for details see
https://bugs.debian.org/767387
http://lists.freedesktop.org/archives/beignet/2014-October/004343.html
undistributable - derived from the Len(n)a standard test image
(https://bugs.debian.org/758442)
kernels/compiler_box_blur_float_ref.bmp
kernels/compiler_box_blur_ref.bmp
kernels/lenna128x128.bmp
unclear license - derived from Shadertoy shaders (stated in
utests/compiler_shader_toy.cpp) these particular shaders are no longer
on Shadertoy, but the default license there is CC-BY-NC-SA
(https://www.shadertoy.com/terms)
kernels/compiler_chocolux.cl
kernels/compiler_chocolux_ref.bmp
kernels/compiler_clod.cl
kernels/compiler_clod_function_call.cl
kernels/compiler_clod_ref.bmp
kernels/compiler_julia.cl
kernels/compiler_julia_function_call.cl
kernels/compiler_julia_no_break.cl
kernels/compiler_julia_no_break_ref.bmp
kernels/compiler_julia_ref.bmp
kernels/compiler_menger_sponge.cl
kernels/compiler_menger_sponge_no_shadow.cl
kernels/compiler_menger_sponge_no_shadow_ref.bmp
kernels/compiler_menger_sponge_ref.bmp
kernels/compiler_nautilus.cl
kernels/compiler_nautilus_ref.bmp
kernels/compiler_ribbon.cl
kernels/compiler_ribbon_ref.bmp
---
.gitignore | 5 +
CMake/CMakeConfigTemplate.hpp | 28 +
CMake/FindLLVM.cmake | 107 +
CMake/FindMesaSrc.cmake | 26 +
CMake/FindOCLIcd.cmake | 24 +
CMakeLists.txt | 171 +
COPYING | 502 +
NEWS.mdwn | 1 +
README.md | 1 +
backend/CMakeLists.txt | 108 +
backend/kernels/compile.sh | 6 +
backend/src/.gitignore | 7 +
backend/src/CMakeLists.txt | 236 +
backend/src/GBEConfig.h.in | 7 +
backend/src/backend/context.cpp | 585 +
backend/src/backend/context.hpp | 149 +
backend/src/backend/gen/gen_mesa_disasm.c | 1302 ++
backend/src/backend/gen/gen_mesa_disasm.h | 45 +
backend/src/backend/gen75_context.cpp | 112 +
backend/src/backend/gen75_context.hpp | 62 +
backend/src/backend/gen75_encoder.cpp | 269 +
backend/src/backend/gen75_encoder.hpp | 60 +
backend/src/backend/gen_context.cpp | 1911 ++
backend/src/backend/gen_context.hpp | 224 +
backend/src/backend/gen_defs.hpp | 974 ++
backend/src/backend/gen_encoder.cpp | 1311 ++
backend/src/backend/gen_encoder.hpp | 241 +
backend/src/backend/gen_insn_compact.cpp | 523 +
.../src/backend/gen_insn_gen7_schedule_info.hxx | 42 +
backend/src/backend/gen_insn_scheduling.cpp | 722 +
backend/src/backend/gen_insn_scheduling.hpp | 42 +
backend/src/backend/gen_insn_selection.cpp | 4032 +++++
backend/src/backend/gen_insn_selection.hpp | 290 +
backend/src/backend/gen_insn_selection.hxx | 86 +
backend/src/backend/gen_program.cpp | 444 +
backend/src/backend/gen_program.h | 38 +
backend/src/backend/gen_program.hpp | 86 +
backend/src/backend/gen_reg_allocation.cpp | 1218 ++
backend/src/backend/gen_reg_allocation.hpp | 73 +
backend/src/backend/gen_register.hpp | 1060 ++
backend/src/backend/program.cpp | 1317 ++
backend/src/backend/program.h | 358 +
backend/src/backend/program.hpp | 320 +
backend/src/builtin_vector_proto.def | 295 +
backend/src/gbe_bin_generater.cpp | 437 +
backend/src/gbe_bin_interpreter.cpp | 80 +
backend/src/gen_as.sh | 101 +
backend/src/gen_builtin_vector.py | 384 +
backend/src/gen_convert.sh | 553 +
backend/src/genconfig.sh | 11 +
backend/src/ir/constant.cpp | 141 +
backend/src/ir/constant.hpp | 134 +
backend/src/ir/context.cpp | 182 +
backend/src/ir/context.hpp | 252 +
backend/src/ir/function.cpp | 359 +
backend/src/ir/function.hpp | 400 +
backend/src/ir/image.cpp | 278 +
backend/src/ir/image.hpp | 102 +
backend/src/ir/immediate.cpp | 263 +
backend/src/ir/immediate.hpp | 264 +
backend/src/ir/instruction.cpp | 1684 ++
backend/src/ir/instruction.hpp | 687 +
backend/src/ir/instruction.hxx | 95 +
backend/src/ir/liveness.cpp | 240 +
backend/src/ir/liveness.hpp | 148 +
backend/src/ir/lowering.cpp | 396 +
backend/src/ir/lowering.hpp | 94 +
backend/src/ir/printf.cpp | 222 +
backend/src/ir/printf.hpp | 244 +
backend/src/ir/profile.cpp | 106 +
backend/src/ir/profile.hpp | 86 +
backend/src/ir/register.cpp | 67 +
backend/src/ir/register.hpp | 170 +
backend/src/ir/sampler.cpp | 139 +
backend/src/ir/sampler.hpp | 94 +
backend/src/ir/type.cpp | 51 +
backend/src/ir/type.hpp | 97 +
backend/src/ir/unit.cpp | 61 +
backend/src/ir/unit.hpp | 92 +
backend/src/ir/value.cpp | 607 +
backend/src/ir/value.hpp | 266 +
backend/src/llvm/llvm_barrier_nodup.cpp | 115 +
backend/src/llvm/llvm_gen_backend.cpp | 3628 ++++
backend/src/llvm/llvm_gen_backend.hpp | 105 +
backend/src/llvm/llvm_gen_ocl_function.hxx | 196 +
backend/src/llvm/llvm_intrinsic_lowering.cpp | 170 +
backend/src/llvm/llvm_loadstore_optimization.cpp | 272 +
backend/src/llvm/llvm_passes.cpp | 399 +
backend/src/llvm/llvm_printf_parser.cpp | 851 +
backend/src/llvm/llvm_scalarize.cpp | 878 +
backend/src/llvm/llvm_to_gen.cpp | 252 +
backend/src/llvm/llvm_to_gen.hpp | 40 +
backend/src/ocl_as.h | 3086 ++++
backend/src/ocl_barrier.ll | 39 +
backend/src/ocl_common_defines.h | 126 +
backend/src/ocl_convert.h | 17415 +++++++++++++++++++
backend/src/ocl_memcpy.ll | 336 +
backend/src/ocl_memset.ll | 127 +
backend/src/ocl_stdlib.tmpl.h | 5160 ++++++
backend/src/sys/alloc.cpp | 359 +
backend/src/sys/alloc.hpp | 342 +
backend/src/sys/assert.cpp | 81 +
backend/src/sys/assert.hpp | 35 +
backend/src/sys/atomic.hpp | 56 +
backend/src/sys/cvar.cpp | 65 +
backend/src/sys/cvar.hpp | 80 +
backend/src/sys/exception.hpp | 56 +
backend/src/sys/fixed_array.hpp | 84 +
backend/src/sys/hash_map.hpp | 82 +
backend/src/sys/intrinsics.hpp | 209 +
backend/src/sys/intrusive_list.cpp | 66 +
backend/src/sys/intrusive_list.hpp | 176 +
backend/src/sys/list.hpp | 65 +
backend/src/sys/map.hpp | 75 +
backend/src/sys/mutex.cpp | 48 +
backend/src/sys/mutex.hpp | 74 +
backend/src/sys/platform.cpp | 79 +
backend/src/sys/platform.hpp | 441 +
backend/src/sys/set.hpp | 70 +
backend/src/sys/vector.hpp | 79 +
backend/src/update.sh | 3 +
backend/src/update_as.sh | 11 +
backend/src/update_blob_ocl_header.py | 65 +
backend/src/update_convert.sh | 12 +
benchmark/CMakeLists.txt | 21 +
benchmark/benchmark_run.cpp | 117 +
benchmark/enqueue_copy_buf.cpp | 69 +
docs/Beignet.mdwn | 230 +
docs/Beignet/Backend.mdwn | 96 +
docs/Beignet/Backend/TODO.mdwn | 110 +
docs/Beignet/Backend/compiler_backend.mdwn | 118 +
docs/Beignet/Backend/gen_ir.mdwn | 254 +
docs/Beignet/Backend/mixed_buffer_pointer.mdwn | 46 +
docs/Beignet/Backend/unstructured_branches.mdwn | 271 +
docs/NEWS.mdwn | 16 +
docs/howto/cross-compiler-howto.mdwn | 60 +
docs/optimization-guide.mdwn | 28 +
include/CL/cl.h | 1214 ++
include/CL/cl.hpp | 12452 +++++++++++++
include/CL/cl_d3d10.h | 126 +
include/CL/cl_d3d11.h | 126 +
include/CL/cl_dx9_media_sharing.h | 127 +
include/CL/cl_egl.h | 133 +
include/CL/cl_ext.h | 316 +
include/CL/cl_gl.h | 162 +
include/CL/cl_gl_ext.h | 69 +
include/CL/cl_intel.h | 141 +
include/CL/cl_platform.h | 1278 ++
include/CL/opencl.h | 54 +
include/CMakeLists.txt | 5 +
intel-beignet.icd.in | 1 +
kernels/buildin_work_dim.cl | 3 +
kernels/builtin_acos_asin.cl | 10 +
kernels/builtin_atan2.cl | 4 +
kernels/builtin_bitselect.cl | 4 +
kernels/builtin_convert_sat.cl | 48 +
kernels/builtin_exp.cl | 10 +
kernels/builtin_frexp.cl | 4 +
kernels/builtin_global_id.cl | 4 +
kernels/builtin_global_size.cl | 3 +
kernels/builtin_lgamma.cl | 4 +
kernels/builtin_lgamma_r.cl | 4 +
kernels/builtin_local_id.cl | 6 +
kernels/builtin_local_size.cl | 3 +
kernels/builtin_mad_sat.cl | 4 +
kernels/builtin_modf.cl | 6 +
kernels/builtin_nextafter.cl | 4 +
kernels/builtin_num_groups.cl | 3 +
kernels/builtin_pow.cl | 7 +
kernels/builtin_remquo.cl | 6 +
kernels/builtin_shuffle.cl | 8 +
kernels/builtin_shuffle2.cl | 13 +
kernels/builtin_sign.cl | 4 +
kernels/builtin_sinpi.cl | 4 +
kernels/builtin_tgamma.cl | 4 +
kernels/compare_image_2d_and_1d_array.cl | 13 +
kernels/compiler_abs.cl | 28 +
kernels/compiler_abs_diff.cl | 30 +
kernels/compiler_address_space.cl | 9 +
kernels/compiler_argument_structure.cl | 9 +
kernels/compiler_argument_structure_indirect.cl | 9 +
kernels/compiler_arith_shift_right.cl | 4 +
kernels/compiler_array.cl | 14 +
kernels/compiler_array0.cl | 16 +
kernels/compiler_array1.cl | 15 +
kernels/compiler_array2.cl | 13 +
kernels/compiler_array3.cl | 14 +
kernels/compiler_async_copy.cl | 24 +
kernels/compiler_async_copy_and_prefetch.cl | 9 +
kernels/compiler_async_stride_copy.cl | 16 +
kernels/compiler_atomic_functions.cl | 50 +
kernels/compiler_basic_arithmetic.cl | 53 +
kernels/compiler_bool_cross_basic_block.cl | 21 +
kernels/compiler_box_blur.cl | 80 +
kernels/compiler_box_blur_float.cl | 48 +
kernels/compiler_box_blur_image.cl | 18 +
kernels/compiler_byte_scatter.cl | 7 +
kernels/compiler_ceil.cl | 4 +
kernels/compiler_clz_int.cl | 5 +
kernels/compiler_clz_short.cl | 5 +
kernels/compiler_constant_expr.cl | 23 +
kernels/compiler_convert_uchar_sat.cl | 4 +
kernels/compiler_data_types.cl | 80 +
kernels/compiler_degrees.cl | 4 +
kernels/compiler_displacement_map_element.cl | 11 +
kernels/compiler_double.cl | 9 +
kernels/compiler_double_2.cl | 9 +
kernels/compiler_double_3.cl | 6 +
kernels/compiler_double_4.cl | 5 +
kernels/compiler_event.cl | 6 +
kernels/compiler_fabs.cl | 5 +
kernels/compiler_function_argument.cl | 7 +
kernels/compiler_function_argument0.cl | 7 +
kernels/compiler_function_argument1.cl | 7 +
kernels/compiler_function_argument2.cl | 12 +
kernels/compiler_function_argument3.cl | 71 +
kernels/compiler_function_constant.cl | 6 +
kernels/compiler_function_constant0.cl | 6 +
kernels/compiler_function_qualifiers.cl | 9 +
kernels/compiler_gather_register_file.cl | 10 +
kernels/compiler_gather_register_file0.cl | 10 +
kernels/compiler_gather_register_file1.cl | 11 +
kernels/compiler_geometric_builtin.cl | 11 +
kernels/compiler_getelementptr_bitcast.cl | 18 +
kernels/compiler_global_constant.cl | 76 +
kernels/compiler_global_constant_2.cl | 20 +
kernels/compiler_global_memory_barrier.cl | 7 +
kernels/compiler_group_size.cl | 29 +
kernels/compiler_hadd.cl | 4 +
kernels/compiler_if_else.cl | 14 +
kernels/compiler_insert_to_constant.cl | 6 +
kernels/compiler_insert_vector.cl | 11 +
kernels/compiler_insn_selection_masked_min_max.cl | 11 +
kernels/compiler_insn_selection_max.cl | 7 +
kernels/compiler_insn_selection_min.cl | 7 +
kernels/compiler_integer_builtin.cl | 23 +
kernels/compiler_integer_division.cl | 6 +
kernels/compiler_integer_remainder.cl | 6 +
kernels/compiler_load_bool_imm.cl | 12 +
kernels/compiler_local_memory_barrier.cl | 6 +
kernels/compiler_local_memory_barrier_2.cl | 7 +
kernels/compiler_local_memory_barrier_wg64.cl | 6 +
kernels/compiler_local_memory_two_ptr.cl | 10 +
kernels/compiler_local_slm.cl | 24 +
kernels/compiler_long.cl | 8 +
kernels/compiler_long_2.cl | 20 +
kernels/compiler_long_asr.cl | 7 +
kernels/compiler_long_cmp.cl | 29 +
kernels/compiler_long_convert.cl | 19 +
kernels/compiler_long_mult.cl | 7 +
kernels/compiler_long_shl.cl | 7 +
kernels/compiler_long_shr.cl | 7 +
kernels/compiler_lower_return0.cl | 8 +
kernels/compiler_lower_return1.cl | 8 +
kernels/compiler_lower_return2.cl | 11 +
kernels/compiler_mad24.cl | 4 +
kernels/compiler_mad_hi.cl | 4 +
kernels/compiler_mandelbrot.cl | 47 +
kernels/compiler_mandelbrot_alternate.cl | 38 +
kernels/compiler_mandelbrot_alternate_ref.bmp | Bin 0 -> 196662 bytes
kernels/compiler_mandelbrot_ref.bmp | Bin 0 -> 196662 bytes
kernels/compiler_math.cl | 40 +
kernels/compiler_math_2op.cl | 19 +
kernels/compiler_math_3op.cl | 9 +
kernels/compiler_math_builtin.cl | 82 +
kernels/compiler_math_constants.cl | 23 +
kernels/compiler_mem_fence.cl | 10 +
kernels/compiler_mixed_pointer.cl | 23 +
kernels/compiler_mul24.cl | 4 +
kernels/compiler_mul_hi.cl | 4 +
kernels/compiler_multiple_kernels.cl | 7 +
kernels/compiler_obread.cl | 8 +
kernels/compiler_obwrite.cl | 8 +
kernels/compiler_preprocessor_macros.cl | 13 +
kernels/compiler_private_data_overflow.cl | 10 +
kernels/compiler_radians.cl | 4 +
kernels/compiler_region.cl | 10 +
kernels/compiler_region0.cl | 11 +
kernels/compiler_region1.cl | 9 +
kernels/compiler_relational_builtin.cl | 24 +
kernels/compiler_rhadd.cl | 4 +
kernels/compiler_rotate.cl | 5 +
kernels/compiler_sampler.cl | 25 +
kernels/compiler_saturate.cl | 16 +
kernels/compiler_saturate_sub.cl | 16 +
kernels/compiler_shift_right.cl | 4 +
kernels/compiler_short_scatter.cl | 7 +
kernels/compiler_simd_all.cl | 12 +
kernels/compiler_simd_any.cl | 15 +
kernels/compiler_smoothstep.cl | 4 +
kernels/compiler_step.cl | 38 +
kernels/compiler_structure_attributes.cl | 17 +
kernels/compiler_switch.cl | 14 +
kernels/compiler_type_casting.cl | 19 +
kernels/compiler_uint16_copy.cl | 8 +
kernels/compiler_uint2_copy.cl | 7 +
kernels/compiler_uint3_copy.cl | 7 +
kernels/compiler_uint3_unaligned_copy.cl | 8 +
kernels/compiler_uint8_copy.cl | 7 +
kernels/compiler_unstructured_branch0.cl | 14 +
kernels/compiler_unstructured_branch1.cl | 14 +
kernels/compiler_unstructured_branch2.cl | 18 +
kernels/compiler_unstructured_branch3.cl | 16 +
kernels/compiler_upsample_int.cl | 4 +
kernels/compiler_upsample_long.cl | 4 +
kernels/compiler_vect_compare.cl | 7 +
kernels/compiler_vector_inc.cl | 13 +
kernels/compiler_vector_load_store.cl | 40 +
kernels/compiler_volatile.cl | 4 +
kernels/compiler_vote_all.cl | 10 +
kernels/compiler_vote_any.cl | 10 +
kernels/compiler_workitem_builtin.cl | 12 +
kernels/compiler_write_only_bytes.cl | 6 +
kernels/compiler_write_only_shorts.cl | 6 +
kernels/double_precision_check.cl | 11 +
kernels/empty.cl | 1 +
kernels/image_1D_buffer.cl | 13 +
kernels/include/runtime_compile_link_inc.h | 4 +
kernels/my_test.cl | 26 +
kernels/null_kernel_arg.cl | 9 +
kernels/runtime_compile_link.h | 1 +
kernels/runtime_compile_link_a.cl | 13 +
kernels/runtime_compile_link_b.cl | 9 +
kernels/test_cl_finish.cl | 12 +
kernels/test_copy_buffer.cl | 6 +
kernels/test_copy_buffer_row.cl | 8 +
kernels/test_copy_image.cl | 10 +
kernels/test_copy_image1.cl | 33 +
kernels/test_copy_image_1d.cl | 9 +
kernels/test_copy_image_3d.cl | 28 +
kernels/test_fill_gl_image.cl | 11 +
kernels/test_fill_image.cl | 13 +
kernels/test_fill_image0.cl | 9 +
kernels/test_fill_image_1d.cl | 8 +
kernels/test_fill_image_3d.cl | 14 +
kernels/test_fill_image_3d_2.cl | 10 +
kernels/test_get_arg_info.cl | 8 +
kernels/test_get_image_info.cl | 13 +
kernels/test_get_image_info_array.cl | 25 +
kernels/test_movforphi_undef.cl | 18 +
kernels/test_printf.cl | 38 +
kernels/test_write_only.cl | 6 +
setup_fulsim_hsw.sh | 5 +
setup_fulsim_ivb.sh | 5 +
setup_perfsim_ivb.sh | 4 +
src/.gitignore | 2 +
src/CMakeLists.txt | 126 +
src/OCLConfig.h.in | 6 +
src/cl_alloc.c | 88 +
src/cl_alloc.h | 47 +
src/cl_api.c | 3341 ++++
src/cl_command_queue.c | 622 +
src/cl_command_queue.h | 109 +
src/cl_command_queue_gen7.c | 394 +
src/cl_context.c | 372 +
src/cl_context.h | 166 +
src/cl_device_data.h | 194 +
src/cl_device_id.c | 617 +
src/cl_device_id.h | 145 +
src/cl_driver.cpp | 40 +
src/cl_driver.h | 383 +
src/cl_driver_defs.c | 95 +
src/cl_driver_type.h | 24 +
src/cl_enqueue.c | 472 +
src/cl_enqueue.h | 73 +
src/cl_event.c | 650 +
src/cl_event.h | 106 +
src/cl_extensions.c | 107 +
src/cl_extensions.h | 99 +
src/cl_gbe_loader.cpp | 328 +
src/cl_gbe_loader.h | 80 +
src/cl_gen75_device.h | 30 +
src/cl_gen7_device.h | 29 +
src/cl_gl_api.c | 153 +
src/cl_gt_device.h | 124 +
src/cl_image.c | 229 +
src/cl_image.h | 44 +
src/cl_internals.h | 36 +
src/cl_kernel.c | 431 +
src/cl_kernel.h | 116 +
src/cl_khr_icd.c | 174 +
src/cl_khr_icd.h | 34 +
src/cl_mem.c | 1903 ++
src/cl_mem.h | 290 +
src/cl_mem_gl.c | 97 +
src/cl_mem_gl.h | 17 +
src/cl_platform_id.c | 112 +
src/cl_platform_id.h | 72 +
src/cl_program.c | 851 +
src/cl_program.h | 136 +
src/cl_sampler.c | 142 +
src/cl_sampler.h | 57 +
src/cl_thread.c | 265 +
src/cl_thread.h | 47 +
src/cl_utils.h | 316 +
src/intel/intel_batchbuffer.c | 191 +
src/intel/intel_batchbuffer.h | 152 +
src/intel/intel_defines.h | 339 +
src/intel/intel_dri_resource_sharing.c | 208 +
src/intel/intel_dri_resource_sharing.h | 39 +
src/intel/intel_dri_resource_sharing_int.h | 143 +
src/intel/intel_driver.c | 744 +
src/intel/intel_driver.h | 125 +
src/intel/intel_gpgpu.c | 1513 ++
src/intel/intel_gpgpu.h | 34 +
src/intel/intel_structs.h | 461 +
src/kernels/cl_internal_copy_buf_align16.cl | 12 +
src/kernels/cl_internal_copy_buf_align4.cl | 8 +
src/kernels/cl_internal_copy_buf_rect.cl | 15 +
.../cl_internal_copy_buf_unalign_dst_offset.cl | 28 +
.../cl_internal_copy_buf_unalign_same_offset.cl | 19 +
.../cl_internal_copy_buf_unalign_src_offset.cl | 29 +
src/kernels/cl_internal_copy_buffer_to_image_2d.cl | 18 +
src/kernels/cl_internal_copy_buffer_to_image_3d.cl | 19 +
src/kernels/cl_internal_copy_image_1d_to_1d.cl | 19 +
src/kernels/cl_internal_copy_image_2d_to_2d.cl | 21 +
src/kernels/cl_internal_copy_image_2d_to_3d.cl | 22 +
src/kernels/cl_internal_copy_image_2d_to_buffer.cl | 19 +
src/kernels/cl_internal_copy_image_3d_to_2d.cl | 22 +
src/kernels/cl_internal_copy_image_3d_to_3d.cl | 23 +
src/kernels/cl_internal_copy_image_3d_to_buffer.cl | 22 +
src/kernels/cl_internal_fill_buf_align128.cl | 9 +
src/kernels/cl_internal_fill_buf_align2.cl | 8 +
src/kernels/cl_internal_fill_buf_align4.cl | 8 +
src/kernels/cl_internal_fill_buf_align8.cl | 14 +
src/kernels/cl_internal_fill_buf_unalign.cl | 8 +
src/kernels/cl_internal_fill_image_1d.cl | 14 +
src/kernels/cl_internal_fill_image_1d_array.cl | 15 +
src/kernels/cl_internal_fill_image_2d.cl | 15 +
src/kernels/cl_internal_fill_image_2d_array.cl | 16 +
src/kernels/cl_internal_fill_image_3d.cl | 16 +
src/performance.c | 324 +
src/performance.h | 12 +
src/x11/dricommon.c | 330 +
src/x11/dricommon.h | 99 +
src/x11/mesa_egl_extension.c | 307 +
src/x11/mesa_egl_extension.h | 20 +
src/x11/mesa_egl_res_share.c | 135 +
src/x11/mesa_egl_res_share.h | 44 +
src/x11/va_dri2.c | 327 +
src/x11/va_dri2.h | 89 +
src/x11/va_dri2str.h | 211 +
src/x11/va_dri2tokens.h | 66 +
utests/.gitignore | 15 +
utests/CMakeLists.txt | 241 +
utests/buildin_work_dim.cpp | 37 +
utests/builtin_acos_asin.cpp | 87 +
utests/builtin_atan2.cpp | 43 +
utests/builtin_bitselect.cpp | 50 +
utests/builtin_convert_sat.cpp | 80 +
utests/builtin_exp.cpp | 102 +
utests/builtin_frexp.cpp | 50 +
utests/builtin_global_id.cpp | 77 +
utests/builtin_global_size.cpp | 108 +
utests/builtin_kernel_max_global_size.cpp | 30 +
utests/builtin_lgamma.cpp | 40 +
utests/builtin_lgamma_r.cpp | 46 +
utests/builtin_local_id.cpp | 81 +
utests/builtin_local_size.cpp | 88 +
utests/builtin_mad_sat.cpp | 44 +
utests/builtin_modf.cpp | 56 +
utests/builtin_nextafter.cpp | 60 +
utests/builtin_num_groups.cpp | 85 +
utests/builtin_pow.cpp | 92 +
utests/builtin_remquo.cpp | 65 +
utests/builtin_shuffle.cpp | 45 +
utests/builtin_shuffle2.cpp | 45 +
utests/builtin_sign.cpp | 47 +
utests/builtin_sinpi.cpp | 104 +
utests/builtin_tgamma.cpp | 42 +
utests/cl_create_kernel.cpp | 16 +
utests/compare_image_2d_and_1d_array.cpp | 79 +
utests/compiler_abs.cpp | 254 +
utests/compiler_abs_diff.cpp | 295 +
utests/compiler_address_space.cpp | 10 +
utests/compiler_argument_structure.cpp | 28 +
utests/compiler_argument_structure_indirect.cpp | 29 +
utests/compiler_arith_shift_right.cpp | 43 +
utests/compiler_array.cpp | 28 +
utests/compiler_array0.cpp | 54 +
utests/compiler_array1.cpp | 52 +
utests/compiler_array2.cpp | 50 +
utests/compiler_array3.cpp | 51 +
utests/compiler_async_copy.cpp | 55 +
utests/compiler_async_copy_and_prefetch.cpp | 10 +
utests/compiler_async_stride_copy.cpp | 45 +
utests/compiler_atomic_functions.cpp | 97 +
utests/compiler_basic_arithmetic.cpp | 115 +
utests/compiler_bool_cross_basic_block.cpp | 55 +
utests/compiler_box_blur.cpp | 43 +
utests/compiler_box_blur_float.cpp | 65 +
utests/compiler_box_blur_image.cpp | 52 +
utests/compiler_byte_scatter.cpp | 24 +
utests/compiler_ceil.cpp | 43 +
utests/compiler_cl_finish.cpp | 50 +
utests/compiler_clz_int.cpp | 31 +
utests/compiler_clz_short.cpp | 31 +
utests/compiler_constant_expr.cpp | 35 +
utests/compiler_convert_uchar_sat.cpp | 44 +
utests/compiler_copy_buffer.cpp | 32 +
utests/compiler_copy_buffer_row.cpp | 40 +
utests/compiler_copy_image.cpp | 58 +
utests/compiler_copy_image1.cpp | 83 +
utests/compiler_copy_image_1d.cpp | 52 +
utests/compiler_copy_image_3d.cpp | 77 +
utests/compiler_data_types.cpp | 9 +
utests/compiler_degrees.cpp | 32 +
utests/compiler_displacement_map_element.cpp | 64 +
utests/compiler_double.cpp | 46 +
utests/compiler_double_2.cpp | 47 +
utests/compiler_double_3.cpp | 46 +
utests/compiler_double_4.cpp | 40 +
utests/compiler_double_precision.cpp | 43 +
utests/compiler_fabs.cpp | 44 +
utests/compiler_fill_gl_image.cpp | 76 +
utests/compiler_fill_image.cpp | 44 +
utests/compiler_fill_image0.cpp | 42 +
utests/compiler_fill_image_1d.cpp | 50 +
utests/compiler_fill_image_3d.cpp | 50 +
utests/compiler_fill_image_3d_2.cpp | 48 +
utests/compiler_function_argument.cpp | 27 +
utests/compiler_function_argument0.cpp | 26 +
utests/compiler_function_argument1.cpp | 31 +
utests/compiler_function_argument2.cpp | 57 +
utests/compiler_function_argument3.cpp | 45 +
utests/compiler_function_constant.cpp | 34 +
utests/compiler_function_constant0.cpp | 40 +
utests/compiler_function_constant1.cpp | 47 +
utests/compiler_function_qualifiers.cpp | 20 +
utests/compiler_geometric_builtin.cpp | 9 +
utests/compiler_get_image_info.cpp | 50 +
utests/compiler_get_image_info_array.cpp | 64 +
utests/compiler_getelementptr_bitcast.cpp | 45 +
utests/compiler_global_constant.cpp | 104 +
utests/compiler_global_constant_2.cpp | 59 +
utests/compiler_global_memory_barrier.cpp | 28 +
utests/compiler_group_size.cpp | 141 +
utests/compiler_hadd.cpp | 40 +
utests/compiler_if_else.cpp | 64 +
utests/compiler_insert_to_constant.cpp | 30 +
utests/compiler_insert_vector.cpp | 18 +
utests/compiler_insn_selection_masked_min_max.cpp | 42 +
utests/compiler_insn_selection_max.cpp | 37 +
utests/compiler_insn_selection_min.cpp | 36 +
utests/compiler_integer_builtin.cpp | 9 +
utests/compiler_integer_division.cpp | 44 +
utests/compiler_integer_remainder.cpp | 44 +
utests/compiler_load_bool_imm.cpp | 29 +
utests/compiler_local_memory_barrier.cpp | 46 +
utests/compiler_local_memory_barrier_2.cpp | 29 +
utests/compiler_local_memory_barrier_wg64.cpp | 46 +
utests/compiler_local_memory_two_ptr.cpp | 50 +
utests/compiler_local_slm.cpp | 33 +
utests/compiler_long.cpp | 60 +
utests/compiler_long_2.cpp | 51 +
utests/compiler_long_asr.cpp | 41 +
utests/compiler_long_cmp.cpp | 122 +
utests/compiler_long_convert.cpp | 158 +
utests/compiler_long_mult.cpp | 49 +
utests/compiler_long_shl.cpp | 41 +
utests/compiler_long_shr.cpp | 41 +
utests/compiler_lower_return0.cpp | 54 +
utests/compiler_lower_return1.cpp | 47 +
utests/compiler_lower_return2.cpp | 48 +
utests/compiler_mad24.cpp | 41 +
utests/compiler_mad_hi.cpp | 46 +
utests/compiler_mandelbrot.cpp | 48 +
utests/compiler_mandelbrot_alternate.cpp | 54 +
utests/compiler_math.cpp | 89 +
utests/compiler_math_2op.cpp | 80 +
utests/compiler_math_3op.cpp | 64 +
utests/compiler_math_builtin.cpp | 9 +
utests/compiler_math_constants.cpp | 9 +
utests/compiler_mem_fence.cpp | 9 +
utests/compiler_mixed_pointer.cpp | 119 +
utests/compiler_movforphi_undef.cpp | 61 +
utests/compiler_mul24.cpp | 36 +
utests/compiler_mul_hi.cpp | 40 +
utests/compiler_multiple_kernels.cpp | 8 +
utests/compiler_preprocessor_macros.cpp | 9 +
utests/compiler_private_data_overflow.cpp | 15 +
utests/compiler_program_objects.cpp | 64 +
utests/compiler_radians.cpp | 32 +
utests/compiler_relational_builtin.cpp | 9 +
utests/compiler_rhadd.cpp | 41 +
utests/compiler_rotate.cpp | 40 +
utests/compiler_sampler.cpp | 41 +
utests/compiler_saturate.cpp | 114 +
utests/compiler_saturate_sub.cpp | 114 +
utests/compiler_shader_toy.cpp | 87 +
utests/compiler_shift_right.cpp | 45 +
utests/compiler_short_scatter.cpp | 25 +
utests/compiler_simd_all.cpp | 43 +
utests/compiler_simd_any.cpp | 43 +
utests/compiler_smoothstep.cpp | 58 +
utests/compiler_step.cpp | 342 +
utests/compiler_structure_attributes.cpp | 9 +
utests/compiler_switch.cpp | 48 +
utests/compiler_type_casting.cpp | 10 +
utests/compiler_uint16_copy.cpp | 35 +
utests/compiler_uint2_copy.cpp | 31 +
utests/compiler_uint3_copy.cpp | 40 +
utests/compiler_uint3_unaligned_copy.cpp | 42 +
utests/compiler_uint8_copy.cpp | 35 +
utests/compiler_unstructured_branch0.cpp | 55 +
utests/compiler_unstructured_branch1.cpp | 54 +
utests/compiler_unstructured_branch2.cpp | 68 +
utests/compiler_unstructured_branch3.cpp | 58 +
utests/compiler_upsample_int.cpp | 37 +
utests/compiler_upsample_long.cpp | 38 +
utests/compiler_vect_compare.cpp | 44 +
utests/compiler_vector_inc.cpp | 46 +
utests/compiler_vector_load_store.cpp | 63 +
utests/compiler_volatile.cpp | 9 +
utests/compiler_workitem_builtin.cpp | 9 +
utests/compiler_write_only.cpp | 43 +
utests/compiler_write_only_bytes.cpp | 23 +
utests/compiler_write_only_shorts.cpp | 24 +
utests/enqueue_built_in_kernels.cpp | 19 +
utests/enqueue_copy_buf.cpp | 66 +
utests/enqueue_copy_buf_unaligned.cpp | 118 +
utests/enqueue_fill_buf.cpp | 90 +
utests/get_arg_info.cpp | 85 +
utests/get_cl_info.cpp | 641 +
utests/image_1D_buffer.cpp | 80 +
utests/load_program_from_bin_file.cpp | 77 +
utests/load_program_from_gen_bin.cpp | 93 +
utests/my_test.cpp | 99 +
utests/new_data.txt | 256 +
utests/profiling_exec.cpp | 102 +
utests/runtime_barrier_list.cpp | 75 +
utests/runtime_compile_link.cpp | 162 +
utests/runtime_createcontext.cpp | 14 +
utests/runtime_event.cpp | 60 +
utests/runtime_flat_address_space.cpp | 75 +
utests/runtime_marker_list.cpp | 75 +
utests/runtime_null_kernel_arg.cpp | 27 +
utests/setenv.sh.in | 7 +
utests/sub_buffer.cpp | 135 +
utests/test_printf.cpp | 18 +
utests/utest.cpp | 183 +
utests/utest.hpp | 139 +
utests/utest_assert.cpp | 41 +
utests/utest_assert.hpp | 44 +
utests/utest_error.c | 76 +
utests/utest_error.h | 26 +
utests/utest_exception.hpp | 48 +
utests/utest_file_map.cpp | 117 +
utests/utest_file_map.hpp | 84 +
utests/utest_generator.py | 387 +
utests/utest_helper.cpp | 674 +
utests/utest_helper.hpp | 234 +
utests/utest_math_gen.py | 577 +
utests/utest_run.cpp | 118 +
654 files changed, 120515 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..90fd161
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.o
+CMakeCache.txt
+CMakeFiles/
+Makefile
+cmake_install.cmake
diff --git a/CMake/CMakeConfigTemplate.hpp b/CMake/CMakeConfigTemplate.hpp
new file mode 100644
index 0000000..7702c54
--- /dev/null
+++ b/CMake/CMakeConfigTemplate.hpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef CMAKE_CONFIG_HPP
+#define CMAKE_CONFIG_HPP
+
+#define ON true
+#define OFF false
+#define GEN_INSTALLATION_PATH "${CMAKE_INSTALL_PREFIX}/lib/i965/"
+
+#endif /* CMAKE_CONFIG_HPP */
+
diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
new file mode 100644
index 0000000..556b3a9
--- /dev/null
+++ b/CMake/FindLLVM.cmake
@@ -0,0 +1,107 @@
+# Find the native LLVM includes and library
+#
+# LLVM_INCLUDE_DIR - where to find llvm include files
+# LLVM_LIBRARY_DIR - where to find llvm libs
+# LLVM_CFLAGS - llvm compiler flags
+# LLVM_LFLAGS - llvm linker flags
+# LLVM_MODULE_LIBS - list of llvm libs for working with modules.
+# LLVM_FOUND - True if llvm found.
+if (LLVM_INSTALL_DIR)
+ find_program(LLVM_CONFIG_EXECUTABLE
+ NAMES llvm-config-33 llvm-config-3.3 llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4 llvm-config
+ DOC "llvm-config executable"
+ PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+else (LLVM_INSTALL_DIR)
+ find_program(LLVM_CONFIG_EXECUTABLE
+ NAMES llvm-config-33 llvm-config-3.3 llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4 llvm-config
+ DOC "llvm-config executable")
+endif (LLVM_INSTALL_DIR)
+
+if (LLVM_CONFIG_EXECUTABLE)
+ message(STATUS "LLVM llvm-config found at: ${LLVM_CONFIG_EXECUTABLE}")
+else (LLVM_CONFIG_EXECUTABLE)
+ message(FATAL_ERROR "Could NOT find LLVM executable, please add -DLLVM_INSTALL_DIR=/path/to/llvm-config/ in cmake command")
+endif (LLVM_CONFIG_EXECUTABLE)
+
+if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
+ SET(LLVM_FIND_VERSION_NODOT "${LLVM_FIND_VERSION_MAJOR}${LLVM_FIND_VERSION_MINOR}")
+ execute_process(
+ COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
+ OUTPUT_VARIABLE LLVM_VERSION
+ )
+ string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2 " LLVM_VERSION_NODOT ${LLVM_VERSION})
+ if (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
+ message(FATAL_ERROR "imcompatible LLVM version ${LLVM_VERSION} required ${LLVM_FIND_VERSION}")
+ else (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
+ if (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+ message(STATUS "find stable LLVM version ${LLVM_VERSION}")
+ else (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+ message(STATUS "find unstable LLVM version ${LLVM_VERSION}")
+ endif (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+ add_definitions("-DLLVM_${LLVM_VERSION_NODOT}")
+ endif (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
+endif (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
+
+execute_process(
+ COMMAND ${LLVM_CONFIG_EXECUTABLE} --includedir
+ OUTPUT_VARIABLE LLVM_INCLUDE_DIR
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+ COMMAND ${LLVM_CONFIG_EXECUTABLE} --libdir
+ OUTPUT_VARIABLE LLVM_LIBRARY_DIR
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+ COMMAND ${LLVM_CONFIG_EXECUTABLE} --cppflags
+ OUTPUT_VARIABLE LLVM_CFLAGS
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+ COMMAND ${LLVM_CONFIG_EXECUTABLE} --ldflags
+ OUTPUT_VARIABLE LLVM_LFLAGS
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+ COMMAND ${LLVM_CONFIG_EXECUTABLE} --libs
+ OUTPUT_VARIABLE LLVM_MODULE_LIBS
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if (LLVM_VERSION_NODOT VERSION_GREATER 34)
+execute_process(
+ COMMAND ${LLVM_CONFIG_EXECUTABLE} --system-libs
+ OUTPUT_VARIABLE LLVM_SYSTEM_LIBS_ORIG
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+string(REGEX REPLACE " *\n" "" LLVM_SYSTEM_LIBS ${LLVM_SYSTEM_LIBS_ORIG})
+endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
+
+macro(add_one_lib name)
+ FIND_LIBRARY(CLANG_LIB
+ NAMES ${name}
+ PATHS ${LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
+ set(CLANG_LIBRARIES ${CLANG_LIBRARIES} ${CLANG_LIB})
+ unset(CLANG_LIB CACHE)
+endmacro()
+
+#Assume clang lib path same as llvm lib path
+add_one_lib("clangFrontend")
+add_one_lib("clangSerialization")
+add_one_lib("clangDriver")
+add_one_lib("clangCodeGen")
+add_one_lib("clangSema")
+add_one_lib("clangStaticAnalyzerFrontend")
+add_one_lib("clangStaticAnalyzerCheckers")
+add_one_lib("clangStaticAnalyzerCore")
+add_one_lib("clangAnalysis")
+add_one_lib("clangEdit")
+add_one_lib("clangAST")
+add_one_lib("clangParse")
+add_one_lib("clangSema")
+add_one_lib("clangLex")
+add_one_lib("clangBasic")
diff --git a/CMake/FindMesaSrc.cmake b/CMake/FindMesaSrc.cmake
new file mode 100644
index 0000000..978cb4e
--- /dev/null
+++ b/CMake/FindMesaSrc.cmake
@@ -0,0 +1,26 @@
+#
+# Try to find mesa source code
+# Once done this will define
+#
+# MESA_SOURCE_FOUND
+# MESA_SOURCE_INCLUDES
+#
+
+# Find mesa source code.
+FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
+ $ENV{MESA_SOURCE_DIR}
+ ${MAKE_CURRENT_SOURCE_DIR}/../mesa
+ ~/mesa
+ DOC "The mesa source directory which is needed for cl_khr_gl_sharing.")
+
+IF(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
+ ${MESA_SOURCE_PREFIX}/include
+ ${MESA_SOURCE_PREFIX}/src/mapi
+ ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
+ ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i915/
+ ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
+SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ELSE(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ENDIF(MESA_SOURCE_PREFIX)
diff --git a/CMake/FindOCLIcd.cmake b/CMake/FindOCLIcd.cmake
new file mode 100644
index 0000000..b0a8ad7
--- /dev/null
+++ b/CMake/FindOCLIcd.cmake
@@ -0,0 +1,24 @@
+#
+# Try to find ocl_icd library and include path.
+# Once done this will define
+#
+# OCLIcd_FOUND
+# OCLIcd_INCLUDE_PATH
+#
+
+FIND_PATH(OCLIcd_INCLUDE_PATH ocl_icd.h
+ ~/include/
+ /usr/include/
+ /usr/local/include/
+ /sw/include/
+ /opt/local/include/
+ DOC "The directory where ocl_icd.h resides")
+
+IF(OCLIcd_INCLUDE_PATH)
+ INCLUDE_DIRECTORIES(${OCLIcd_INCLUDE_PATH})
+ SET(OCLIcd_FOUND 1 CACHE STRING "Set to 1 if OCLIcd is found, 0 otherwise")
+ELSE(OCLIcd_INCLUDE_PATH)
+ SET(OCLIcd_FOUND 0 CACHE STRING "Set to 1 if OCLIcd is found, 0 otherwise")
+ENDIF(OCLIcd_INCLUDE_PATH)
+
+MARK_AS_ADVANCED(OCLIcd_FOUND)
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..ac59859
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,171 @@
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
+PROJECT(OCL)
+set (LIBCL_DRIVER_VERSION_MAJOR 0)
+set (LIBCL_DRIVER_VERSION_MINOR 9)
+set (LIBCL_DRIVER_VERSION_PATCH 3)
+set (LIBCL_C_VERSION_MAJOR 1)
+set (LIBCL_C_VERSION_MINOR 2)
+
+configure_file (
+ "src/OCLConfig.h.in"
+ "src/OCLConfig.h"
+)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+
+INCLUDE (FindPkgConfig)
+
+SET(CMAKE_VERBOSE_MAKEFILE "false")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMake/")
+if (NOT LIB_INSTALL_DIR)
+ set (LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+endif (NOT LIB_INSTALL_DIR)
+if (NOT BEIGNET_INSTALL_DIR)
+ set (BEIGNET_INSTALL_DIR "${LIB_INSTALL_DIR}/beignet/")
+endif (NOT BEIGNET_INSTALL_DIR)
+SET(EMULATE_IVB false CACHE BOOL "To emulate IVB")
+SET(EMULATE_SNB false CACHE BOOL "To emulate SNB")
+SET(EMULATE_HSW false CACHE BOOL "To emulate HSW")
+ADD_DEFINITIONS(-D__$(USER)__)
+
+# Force Release with debug info
+if (NOT CMAKE_BUILD_TYPE)
+ set (CMAKE_BUILD_TYPE RelWithDebInfo)
+endif (NOT CMAKE_BUILD_TYPE)
+set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
+message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
+
+SET(CMAKE_CXX_FLAGS_DEBUGO0 "-O0 -g")
+SET(CMAKE_C_FLAGS_DEBUGO0 "-O0 -g")
+
+IF (EMULATE_HSW)
+ SET (USE_FULSIM "true")
+ ADD_DEFINITIONS(-DEMULATE_GEN=75)
+ELSEIF (EMULATE_IVB)
+ SET (USE_FULSIM "true")
+ ADD_DEFINITIONS(-DEMULATE_GEN=7)
+ELSEIF (EMULATE_SNB)
+ SET (USE_FULSIM "true")
+ ADD_DEFINITIONS(-DEMULATE_GEN=6)
+ELSE (EMULATE_IVB)
+ SET (USE_FULSIM "false")
+ ADD_DEFINITIONS(-DEMULATE_GEN=0)
+ENDIF (EMULATE_HSW)
+
+# XXX now hard coded to enable the clamp to border workaround for IVB.
+ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
+IF (USE_FULSIM)
+ ADD_DEFINITIONS(-DUSE_FULSIM=1)
+ELSE (USE_FULSIM)
+ ADD_DEFINITIONS(-DUSE_FULSIM=0)
+ENDIF (USE_FULSIM)
+
+SET(CMAKE_CXX_FLAGS "-Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++0x -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_C_FLAGS}")
+
+# Front end stuff we need
+#INCLUDE(CMake/FindLLVM.cmake)
+Find_Package(LLVM 3.3)
+
+# XLib
+Find_Package(X11)
+IF(X11_FOUND)
+ MESSAGE(STATUS "Looking for XLib - found")
+ELSE(X11_FOUND)
+ MESSAGE(STATUS "Looking for XLib - not found")
+ENDIF(X11_FOUND)
+
+# DRM
+pkg_check_modules(DRM REQUIRED libdrm)
+IF(DRM_FOUND)
+ MESSAGE(STATUS "Looking for DRM - found at ${DRM_PREFIX}")
+ INCLUDE_DIRECTORIES(${DRM_INCLUDE_DIRS})
+ELSE(DRM_FOUND)
+ MESSAGE(STATUS "Looking for DRM - not found")
+ENDIF(DRM_FOUND)
+
+# DRM Intel
+pkg_check_modules(DRM_INTEL REQUIRED libdrm_intel)
+IF(DRM_INTEL_FOUND)
+ INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_DIRS})
+ MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX}")
+ELSE(DRM_INTEL_FOUND)
+ MESSAGE(STATUS "Looking for DRM Intel - not found")
+ENDIF(DRM_INTEL_FOUND)
+
+# Threads
+Find_Package(Threads)
+
+IF(X11_FOUND)
+# OpenGL (not use cmake helper)
+pkg_check_modules(OPENGL gl)
+IF(OPENGL_FOUND)
+ INCLUDE_DIRECTORIES(${OPENGL_INCLUDE_DIRS})
+ MESSAGE(STATUS "Looking for OpenGL - found at ${OPENGL_PREFIX}")
+ELSE(OPENGL_FOUND)
+ MESSAGE(STATUS "Looking for OpenGL - not found")
+ENDIF(OPENGL_FOUND)
+
+# Xext
+pkg_check_modules(XEXT REQUIRED xext)
+IF(XEXT_FOUND)
+ INCLUDE_DIRECTORIES(${XEXT_INCLUDE_DIRS})
+ MESSAGE(STATUS "Looking for Xext - found at ${XEXT_PREFIX}")
+ELSE(XEXT_FOUND)
+ MESSAGE(STATUS "Looking for Xext - not found")
+ENDIF(XEXT_FOUND)
+
+# Xfixes
+pkg_check_modules(XFIXES REQUIRED xfixes)
+IF(XFIXES_FOUND)
+ INCLUDE_DIRECTORIES(${XFIXES_INCLUDE_DIRS})
+ MESSAGE(STATUS "Looking for Xfixes - found at ${XFIXES_PREFIX}")
+ELSE(XFIXES_FOUND)
+ MESSAGE(STATUS "Looking for Xfixes - not found")
+ENDIF(XFIXES_FOUND)
+ENDIF(X11_FOUND)
+
+pkg_check_modules(EGL egl)
+IF(EGL_FOUND)
+ MESSAGE(STATUS "Looking for EGL - found at ${EGL_PREFIX}")
+ELSE(EGL_FOUND)
+ MESSAGE(STATUS "Looking for EGL - not found")
+ENDIF(EGL_FOUND)
+
+# cl_khr_gl_sharing requires to build with mesa source
+Find_Package(MesaSrc)
+IF(MESA_SOURCE_FOUND)
+ MESSAGE(STATUS "Looking for mesa source code - found at ${MESA_SOURCE_PREFIX}")
+ELSE(MESA_SOURCE_FOUND)
+ MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
+ENDIF(MESA_SOURCE_FOUND)
+
+Find_Package(OCLIcd)
+IF(OCLIcd_FOUND)
+ MESSAGE(STATUS "Looking for OCL ICD header file - found")
+ configure_file (
+ "intel-beignet.icd.in"
+ "intel-beignet.icd"
+ )
+ install (FILES ${CMAKE_CURRENT_BINARY_DIR}/intel-beignet.icd DESTINATION /etc/OpenCL/vendors)
+ELSE(OCLIcd_FOUND)
+ MESSAGE(STATUS "Looking for OCL ICD header file - not found")
+ENDIF(OCLIcd_FOUND)
+
+Find_Package(PythonInterp)
+
+ADD_SUBDIRECTORY(include)
+ADD_SUBDIRECTORY(backend)
+ADD_SUBDIRECTORY(src)
+ADD_SUBDIRECTORY(utests)
+ADD_SUBDIRECTORY(benchmark)
+
+SET(CPACK_PACKAGE_VERSION_MAJOR "${LIBCL_DRIVER_VERSION_MAJOR}")
+SET(CPACK_PACKAGE_VERSION_MINOR "${LIBCL_DRIVER_VERSION_MINOR}")
+SET(CPACK_PACKAGE_VERSION_PATCH "${LIBCL_DRIVER_VERSION_PATCH}")
+SET(CPACK_SOURCE_GENERATOR "TGZ;TZ")
+SET(CPACK_PACKAGE_NAME "Beignet")
+SET(CPACK_PACKAGE_VENDOR "Intel Open Source Technology Center")
+INCLUDE(CPack)
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..4362b49
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,502 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL. It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+ This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it. You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+ When we speak of free software, we are referring to freedom of use,
+not price. Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+ To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+ For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you. You must make sure that they, too, receive or can get the source
+code. If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it. And you must show them these terms so they know their rights.
+
+ We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+ To protect each distributor, we want to make it very clear that
+there is no warranty for the free library. Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+ Finally, software patents pose a constant threat to the existence of
+any free program. We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder. Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+ Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License. This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License. We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+ When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library. The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom. The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+ We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License. It also provides other free software developers Less
+of an advantage over competing non-free programs. These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries. However, the Lesser license provides advantages in certain
+special circumstances.
+
+ For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard. To achieve this, non-free programs must be
+allowed to use the library. A more frequent case is that a free
+library does the same job as widely used non-free libraries. In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+ In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software. For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+ Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+ The precise terms and conditions for copying, distribution and
+modification follow. Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library". The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+ GNU LESSER GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+ A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+ The "Library", below, refers to any such software library or work
+which has been distributed under these terms. A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language. (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+ "Source code" for a work means the preferred form of the work for
+making modifications to it. For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+ Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it). Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+ 1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+ You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+ 2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) The modified work must itself be a software library.
+
+ b) You must cause the files modified to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ c) You must cause the whole of the work to be licensed at no
+ charge to all third parties under the terms of this License.
+
+ d) If a facility in the modified Library refers to a function or a
+ table of data to be supplied by an application program that uses
+ the facility, other than as an argument passed when the facility
+ is invoked, then you must make a good faith effort to ensure that,
+ in the event an application does not supply such function or
+ table, the facility still operates, and performs whatever part of
+ its purpose remains meaningful.
+
+ (For example, a function in a library to compute square roots has
+ a purpose that is entirely well-defined independent of the
+ application. Therefore, Subsection 2d requires that any
+ application-supplied function or table used by this function must
+ be optional: if the application does not supply it, the square
+ root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library. To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License. (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.) Do not make any other change in
+these notices.
+
+ Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+ This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+ 4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+ If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library". Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+ However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library". The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+ When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library. The
+threshold for this to be true is not precisely defined by law.
+
+ If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work. (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+ Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+ 6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+ You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License. You must supply a copy of this License. If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License. Also, you must do one
+of these things:
+
+ a) Accompany the work with the complete corresponding
+ machine-readable source code for the Library including whatever
+ changes were used in the work (which must be distributed under
+ Sections 1 and 2 above); and, if the work is an executable linked
+ with the Library, with the complete machine-readable "work that
+ uses the Library", as object code and/or source code, so that the
+ user can modify the Library and then relink to produce a modified
+ executable containing the modified Library. (It is understood
+ that the user who changes the contents of definitions files in the
+ Library will not necessarily be able to recompile the application
+ to use the modified definitions.)
+
+ b) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (1) uses at run time a
+ copy of the library already present on the user's computer system,
+ rather than copying library functions into the executable, and (2)
+ will operate properly with a modified version of the library, if
+ the user installs one, as long as the modified version is
+ interface-compatible with the version that the work was made with.
+
+ c) Accompany the work with a written offer, valid for at
+ least three years, to give the same user the materials
+ specified in Subsection 6a, above, for a charge no more
+ than the cost of performing this distribution.
+
+ d) If distribution of the work is made by offering access to copy
+ from a designated place, offer equivalent access to copy the above
+ specified materials from the same place.
+
+ e) Verify that the user has already received a copy of these
+ materials or that you have already sent this user a copy.
+
+ For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it. However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+ It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system. Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+ 7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+ a) Accompany the combined library with a copy of the same work
+ based on the Library, uncombined with any other library
+ facilities. This must be distributed under the terms of the
+ Sections above.
+
+ b) Give prominent notice with the combined library of the fact
+ that part of it is a work based on the Library, and explaining
+ where to find the accompanying uncombined form of the same work.
+
+ 8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License. Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License. However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+ 9. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Library or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+ 10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+ 11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all. For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded. In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+ 13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+ 14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission. For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this. Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+ NO WARRANTY
+
+ 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Libraries
+
+ If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change. You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+ To apply these terms, attach the following notices to the library. It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the library's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the
+ library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+ <signature of Ty Coon>, 1 April 1990
+ Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/NEWS.mdwn b/NEWS.mdwn
new file mode 120000
index 0000000..dc4cb4b
--- /dev/null
+++ b/NEWS.mdwn
@@ -0,0 +1 @@
+docs/NEWS.mdwn
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 120000
index 0000000..b9f23a8
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+docs/Beignet.mdwn
\ No newline at end of file
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
new file mode 100644
index 0000000..6a31c68
--- /dev/null
+++ b/backend/CMakeLists.txt
@@ -0,0 +1,108 @@
+project (GBE)
+set (LIBGBE_VERSION_MAJOR 0)
+set (LIBGBE_VERSION_MINOR 2)
+cmake_minimum_required (VERSION 2.6.0)
+
+set (GBE_CMAKE_DIR "${GBE_SOURCE_DIR}/cmake")
+set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${GBE_CMAKE_DIR}")
+
+##############################################################
+# Compilation directives
+##############################################################
+
+set (GBE_DEBUG_MEMORY false CACHE bool "Activate the memory debugger")
+set (GBE_USE_BLOB false CACHE bool "Compile everything from one big file")
+
+##############################################################
+# Compiler
+##############################################################
+if (UNIX)
+ set (COMPILER "GCC" CACHE INT "Compiler to choose on Linux (GCC,ICC,CLANG)")
+endif (UNIX)
+
+# Force Release with debug info
+if (NOT CMAKE_BUILD_TYPE)
+ set (CMAKE_BUILD_TYPE RelWithDebInfo)
+endif (NOT CMAKE_BUILD_TYPE)
+set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
+message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
+
+if (GBE_DEBUG_MEMORY)
+ set (GBE_DEBUG_MEMORY_FLAG "-DGBE_DEBUG_MEMORY=1")
+else (GBE_DEBUG_MEMORY)
+ set (GBE_DEBUG_MEMORY_FLAG "-DGBE_DEBUG_MEMORY=0")
+endif (GBE_DEBUG_MEMORY)
+
+# Hide all symbols and allows the symbols declared as visible to be exported
+set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden -DGBE_COMPILER_AVAILABLE=1 ${CMAKE_C_CXX_FLAGS}")
+
+if (COMPILER STREQUAL "GCC")
+ set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
+ set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} ${LLVM_CFLAGS}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -Wno-invalid-offsetof -fno-rtti -std=c++0x")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-E")
+ set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined ${LLVM_LFLAGS}")
+ set (CMAKE_CXX_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
+ set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+ set (CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
+ set (CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
+ set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wl,-E")
+ set (CMAKE_C_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
+ set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+ set (CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
+ set (CMAKE_C_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
+elseif (COMPILER STREQUAL "CLANG")
+ set (CMAKE_C_COMPILER "clang")
+ set (CMAKE_C_FLAGS "-Wall -std=c99")
+ set (CMAKE_C_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
+ set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+ set (CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
+ set (CMAKE_C_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
+ set (CMAKE_CXX_COMPILER "clang++")
+ set (CMAKE_CXX_FLAGS "-fstrict-aliasing -msse2 -fPIC -Wall -Wno-format-security -Wno-invalid-offsetof -std=c++0x")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG}")
+ set (CMAKE_CXX_FLAGS_DEBUG "-g -DGBE_DEBUG=1")
+ set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+ set (CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG -DGBE_DEBUG=0")
+ set (CMAKE_CXX_FLAGS_RELEASE "-O2 -DNDEBUG -DGBE_DEBUG=0")
+ set (CMAKE_AR "/usr/bin/llvm-ar")
+ set (CMAKE_LINKER "/usr/bin/llvm-ld")
+ set (CMAKE_NM "/usr/bin/llvm-nm")
+ set (CMAKE_OBJDUMP "/usr/bin/llvm-objdump")
+ set (CMAKE_RANLIB "ranlib")
+elseif (COMPILER STREQUAL "ICC")
+ set (CMAKE_CXX_COMPILER "icpc")
+ set (CMAKE_C_COMPILER "icc")
+ set (CMAKE_CXX_FLAGS "-std=c++0x -wd2928 -Wall -fPIC -fstrict-aliasing -fp-model fast -xSSE2")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG} -Wl,-E")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MODE_FLAG}")
+ set (CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DGBE_DEBUG=1")
+ set (CCMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O2 -DGBE_DEBUG=1")
+ set (CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O2 -DGBE_DEBUG=0")
+ set (CCMAKE_CXX_FLAGS_MINSIZEREL "-Os -DGBE_DEBUG=0")
+ set (CMAKE_EXE_LINKER_FLAGS "")
+endif ()
+
+include_directories (${CMAKE_CURRENT_BINARY_DIR})
+##############################################################
+# Project source code
+##############################################################
+add_subdirectory (src)
+set(LOCAL_PCH_OBJECT_DIR ${LOCAL_PCH_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_PCM_OBJECT_DIR ${LOCAL_PCM_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_GBE_OBJECT_DIR ${LOCAL_GBE_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_INTERP_OBJECT_DIR ${LOCAL_INTERP_OBJECT_DIR} PARENT_SCOPE)
+
+set (GBE_BIN_GENERATER
+ OCL_PCM_PATH=${LOCAL_PCM_OBJECT_DIR} OCL_PCH_PATH=${LOCAL_PCH_OBJECT_DIR} LD_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
+ PARENT_SCOPE)
+
diff --git a/backend/kernels/compile.sh b/backend/kernels/compile.sh
new file mode 100755
index 0000000..f6bb834
--- /dev/null
+++ b/backend/kernels/compile.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+clang -emit-llvm -O3 -target nvptx -c $1 -o $1.o
+llvm-dis $1.o
+rm $1.o
+mv $1.o.ll $1.ll
+
diff --git a/backend/src/.gitignore b/backend/src/.gitignore
new file mode 100644
index 0000000..d0ee832
--- /dev/null
+++ b/backend/src/.gitignore
@@ -0,0 +1,7 @@
+GBEConfig.h
+libgbe.so
+ocl_common_defines_str.cpp
+ocl_stdlib.h
+ocl_stdlib.h.pch
+ocl_stdlib_str.cpp
+ocl_vector.h
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
new file mode 100644
index 0000000..a3818ab
--- /dev/null
+++ b/backend/src/CMakeLists.txt
@@ -0,0 +1,236 @@
+set (ocl_vector_spec_file ${GBE_SOURCE_DIR}/src/builtin_vector_proto.def)
+set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
+set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
+set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
+set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
+set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
+set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}${BEIGNET_INSTALL_DIR}ocl_stdlib.h)
+set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
+set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
+set (ocl_gen_vector_cmd ${GBE_SOURCE_DIR}/src/gen_builtin_vector.py)
+
+set (string_header "\\\"string\\\"")
+add_custom_command(
+ OUTPUT ${ocl_blob_cpp_file}
+ COMMAND rm -rf ${ocl_blob_cpp_file}
+ COMMAND echo "\\\#include ${string_header}" >> ${ocl_blob_cpp_file}
+ COMMAND echo "namespace gbe {" >> ${ocl_blob_cpp_file}
+ COMMAND echo "std::string ocl_stdlib_str = " >> ${ocl_blob_cpp_file}
+ # Yeah!!! welcome to back slash hell
+ COMMAND cat ${ocl_blob_file} |sed 's/\\\\/\\\\\\\\/g' | sed 's/\\\"/\\\\\\\"/g' | awk '{ printf \(\"\\"%s\\\\n\\"\\n\", $$0\) }' >> ${ocl_blob_cpp_file}
+ COMMAND echo "\;" >> ${ocl_blob_cpp_file}
+ COMMAND echo "}" >> ${ocl_blob_cpp_file}
+ COMMAND echo "" >> ${ocl_blob_cpp_file}
+ DEPENDS ${ocl_blob_file})
+
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h;${ocl_blob_file}")
+
+add_custom_command(
+ OUTPUT ${ocl_vector_file}
+ COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
+ DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file}
+ )
+
+add_custom_command(
+ OUTPUT ${ocl_blob_file}
+ COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/${BEIGNET_INSTALL_DIR}
+ COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
+ DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file}
+ )
+
+set (pch_object ${ocl_blob_file}.pch)
+set (local_pch_object ${ocl_blob_file}.local.pch)
+# generate pch object
+if (LLVM_VERSION_NODOT VERSION_GREATER 32)
+ set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off -cl-kernel-arg-info)
+else (LLVM_VERSION_NODOT VERSION_GREATER 32)
+ if (LLVM_VERSION_NODOT VERSION_GREATER 31)
+ set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off)
+ else (LLVM_VERSION_NODOT VERSION_GREATER 31)
+ set (clang_cmd -cc1 -x cl -triple ptx32)
+ endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
+endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
+set (clang_cmd ${clang_cmd} -cl-std=CL1.2 -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
+add_custom_command(
+ OUTPUT ${pch_object}
+ COMMAND rm -f ${pch_object}
+ COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} --relocatable-pch -emit-pch -isysroot ${CMAKE_CURRENT_BINARY_DIR} ${ocl_blob_file} -o ${pch_object}
+ COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} -emit-pch ${ocl_blob_file} -o ${local_pch_object}
+ DEPENDS ${ocl_blob_file}
+ )
+
+add_custom_target(pch_object
+ DEPENDS ${pch_object})
+
+macro(ll_add_library ll_lib ll_sources)
+ foreach (ll ${${ll_sources}})
+ add_custom_command(
+ OUTPUT ${ll}.bc
+ COMMAND rm -f ${ll}.bc
+ COMMAND ${LLVM_INSTALL_DIR}llvm-as -o ${ll}.bc ${GBE_SOURCE_DIR}/src/${ll}
+ DEPENDS ${ll}
+ )
+ set (ll_objects ${ll_objects} ${ll}.bc)
+ endforeach (ll ${ll_sources})
+ add_custom_command(
+ OUTPUT ${ll_lib}
+ COMMAND ${LLVM_INSTALL_DIR}llvm-link -o ${ll_lib} ${ll_objects}
+ DEPENDS ${ll_objects}
+ )
+ add_custom_target(${ll_lib}
+ DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${ll_lib})
+endmacro(ll_add_library)
+
+if (GBE_USE_BLOB)
+ set (GBE_SRC
+ blob.cpp
+ backend/gen/gen_mesa_disasm.c)
+else (GBE_USE_BLOB)
+ set (GBE_SRC
+ ${ocl_blob_file}
+ ocl_stdlib_str.cpp # this file is auto-generated.
+ sys/vector.hpp
+ sys/hash_map.hpp
+ sys/map.hpp
+ sys/set.hpp
+ sys/intrusive_list.hpp
+ sys/intrusive_list.cpp
+ sys/exception.hpp
+ sys/assert.cpp
+ sys/assert.hpp
+ sys/alloc.cpp
+ sys/alloc.hpp
+ sys/mutex.cpp
+ sys/mutex.hpp
+ sys/platform.cpp
+ sys/platform.hpp
+ sys/cvar.cpp
+ sys/cvar.hpp
+ ir/context.cpp
+ ir/context.hpp
+ ir/profile.cpp
+ ir/profile.hpp
+ ir/type.cpp
+ ir/type.hpp
+ ir/unit.cpp
+ ir/unit.hpp
+ ir/constant.cpp
+ ir/constant.hpp
+ ir/sampler.cpp
+ ir/sampler.hpp
+ ir/image.cpp
+ ir/image.hpp
+ ir/instruction.cpp
+ ir/instruction.hpp
+ ir/liveness.cpp
+ ir/register.cpp
+ ir/register.hpp
+ ir/function.cpp
+ ir/function.hpp
+ ir/value.cpp
+ ir/value.hpp
+ ir/lowering.cpp
+ ir/lowering.hpp
+ ir/printf.cpp
+ ir/printf.hpp
+ ir/immediate.hpp
+ ir/immediate.cpp
+ backend/context.cpp
+ backend/context.hpp
+ backend/program.cpp
+ backend/program.hpp
+ backend/program.h
+ llvm/llvm_gen_backend.cpp
+ llvm/llvm_passes.cpp
+ llvm/llvm_scalarize.cpp
+ llvm/llvm_intrinsic_lowering.cpp
+ llvm/llvm_barrier_nodup.cpp
+ llvm/llvm_printf_parser.cpp
+ llvm/llvm_to_gen.cpp
+ llvm/llvm_loadstore_optimization.cpp
+ llvm/llvm_gen_backend.hpp
+ llvm/llvm_gen_ocl_function.hxx
+ llvm/llvm_to_gen.hpp
+ backend/gen/gen_mesa_disasm.c
+ backend/gen_insn_selection.cpp
+ backend/gen_insn_selection.hpp
+ backend/gen_insn_scheduling.cpp
+ backend/gen_insn_scheduling.hpp
+ backend/gen_reg_allocation.cpp
+ backend/gen_reg_allocation.hpp
+ backend/gen_context.cpp
+ backend/gen_context.cpp
+ backend/gen75_context.hpp
+ backend/gen75_context.cpp
+ backend/gen_program.cpp
+ backend/gen_program.hpp
+ backend/gen_program.h
+ backend/gen_defs.hpp
+ backend/gen_insn_compact.cpp
+ backend/gen_encoder.hpp
+ backend/gen_encoder.cpp
+ backend/gen75_encoder.hpp
+ backend/gen75_encoder.cpp
+ )
+
+endif (GBE_USE_BLOB)
+
+include_directories (.)
+link_directories (${LLVM_LIBRARY_DIRS} ${DRM_LIBDIR})
+include_directories(${LLVM_INCLUDE_DIRS})
+add_library (gbe SHARED ${GBE_SRC})
+
+# for pre compiled module library.
+set (pcm_lib "beignet.bc")
+set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll)
+ll_add_library (${pcm_lib} pcm_sources)
+
+ADD_DEPENDENCIES (gbe pch_object ${pcm_lib})
+target_link_libraries(
+ gbe
+ ${DRM_INTEL_LIBRARIES}
+ ${DRM_LIBRARIES}
+ ${CLANG_LIBRARIES}
+ ${LLVM_MODULE_LIBS}
+ ${LLVM_SYSTEM_LIBS}
+ ${CMAKE_THREAD_LIBS_INIT}
+ ${CMAKE_DL_LIBS})
+
+add_library(gbeinterp SHARED gbe_bin_interpreter.cpp)
+
+if (LLVM_VERSION_NODOT VERSION_EQUAL 34)
+ find_library(TERMINFO NAMES tinfo ncurses)
+ if (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+ message(FATAL_ERROR "no libtinfo or libncurses is found in system")
+ else (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+ target_link_libraries(gbe ${TERMINFO})
+ message(STATUS "use ${TERMINFO} as terminal control library")
+ endif (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+endif(LLVM_VERSION_NODOT VERSION_EQUAL 34)
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
+TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
+
+install (TARGETS gbe LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
+install (TARGETS gbeinterp LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
+#install (FILES backend/program.h DESTINATION include/gen)
+install (FILES ${ocl_blob_file} DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${pch_object} DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib} DESTINATION ${BEIGNET_INSTALL_DIR})
+# When build beignet itself, we need to export the local precompiled header file and precompiled module
+# file to libcl and utests.
+set (LOCAL_PCH_OBJECT_DIR "${local_pch_object}:${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch" PARENT_SCOPE)
+set (LOCAL_PCM_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib}:${BEIGNET_INSTALL_DIR}/${pcm_lib}" PARENT_SCOPE)
+set (LOCAL_GBE_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbe.so" PARENT_SCOPE)
+set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PARENT_SCOPE)
+
+set (PCH_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch")
+set (PCM_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/${pcm_lib}")
+set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
+set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
+configure_file (
+ "GBEConfig.h.in"
+ "GBEConfig.h"
+)
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
new file mode 100644
index 0000000..f5c69c6
--- /dev/null
+++ b/backend/src/GBEConfig.h.in
@@ -0,0 +1,7 @@
+// the configured options and settings for LIBGBE
+#define LIBGBE_VERSION_MAJOR @LIBGBE_VERSION_MAJOR@
+#define LIBGBE_VERSION_MINOR @LIBGBE_VERSION_MINOR@
+#define PCH_OBJECT_DIR "@PCH_OBJECT_DIR@"
+#define PCM_OBJECT_DIR "@PCM_OBJECT_DIR@"
+#define GBE_OBJECT_DIR "@GBE_OBJECT_DIR@"
+#define INTERP_OBJECT_DIR "@INTERP_OBJECT_DIR@"
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
new file mode 100644
index 0000000..e09a309
--- /dev/null
+++ b/backend/src/backend/context.cpp
@@ -0,0 +1,585 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file context.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "backend/context.hpp"
+#include "backend/program.hpp"
+#include "backend/gen_encoder.hpp"
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/profile.hpp"
+#include "ir/liveness.hpp"
+#include "ir/value.hpp"
+#include "ir/image.hpp"
+#include "sys/cvar.hpp"
+#include <algorithm>
+
+namespace gbe
+{
+ class SimpleAllocator
+ {
+ public:
+ SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
+ ~SimpleAllocator(void);
+
+ /*! Allocate some memory from the pool.
+ */
+ int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
+
+ /*! Free the given register file piece */
+ void deallocate(int16_t offset);
+
+ /*! Spilt a block into 2 blocks */
+ void splitBlock(int16_t offset, int16_t subOffset);
+
+ protected:
+ /*! Double chained list of free spaces */
+ struct Block {
+ Block(int16_t offset, int16_t size) :
+ prev(NULL), next(NULL), offset(offset), size(size) {}
+ Block *prev, *next; //!< Previous and next free blocks
+ int16_t offset; //!< Where the free block starts
+ int16_t size; //!< Size of the free block
+ };
+
+ /*! Try to coalesce two blocks (left and right). They must be in that order.
+ * If the colascing was done, the left block is deleted
+ */
+ void coalesce(Block *left, Block *right);
+ /*! the maximum offset */
+ int16_t maxOffset;
+ /*! whether trigger an assertion on allocation failure */
+ bool assertFail;
+ /*! Head and tail of the free list */
+ Block *head;
+ Block *tail;
+ /*! Handle free list element allocation */
+ DECL_POOL(Block, blockPool);
+ /*! Track allocated memory blocks <offset, size> */
+ map<int16_t, int16_t> allocatedBlocks;
+ /*! Use custom allocators */
+ GBE_CLASS(SimpleAllocator);
+ };
+
+ /*! Structure that keeps track of allocation in the register file. This is
+ * actually needed by Context (and not only by GenContext) because both
+ * simulator and hardware have to deal with constant pushing which uses the
+ * register file
+ *
+ * Since Gen is pretty flexible, we just reuse the Simpleallocator
+ */
+
+ class RegisterAllocator: public SimpleAllocator {
+ public:
+ RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
+
+ GBE_CLASS(RegisterAllocator);
+ };
+
+ /*!
+ * an allocator for scratch memory allocation. Scratch memory are used for register spilling.
+ * You can query how much scratch memory needed through getMaxScatchMemUsed().
+ */
+
+ class ScratchAllocator: public SimpleAllocator {
+ public:
+ ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {}
+ int16_t getMaxScatchMemUsed() { return maxOffset; }
+
+ GBE_CLASS(ScratchAllocator);
+ };
+
+ SimpleAllocator::SimpleAllocator(int16_t startOffset,
+ int16_t size,
+ bool _assertFail)
+ : maxOffset(0),
+ assertFail(_assertFail){
+ tail = head = this->newBlock(startOffset, size);
+ }
+
+ SimpleAllocator::~SimpleAllocator(void) {
+ while (this->head) {
+ Block *next = this->head->next;
+ this->deleteBlock(this->head);
+ this->head = next;
+ }
+ }
+
+ int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd)
+ {
+ // Make it simple and just use the first block we find
+ Block *list = bFwd ? head : tail;
+ while (list) {
+ int16_t aligned;
+ int16_t spaceOnLeft;
+ int16_t spaceOnRight;
+ if(bFwd) {
+ aligned = ALIGN(list->offset, alignment);
+ spaceOnLeft = aligned - list->offset;
+ spaceOnRight = list->size - size - spaceOnLeft;
+
+ // Not enough space in this block
+ if (spaceOnRight < 0) {
+ list = list->next;
+ continue;
+ }
+ } else {
+ int16_t unaligned = list->offset + list->size - size - (alignment-1);
+ if(unaligned < 0) {
+ list = list->prev;
+ continue;
+ }
+ aligned = ALIGN(unaligned, alignment); //alloc from block's tail
+ spaceOnLeft = aligned - list->offset;
+ spaceOnRight = list->size - size - spaceOnLeft;
+
+ // Not enough space in this block
+ if (spaceOnLeft < 0) {
+ list = list->prev;
+ continue;
+ }
+ }
+
+ // Cool we can use this block
+ Block *left = list->prev;
+ Block *right = list->next;
+
+ // If we left a hole on the left, create a new block
+ if (spaceOnLeft) {
+ Block *newBlock = this->newBlock(list->offset, spaceOnLeft);
+ if (left) {
+ left->next = newBlock;
+ newBlock->prev = left;
+ }
+ if (right) {
+ newBlock->next = right;
+ right->prev = newBlock;
+ }
+ left = newBlock;
+ }
+
+ // If we left a hole on the right, create a new block as well
+ if (spaceOnRight) {
+ Block *newBlock = this->newBlock(aligned + size, spaceOnRight);
+ if (left) {
+ left->next = newBlock;
+ newBlock->prev = left;
+ }
+ if (right) {
+ right->prev = newBlock;
+ newBlock->next = right;
+ }
+ right = newBlock;
+ }
+
+ // Chain both successors and predecessors when the entire block was
+ // allocated
+ if (spaceOnLeft == 0 && spaceOnRight == 0) {
+ if (left) left->next = right;
+ if (right) right->prev = left;
+ }
+
+ // Update the head of the free blocks
+ if (list == head) {
+ if (left)
+ head = left;
+ else if (right)
+ head = right;
+ else
+ head = NULL;
+ }
+
+ // Update the tail of the free blocks
+ if (list == tail) {
+ if (right)
+ tail = right;
+ else if (left)
+ tail = left;
+ else
+ tail = NULL;
+ }
+ // Free the block and check the consistency
+ this->deleteBlock(list);
+ if (head && head->next) GBE_ASSERT(head->next->prev == head);
+ if (tail && tail->prev) GBE_ASSERT(tail->prev->next == tail);
+
+ // Track the allocation to retrieve the size later
+ allocatedBlocks.insert(std::make_pair(aligned, size));
+ // update max offset
+ if(aligned + size > maxOffset) maxOffset = aligned + size;
+ // We have a valid offset now
+ return aligned;
+ }
+ GBE_ASSERT( !assertFail );
+ return 0;
+ }
+
+ void SimpleAllocator::deallocate(int16_t offset)
+ {
+ // Retrieve the size in the allocation map
+ auto it = allocatedBlocks.find(offset);
+ GBE_ASSERT(it != allocatedBlocks.end());
+ const int16_t size = it->second;
+
+ // Find the two blocks where to insert the new block
+ Block *list = tail, *next = NULL;
+ while (list != NULL) {
+ if (list->offset < offset)
+ break;
+ next = list;
+ list = list->prev;
+ }
+
+ // Create the block and insert it
+ Block *newBlock = this->newBlock(offset, size);
+ if (list) {
+ GBE_ASSERT(list->offset + list->size <= offset);
+ list->next = newBlock;
+ newBlock->prev = list;
+ } else
+ this->head = newBlock; // list is NULL means newBlock should be the head.
+
+ if (next) {
+ GBE_ASSERT(offset + size <= next->offset);
+ next->prev = newBlock;
+ newBlock->next = next;
+ } else
+ this->tail = newBlock; // next is NULL means newBlock should be the tail.
+
+ if (list != NULL || next != NULL)
+ {
+ // Coalesce the blocks if possible
+ this->coalesce(list, newBlock);
+ this->coalesce(newBlock, next);
+ }
+
+ // Do not track this allocation anymore
+ allocatedBlocks.erase(it);
+ }
+
+ void SimpleAllocator::coalesce(Block *left, Block *right) {
+ if (left == NULL || right == NULL) return;
+ GBE_ASSERT(left->offset < right->offset);
+ GBE_ASSERT(left->next == right);
+ GBE_ASSERT(right->prev == left);
+ if (left->offset + left->size == right->offset) {
+ right->offset = left->offset;
+ right->size += left->size;
+ if (left->prev) left->prev->next = right;
+ right->prev = left->prev;
+ if (left == this->head)
+ this->head = right;
+ this->deleteBlock(left);
+ }
+ }
+
+ void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) {
+ // Retrieve the size in the allocation map
+ auto it = allocatedBlocks.find(offset);
+ GBE_ASSERT(it != allocatedBlocks.end());
+
+ while(subOffset > it->second) {
+ subOffset -= it->second;
+ offset += it->second;
+ it = allocatedBlocks.find(offset);
+ GBE_ASSERT(it != allocatedBlocks.end());
+ }
+
+ if(subOffset == 0)
+ return;
+ int16_t size = it->second;
+ allocatedBlocks.erase(it);
+ // Track the allocation to retrieve the size later
+ allocatedBlocks.insert(std::make_pair(offset, subOffset));
+ allocatedBlocks.insert(std::make_pair(offset + subOffset, size - subOffset));
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Generic Context (shared by the simulator and the HW context)
+ ///////////////////////////////////////////////////////////////////////////
+ IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
+
+ Context::Context(const ir::Unit &unit, const std::string &name) :
+ unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL)
+ {
+ GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
+ this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
+ this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+ // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
+ this->registerAllocator = NULL; //GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+ this->scratchAllocator = NULL; //GBE_NEW(ScratchAllocator, 12*KB);
+ }
+
+ Context::~Context(void) {
+ GBE_SAFE_DELETE(this->registerAllocator);
+ GBE_SAFE_DELETE(this->scratchAllocator);
+ GBE_SAFE_DELETE(this->dag);
+ GBE_SAFE_DELETE(this->liveness);
+ }
+
+ void Context::startNewCG(uint32_t simdWidth) {
+ if (simdWidth == 0 || OCL_SIMD_WIDTH != 15)
+ this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
+ else
+ this->simdWidth = simdWidth;
+ GBE_SAFE_DELETE(this->registerAllocator);
+ GBE_SAFE_DELETE(this->scratchAllocator);
+ GBE_ASSERT(dag != NULL && liveness != NULL);
+ this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+ this->scratchAllocator = GBE_NEW(ScratchAllocator, this->getScratchSize());
+ this->curbeRegs.clear();
+ this->JIPs.clear();
+ }
+
+ Kernel *Context::compileKernel(void) {
+ this->kernel = this->allocateKernel();
+ this->kernel->simdWidth = this->simdWidth;
+ this->buildArgList();
+ if (usedLabels.size() == 0)
+ this->buildUsedLabels();
+ if (JIPs.size() == 0)
+ this->buildJIPs();
+ this->buildStack();
+ this->handleSLM();
+ if (this->emitCode() == false) {
+ GBE_DELETE(this->kernel);
+ this->kernel = NULL;
+ }
+ if(this->kernel != NULL) {
+ this->kernel->scratchSize = this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
+ this->kernel->ctx = this;
+ }
+ return this->kernel;
+ }
+
+ int16_t Context::allocate(int16_t size, int16_t alignment) {
+ return registerAllocator->allocate(size, alignment);
+ }
+
+ void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
+
+ void Context::splitBlock(int16_t offset, int16_t subOffset) {
+ registerAllocator->splitBlock(offset, subOffset);
+ }
+
+ // FIXME TODO as we optimize scratch memory usage using the register interval.
+ // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
+ // memory that are reused still keep the order
+
+ int32_t Context::allocateScratchMem(uint32_t size) {
+ return scratchAllocator->allocate(size, 32, true);
+ }
+ void Context::deallocateScratchMem(int32_t offset) {
+ scratchAllocator->deallocate(offset);
+ }
+
+ void Context::buildStack(void) {
+ const auto &stackUse = dag->getUse(ir::ocl::stackptr);
+ if (stackUse.size() == 0) // no stack is used if stackptr is unused
+ return;
+ // Be sure that the stack pointer is set
+ // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
+ uint32_t stackSize = 1*KB;
+ while (stackSize < fn.getStackSize()) {
+ stackSize <<= 1;
+ GBE_ASSERT(stackSize <= 64*KB);
+ }
+ this->kernel->stackSize = stackSize;
+ }
+
+ uint32_t Context::newCurbeEntry(gbe_curbe_type value,
+ uint32_t subValue,
+ uint32_t size,
+ uint32_t alignment)
+ {
+ alignment = alignment == 0 ? size : alignment;
+ const uint32_t offset = registerAllocator->allocate(size, alignment, 1);
+ GBE_ASSERT(offset >= GEN_REG_SIZE);
+ kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
+ kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
+ return offset;
+ }
+
+ uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
+ {
+ int32_t offset = fn.getImageSet()->getInfoOffset(key);
+ if (offset >= 0)
+ return offset + GEN_REG_SIZE;
+ newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
+ std::sort(kernel->patches.begin(), kernel->patches.end());
+
+ offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
+ GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
+ fn.getImageSet()->appendInfo(key, offset);
+ return offset + GEN_REG_SIZE;
+ }
+
+ void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
+ curbeRegs.insert(std::make_pair(reg, offset));
+ }
+ ir::Register Context::getSurfaceBaseReg(unsigned char bti) {
+ return fn.getSurfaceBaseReg(bti);
+ }
+
+ void Context::buildArgList(void) {
+ kernel->argNum = fn.argNum();
+ if (kernel->argNum)
+ kernel->args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, kernel->argNum);
+ else
+ kernel->args = NULL;
+ for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
+ const auto &arg = fn.getArg(argID);
+
+ kernel->args[argID].align = arg.align;
+ kernel->args[argID].info = arg.info;
+ switch (arg.type) {
+ case ir::FunctionArgument::VALUE:
+ case ir::FunctionArgument::STRUCTURE:
+ kernel->args[argID].type = GBE_ARG_VALUE;
+ kernel->args[argID].size = arg.size;
+ break;
+ case ir::FunctionArgument::GLOBAL_POINTER:
+ kernel->args[argID].type = GBE_ARG_GLOBAL_PTR;
+ kernel->args[argID].size = sizeof(void*);
+ kernel->args[argID].bti = arg.bti;
+ break;
+ case ir::FunctionArgument::CONSTANT_POINTER:
+ kernel->args[argID].type = GBE_ARG_CONSTANT_PTR;
+ kernel->args[argID].size = sizeof(void*);
+ break;
+ case ir::FunctionArgument::LOCAL_POINTER:
+ kernel->args[argID].type = GBE_ARG_LOCAL_PTR;
+ kernel->args[argID].size = 0;
+ break;
+ case ir::FunctionArgument::IMAGE:
+ kernel->args[argID].type = GBE_ARG_IMAGE;
+ kernel->args[argID].size = sizeof(void*);
+ break;
+ case ir::FunctionArgument::SAMPLER:
+ kernel->args[argID].type = GBE_ARG_SAMPLER;
+ kernel->args[argID].size = sizeof(void*);
+ break;
+ }
+ }
+ }
+
+ void Context::buildUsedLabels(void) {
+ usedLabels.clear();
+ fn.foreachInstruction([this](const ir::Instruction &insn) {
+ using namespace ir;
+ if (insn.getOpcode() != OP_BRA) return;
+ const LabelIndex index = cast<BranchInstruction>(insn).getLabelIndex();
+ usedLabels.insert(index);
+ });
+ }
+
+ void Context::buildJIPs(void) {
+ using namespace ir;
+
+ // Linearly store the branch target for each block and its own label
+ const LabelIndex noTarget(fn.labelNum());
+ vector<std::pair<LabelIndex, LabelIndex>> braTargets;
+ int32_t curr = 0, blockNum = fn.blockNum();
+ braTargets.resize(blockNum);
+
+ // If some blocks are unused we mark them as such by setting their own label
+ // as "invalid" (== noTarget)
+ for (auto &bb : braTargets) bb = std::make_pair(noTarget, noTarget);
+ fn.foreachBlock([&](const BasicBlock &bb) {
+ const LabelIndex ownLabel = bb.getLabelIndex();
+ const Instruction *last = bb.getLastInstruction();
+ if (last->getOpcode() != OP_BRA)
+ braTargets[curr++] = std::make_pair(ownLabel, noTarget);
+ else {
+ const BranchInstruction *bra = cast<BranchInstruction>(last);
+ braTargets[curr++] = std::make_pair(ownLabel, bra->getLabelIndex());
+ }
+ });
+
+ // Backward jumps are special. We must insert the label of the next block
+ // when we hit the "DO" i.e. the target label of the backward branch (as in
+ // do { } while) . So, we store the bwd jumps per targets
+ // XXX does not use custom allocator
+ std::multimap<LabelIndex, LabelIndex> bwdTargets;
+ for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
+ const LabelIndex ownLabel = braTargets[blockID].first;
+ const LabelIndex target = braTargets[blockID].second;
+ if (ownLabel == noTarget) continue; // unused block
+ if (target == noTarget) continue; // no branch
+ if (target <= ownLabel) { // This is a backward jump
+ // Last block is just "RET". So, it cannot be the last block
+ GBE_ASSERT(blockID < blockNum - 1);
+ const LabelIndex fallThrough = braTargets[blockID+1].first;
+ bwdTargets.insert(std::make_pair(target, fallThrough));
+ }
+ }
+
+ // Stores the current forward targets
+ set<LabelIndex> fwdTargets;
+
+ // Now retraverse the blocks and figure out all JIPs
+ for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
+ const LabelIndex ownLabel = braTargets[blockID].first;
+ const LabelIndex target = braTargets[blockID].second;
+ const BasicBlock &bb = fn.getBlock(ownLabel);
+ const Instruction *label = bb.getFirstInstruction();
+ const Instruction *bra = bb.getLastInstruction();
+
+ // Expires the branches that point to us (if any)
+ auto it = fwdTargets.find(ownLabel);
+ if (it != fwdTargets.end()) fwdTargets.erase(it);
+
+ // Insert the fall through of the bwd branches that point to us if any
+ auto ii = bwdTargets.equal_range(ownLabel);
+ for (auto it = ii.first; it != ii.second; ++it)
+ fwdTargets.insert(it->second);
+
+ // If there is an outstanding forward branch, compute a JIP for the label
+ auto lower = fwdTargets.lower_bound(LabelIndex(0));
+ GBE_ASSERT(label->isMemberOf<LabelInstruction>() == true);
+ if (lower != fwdTargets.end())
+ JIPs.insert(std::make_pair(label, *lower));
+
+ // Handle special cases and backward branches first
+ if (ownLabel == noTarget) continue; // unused block
+ if (target == noTarget) continue; // no branch at all
+ GBE_ASSERT(bra->isMemberOf<BranchInstruction>() == true);
+ if (target <= ownLabel) { // bwd branch: we always jump
+ JIPs.insert(std::make_pair(bra, LabelIndex(target)));
+ continue;
+ }
+
+ // This is a forward jump, register it and get the JIP
+ fwdTargets.insert(target);
+ auto jip = fwdTargets.lower_bound(LabelIndex(0));
+ JIPs.insert(std::make_pair(bra, *jip));
+ }
+ }
+
+ void Context::handleSLM(void) {
+ const bool useSLM = fn.getUseSLM();
+ kernel->useSLM = useSLM;
+ kernel->slmSize = fn.getSLMSize();
+ }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
new file mode 100644
index 0000000..3faead2
--- /dev/null
+++ b/backend/src/backend/context.hpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_CONTEXT_HPP__
+#define __GBE_CONTEXT_HPP__
+
+#include "ir/instruction.hpp"
+#include "backend/program.h"
+#include "sys/set.hpp"
+#include "sys/map.hpp"
+#include "sys/platform.hpp"
+#include <string>
+
+namespace gbe {
+namespace ir {
+
+ class Unit; // Contains the complete program
+ class Function; // We compile a function into a kernel
+ class Liveness; // Describes liveness of each ir function register
+ class FunctionDAG; // Describes the instruction dependencies
+
+} /* namespace ir */
+} /* namespace gbe */
+
+namespace gbe
+{
+ class Kernel; // context creates Kernel
+ class RegisterAllocator; // allocator for physical register allocation
+ class ScratchAllocator; // allocator for scratch memory allocation
+
+ /*! Context is the helper structure to build the Gen ISA or simulation code
+ * from GenIR
+ */
+ class Context : public NonCopyable
+ {
+ public:
+ /*! Create a new context. name is the name of the function we want to
+ * compile
+ */
+ Context(const ir::Unit &unit, const std::string &name);
+ /*! Release everything needed */
+ virtual ~Context(void);
+ /*! start new code generation with specific simd width. */
+ void startNewCG(uint32_t simdWidth);
+ /*! Compile the code */
+ Kernel *compileKernel(void);
+ /*! Tells if the labels is used */
+ INLINE bool isLabelUsed(ir::LabelIndex index) const {
+ return usedLabels.contains(index);
+ }
+ /*! Get the function graph */
+ INLINE const ir::FunctionDAG &getFunctionDAG(void) const { return *dag; }
+ /*! Get the liveness information */
+ INLINE const ir::Liveness &getLiveness(void) const { return *liveness; }
+ /*! Tells if the register is used */
+ bool isRegUsed(const ir::Register ®) const;
+ /*! Get the kernel we are currently compiling */
+ INLINE Kernel *getKernel(void) const { return this->kernel; }
+ /*! Get the function we are currently compiling */
+ INLINE const ir::Function &getFunction(void) const { return this->fn; }
+ /*! Get the target label index for the given instruction */
+ INLINE ir::LabelIndex getLabelIndex(const ir::Instruction *insn) const {
+ GBE_ASSERT(JIPs.find(insn) != JIPs.end());
+ return JIPs.find(insn)->second;
+ }
+ /*! Only GOTO and some LABEL instructions may have JIPs */
+ INLINE bool hasJIP(const ir::Instruction *insn) const {
+ return JIPs.find(insn) != JIPs.end();
+ }
+ /*! Allocate some memory in the register file */
+ int16_t allocate(int16_t size, int16_t alignment);
+ /*! Deallocate previously allocated memory */
+ void deallocate(int16_t offset);
+ /*! Spilt a block into 2 blocks, for some registers allocate together but deallocate seperate */
+ void splitBlock(int16_t offset, int16_t subOffset);
+ /* allocate a new entry for a specific image's information */
+ /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
+ uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
+ /*! allocate size scratch memory and return start address */
+ int32_t allocateScratchMem(uint32_t size);
+ /*! deallocate scratch memory at offset */
+ void deallocateScratchMem(int32_t offset);
+ /*! Preallocated curbe register set including special registers. */
+ map<ir::Register, uint32_t> curbeRegs;
+ ir::Register getSurfaceBaseReg(unsigned char bti);
+ protected:
+ /*! Build the instruction stream. Return false if failed */
+ virtual bool emitCode(void) = 0;
+ /*! Align the scratch size to the device's scratch unit size */
+ virtual uint32_t alignScratchSize(uint32_t) = 0;
+ /*! Get the device's max srcatch size */
+ virtual uint32_t getScratchSize(void) = 0;
+ /*! Allocate a new empty kernel (to be implemented) */
+ virtual Kernel *allocateKernel(void) = 0;
+ /*! Look if a stack is needed and allocate it */
+ void buildStack(void);
+ /*! Build the list of arguments to set to launch the kernel */
+ void buildArgList(void);
+ /*! Build the sets of used labels */
+ void buildUsedLabels(void);
+ /*! Build JIPs for each branch and possibly labels. Can be different from
+ * the branch target due to unstructured branches
+ */
+ void buildJIPs(void);
+ /*! Configure SLM use if needed */
+ void handleSLM(void);
+ /*! Insert a new entry with the given size in the Curbe. Return the offset
+ * of the entry
+ */
+ void insertCurbeReg(ir::Register, uint32_t grfOffset);
+ /*! allocate a curbe entry. */
+ uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
+ /*! Provide for each branch and label the label index target */
+ typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
+ const ir::Unit &unit; //!< Unit that contains the kernel
+ const ir::Function &fn; //!< Function to compile
+ std::string name; //!< Name of the kernel to compile
+ Kernel *kernel; //!< Kernel we are building
+ ir::Liveness *liveness; //!< Liveness info for the variables
+ ir::FunctionDAG *dag; //!< Graph of values on the function
+ RegisterAllocator *registerAllocator; //!< physical register allocation
+ ScratchAllocator *scratchAllocator; //!< scratch memory allocator
+ set<ir::LabelIndex> usedLabels; //!< Set of all used labels
+ JIPMap JIPs; //!< Where to jump all labels/branches
+ uint32_t simdWidth; //!< Number of lanes per HW threads
+ map<unsigned char, ir::Register> btiRegMap;
+ GBE_CLASS(Context); //!< Use custom allocators
+ };
+
+} /* namespace gbe */
+
+#endif /* __GBE_CONTEXT_HPP__ */
+
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
new file mode 100644
index 0000000..c120b60
--- /dev/null
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -0,0 +1,1302 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. The copyright holders make no representations
+ * about the suitability of this software for any purpose. It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "backend/gen_defs.hpp"
+#include "src/cl_device_data.h"
+
+static const struct {
+ const char *name;
+ int nsrc;
+ int ndst;
+} opcode[128] = {
+ [GEN_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_FBH] = { .name = "fbh", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_FBL] = { .name = "fbl", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
+
+ [GEN_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_MAD] = { .name = "mad", .nsrc = 3, .ndst = 1 },
+ [GEN_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+ [GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_ADDC] = { .name = "addc", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SUBB] = { .name = "subb", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+ [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+ [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+ [GEN_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+ [GEN_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+ [GEN_OPCODE_ENDIF] = { .name = "endif", .nsrc = 1, .ndst = 0 },
+};
+
+static const char *conditional_modifier[16] = {
+ [GEN_CONDITIONAL_NONE] = "",
+ [GEN_CONDITIONAL_Z] = ".e",
+ [GEN_CONDITIONAL_NZ] = ".ne",
+ [GEN_CONDITIONAL_G] = ".g",
+ [GEN_CONDITIONAL_GE] = ".ge",
+ [GEN_CONDITIONAL_L] = ".l",
+ [GEN_CONDITIONAL_LE] = ".le",
+ [GEN_CONDITIONAL_R] = ".r",
+ [GEN_CONDITIONAL_O] = ".o",
+ [GEN_CONDITIONAL_U] = ".u",
+};
+
+static const char *negate[2] = {
+ [0] = "",
+ [1] = "-",
+};
+
+static const char *_abs[2] = {
+ [0] = "",
+ [1] = "(abs)",
+};
+
+static const char *vert_stride[16] = {
+ [0] = "0",
+ [1] = "1",
+ [2] = "2",
+ [3] = "4",
+ [4] = "8",
+ [5] = "16",
+ [6] = "32",
+ [15] = "VxH",
+};
+
+static const char *width[8] = {
+ [0] = "1",
+ [1] = "2",
+ [2] = "4",
+ [3] = "8",
+ [4] = "16",
+};
+
+static const char *horiz_stride[4] = {
+ [0] = "0",
+ [1] = "1",
+ [2] = "2",
+ [3] = "4"
+};
+
+static const char *chan_sel[4] = {
+ [0] = "x",
+ [1] = "y",
+ [2] = "z",
+ [3] = "w",
+};
+
+static const char *debug_ctrl[2] = {
+ [0] = "",
+ [1] = ".breakpoint"
+};
+
+static const char *saturate[2] = {
+ [0] = "",
+ [1] = ".sat"
+};
+
+static const char *accwr[2] = {
+ [0] = "",
+ [1] = "AccWrEnable"
+};
+
+static const char *wectrl[2] = {
+ [0] = "WE_normal",
+ [1] = "WE_all"
+};
+
+static const char *exec_size[8] = {
+ [0] = "1",
+ [1] = "2",
+ [2] = "4",
+ [3] = "8",
+ [4] = "16",
+ [5] = "32"
+};
+
+static const char *pred_inv[2] = {
+ [0] = "+",
+ [1] = "-"
+};
+
+static const char *pred_ctrl_align16[16] = {
+ [1] = "",
+ [2] = ".x",
+ [3] = ".y",
+ [4] = ".z",
+ [5] = ".w",
+ [6] = ".any4h",
+ [7] = ".all4h",
+};
+
+static const char *pred_ctrl_align1[16] = {
+ [1] = "",
+ [2] = ".anyv",
+ [3] = ".allv",
+ [4] = ".any2h",
+ [5] = ".all2h",
+ [6] = ".any4h",
+ [7] = ".all4h",
+ [8] = ".any8h",
+ [9] = ".all8h",
+ [10] = ".any16h",
+ [11] = ".all16h",
+};
+
+static const char *thread_ctrl[4] = {
+ [0] = "",
+ [2] = "switch"
+};
+
+static const char *dep_ctrl[4] = {
+ [0] = "",
+ [1] = "NoDDClr",
+ [2] = "NoDDChk",
+ [3] = "NoDDClr,NoDDChk",
+};
+
+static const char *mask_ctrl[4] = {
+ [0] = "",
+ [1] = "nomask",
+};
+
+static const char *access_mode[2] = {
+ [0] = "align1",
+ [1] = "align16",
+};
+
+static const char *reg_encoding[8] = {
+ [0] = ":UD",
+ [1] = ":D",
+ [2] = ":UW",
+ [3] = ":W",
+ [4] = ":UB",
+ [5] = ":B",
+ [6] = ":DF",
+ [7] = ":F"
+};
+
+int reg_type_size[8] = {
+ [0] = 4,
+ [1] = 4,
+ [2] = 2,
+ [3] = 2,
+ [4] = 1,
+ [5] = 1,
+ [6] = 8,
+ [7] = 4
+};
+
+static const char *reg_file[4] = {
+ [0] = "A",
+ [1] = "g",
+ [2] = "m",
+ [3] = "imm",
+};
+
+static const char *writemask[16] = {
+ [0x0] = ".",
+ [0x1] = ".x",
+ [0x2] = ".y",
+ [0x3] = ".xy",
+ [0x4] = ".z",
+ [0x5] = ".xz",
+ [0x6] = ".yz",
+ [0x7] = ".xyz",
+ [0x8] = ".w",
+ [0x9] = ".xw",
+ [0xa] = ".yw",
+ [0xb] = ".xyw",
+ [0xc] = ".zw",
+ [0xd] = ".xzw",
+ [0xe] = ".yzw",
+ [0xf] = "",
+};
+
+static const char *end_of_thread[2] = {
+ [0] = "",
+ [1] = "EOT"
+};
+
+static const char *target_function_gen6[16] = {
+ [GEN_SFID_NULL] = "null",
+ [GEN_SFID_MATH] = "math",
+ [GEN_SFID_SAMPLER] = "sampler",
+ [GEN_SFID_MESSAGE_GATEWAY] = "gateway",
+ [GEN_SFID_URB] = "urb",
+ [GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
+ [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
+ [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
+ [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+ [GEN_SFID_DATAPORT_DATA_CACHE] = "data"
+};
+
+static const char *target_function_gen75[16] = {
+ [GEN_SFID_NULL] = "null",
+ [GEN_SFID_MATH] = "math",
+ [GEN_SFID_SAMPLER] = "sampler",
+ [GEN_SFID_MESSAGE_GATEWAY] = "gateway",
+ [GEN_SFID_URB] = "urb",
+ [GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
+ [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
+ [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
+ [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+ [GEN_SFID_DATAPORT_DATA_CACHE] = "data (0)",
+ [GEN_SFID_DATAPORT1_DATA_CACHE] = "data (1)"
+};
+
+static const char *gateway_sub_function[8] = {
+ [0] = "open gateway",
+ [1] = "close gateway",
+ [2] = "forward gateway",
+ [3] = "get time stamp",
+ [4] = "barrier",
+ [5] = "update gateway state",
+ [6] = "MMIO R/W",
+ [7] = "reserved"
+};
+
+static const char *math_function[16] = {
+ [GEN_MATH_FUNCTION_INV] = "inv",
+ [GEN_MATH_FUNCTION_LOG] = "log",
+ [GEN_MATH_FUNCTION_EXP] = "exp",
+ [GEN_MATH_FUNCTION_SQRT] = "sqrt",
+ [GEN_MATH_FUNCTION_RSQ] = "rsq",
+ [GEN_MATH_FUNCTION_SIN] = "sin",
+ [GEN_MATH_FUNCTION_COS] = "cos",
+ [GEN_MATH_FUNCTION_FDIV] = "fdiv",
+ [GEN_MATH_FUNCTION_POW] = "pow",
+ [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+ [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
+ [GEN_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
+};
+
+static const char *math_saturate[2] = {
+ [0] = "",
+ [1] = "sat"
+};
+
+static const char *math_signed[2] = {
+ [0] = "",
+ [1] = "signed"
+};
+
+static const char *math_scalar[2] = {
+ [0] = "",
+ [1] = "scalar"
+};
+
+static const char *math_precision[2] = {
+ [0] = "",
+ [1] = "partial_precision"
+};
+
+static const char *data_port_data_cache_simd_mode[] = {
+ "SIMD4x2",
+ "SIMD16",
+ "SIMD8",
+};
+
+static const char *data_port_data_cache_category[] = {
+ "legacy",
+ "scratch",
+};
+
+static const char *data_port_scratch_block_size[] = {
+ "1 register",
+ "2 registers",
+ "Reserve",
+ "4 registers",
+};
+
+static const char *data_port_scratch_invalidate[] = {
+ "no invalidate",
+ "invalidate cache line",
+};
+
+static const char *data_port_scratch_channel_mode[] = {
+ "Oword",
+ "Dword",
+};
+
+static const char *data_port_scratch_msg_type[] = {
+ "Scratch Read",
+ "Scratch Write",
+};
+
+static const char *data_port_data_cache_msg_type[] = {
+ [0] = "OWord Block Read",
+ [1] = "Unaligned OWord Block Read",
+ [2] = "OWord Dual Block Read",
+ [3] = "DWord Scattered Read",
+ [4] = "Byte Scattered Read",
+ [5] = "Untyped Surface Read",
+ [6] = "Untyped Atomic Operation",
+ [7] = "Memory Fence",
+ [8] = "OWord Block Write",
+ [10] = "OWord Dual Block Write",
+ [11] = "DWord Scattered Write",
+ [12] = "Byte Scattered Write",
+ [13] = "Untyped Surface Write",
+};
+
+static const char *data_port1_data_cache_msg_type[] = {
+ [1] = "Untyped Surface Read",
+ [2] = "Untyped Atomic Operation",
+ [3] = "Untyped Atomic Operation SIMD4x2",
+ [4] = "Media Block Read",
+ [5] = "Typed Surface Read",
+ [6] = "Typed Atomic Operation",
+ [7] = "Typed Atomic Operation SIMD4x2",
+ [9] = "Untyped Surface Write",
+ [10] = "Media Block Write",
+ [11] = "Atomic Counter Operation",
+ [12] = "Atomic Counter Operation 4X2",
+ [13] = "Typed Surface Write",
+};
+
+static int column;
+
+static int string (FILE *file, const char *string)
+{
+ fputs (string, file);
+ column += strlen (string);
+ return 0;
+}
+
+static int format (FILE *f, const char *format, ...)
+{
+ char buf[1024];
+ va_list args;
+ va_start (args, format);
+
+ vsnprintf (buf, sizeof (buf) - 1, format, args);
+ va_end (args);
+ string (f, buf);
+ return 0;
+}
+
+static int newline (FILE *f)
+{
+ putc ('\n', f);
+ column = 0;
+ return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+ do
+ string (f, " ");
+ while (column < c);
+ return 0;
+}
+
+static int flag_reg (FILE *file, const int flag_nr, const int flag_sub_reg_nr)
+{
+ if (flag_nr || flag_sub_reg_nr)
+ return format (file, ".f%d.%d", flag_nr, flag_sub_reg_nr);
+ return 0;
+}
+
+static int control (FILE *file, const char *name, const char *ctrl[], uint32_t id, int *space)
+{
+ if (!ctrl[id]) {
+ fprintf (file, "*** invalid %s value %d ",
+ name, id);
+ return 1;
+ }
+ if (ctrl[id][0])
+ {
+ if (space && *space)
+ string (file, " ");
+ string (file, ctrl[id]);
+ if (space)
+ *space = 1;
+ }
+ return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+ if (!opcode[id].name) {
+ format (file, "*** invalid opcode value %d ", id);
+ return 1;
+ }
+ string (file, opcode[id].name);
+ return 0;
+}
+
+static int reg (FILE *file, uint32_t _reg_file, uint32_t _reg_nr)
+{
+ int err = 0;
+
+ if (_reg_file == GEN_ARCHITECTURE_REGISTER_FILE) {
+ switch (_reg_nr & 0xf0) {
+ case GEN_ARF_NULL:
+ string (file, "null");
+ return -1;
+ case GEN_ARF_ADDRESS:
+ format (file, "a%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_ACCUMULATOR:
+ format (file, "acc%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_FLAG:
+ format (file, "f%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_MASK:
+ format (file, "mask%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_MASK_STACK:
+ format (file, "msd%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_STATE:
+ format (file, "sr%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_CONTROL:
+ format (file, "cr%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_NOTIFICATION_COUNT:
+ format (file, "n%d", _reg_nr & 0x0f);
+ break;
+ case GEN_ARF_IP:
+ string (file, "ip");
+ return -1;
+ break;
+ default:
+ format (file, "ARF%d", _reg_nr);
+ break;
+ }
+ } else {
+ err |= control (file, "src reg file", reg_file, _reg_file, NULL);
+ format (file, "%d", _reg_nr);
+ }
+ return err;
+}
+
+static int dest (FILE *file, const union GenNativeInstruction *inst)
+{
+ int err = 0;
+
+ if (inst->header.access_mode == GEN_ALIGN_1)
+ {
+ if (inst->bits1.da1.dest_address_mode == GEN_ADDRESS_DIRECT)
+ {
+ err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+ if (err == -1) {
+ control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+ return 0;
+ }
+ if (inst->bits1.da1.dest_subreg_nr)
+ format (file, ".%d", inst->bits1.da1.dest_subreg_nr /
+ reg_type_size[inst->bits1.da1.dest_reg_type]);
+ format (file, "<%s>", horiz_stride[inst->bits1.da1.dest_horiz_stride]);
+ err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+ }
+ else
+ {
+ string (file, "g[a0");
+ if (inst->bits1.ia1.dest_subreg_nr)
+ format (file, ".%d", inst->bits1.ia1.dest_subreg_nr /
+ reg_type_size[inst->bits1.ia1.dest_reg_type]);
+ if (inst->bits1.ia1.dest_indirect_offset)
+ format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+ string (file, "]");
+ format (file, "<%s>", horiz_stride[inst->bits1.ia1.dest_horiz_stride]);
+ err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+ }
+ }
+ else
+ {
+ if (inst->bits1.da16.dest_address_mode == GEN_ADDRESS_DIRECT)
+ {
+ err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+ if (err == -1)
+ return 0;
+ if (inst->bits1.da16.dest_subreg_nr)
+ format (file, ".%d", inst->bits1.da16.dest_subreg_nr /
+ reg_type_size[inst->bits1.da16.dest_reg_type]);
+ string (file, "<1>");
+ err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+ err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+ }
+ else
+ {
+ err = 1;
+ string (file, "Indirect align16 address mode not supported");
+ }
+ }
+
+ return 0;
+}
+
+static int dest_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+ int err = 0;
+ const uint32_t reg_file = GEN_GENERAL_REGISTER_FILE;
+
+ err |= reg (file, reg_file, inst->bits1.da3src.dest_reg_nr);
+ if (err == -1)
+ return 0;
+ if (inst->bits1.da3src.dest_subreg_nr)
+ format (file, ".%d", inst->bits1.da3src.dest_subreg_nr);
+ string (file, "<1>");
+ err |= control (file, "writemask", writemask, inst->bits1.da3src.dest_writemask, NULL);
+ err |= control (file, "dest reg encoding", reg_encoding, GEN_TYPE_F, NULL);
+
+ return 0;
+}
+
+static int src_align1_region (FILE *file,
+ uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride)
+{
+ int err = 0;
+ string (file, "<");
+ err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+ string (file, ",");
+ err |= control (file, "width", width, _width, NULL);
+ string (file, ",");
+ err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+ string (file, ">");
+ return err;
+}
+
+static int src_da1 (FILE *file, uint32_t type, uint32_t _reg_file,
+ uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride,
+ uint32_t reg_num, uint32_t sub_reg_num, uint32_t __abs, uint32_t _negate)
+{
+ int err = 0;
+ err |= control (file, "negate", negate, _negate, NULL);
+ err |= control (file, "abs", _abs, __abs, NULL);
+
+ err |= reg (file, _reg_file, reg_num);
+ if (err == -1)
+ return 0;
+ if (sub_reg_num)
+ format (file, ".%d", sub_reg_num / reg_type_size[type]); /* use formal style like spec */
+ src_align1_region (file, _vert_stride, _width, _horiz_stride);
+ err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+ return err;
+}
+
+static int src_ia1 (FILE *file,
+ uint32_t type,
+ uint32_t _reg_file,
+ int32_t _addr_imm,
+ uint32_t _addr_subreg_nr,
+ uint32_t _negate,
+ uint32_t __abs,
+ uint32_t _addr_mode,
+ uint32_t _horiz_stride,
+ uint32_t _width,
+ uint32_t _vert_stride)
+{
+ int err = 0;
+ err |= control (file, "negate", negate, _negate, NULL);
+ err |= control (file, "abs", _abs, __abs, NULL);
+
+ string (file, "g[a0");
+ if (_addr_subreg_nr)
+ format (file, ".%d", _addr_subreg_nr);
+ if (_addr_imm)
+ format (file, " %d", _addr_imm);
+ string (file, "]");
+ src_align1_region (file, _vert_stride, _width, _horiz_stride);
+ err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+ return err;
+}
+
+static int src_da16 (FILE *file,
+ uint32_t _reg_type,
+ uint32_t _reg_file,
+ uint32_t _vert_stride,
+ uint32_t _reg_nr,
+ uint32_t _subreg_nr,
+ uint32_t __abs,
+ uint32_t _negate,
+ uint32_t swz_x,
+ uint32_t swz_y,
+ uint32_t swz_z,
+ uint32_t swz_w)
+{
+ int err = 0;
+ err |= control (file, "negate", negate, _negate, NULL);
+ err |= control (file, "abs", _abs, __abs, NULL);
+
+ err |= reg (file, _reg_file, _reg_nr);
+ if (err == -1)
+ return 0;
+ if (_subreg_nr)
+ /* bit4 for subreg number byte addressing. Make this same meaning as
+ in da1 case, so output looks consistent. */
+ format (file, ".%d", 16 / reg_type_size[_reg_type]);
+ string (file, "<");
+ err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+ string (file, ",4,1>");
+ /*
+ * Three kinds of swizzle display:
+ * identity - nothing printed
+ * 1->all - print the single channel
+ * 1->1 - print the mapping
+ */
+ if (swz_x == GEN_CHANNEL_X &&
+ swz_y == GEN_CHANNEL_Y &&
+ swz_z == GEN_CHANNEL_Z &&
+ swz_w == GEN_CHANNEL_W)
+ {
+ ;
+ }
+ else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ }
+ else
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ err |= control (file, "channel select", chan_sel, swz_y, NULL);
+ err |= control (file, "channel select", chan_sel, swz_z, NULL);
+ err |= control (file, "channel select", chan_sel, swz_w, NULL);
+ }
+ err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+ return err;
+}
+
+static int src0_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+ int err = 0;
+ uint32_t swz_x = (inst->bits2.da3src.src0_swizzle >> 0) & 0x3;
+ uint32_t swz_y = (inst->bits2.da3src.src0_swizzle >> 2) & 0x3;
+ uint32_t swz_z = (inst->bits2.da3src.src0_swizzle >> 4) & 0x3;
+ uint32_t swz_w = (inst->bits2.da3src.src0_swizzle >> 6) & 0x3;
+
+ err |= control (file, "negate", negate, inst->bits1.da3src.src0_negate, NULL);
+ err |= control (file, "abs", _abs, inst->bits1.da3src.src0_abs, NULL);
+
+ err |= reg (file, GEN_GENERAL_REGISTER_FILE, inst->bits2.da3src.src0_reg_nr);
+ if (err == -1)
+ return 0;
+ if (inst->bits2.da3src.src0_subreg_nr)
+ format (file, ".%d", inst->bits2.da3src.src0_subreg_nr);
+ string (file, "<4,1,1>");
+ err |= control (file, "src da16 reg type", reg_encoding,
+ GEN_TYPE_F, NULL);
+ /*
+ * Three kinds of swizzle display:
+ * identity - nothing printed
+ * 1->all - print the single channel
+ * 1->1 - print the mapping
+ */
+ if (swz_x == GEN_CHANNEL_X &&
+ swz_y == GEN_CHANNEL_Y &&
+ swz_z == GEN_CHANNEL_Z &&
+ swz_w == GEN_CHANNEL_W)
+ {
+ ;
+ }
+ else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ }
+ else
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ err |= control (file, "channel select", chan_sel, swz_y, NULL);
+ err |= control (file, "channel select", chan_sel, swz_z, NULL);
+ err |= control (file, "channel select", chan_sel, swz_w, NULL);
+ }
+ return err;
+}
+
+static int src1_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+ int err = 0;
+ uint32_t swz_x = (inst->bits2.da3src.src1_swizzle >> 0) & 0x3;
+ uint32_t swz_y = (inst->bits2.da3src.src1_swizzle >> 2) & 0x3;
+ uint32_t swz_z = (inst->bits2.da3src.src1_swizzle >> 4) & 0x3;
+ uint32_t swz_w = (inst->bits2.da3src.src1_swizzle >> 6) & 0x3;
+ uint32_t src1_subreg_nr = (inst->bits2.da3src.src1_subreg_nr_low |
+ (inst->bits3.da3src.src1_subreg_nr_high << 2));
+
+ err |= control (file, "negate", negate, inst->bits1.da3src.src1_negate,
+ NULL);
+ err |= control (file, "abs", _abs, inst->bits1.da3src.src1_abs, NULL);
+
+ err |= reg (file, GEN_GENERAL_REGISTER_FILE,
+ inst->bits3.da3src.src1_reg_nr);
+ if (err == -1)
+ return 0;
+ if (src1_subreg_nr)
+ format (file, ".%d", src1_subreg_nr);
+ string (file, "<4,1,1>");
+ err |= control (file, "src da16 reg type", reg_encoding,
+ GEN_TYPE_F, NULL);
+ /*
+ * Three kinds of swizzle display:
+ * identity - nothing printed
+ * 1->all - print the single channel
+ * 1->1 - print the mapping
+ */
+ if (swz_x == GEN_CHANNEL_X &&
+ swz_y == GEN_CHANNEL_Y &&
+ swz_z == GEN_CHANNEL_Z &&
+ swz_w == GEN_CHANNEL_W)
+ {
+ ;
+ }
+ else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ }
+ else
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ err |= control (file, "channel select", chan_sel, swz_y, NULL);
+ err |= control (file, "channel select", chan_sel, swz_z, NULL);
+ err |= control (file, "channel select", chan_sel, swz_w, NULL);
+ }
+ return err;
+}
+
+
+static int src2_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+ int err = 0;
+ uint32_t swz_x = (inst->bits3.da3src.src2_swizzle >> 0) & 0x3;
+ uint32_t swz_y = (inst->bits3.da3src.src2_swizzle >> 2) & 0x3;
+ uint32_t swz_z = (inst->bits3.da3src.src2_swizzle >> 4) & 0x3;
+ uint32_t swz_w = (inst->bits3.da3src.src2_swizzle >> 6) & 0x3;
+
+ err |= control (file, "negate", negate, inst->bits1.da3src.src2_negate,
+ NULL);
+ err |= control (file, "abs", _abs, inst->bits1.da3src.src2_abs, NULL);
+
+ err |= reg (file, GEN_GENERAL_REGISTER_FILE,
+ inst->bits3.da3src.src2_reg_nr);
+ if (err == -1)
+ return 0;
+ if (inst->bits3.da3src.src2_subreg_nr)
+ format (file, ".%d", inst->bits3.da3src.src2_subreg_nr);
+ string (file, "<4,1,1>");
+ err |= control (file, "src da16 reg type", reg_encoding,
+ GEN_TYPE_F, NULL);
+ /*
+ * Three kinds of swizzle display:
+ * identity - nothing printed
+ * 1->all - print the single channel
+ * 1->1 - print the mapping
+ */
+ if (swz_x == GEN_CHANNEL_X &&
+ swz_y == GEN_CHANNEL_Y &&
+ swz_z == GEN_CHANNEL_Z &&
+ swz_w == GEN_CHANNEL_W)
+ {
+ ;
+ }
+ else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ }
+ else
+ {
+ string (file, ".");
+ err |= control (file, "channel select", chan_sel, swz_x, NULL);
+ err |= control (file, "channel select", chan_sel, swz_y, NULL);
+ err |= control (file, "channel select", chan_sel, swz_z, NULL);
+ err |= control (file, "channel select", chan_sel, swz_w, NULL);
+ }
+ return err;
+}
+
+static int imm (FILE *file, uint32_t type, const union GenNativeInstruction *inst) {
+ switch (type) {
+ case GEN_TYPE_UD:
+ format (file, "0x%xUD", inst->bits3.ud);
+ break;
+ case GEN_TYPE_D:
+ format (file, "%dD", inst->bits3.d);
+ break;
+ case GEN_TYPE_UW:
+ format (file, "0x%xUW", (uint16_t) inst->bits3.ud);
+ break;
+ case GEN_TYPE_W:
+ format (file, "%dW", (int16_t) inst->bits3.d);
+ break;
+ case GEN_TYPE_UB:
+ format (file, "0x%xUB", (int8_t) inst->bits3.ud);
+ break;
+ case GEN_TYPE_VF:
+ format (file, "Vector Float");
+ break;
+ case GEN_TYPE_V:
+ format (file, "0x%xV", inst->bits3.ud);
+ break;
+ case GEN_TYPE_F:
+ format (file, "%-gF", inst->bits3.f);
+ }
+ return 0;
+}
+
+static int src0 (FILE *file, const union GenNativeInstruction *inst)
+{
+ if (inst->bits1.da1.src0_reg_file == GEN_IMMEDIATE_VALUE)
+ return imm (file, inst->bits1.da1.src0_reg_type,
+ inst);
+ else if (inst->header.access_mode == GEN_ALIGN_1)
+ {
+ if (inst->bits2.da1.src0_address_mode == GEN_ADDRESS_DIRECT)
+ {
+ return src_da1 (file,
+ inst->bits1.da1.src0_reg_type,
+ inst->bits1.da1.src0_reg_file,
+ inst->bits2.da1.src0_vert_stride,
+ inst->bits2.da1.src0_width,
+ inst->bits2.da1.src0_horiz_stride,
+ inst->bits2.da1.src0_reg_nr,
+ inst->bits2.da1.src0_subreg_nr,
+ inst->bits2.da1.src0_abs,
+ inst->bits2.da1.src0_negate);
+ }
+ else
+ {
+ return src_ia1 (file,
+ inst->bits1.ia1.src0_reg_type,
+ inst->bits1.ia1.src0_reg_file,
+ inst->bits2.ia1.src0_indirect_offset,
+ inst->bits2.ia1.src0_subreg_nr,
+ inst->bits2.ia1.src0_negate,
+ inst->bits2.ia1.src0_abs,
+ inst->bits2.ia1.src0_address_mode,
+ inst->bits2.ia1.src0_horiz_stride,
+ inst->bits2.ia1.src0_width,
+ inst->bits2.ia1.src0_vert_stride);
+ }
+ }
+ else
+ {
+ if (inst->bits2.da16.src0_address_mode == GEN_ADDRESS_DIRECT)
+ {
+ return src_da16 (file,
+ inst->bits1.da16.src0_reg_type,
+ inst->bits1.da16.src0_reg_file,
+ inst->bits2.da16.src0_vert_stride,
+ inst->bits2.da16.src0_reg_nr,
+ inst->bits2.da16.src0_subreg_nr,
+ inst->bits2.da16.src0_abs,
+ inst->bits2.da16.src0_negate,
+ inst->bits2.da16.src0_swz_x,
+ inst->bits2.da16.src0_swz_y,
+ inst->bits2.da16.src0_swz_z,
+ inst->bits2.da16.src0_swz_w);
+ }
+ else
+ {
+ string (file, "Indirect align16 address mode not supported");
+ return 1;
+ }
+ }
+}
+
+static int src1 (FILE *file, const union GenNativeInstruction *inst)
+{
+ if (inst->bits1.da1.src1_reg_file == GEN_IMMEDIATE_VALUE)
+ return imm (file, inst->bits1.da1.src1_reg_type,
+ inst);
+ else if (inst->header.access_mode == GEN_ALIGN_1)
+ {
+ if (inst->bits3.da1.src1_address_mode == GEN_ADDRESS_DIRECT)
+ {
+ return src_da1 (file,
+ inst->bits1.da1.src1_reg_type,
+ inst->bits1.da1.src1_reg_file,
+ inst->bits3.da1.src1_vert_stride,
+ inst->bits3.da1.src1_width,
+ inst->bits3.da1.src1_horiz_stride,
+ inst->bits3.da1.src1_reg_nr,
+ inst->bits3.da1.src1_subreg_nr,
+ inst->bits3.da1.src1_abs,
+ inst->bits3.da1.src1_negate);
+ }
+ else
+ {
+ return src_ia1 (file,
+ inst->bits1.ia1.src1_reg_type,
+ inst->bits1.ia1.src1_reg_file,
+ inst->bits3.ia1.src1_indirect_offset,
+ inst->bits3.ia1.src1_subreg_nr,
+ inst->bits3.ia1.src1_negate,
+ inst->bits3.ia1.src1_abs,
+ inst->bits3.ia1.src1_address_mode,
+ inst->bits3.ia1.src1_horiz_stride,
+ inst->bits3.ia1.src1_width,
+ inst->bits3.ia1.src1_vert_stride);
+ }
+ }
+ else
+ {
+ if (inst->bits3.da16.src1_address_mode == GEN_ADDRESS_DIRECT)
+ {
+ return src_da16 (file,
+ inst->bits1.da16.src1_reg_type,
+ inst->bits1.da16.src1_reg_file,
+ inst->bits3.da16.src1_vert_stride,
+ inst->bits3.da16.src1_reg_nr,
+ inst->bits3.da16.src1_subreg_nr,
+ inst->bits3.da16.src1_abs,
+ inst->bits3.da16.src1_negate,
+ inst->bits3.da16.src1_swz_x,
+ inst->bits3.da16.src1_swz_y,
+ inst->bits3.da16.src1_swz_z,
+ inst->bits3.da16.src1_swz_w);
+ }
+ else
+ {
+ string (file, "Indirect align16 address mode not supported");
+ return 1;
+ }
+ }
+}
+
+static const int esize[6] = {
+ [0] = 1,
+ [1] = 2,
+ [2] = 4,
+ [3] = 8,
+ [4] = 16,
+ [5] = 32,
+};
+
+static int qtr_ctrl(FILE *file, const union GenNativeInstruction *inst)
+{
+ int qtr_ctl = inst->header.quarter_control;
+ int exec_size = esize[inst->header.execution_size];
+
+ if (exec_size == 8) {
+ switch (qtr_ctl) {
+ case 0:
+ string (file, " 1Q");
+ break;
+ case 1:
+ string (file, " 2Q");
+ break;
+ case 2:
+ string (file, " 3Q");
+ break;
+ case 3:
+ string (file, " 4Q");
+ break;
+ }
+ } else if (exec_size == 16){
+ if (qtr_ctl < 2)
+ string (file, " 1H");
+ else
+ string (file, " 2H");
+ }
+ return 0;
+}
+
+int gen_disasm (FILE *file, const void *opaque_insn, uint32_t deviceID, uint32_t compacted)
+{
+ const union GenNativeInstruction *inst = (const union GenNativeInstruction *) opaque_insn;
+ int err = 0;
+ int space = 0;
+ int gen = 70;
+ if (IS_IVYBRIDGE(deviceID)) {
+ gen = 70;
+ } else if (IS_HASWELL(deviceID)) {
+ gen = 75;
+ }
+
+ if (inst->header.predicate_control) {
+ string (file, "(");
+ err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+ format (file, "f%d", inst->bits2.da1.flag_reg_nr);
+ if (inst->bits2.da1.flag_sub_reg_nr)
+ format (file, ".%d", inst->bits2.da1.flag_sub_reg_nr);
+ if (inst->header.access_mode == GEN_ALIGN_1)
+ err |= control (file, "predicate control align1", pred_ctrl_align1,
+ inst->header.predicate_control, NULL);
+ else
+ err |= control (file, "predicate control align16", pred_ctrl_align16,
+ inst->header.predicate_control, NULL);
+ string (file, ") ");
+ }
+
+ err |= print_opcode (file, inst->header.opcode);
+ err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+ err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+ if (inst->header.opcode == GEN_OPCODE_MATH) {
+ string (file, " ");
+ err |= control (file, "function", math_function,
+ inst->header.destreg_or_condmod, NULL);
+ } else if (inst->header.opcode != GEN_OPCODE_SEND &&
+ inst->header.opcode != GEN_OPCODE_SENDC) {
+ err |= control (file, "conditional modifier", conditional_modifier,
+ inst->header.destreg_or_condmod, NULL);
+ if (inst->header.destreg_or_condmod)
+ err |= flag_reg (file,
+ inst->bits2.da1.flag_reg_nr,
+ inst->bits2.da1.flag_sub_reg_nr);
+ }
+
+ if (inst->header.opcode != GEN_OPCODE_NOP) {
+ string (file, "(");
+ err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+ string (file, ")");
+ }
+
+ if (inst->header.opcode == GEN_OPCODE_SEND && gen < 60)
+ format (file, " %d", inst->header.destreg_or_condmod);
+
+ if (opcode[inst->header.opcode].nsrc == 3) {
+ pad (file, 16);
+ err |= dest_3src (file, inst);
+
+ pad (file, 32);
+ err |= src0_3src (file, inst);
+
+ pad (file, 48);
+ err |= src1_3src (file, inst);
+
+ pad (file, 64);
+ err |= src2_3src (file, inst);
+ } else {
+ if (opcode[inst->header.opcode].ndst > 0) {
+ pad (file, 16);
+ err |= dest (file, inst);
+ } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_IF ||
+ inst->header.opcode == GEN_OPCODE_ELSE ||
+ inst->header.opcode == GEN_OPCODE_ENDIF ||
+ inst->header.opcode == GEN_OPCODE_WHILE ||
+ inst->header.opcode == GEN_OPCODE_BRD ||
+ inst->header.opcode == GEN_OPCODE_JMPI)) {
+ format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
+ } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_BREAK ||
+ inst->header.opcode == GEN_OPCODE_CONTINUE ||
+ inst->header.opcode == GEN_OPCODE_HALT ||
+ inst->header.opcode == GEN_OPCODE_BRC)) {
+ format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+ }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
+ format (file, " %d", inst->bits3.d);
+ }*/
+
+ if (opcode[inst->header.opcode].nsrc > 0) {
+ pad (file, 32);
+ err |= src0 (file, inst);
+ }
+ if (opcode[inst->header.opcode].nsrc > 1) {
+ pad (file, 48);
+ err |= src1 (file, inst);
+ }
+ }
+
+ if (inst->header.opcode == GEN_OPCODE_SEND ||
+ inst->header.opcode == GEN_OPCODE_SENDC) {
+ enum GenMessageTarget target = inst->header.destreg_or_condmod;
+
+ newline (file);
+ pad (file, 16);
+ space = 0;
+
+ if(gen == 75) {
+ err |= control (file, "target function", target_function_gen75,
+ target, &space);
+ } else {
+ err |= control (file, "target function", target_function_gen6,
+ target, &space);
+ }
+
+ switch (target) {
+ case GEN_SFID_MATH:
+ err |= control (file, "math function", math_function,
+ inst->bits3.math_gen5.function, &space);
+ err |= control (file, "math saturate", math_saturate,
+ inst->bits3.math_gen5.saturate, &space);
+ err |= control (file, "math signed", math_signed,
+ inst->bits3.math_gen5.int_type, &space);
+ err |= control (file, "math scalar", math_scalar,
+ inst->bits3.math_gen5.data_type, &space);
+ err |= control (file, "math precision", math_precision,
+ inst->bits3.math_gen5.precision, &space);
+ break;
+ case GEN_SFID_SAMPLER:
+ format (file, " (%d, %d, %d, %d)",
+ inst->bits3.sampler_gen7.bti,
+ inst->bits3.sampler_gen7.sampler,
+ inst->bits3.sampler_gen7.msg_type,
+ inst->bits3.sampler_gen7.simd_mode);
+ break;
+ case GEN_SFID_DATAPORT_DATA_CACHE:
+ if(inst->bits3.gen7_untyped_rw.category == 0) {
+ format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+ inst->bits3.gen7_untyped_rw.bti,
+ inst->bits3.gen7_untyped_rw.rgba,
+ data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+ data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+ data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+ } else {
+ format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+ inst->bits3.gen7_scratch_rw.offset,
+ data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
+ data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
+ data_port_scratch_channel_mode[inst->bits3.gen7_scratch_rw.channel_mode],
+ data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
+ }
+ break;
+ case GEN_SFID_DATAPORT1_DATA_CACHE:
+ format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+ inst->bits3.gen7_untyped_rw.bti,
+ inst->bits3.gen7_untyped_rw.rgba,
+ data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+ data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+ data_port1_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+ break;
+ case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
+ format (file, " (bti: %d, %s)",
+ inst->bits3.gen7_dword_rw.bti,
+ data_port_data_cache_msg_type[inst->bits3.gen7_dword_rw.msg_type]);
+ break;
+ case GEN_SFID_MESSAGE_GATEWAY:
+ format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
+ gateway_sub_function[inst->bits3.gen7_msg_gw.subfunc],
+ inst->bits3.gen7_msg_gw.notify,
+ inst->bits3.gen7_msg_gw.ackreq);
+ break;
+
+ default:
+ format (file, "unsupported target %d", target);
+ break;
+ }
+ if (space)
+ string (file, " ");
+ format (file, "mlen %d", inst->bits3.generic_gen5.msg_length);
+ format (file, " rlen %d", inst->bits3.generic_gen5.response_length);
+ }
+ pad (file, 64);
+ if (inst->header.opcode != GEN_OPCODE_NOP) {
+ string (file, "{");
+ space = 1;
+ err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+ if (gen >= 60)
+ err |= control (file, "write enable control", wectrl, inst->header.mask_control, &space);
+ else
+ err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+ err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+
+ err |= qtr_ctrl (file, inst);
+ err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+ if (gen >= 60)
+ err |= control (file, "acc write control", accwr, inst->header.acc_wr_control, &space);
+ if (inst->header.opcode == GEN_OPCODE_SEND ||
+ inst->header.opcode == GEN_OPCODE_SENDC)
+ err |= control (file, "end of thread", end_of_thread,
+ inst->bits3.generic_gen5.end_of_thread, &space);
+
+ if(compacted) {
+ string(file, " Compacted");
+ }
+ if (space)
+ string (file, " ");
+ string (file, "}");
+ }
+ string (file, ";");
+ newline (file);
+ return err;
+}
+
diff --git a/backend/src/backend/gen/gen_mesa_disasm.h b/backend/src/backend/gen/gen_mesa_disasm.h
new file mode 100644
index 0000000..ae007a4
--- /dev/null
+++ b/backend/src/backend/gen/gen_mesa_disasm.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_mesa_disasm.h
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * To decode and print one Gen ISA instruction. The code is directly taken
+ * from Mesa
+ */
+
+#ifndef __GBE_GEN_MESA_DISASM_H__
+#define __GBE_GEN_MESA_DISASM_H__
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+extern int gen_disasm(FILE *file, const void *opaque_insn, uint32_t deviceID, uint32_t compacted);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __GBE_GEN_MESA_DISASM_H__ */
+
+
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
new file mode 100644
index 0000000..da0db85
--- /dev/null
+++ b/backend/src/backend/gen75_context.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.cpp
+ */
+
+#include "backend/gen75_context.hpp"
+#include "backend/gen75_encoder.hpp"
+#include "backend/gen_program.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/cvar.hpp"
+#include "ir/function.hpp"
+#include "ir/value.hpp"
+#include <cstring>
+
+namespace gbe
+{
+ void Gen75Context::emitSLMOffset(void) {
+ if(kernel->getUseSLM() == false)
+ return;
+
+ const GenRegister slm_offset = ra->genReg(GenRegister::ud1grf(ir::ocl::slmoffset));
+ const GenRegister slm_index = GenRegister::ud1grf(0, 0);
+ //the slm index is hold in r0.0 24-27 bit, in 4K unit, shift left 12 to get byte unit
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->SHR(slm_offset, slm_index, GenRegister::immud(12));
+ p->pop();
+ }
+
+ void Gen75Context::allocSLMOffsetCurbe(void) {
+ if(fn.getUseSLM())
+ allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
+ }
+
+ uint32_t Gen75Context::alignScratchSize(uint32_t size){
+ if(size == 0)
+ return 0;
+ uint32_t i = 2048;
+ while(i < size) i *= 2;
+ return i;
+ }
+
+ void Gen75Context::emitStackPointer(void) {
+ using namespace ir;
+
+ // Only emit stack pointer computation if we use a stack
+ if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+ return;
+
+ // Check that everything is consistent in the kernel code
+ const uint32_t perLaneSize = kernel->getStackSize();
+ const uint32_t perThreadSize = perLaneSize * this->simdWidth;
+ GBE_ASSERT(perLaneSize > 0);
+ GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
+ GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
+
+ // Use shifts rather than muls which are limited to 32x16 bit sources
+ const uint32_t perLaneShift = logi2(perLaneSize);
+ const uint32_t perThreadShift = logi2(perThreadSize);
+ const GenRegister selStatckPtr = this->simdWidth == 8 ?
+ GenRegister::ud8grf(ir::ocl::stackptr) :
+ GenRegister::ud16grf(ir::ocl::stackptr);
+ const GenRegister stackptr = ra->genReg(selStatckPtr);
+ const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+ const GenRegister bufferptr = ra->genReg(selStackBuffer);
+
+ // We compute the per-lane stack pointer here
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+ p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+ p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+ p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+ p->curr.execWidth = this->simdWidth;
+ p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+ p->curr.execWidth = 1;
+ p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
+ p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
+ p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+ p->curr.execWidth = this->simdWidth;
+ p->ADD(stackptr, stackptr, bufferptr);
+ p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+ p->pop();
+ }
+
+ void Gen75Context::newSelection(void) {
+ this->sel = GBE_NEW(Selection75, *this);
+ }
+}
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp
new file mode 100644
index 0000000..6f62b02
--- /dev/null
+++ b/backend/src/backend/gen75_context.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.hpp
+ */
+#ifndef __GBE_GEN75_CONTEXT_HPP__
+#define __GBE_GEN75_CONTEXT_HPP__
+
+#include "backend/gen_context.hpp"
+#include "backend/gen75_encoder.hpp"
+
+namespace gbe
+{
+ /* This class is used to implement the HSW
+ specific logic for context. */
+ class Gen75Context : public GenContext
+ {
+ public:
+ virtual ~Gen75Context(void) { }
+ Gen75Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+ : GenContext(unit, name, deviceID, relaxMath) {
+ };
+ /*! device's max srcatch buffer size */
+ #define GEN75_SCRATCH_SIZE (2 * KB * KB)
+ /*! Emit the per-lane stack pointer computation */
+ virtual void emitStackPointer(void);
+ /*! Align the scratch size to the device's scratch unit size */
+ virtual uint32_t alignScratchSize(uint32_t size);
+ /*! Get the device's max srcatch size */
+ virtual uint32_t getScratchSize(void) {
+ //Because the allocate is use uint16_t, so clamp it, need refine
+ return std::min(GEN75_SCRATCH_SIZE, 0x7fff);
+ }
+
+ protected:
+ virtual GenEncoder* generateEncoder(void) {
+ return GBE_NEW(Gen75Encoder, this->simdWidth, 75, deviceID);
+ }
+
+ private:
+ virtual void emitSLMOffset(void);
+ virtual void allocSLMOffsetCurbe(void);
+ virtual void newSelection(void);
+ };
+}
+#endif /* __GBE_GEN75_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
new file mode 100644
index 0000000..69d2de0
--- /dev/null
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -0,0 +1,269 @@
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+#include "backend/gen75_encoder.hpp"
+
+static const uint32_t untypedRWMask[] = {
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
+ GEN_UNTYPED_ALPHA,
+ 0
+};
+
+namespace gbe
+{
+ void Gen75Encoder::setHeader(GenNativeInstruction *insn) {
+ if (this->curr.execWidth == 8)
+ insn->header.execution_size = GEN_WIDTH_8;
+ else if (this->curr.execWidth == 16)
+ insn->header.execution_size = GEN_WIDTH_16;
+ else if (this->curr.execWidth == 1)
+ insn->header.execution_size = GEN_WIDTH_1;
+ else if (this->curr.execWidth == 4)
+ insn->header.execution_size = GEN_WIDTH_4;
+ else
+ NOT_IMPLEMENTED;
+ insn->header.acc_wr_control = this->curr.accWrEnable;
+ insn->header.quarter_control = this->curr.quarterControl;
+ insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
+ insn->header.mask_control = this->curr.noMask;
+ insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+ insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+ if (this->curr.predicate != GEN_PREDICATE_NONE) {
+ insn->header.predicate_control = this->curr.predicate;
+ insn->header.predicate_inverse = this->curr.inversePredicate;
+ }
+ insn->header.saturate = this->curr.saturate;
+ }
+
+ void Gen75Encoder::setDPUntypedRW(GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t rgba,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
+ insn->bits3.gen7_untyped_rw.msg_type = msg_type;
+ insn->bits3.gen7_untyped_rw.bti = bti;
+ insn->bits3.gen7_untyped_rw.rgba = rgba;
+ if (curr.execWidth == 8)
+ insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+ else if (curr.execWidth == 16)
+ insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+ else
+ NOT_SUPPORTED;
+ }
+
+ void Gen75Encoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+ unsigned char msg_type, uint32_t msg_length, bool header_present)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+ setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
+ insn->bits3.gen7_typed_rw.bti = bti;
+ insn->bits3.gen7_typed_rw.msg_type = msg_type;
+
+ /* Always using the low 8 slots here. */
+ insn->bits3.gen7_typed_rw.slot = 1;
+ }
+
+ void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+
+ if (this->curr.execWidth == 8) {
+ msg_length = srcNum;
+ response_length = 1;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2 * srcNum;
+ response_length = 2;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
+ insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
+ insn->bits3.gen7_atomic_op.bti = bti;
+ insn->bits3.gen7_atomic_op.return_data = 1;
+ insn->bits3.gen7_atomic_op.aop_type = function;
+
+ if (this->curr.execWidth == 8)
+ insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+ else if (this->curr.execWidth == 16)
+ insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+ else
+ NOT_SUPPORTED;
+ }
+
+ void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ if (this->curr.execWidth == 8) {
+ msg_length = 1;
+ response_length = elemNum;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2;
+ response_length = 2 * elemNum;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(insn,
+ bti,
+ untypedRWMask[elemNum],
+ GEN75_P1_UNTYPED_READ,
+ msg_length,
+ response_length);
+ }
+
+ void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ this->setHeader(insn);
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ msg_length = 1 + elemNum;
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ msg_length = 2 * (1 + elemNum);
+ }
+ else
+ NOT_IMPLEMENTED;
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(insn,
+ bti,
+ untypedRWMask[elemNum],
+ GEN75_P1_UNTYPED_SURFACE_WRITE,
+ msg_length,
+ response_length);
+ }
+
+ void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+ union { double d; unsigned u[2]; } u;
+ u.d = value;
+ GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+ push();
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ curr.execWidth = 1;
+ MOV(r, GenRegister::immud(u.u[0]));
+ MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[1]));
+ pop();
+ r.type = GEN_TYPE_DF;
+ r.vstride = GEN_VERTICAL_STRIDE_0;
+ r.width = GEN_WIDTH_1;
+ r.hstride = GEN_HORIZONTAL_STRIDE_0;
+ push();
+ uint32_t width = curr.execWidth;
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ curr.quarterControl = GEN_COMPRESSION_Q1;
+ MOV(dest, r);
+ if (width == 16) {
+ curr.quarterControl = GEN_COMPRESSION_Q2;
+ MOV(GenRegister::offset(dest, 2), r);
+ }
+ pop();
+ }
+
+ void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+ GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+ int w = curr.execWidth;
+ GenRegister r0;
+ r0 = GenRegister::h2(r);
+ push();
+ curr.execWidth = 4;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ MOV(r0, src0);
+ MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
+ curr.noMask = 0;
+ curr.quarterControl = 0;
+ curr.nibControl = 0;
+ MOV(dest, r0);
+ curr.nibControl = 1;
+ MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4));
+ pop();
+ if (w == 16) {
+ push();
+ curr.execWidth = 4;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ MOV(r0, GenRegister::suboffset(src0, 8));
+ MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
+ curr.noMask = 0;
+ curr.quarterControl = 1;
+ curr.nibControl = 0;
+ MOV(GenRegister::suboffset(dest, 8), r0);
+ curr.nibControl = 1;
+ MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4));
+ pop();
+ }
+ }
+
+ void Gen75Encoder::JMPI(GenRegister src, bool longjmp) {
+ alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
+ }
+
+ void Gen75Encoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
+ GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
+ GBE_ASSERT(insnID < this->store.size());
+ GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
+ insn.header.opcode == GEN_OPCODE_BRD ||
+ insn.header.opcode == GEN_OPCODE_ENDIF ||
+ insn.header.opcode == GEN_OPCODE_IF ||
+ insn.header.opcode == GEN_OPCODE_BRC);
+
+ if (insn.header.opcode == GEN_OPCODE_IF) {
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ return;
+ }
+ else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+ //jumpDistance'unit is Qword, and the HSW's offset of jmpi is in byte, so multi 8
+ jumpDistance = (jumpDistance - 2) * 8;
+ }
+
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ }
+} /* End of the name space. */
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
new file mode 100644
index 0000000..c10dac9
--- /dev/null
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.hpp
+ */
+#ifndef __GBE_GEN75_ENCODER_HPP__
+#define __GBE_GEN75_ENCODER_HPP__
+
+#include "backend/gen_encoder.hpp"
+
+namespace gbe
+{
+ /* This class is used to implement the HSW
+ specific logic for encoder. */
+ class Gen75Encoder : public GenEncoder
+ {
+ public:
+ /*! exec width of the double data type */
+ #define GEN75_DOUBLE_EXEC_WIDTH 4
+ virtual ~Gen75Encoder(void) { }
+
+ Gen75Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
+ : GenEncoder(simdWidth, gen, deviceID) { }
+
+ /*! Jump indexed instruction */
+ virtual void JMPI(GenRegister src, bool longjmp = false);
+ /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
+ virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+ /*! Get double/long exec width */
+ virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
+ virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
+ virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+ virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+ virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+ virtual void setHeader(GenNativeInstruction *insn);
+ virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
+ uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
+ virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+ unsigned char msg_type, uint32_t msg_length,
+ bool header_present);
+ };
+}
+#endif /* __GBE_GEN75_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
new file mode 100644
index 0000000..4f697ef
--- /dev/null
+++ b/backend/src/backend/gen_context.cpp
@@ -0,0 +1,1911 @@
+/*
+ * Copyright © 2012 Intel Corporatin
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_context.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "backend/gen_context.hpp"
+#include "backend/gen_program.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "backend/gen/gen_mesa_disasm.h"
+#include "ir/function.hpp"
+#include "ir/value.hpp"
+#include "sys/cvar.hpp"
+#include <cstring>
+#include <iostream>
+#include <iomanip>
+
+namespace gbe
+{
+ ///////////////////////////////////////////////////////////////////////////
+ // GenContext implementation
+ ///////////////////////////////////////////////////////////////////////////
+ GenContext::GenContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID,
+ bool relaxMath) :
+ Context(unit, name), deviceID(deviceID), relaxMath(relaxMath)
+ {
+ this->p = NULL;
+ this->sel = NULL;
+ this->ra = NULL;
+ this->ifEndifFix = false;
+ }
+
+ GenContext::~GenContext(void) {
+ GBE_DELETE(this->ra);
+ GBE_DELETE(this->sel);
+ GBE_DELETE(this->p);
+ }
+
+ void GenContext::startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure) {
+ this->limitRegisterPressure = limitRegisterPressure;
+ this->reservedSpillRegs = reservedSpillRegs;
+ Context::startNewCG(simdWidth);
+ GBE_SAFE_DELETE(ra);
+ GBE_SAFE_DELETE(sel);
+ GBE_SAFE_DELETE(p);
+ this->p = generateEncoder();
+ this->newSelection();
+ this->ra = GBE_NEW(GenRegAllocator, *this);
+ this->branchPos2.clear();
+ this->branchPos3.clear();
+ this->labelPos.clear();
+ this->errCode = NO_ERROR;
+ }
+
+ void GenContext::newSelection(void) {
+ this->sel = GBE_NEW(Selection, *this);
+ }
+
+ uint32_t GenContext::alignScratchSize(uint32_t size){
+ uint32_t i = 0;
+ while(i < size) i+=1024;
+ return i;
+ }
+
+ void GenContext::emitInstructionStream(void) {
+ // Emit Gen ISA
+ for (auto &block : *sel->blockList)
+ for (auto &insn : block.insnList) {
+ const uint32_t opcode = insn.opcode;
+ p->push();
+ // no more virtual register here in that part of the code generation
+ GBE_ASSERT(insn.state.physicalFlag);
+ p->curr = insn.state;
+ switch (opcode) {
+#define DECL_SELECTION_IR(OPCODE, FAMILY) \
+ case SEL_OP_##OPCODE: this->emit##FAMILY(insn); break;
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_INSN
+ }
+ p->pop();
+ }
+ /* per spec, pad the instruction stream with 8 nop to avoid
+ instruction prefetcher prefetch into an invalide page */
+ for(int i = 0; i < 8; i++)
+ p->NOP();
+ }
+
+ bool GenContext::patchBranches(void) {
+ using namespace ir;
+ for (auto pair : branchPos2) {
+ const LabelIndex label = pair.first;
+ const int32_t insnID = pair.second;
+ const int32_t targetID = labelPos.find(label)->second;
+ p->patchJMPI(insnID, (targetID - insnID));
+ }
+ for (auto pair : branchPos3) {
+ const LabelPair labelPair = pair.first;
+ const int32_t insnID = pair.second;
+ const int32_t jip = labelPos.find(labelPair.l0)->second;
+ const int32_t uip = labelPos.find(labelPair.l1)->second;
+ if (((jip - insnID) > 32767 || (jip - insnID) < -32768) ||
+ ((uip - insnID) > 32768 || (uip - insnID) < -32768)) {
+ // The only possible error instruction is if/endif here.
+ errCode = OUT_OF_RANGE_IF_ENDIF;
+ return false;
+ }
+ p->patchJMPI(insnID, (((uip - insnID)) << 16) | ((jip - insnID)));
+ }
+ return true;
+ }
+
+ void GenContext::clearFlagRegister(void) {
+ // when group size not aligned to simdWidth, flag register need clear to
+ // make prediction(any8/16h) work correctly
+ const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+ const GenRegister zero = ra->genReg(GenRegister::uw1grf(ir::ocl::zero));
+ const GenRegister one = ra->genReg(GenRegister::uw1grf(ir::ocl::one));
+ p->push();
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+ p->curr.noMask = 0;
+ p->MOV(blockip, GenRegister::immuw(0));
+ p->curr.execWidth = 1;
+ // FIXME, need to get the final use set of zero/one, if there is no user,
+ // no need to generate the following two instructions.
+ p->MOV(zero, GenRegister::immuw(0));
+ p->MOV(one, GenRegister::immw(-1));
+ p->pop();
+ }
+
+ void GenContext::emitStackPointer(void) {
+ using namespace ir;
+
+ // Only emit stack pointer computation if we use a stack
+ if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+ return;
+
+ // Check that everything is consistent in the kernel code
+ const uint32_t perLaneSize = kernel->getStackSize();
+ const uint32_t perThreadSize = perLaneSize * this->simdWidth;
+ GBE_ASSERT(perLaneSize > 0);
+ GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
+ GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
+
+ // Use shifts rather than muls which are limited to 32x16 bit sources
+ const uint32_t perLaneShift = logi2(perLaneSize);
+ const uint32_t perThreadShift = logi2(perThreadSize);
+ const GenRegister selStatckPtr = this->simdWidth == 8 ?
+ GenRegister::ud8grf(ir::ocl::stackptr) :
+ GenRegister::ud16grf(ir::ocl::stackptr);
+ const GenRegister stackptr = ra->genReg(selStatckPtr);
+ const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+ const GenRegister bufferptr = ra->genReg(selStackBuffer);
+
+ // We compute the per-lane stack pointer here
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+ p->curr.execWidth = this->simdWidth;
+ p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+ p->curr.execWidth = 1;
+ p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+ p->curr.execWidth = this->simdWidth;
+ p->ADD(stackptr, stackptr, bufferptr);
+ p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+ p->pop();
+ }
+
+ void GenContext::emitLabelInstruction(const SelectionInstruction &insn) {
+ const ir::LabelIndex label(insn.index);
+ this->labelPos.insert(std::make_pair(label, p->store.size()));
+ }
+
+ void GenContext::emitUnaryInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ switch (insn.opcode) {
+ case SEL_OP_MOV: p->MOV(dst, src, insn.extra.function); break;
+ case SEL_OP_FBH: p->FBH(dst, src); break;
+ case SEL_OP_FBL: p->FBL(dst, src); break;
+ case SEL_OP_NOT: p->NOT(dst, src); break;
+ case SEL_OP_RNDD: p->RNDD(dst, src); break;
+ case SEL_OP_RNDU: p->RNDU(dst, src); break;
+ case SEL_OP_RNDE: p->RNDE(dst, src); break;
+ case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
+ case SEL_OP_F16TO32: p->F16TO32(dst, src); break;
+ case SEL_OP_F32TO16: p->F32TO16(dst, src); break;
+ case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
+ case SEL_OP_CONVI64_TO_I:
+ {
+ p->MOV(dst, src.bottom_half());
+ break;
+ }
+ case SEL_OP_BRC:
+ {
+ const ir::LabelIndex label0(insn.index), label1(insn.index1);
+ const LabelPair labelPair(label0, label1);
+ const GenRegister src = ra->genReg(insn.src(0));
+ this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));
+ p->BRC(src);
+ }
+ break;
+ case SEL_OP_BRD:
+ insertJumpPos(insn);
+ p->BRD(src);
+ break;
+ case SEL_OP_ENDIF:
+ insertJumpPos(insn);
+ p->ENDIF(src);
+ break;
+ case SEL_OP_IF:
+ {
+ const ir::LabelIndex label0(insn.index), label1(insn.index1);
+ const LabelPair labelPair(label0, label1);
+ const GenRegister src = ra->genReg(insn.src(0));
+ this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));
+ p->IF(src);
+ }
+ break;
+ default: NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction &insn) {
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister src = ra->genReg(insn.src(0));
+ GenRegister tmp = ra->genReg(insn.dst(1));
+ switch (insn.opcode) {
+ case SEL_OP_LOAD_DF_IMM:
+ p->LOAD_DF_IMM(dst, tmp, src.value.df);
+ break;
+ case SEL_OP_MOV_DF:
+ p->MOV_DF(dst, src, tmp);
+ break;
+ case SEL_OP_CONVI_TO_I64: {
+ GenRegister middle = src;
+ if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
+ middle = tmp;
+ middle.type = GEN_TYPE_D;
+ p->MOV(middle, src);
+ }
+
+ p->MOV(dst.bottom_half(), middle);
+ if(src.is_signed_int())
+ p->ASR(dst.top_half(this->simdWidth), middle, GenRegister::immud(31));
+ else
+ p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction &insn) {
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister src0 = ra->genReg(insn.src(0));
+ GenRegister src1 = ra->genReg(insn.src(1));
+ GenRegister tmp = ra->genReg(insn.dst(1));
+ switch (insn.opcode) {
+ case SEL_OP_I64ADD: {
+ tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+ GenRegister x = tmp.bottom_half();
+ GenRegister y = tmp.top_half(this->simdWidth);
+
+ loadBottomHalf(x, src0);
+ loadBottomHalf(y, src1);
+ addWithCarry(x, x, y);
+ storeBottomHalf(dst, x);
+ loadTopHalf(x, src0);
+ p->ADD(x, x, y);
+ loadTopHalf(y, src1);
+ p->ADD(x, x, y);
+ storeTopHalf(dst, x);
+ break;
+ }
+ case SEL_OP_I64SUB: {
+ tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+ GenRegister x = tmp.bottom_half();
+ GenRegister y = tmp.top_half(this->simdWidth);
+
+ loadBottomHalf(x, src0);
+ loadBottomHalf(y, src1);
+ subWithBorrow(x, x, y);
+ storeBottomHalf(dst, x);
+ loadTopHalf(x, src0);
+ subWithBorrow(x, x, y);
+ loadTopHalf(y, src1);
+ subWithBorrow(x, x, y);
+ storeTopHalf(dst, x);
+ break;
+ }
+ case SEL_OP_MUL_HI: {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int i = 0; i < w / 8; i ++) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
+ p->curr.accWrEnable = 1;
+ p->MACH(tmp, src0, src1);
+ p->pop();
+ p->curr.quarterControl = i;
+ p->MOV(dst, tmp);
+ dst = GenRegister::Qn(dst, 1);
+ src0 = GenRegister::Qn(src0, 1);
+ src1 = GenRegister::Qn(src1, 1);
+ }
+ p->pop();
+ break;
+ }
+ case SEL_OP_HADD: {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int i = 0; i < w / 8; i ++) {
+ p->curr.quarterControl = i;
+ p->ADDC(dst, src0, src1);
+ p->SHR(dst, dst, GenRegister::immud(1));
+ p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+ p->OR(dst, dst, tmp);
+ dst = GenRegister::Qn(dst, 1);
+ src0 = GenRegister::Qn(src0, 1);
+ src1 = GenRegister::Qn(src1, 1);
+ }
+ p->pop();
+ break;
+ }
+ case SEL_OP_RHADD: {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int i = 0; i < w / 8; i ++) {
+ p->curr.quarterControl = i;
+ p->ADDC(dst, src0, src1);
+ p->ADD(dst, dst, GenRegister::immud(1));
+ p->SHR(dst, dst, GenRegister::immud(1));
+ p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+ p->OR(dst, dst, tmp);
+ dst = GenRegister::Qn(dst, 1);
+ src0 = GenRegister::Qn(src0, 1);
+ src1 = GenRegister::Qn(src1, 1);
+ }
+ p->pop();
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src0 = ra->genReg(insn.src(0));
+ const GenRegister src1 = ra->genReg(insn.src(1));
+ switch (insn.opcode) {
+ case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
+ case SEL_OP_SEL_INT64:
+ {
+ p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->SEL(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+ }
+ break;
+ case SEL_OP_AND: p->AND(dst, src0, src1, insn.extra.function); break;
+ case SEL_OP_OR: p->OR (dst, src0, src1, insn.extra.function); break;
+ case SEL_OP_XOR: p->XOR(dst, src0, src1, insn.extra.function); break;
+ case SEL_OP_I64AND:
+ {
+ p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->AND(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+ }
+ break;
+ case SEL_OP_I64OR:
+ {
+ p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->OR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+ }
+ break;
+ case SEL_OP_I64XOR:
+ {
+ p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+ p->XOR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+ }
+ break;
+ case SEL_OP_SHR: p->SHR(dst, src0, src1); break;
+ case SEL_OP_SHL: p->SHL(dst, src0, src1); break;
+ case SEL_OP_RSR: p->RSR(dst, src0, src1); break;
+ case SEL_OP_RSL: p->RSL(dst, src0, src1); break;
+ case SEL_OP_ASR: p->ASR(dst, src0, src1); break;
+ case SEL_OP_ADD: p->ADD(dst, src0, src1); break;
+ case SEL_OP_MUL: p->MUL(dst, src0, src1); break;
+ case SEL_OP_MACH: p->MACH(dst, src0, src1); break;
+ case SEL_OP_UPSAMPLE_SHORT: p->UPSAMPLE_SHORT(dst, src0, src1); break;
+ case SEL_OP_UPSAMPLE_INT: p->UPSAMPLE_INT(dst, src0, src1); break;
+ case SEL_OP_UPSAMPLE_LONG:
+ {
+ GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+ xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+ xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+ p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half());
+ p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
+ }
+ break;
+ default: NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::collectShifter(GenRegister dest, GenRegister src) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->AND(dest, src.bottom_half(), GenRegister::immud(63));
+ p->pop();
+ }
+
+ void GenContext::I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2) {
+ addWithCarry(low1, low1, low2);
+ addWithCarry(high1, high1, high2);
+ p->ADD(high1, high1, low2);
+ }
+
+ void GenContext::I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low) {
+ GenRegister &e = dst1, &f = dst2, &g = dst3, &h = dst4,
+ &a = x_high, &b = x_low, &c = y_high, &d = y_low;
+ I32FullMult(e, h, b, d);
+ I32FullMult(f, g, a, d);
+ addWithCarry(g, g, e);
+ addWithCarry(f, f, e);
+ I32FullMult(e, d, b, c);
+ I64FullAdd(f, g, e, d);
+ I32FullMult(b, d, a, c);
+ I64FullAdd(e, f, b, d);
+ }
+
+ void GenContext::I64Neg(GenRegister high, GenRegister low, GenRegister tmp) {
+ p->NOT(high, high);
+ p->NOT(low, low);
+ p->MOV(tmp, GenRegister::immud(1));
+ addWithCarry(low, low, tmp);
+ p->ADD(high, high, tmp);
+ }
+
+ void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
+ p->SHR(sign, high, GenRegister::immud(31));
+ p->push();
+ p->curr.noMask = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ I64Neg(high, low, tmp);
+ p->pop();
+ }
+
+ void GenContext::emitI64MULHIInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ GenRegister g = ra->genReg(insn.dst(7));
+ GenRegister h = ra->genReg(insn.dst(8));
+ GenRegister i = ra->genReg(insn.dst(9));
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(x.type == GEN_TYPE_UL) {
+ I64FullMult(e, f, g, h, a, b, c, d);
+ } else {
+ I64ABS(e, a, b, i, flagReg);
+ I64ABS(f, c, d, i, flagReg);
+ p->XOR(i, e, f);
+ I64FullMult(e, f, g, h, a, b, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->NOT(e, e);
+ p->NOT(f, f);
+ p->NOT(g, g);
+ p->NOT(h, h);
+ p->MOV(i, GenRegister::immud(1));
+ addWithCarry(h, h, i);
+ addWithCarry(g, g, i);
+ addWithCarry(f, f, i);
+ p->ADD(e, e, i);
+ p->pop();
+ }
+ storeTopHalf(dest, e);
+ storeBottomHalf(dest, f);
+ }
+
+ void GenContext::emitI64MADSATInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister z = ra->genReg(insn.src(2));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ GenRegister g = ra->genReg(insn.dst(7));
+ GenRegister h = ra->genReg(insn.dst(8));
+ GenRegister i = ra->genReg(insn.dst(9));
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(x.type == GEN_TYPE_UL) {
+ I64FullMult(e, f, g, h, a, b, c, d);
+ loadTopHalf(c, z);
+ loadBottomHalf(d, z);
+ addWithCarry(h, h, d);
+ addWithCarry(g, g, d);
+ addWithCarry(f, f, d);
+ p->ADD(e, e, d);
+ addWithCarry(g, g, c);
+ addWithCarry(f, f, c);
+ p->ADD(e, e, c);
+ p->OR(a, e, f);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(g, GenRegister::immd(-1));
+ p->MOV(h, GenRegister::immd(-1));
+ p->pop();
+ } else {
+ I64ABS(e, a, b, i, flagReg);
+ I64ABS(f, c, d, i, flagReg);
+ p->XOR(i, e, f);
+ I64FullMult(e, f, g, h, a, b, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, i, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->NOT(e, e);
+ p->NOT(f, f);
+ p->NOT(g, g);
+ p->NOT(h, h);
+ p->MOV(i, one);
+ addWithCarry(h, h, i);
+ addWithCarry(g, g, i);
+ addWithCarry(f, f, i);
+ p->ADD(e, e, i);
+ p->pop();
+ loadTopHalf(c, z);
+ loadBottomHalf(d, z);
+ p->ASR(GenRegister::retype(b, GEN_TYPE_D), GenRegister::retype(c, GEN_TYPE_D), GenRegister::immd(31));
+ p->MOV(a, b);
+ addWithCarry(h, h, d);
+ addWithCarry(g, g, d);
+ addWithCarry(f, f, d);
+ p->ADD(e, e, d);
+ addWithCarry(g, g, c);
+ addWithCarry(f, f, c);
+ p->ADD(e, e, c);
+ addWithCarry(f, f, b);
+ p->ADD(e, e, b);
+ p->ADD(e, e, a);
+ p->MOV(b, zero);
+ p->push();
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->CMP(GEN_CONDITIONAL_NZ, e, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NZ, f, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_G, g, GenRegister::immud(0x7FFFFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->SHR(a, e, GenRegister::immud(31));
+ p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, zero);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(g, GenRegister::immud(0x7FFFFFFF));
+ p->MOV(h, GenRegister::immud(0xFFFFFFFFu));
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(b, zero);
+ p->CMP(GEN_CONDITIONAL_NEQ, e, GenRegister::immud(0xFFFFFFFFu));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NEQ, f, GenRegister::immud(0xFFFFFFFFu));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_LE, g, GenRegister::immud(0x7FFFFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, zero);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(g, GenRegister::immud(0x80000000u));
+ p->MOV(h, zero);
+ p->pop();
+ }
+ storeTopHalf(dest, g);
+ storeBottomHalf(dest, h);
+ }
+
+ void GenContext::emitI64HADDInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+ loadBottomHalf(a, x);
+ loadBottomHalf(b, y);
+ loadTopHalf(c, x);
+ loadTopHalf(d, y);
+ addWithCarry(a, a, b);
+ addWithCarry(c, c, b);
+ addWithCarry(c, c, d);
+ p->ADD(b, b, d);
+ p->SHR(a, a, GenRegister::immud(1));
+ p->SHL(d, c, GenRegister::immud(31));
+ p->OR(a, a, d);
+ p->SHR(c, c, GenRegister::immud(1));
+ p->SHL(d, b, GenRegister::immud(31));
+ p->OR(c, c, d);
+ storeBottomHalf(dest, a);
+ storeTopHalf(dest, c);
+ }
+
+ void GenContext::emitI64RHADDInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+ loadBottomHalf(a, x);
+ loadBottomHalf(b, y);
+ addWithCarry(a, a, b);
+ p->MOV(c, GenRegister::immud(1));
+ addWithCarry(a, a, c);
+ p->ADD(b, b, c);
+ loadTopHalf(c, x);
+ loadTopHalf(d, y);
+ addWithCarry(c, c, b);
+ addWithCarry(c, c, d);
+ p->ADD(b, b, d);
+ p->SHR(a, a, GenRegister::immud(1));
+ p->SHL(d, c, GenRegister::immud(31));
+ p->OR(a, a, d);
+ p->SHR(c, c, GenRegister::immud(1));
+ p->SHL(d, b, GenRegister::immud(31));
+ p->OR(c, c, d);
+ storeBottomHalf(dest, a);
+ storeTopHalf(dest, c);
+ }
+
+ void GenContext::emitI64ShiftInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ GenRegister zero = GenRegister::immud(0);
+ switch(insn.opcode) {
+ case SEL_OP_I64SHL:
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ collectShifter(a, y);
+ loadBottomHalf(e, x);
+ loadTopHalf(f, x);
+ p->SHR(b, e, GenRegister::negate(a));
+ p->SHL(c, e, a);
+ p->SHL(d, f, a);
+ p->OR(e, d, b);
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, e);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(a, a, GenRegister::immud(32));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, c);
+ p->SEL(c, c, zero);
+ p->pop();
+ storeBottomHalf(dest, c);
+ storeTopHalf(dest, d);
+ break;
+ case SEL_OP_I64SHR:
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ collectShifter(a, y);
+ loadBottomHalf(e, x);
+ loadTopHalf(f, x);
+ p->SHL(b, f, GenRegister::negate(a));
+ p->SHR(c, f, a);
+ p->SHR(d, e, a);
+ p->OR(e, d, b);
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, e);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(a, a, GenRegister::immud(32));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, c);
+ p->SEL(c, c, zero);
+ p->pop();
+ storeBottomHalf(dest, d);
+ storeTopHalf(dest, c);
+ break;
+ case SEL_OP_I64ASR:
+ f.type = GEN_TYPE_D;
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ collectShifter(a, y);
+ loadBottomHalf(e, x);
+ loadTopHalf(f, x);
+ p->SHL(b, f, GenRegister::negate(a));
+ p->ASR(c, f, a);
+ p->SHR(d, e, a);
+ p->OR(e, d, b);
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, e);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(a, a, GenRegister::immud(32));
+ p->ASR(f, f, GenRegister::immd(31));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, c);
+ p->SEL(c, c, f);
+ p->pop();
+ storeBottomHalf(dest, d);
+ storeTopHalf(dest, c);
+ break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::saveFlag(GenRegister dest, int flag, int subFlag) {
+ p->push();
+ p->curr.execWidth = 1;
+ p->MOV(dest, GenRegister::flag(flag, subFlag));
+ p->pop();
+ }
+
+ void GenContext::UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister exp,
+ GenRegister mantissa, GenRegister tmp, GenRegister flag) {
+ uint32_t jip0, jip1;
+ GenRegister dst_ud = GenRegister::retype(dst, GEN_TYPE_UD);
+ p->push();
+ p->curr.noMask = 1;
+ p->MOV(exp, GenRegister::immud(32)); // make sure the inactive lane is 1 when check ALL8H/ALL16H condition latter.
+ p->pop();
+ p->FBH(exp, high);
+ p->ADD(exp, GenRegister::negate(exp), GenRegister::immud(31)); //exp = 32 when high == 0
+ p->push();
+ p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32)); //high == 0
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.noMask = 0;
+ p->MOV(dst, low);
+ p->push();
+ if (simdWidth == 8)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+ else if (simdWidth == 16)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+ else
+ NOT_IMPLEMENTED;
+ p->curr.execWidth = 1;
+ p->curr.noMask = 1;
+ jip0 = p->n_instruction();
+ p->JMPI(GenRegister::immud(0));
+ p->pop();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32)); //exp>23 && high!=0
+ p->ADD(tmp, exp, GenRegister::immud(-23));
+ p->SHR(mantissa, high, tmp);
+ p->AND(mantissa, mantissa, GenRegister::immud(0x7fffff));
+ p->SHR(dst_ud, low, tmp); //dst is temp regitster here
+ p->ADD(tmp, GenRegister::negate(tmp), GenRegister::immud(32));
+ p->SHL(high, high, tmp);
+ p->OR(high, high, dst_ud);
+ p->SHL(low, low, tmp);
+ p->push();
+ if (simdWidth == 8)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+ else if (simdWidth == 16)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+ else
+ NOT_IMPLEMENTED;
+ p->curr.execWidth = 1;
+ p->curr.noMask = 1;
+ jip1 = p->n_instruction();
+ p->JMPI(GenRegister::immud(0));
+ p->pop();
+
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(dst_ud, GenRegister::immud(0)); //exp==9, SHR == 0
+
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(23));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->ADD(tmp, exp, GenRegister::immud(9));
+ p->SHR(dst_ud, low, tmp); //dst is temp regitster here
+
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(23));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->ADD(tmp, GenRegister::negate(exp), GenRegister::immud(23));
+ p->SHL(mantissa, high, tmp);
+ p->OR(mantissa, mantissa, dst_ud);
+ p->AND(mantissa, mantissa, GenRegister::immud(0x7fffff));
+ p->SHL(high, low, tmp);
+ p->MOV(low, GenRegister::immud(0));
+
+ p->patchJMPI(jip1, (p->n_instruction() - jip1) );
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31)); //update dst where high != 0
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->ADD(exp, exp, GenRegister::immud(159));
+ p->SHL(exp, exp, GenRegister::immud(23));
+ p->OR(dst_ud, exp, mantissa);
+
+ p->CMP(GEN_CONDITIONAL_GE, high, GenRegister::immud(0x80000000));
+ p->ADD(dst_ud, dst_ud, GenRegister::immud(1));
+
+ p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
+ p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
+ p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
+ p->patchJMPI(jip0, (p->n_instruction() - jip0));
+
+ p->pop();
+
+ }
+
+ void GenContext::emitI64ToFloatInstruction(const SelectionInstruction &insn) {
+ GenRegister src = ra->genReg(insn.src(0));
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister high = ra->genReg(insn.dst(1));
+ GenRegister low = ra->genReg(insn.dst(2));
+ GenRegister exp = ra->genReg(insn.dst(3));
+ GenRegister mantissa = ra->genReg(insn.dst(4));
+ GenRegister tmp = ra->genReg(insn.dst(5));
+ GenRegister tmp_high = ra->genReg(insn.dst(6));
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ loadTopHalf(high, src);
+ loadBottomHalf(low, src);
+ if(!src.is_signed_int()) {
+ UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, flagReg);
+ } else {
+ p->MOV(tmp_high, high);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->NOT(high, high);
+ p->NOT(low, low);
+ p->MOV(tmp, GenRegister::immud(1));
+ addWithCarry(low, low, tmp);
+ p->ADD(high, high, tmp);
+ p->pop();
+ UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, flagReg);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ dest.type = GEN_TYPE_UD;
+ p->OR(dest, dest, GenRegister::immud(0x80000000));
+ p->pop();
+ }
+ }
+
+
+ void GenContext::emitFloatToI64Instruction(const SelectionInstruction &insn) {
+ GenRegister src = ra->genReg(insn.src(0));
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister high = ra->genReg(insn.dst(1));
+ GenRegister tmp = ra->genReg(insn.dst(2));
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+
+ if(dst.is_signed_int())
+ high = GenRegister::retype(high, GEN_TYPE_D);
+ GenRegister low = GenRegister::retype(tmp, GEN_TYPE_UD);
+ float c = (1.f / 65536.f) * (1.f / 65536.f);
+ p->MUL(tmp, src, GenRegister::immf(c));
+ p->RNDZ(tmp, tmp);
+ p->MOV(high, tmp);
+ c = 65536.f * 65536.f;
+ p->MOV(tmp, high); //result may not equal to tmp
+ //mov float to int/uint is sat, so must sub high*0xffffffff
+ p->MUL(tmp, tmp, GenRegister::immf(c));
+ p->ADD(tmp, src, GenRegister::negate(tmp));
+ p->MOV(low, GenRegister::abs(tmp));
+ if(dst.is_signed_int()) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_NEQ, low, GenRegister::immud(0x0));
+ p->ADD(high, high, GenRegister::immd(-1));
+ p->NOT(low, low);
+ p->ADD(low, low, GenRegister::immud(1));
+ p->pop();
+ }
+ storeTopHalf(dst, high);
+ storeBottomHalf(dst, low);
+ }
+
+ void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
+ GenRegister src0 = ra->genReg(insn.src(0));
+ GenRegister src1 = ra->genReg(insn.src(1));
+ GenRegister tmp0 = ra->genReg(insn.dst(0));
+ GenRegister tmp1 = ra->genReg(insn.dst(1));
+ GenRegister tmp2 = ra->genReg(insn.dst(2));
+ tmp0.type = (src0.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+ tmp1.type = (src1.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+ int flag = p->curr.flag, subFlag = p->curr.subFlag;
+ GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW);
+ f1.width = GEN_WIDTH_1;
+ GenRegister f2 = GenRegister::suboffset(f1, 1);
+ GenRegister f3 = GenRegister::suboffset(f1, 2);
+
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ loadTopHalf(tmp0, src0);
+ loadTopHalf(tmp1, src1);
+ switch(insn.extra.function) {
+ case GEN_CONDITIONAL_L:
+ case GEN_CONDITIONAL_LE:
+ case GEN_CONDITIONAL_G:
+ case GEN_CONDITIONAL_GE:
+ {
+ int cmpTopHalf = insn.extra.function;
+ if(insn.extra.function == GEN_CONDITIONAL_LE)
+ cmpTopHalf = GEN_CONDITIONAL_L;
+ if(insn.extra.function == GEN_CONDITIONAL_GE)
+ cmpTopHalf = GEN_CONDITIONAL_G;
+ p->CMP(cmpTopHalf, tmp0, tmp1);
+ }
+ saveFlag(f1, flag, subFlag);
+ p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+ saveFlag(f2, flag, subFlag);
+ tmp0.type = tmp1.type = GEN_TYPE_UD;
+ loadBottomHalf(tmp0, src0);
+ loadBottomHalf(tmp1, src1);
+ p->CMP(insn.extra.function, tmp0, tmp1);
+ saveFlag(f3, flag, subFlag);
+ p->push();
+ p->curr.execWidth = 1;
+ p->AND(f2, f2, f3);
+ p->OR(f1, f1, f2);
+ p->pop();
+ break;
+ case GEN_CONDITIONAL_EQ:
+ p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+ saveFlag(f1, flag, subFlag);
+ tmp0.type = tmp1.type = GEN_TYPE_UD;
+ loadBottomHalf(tmp0, src0);
+ loadBottomHalf(tmp1, src1);
+ p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+ saveFlag(f2, flag, subFlag);
+ p->push();
+ p->curr.execWidth = 1;
+ p->AND(f1, f1, f2);
+ p->pop();
+ break;
+ case GEN_CONDITIONAL_NEQ:
+ p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+ saveFlag(f1, flag, subFlag);
+ tmp0.type = tmp1.type = GEN_TYPE_UD;
+ loadBottomHalf(tmp0, src0);
+ loadBottomHalf(tmp1, src1);
+ p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+ saveFlag(f2, flag, subFlag);
+ p->push();
+ p->curr.execWidth = 1;
+ p->OR(f1, f1, f2);
+ p->pop();
+ break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ p->curr.execWidth = 1;
+ p->MOV(GenRegister::flag(flag, subFlag), f1);
+ p->pop();
+ }
+
+ void GenContext::emitI64SATADDInstruction(const SelectionInstruction &insn) {
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(dst.is_signed_int())
+ p->SHR(e, a, GenRegister::immud(31));
+ addWithCarry(b, b, d);
+ addWithCarry(a, a, d);
+ addWithCarry(a, a, c);
+ p->ADD(c, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ if(! dst.is_signed_int()) {
+ p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(a, GenRegister::immud(0xFFFFFFFFu));
+ p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+ } else {
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x80000000u));
+ p->MOV(b, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+ p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+ }
+ p->pop();
+ storeTopHalf(dst, a);
+ storeBottomHalf(dst, b);
+ }
+
+ void GenContext::emitI64SATSUBInstruction(const SelectionInstruction &insn) {
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(dst.is_signed_int())
+ p->SHR(e, a, GenRegister::immud(31));
+ subWithBorrow(b, b, d);
+ subWithBorrow(a, a, d);
+ subWithBorrow(a, a, c);
+ p->ADD(c, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ if(! dst.is_signed_int()) {
+ p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(a, GenRegister::immud(0));
+ p->MOV(b, GenRegister::immud(0));
+ } else {
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x80000000u));
+ p->MOV(b, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+ p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+ }
+ p->pop();
+ storeTopHalf(dst, a);
+ storeBottomHalf(dst, b);
+ }
+
+ void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
+ p->MOV(dest, src.top_half(this->simdWidth));
+ }
+
+ void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
+ p->MOV(dest.top_half(this->simdWidth), src);
+ }
+
+ void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
+ p->MOV(dest, src.bottom_half());
+ }
+
+ void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
+ p->MOV(dest.bottom_half(), src);
+ }
+
+ void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
+ int execWidth = p->curr.execWidth;
+ GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+ p->push();
+ p->curr.execWidth = 8;
+ p->ADDC(dest, src0, src1);
+ p->MOV(src1, acc0);
+ if (execWidth == 16) {
+ p->curr.quarterControl = 1;
+ p->ADDC(GenRegister::suboffset(dest, 8),
+ GenRegister::suboffset(src0, 8),
+ GenRegister::suboffset(src1, 8));
+ p->MOV(GenRegister::suboffset(src1, 8), acc0);
+ }
+ p->pop();
+ }
+
+ void GenContext::subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1) {
+ int execWidth = p->curr.execWidth;
+ GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+ p->push();
+ p->curr.execWidth = 8;
+ p->SUBB(dest, src0, src1);
+ p->MOV(src1, acc0);
+ if (execWidth == 16) {
+ p->curr.quarterControl = 1;
+ p->SUBB(GenRegister::suboffset(dest, 8),
+ GenRegister::suboffset(src0, 8),
+ GenRegister::suboffset(src1, 8));
+ p->MOV(GenRegister::suboffset(src1, 8), acc0);
+ }
+ p->pop();
+ }
+
+ void GenContext::I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1) {
+ GenRegister acc = GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for(int i = 0; i < execWidth; i += 8) {
+ p->MUL(acc, src0, src1);
+ p->curr.accWrEnable = 1;
+ p->MACH(high, src0, src1);
+ p->curr.accWrEnable = 0;
+ p->MOV(low, acc);
+ src0 = GenRegister::suboffset(src0, 8);
+ src1 = GenRegister::suboffset(src1, 8);
+ high = GenRegister::suboffset(high, 8);
+ low = GenRegister::suboffset(low, 8);
+ }
+ p->pop();
+ }
+
+ void GenContext::emitI64MULInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
+ I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
+ p->ADD(e, e, f);
+ I32FullMult(f, a, b, d);
+ p->ADD(e, e, f);
+ p->pop();
+ storeTopHalf(dest, e);
+ storeBottomHalf(dest, a);
+ }
+
+ void GenContext::emitI64DIVREMInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ GenRegister g = ra->genReg(insn.dst(7));
+ GenRegister h = ra->genReg(insn.dst(8));
+ GenRegister i = ra->genReg(insn.dst(9));
+ GenRegister j = ra->genReg(insn.dst(10));
+ GenRegister k = ra->genReg(insn.dst(11));
+ GenRegister l = ra->genReg(insn.dst(12));
+ GenRegister m = ra->genReg(insn.dst(13));
+ GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+ GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ GenRegister zero = GenRegister::immud(0),
+ one = GenRegister::immud(1),
+ imm31 = GenRegister::immud(31);
+ uint32_t jip0;
+ // (a,b) <- x
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ // (c,d) <- y
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ // k <- sign_of_result
+ if(x.is_signed_int()) {
+ GBE_ASSERT(y.is_signed_int());
+ GBE_ASSERT(dest.is_signed_int());
+ I64ABS(k, a, b, e, flagReg);
+ I64ABS(l, c, d, e, flagReg);
+ if(insn.opcode == SEL_OP_I64DIV)
+ p->XOR(k, k, l);
+ }
+ // (e,f) <- 0
+ p->MOV(e, zero);
+ p->MOV(f, zero);
+ // (g,h) <- 2**63
+ p->MOV(g, GenRegister::immud(0x80000000));
+ p->MOV(h, zero);
+ // (i,j) <- 0
+ p->MOV(i, zero);
+ p->MOV(j, zero);
+ // m <- 0
+ p->MOV(m, zero);
+ {
+ uint32_t loop_start = p->n_instruction();
+ // (c,d,e,f) <- (c,d,e,f) / 2
+ p->SHR(f, f, one);
+ p->SHL(l, e, imm31);
+ p->OR(f, f, l);
+ p->SHR(e, e, one);
+ p->SHL(l, d, imm31);
+ p->OR(e, e, l);
+ p->SHR(d, d, one);
+ p->SHL(l, c, imm31);
+ p->OR(d, d, l);
+ p->SHR(c, c, one);
+ // condition <- (c,d)==0 && (a,b)>=(e,f)
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->MOV(l, zero);
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_EQ, a, e);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_GE, b, f);
+ p->MOV(l, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_G, a, e);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(l, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NEQ, l, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_EQ, c, zero);
+ p->CMP(GEN_CONDITIONAL_EQ, d, zero);
+ // under condition, (a,b) <- (a,b) - (e,f)
+ p->MOV(l, f);
+ subWithBorrow(b, b, l);
+ subWithBorrow(a, a, l);
+ p->MOV(l, e);
+ subWithBorrow(a, a, l);
+ // under condition, (i,j) <- (i,j) | (g,h)
+ p->OR(i, i, g);
+ p->OR(j, j, h);
+ p->pop();
+ // (g,h) /= 2
+ p->SHR(h, h, one);
+ p->SHL(l, g, imm31);
+ p->OR(h, h, l);
+ p->SHR(g, g, one);
+ // condition: m < 64
+ p->ADD(m, m, one);
+
+ p->push();
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ p->MOV(flagReg, zero);
+ p->pop();
+
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 0;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
+
+ p->curr.execWidth = 1;
+ p->curr.noMask = 1;
+ // under condition, jump back to start point
+ if (simdWidth == 8)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ else if (simdWidth == 16)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else
+ NOT_IMPLEMENTED;
+ int distance = -(int)(p->n_instruction() - loop_start );
+ p->curr.noMask = 1;
+ jip0 = p->n_instruction();
+ p->JMPI(zero);
+ p->patchJMPI(jip0, distance);
+ p->pop();
+ // end of loop
+ }
+ // adjust sign of result
+ if(x.is_signed_int()) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ if(insn.opcode == SEL_OP_I64DIV)
+ I64Neg(i, j, l);
+ else
+ I64Neg(a, b, l);
+ p->pop();
+ }
+ // write dest
+ if(insn.opcode == SEL_OP_I64DIV) {
+ storeTopHalf(dest, i);
+ storeBottomHalf(dest, j);
+ } else {
+ GBE_ASSERT(insn.opcode == SEL_OP_I64REM);
+ storeTopHalf(dest, a);
+ storeBottomHalf(dest, b);
+ }
+ }
+
+ void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src0 = ra->genReg(insn.src(0));
+ const GenRegister src1 = ra->genReg(insn.src(1));
+ const GenRegister src2 = ra->genReg(insn.src(2));
+ switch (insn.opcode) {
+ case SEL_OP_MAD: p->MAD(dst, src0, src1, src2); break;
+ default: NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
+ p->NOP();
+ }
+
+ void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
+ p->WAIT();
+ }
+
+ void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister fenceDst = ra->genReg(insn.dst(0));
+ uint32_t barrierType = insn.extra.barrierType;
+ const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+
+ if (barrierType == ir::syncGlobalBarrier) {
+ p->FENCE(fenceDst);
+ p->MOV(fenceDst, fenceDst);
+ }
+ p->push();
+ // As only the payload.2 is used and all the other regions are ignored
+ // SIMD8 mode here is safe.
+ p->curr.execWidth = 8;
+ p->curr.physicalFlag = 0;
+ p->curr.noMask = 1;
+ // Copy barrier id from r0.
+ p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+ // A barrier is OK to start the thread synchronization *and* SLM fence
+ p->BARRIER(src);
+ p->curr.execWidth = 1;
+ // Now we wait for the other threads
+ p->WAIT();
+ p->pop();
+ }
+
+ void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ p->FENCE(dst);
+ p->MOV(dst, dst);
+ }
+
+ void GenContext::emitMathInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src0 = ra->genReg(insn.src(0));
+ const uint32_t function = insn.extra.function;
+ if (insn.srcNum == 2) {
+ const GenRegister src1 = ra->genReg(insn.src(1));
+ p->MATH(dst, function, src0, src1);
+ } else
+ p->MATH(dst, function, src0);
+ }
+
+ void GenContext::emitCompareInstruction(const SelectionInstruction &insn) {
+ const GenRegister src0 = ra->genReg(insn.src(0));
+ const GenRegister src1 = ra->genReg(insn.src(1));
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ if (insn.opcode == SEL_OP_CMP)
+ p->CMP(insn.extra.function, src0, src1, dst);
+ else {
+ GBE_ASSERT(insn.opcode == SEL_OP_SEL_CMP);
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ p->SEL_CMP(insn.extra.function, dst, src0, src1);
+ }
+ }
+
+ void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const uint32_t function = insn.extra.function;
+ const uint32_t bti = insn.getbti();
+
+ p->ATOMIC(dst, function, src, bti, insn.srcNum);
+ }
+
+ void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
+ GenRegister src = ra->genReg(insn.src(0));
+ if(sel->isScalarReg(src.reg()))
+ src = GenRegister::retype(src, GEN_TYPE_UW);
+ else
+ src = GenRegister::unpacked_uw(src.nr, src.subnr / typeSize(GEN_TYPE_UW));
+
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister a0 = GenRegister::addr8(0);
+ uint32_t simdWidth = p->curr.execWidth;
+
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ p->MOV(a0, src);
+ p->MOV(dst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+ p->pop();
+
+ if (simdWidth == 16) {
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+
+ const GenRegister nextDst = GenRegister::Qn(dst, 1);
+ const GenRegister nextSrc = GenRegister::Qn(src, 1);
+ p->MOV(a0, nextSrc);
+ p->MOV(nextDst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+ p->pop();
+ }
+ }
+
+ void GenContext::insertJumpPos(const SelectionInstruction &insn) {
+ const ir::LabelIndex label(insn.index);
+ this->branchPos2.push_back(std::make_pair(label, p->store.size()));
+ }
+
+ void GenContext::emitJumpInstruction(const SelectionInstruction &insn) {
+ insertJumpPos(insn);
+ const GenRegister src = ra->genReg(insn.src(0));
+ p->JMPI(src, insn.extra.longjmp);
+ }
+
+ void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
+ p->curr.execWidth = 8;
+ p->EOT(112);
+ p->pop();
+ }
+
+ void GenContext::emitSpillRegInstruction(const SelectionInstruction &insn) {
+ uint32_t simdWidth = p->curr.execWidth;
+ uint32_t scratchOffset = insn.extra.scratchOffset;
+ const uint32_t header = insn.extra.scratchMsgHeader;
+ p->push();
+
+ const GenRegister msg = GenRegister::ud8grf(header, 0);
+ const GenRegister src = ra->genReg(insn.src(0));
+ GenRegister payload = src;
+ payload.nr = header + 1;
+ payload.subnr = 0;
+
+ GBE_ASSERT(src.subnr == 0);
+ uint32_t regType = insn.src(0).type;
+ uint32_t size = typeSize(regType);
+ uint32_t regSize = stride(src.hstride)*size;
+
+ GBE_ASSERT(regSize == 4 || regSize == 8);
+ if(regSize == 4) {
+ if (payload.nr != src.nr)
+ p->MOV(payload, src);
+ uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+ this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ }
+ else { //size == 8
+ payload.type = GEN_TYPE_UD;
+ GBE_ASSERT(payload.hstride == GEN_HORIZONTAL_STRIDE_1);
+ loadBottomHalf(payload, src);
+ uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+ this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ loadTopHalf(payload, src);
+ this->scratchWrite(msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ }
+ p->pop();
+ }
+
+ void GenContext::emitUnSpillRegInstruction(const SelectionInstruction &insn) {
+ uint32_t scratchOffset = insn.extra.scratchOffset;
+ const GenRegister dst = insn.dst(0);
+ uint32_t regType = dst.type;
+ uint32_t simdWidth = p->curr.execWidth;
+ const uint32_t header = insn.extra.scratchMsgHeader;
+ uint32_t size = typeSize(regType);
+ uint32_t regSize = stride(dst.hstride)*size;
+
+ const GenRegister msg = GenRegister::ud8grf(header, 0);
+ GenRegister payload = msg;
+ payload.nr = header + 1;
+
+ p->push();
+ assert(regSize == 4 || regSize == 8);
+ if(regSize == 4) {
+ uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+ this->scratchRead(GenRegister::ud8grf(dst.nr, dst.subnr), msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ } else {
+ uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+ this->scratchRead(payload, msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ storeBottomHalf(dst, payload);
+ this->scratchRead(payload, msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ storeTopHalf(dst, payload);
+ }
+ p->pop();
+ }
+
+ void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
+ const uint32_t elemNum = insn.extra.elem;
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ p->UNTYPED_READ(dst, src, bti, elemNum*2);
+ }
+
+ void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t elemNum = insn.extra.elem;
+ p->UNTYPED_READ(dst, src, bti, elemNum);
+ }
+
+ void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.dst(0));
+ const uint32_t elemNum = insn.extra.elem;
+ const uint32_t bti = insn.getbti();
+ p->UNTYPED_WRITE(src, bti, elemNum*2);
+ }
+
+ void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t elemNum = insn.extra.elem;
+ p->UNTYPED_WRITE(src, bti, elemNum);
+ }
+
+ void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t elemSize = insn.extra.elem;
+ p->BYTE_GATHER(dst, src, bti, elemSize);
+ }
+
+ void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t elemSize = insn.extra.elem;
+ p->BYTE_SCATTER(src, bti, elemSize);
+ }
+
+ void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ for(uint32_t i = 0; i < insn.dstNum; i++) {
+ p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i));
+ }
+ }
+
+ void GenContext::emitPackByteInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ p->push();
+ if(simdWidth == 8) {
+ for(uint32_t i = 0; i < insn.srcNum; i++)
+ p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i)));
+ } else {
+ // when destination expands two registers, the source must span two registers.
+ p->curr.execWidth = 8;
+ for(uint32_t i = 0; i < insn.srcNum; i++) {
+ GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i);
+ GenRegister src = ra->genReg(insn.src(i));
+
+ p->curr.quarterControl = 0;
+ p->MOV(dsti, src);
+ p->curr.quarterControl = 1;
+ p->MOV(GenRegister::Qn(dsti,1), GenRegister::Qn(src, 1));
+ }
+ }
+ p->pop();
+ }
+
+ void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ p->DWORD_GATHER(dst, src, bti);
+ }
+
+ void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
+ const unsigned char bti = insn.getbti();
+ const unsigned char sampler = insn.extra.sampler;
+ const unsigned int msgLen = insn.extra.rdmsglen;
+ uint32_t simdWidth = p->curr.execWidth;
+ p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0, insn.extra.isLD, insn.extra.isUniform);
+ }
+
+ void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+ p->push();
+ uint32_t simdWidth = p->curr.execWidth;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+
+ p->curr.execWidth = 8;
+ p->MOV(header, GenRegister::ud8grf(0,0));
+ p->pop();
+
+ int size = typeSize(reg_type)*simdWidth;
+ p->push();
+ p->SCRATCH_WRITE(header, offset/32, size, reg_num, channel_mode);
+ p->pop();
+ }
+
+ void GenContext::scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+ p->push();
+ uint32_t simdWidth = p->curr.execWidth;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 8;
+ p->MOV(header, GenRegister::ud8grf(0,0));
+ p->pop();
+
+ int size = typeSize(reg_type)*simdWidth;
+ p->push();
+ p->SCRATCH_READ(dst, header, offset/32, size, reg_num, channel_mode);
+ p->pop();
+ }
+
+ void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
+ const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+ const uint32_t bti = insn.getbti();
+ p->TYPED_WRITE(header, true, bti);
+ }
+
+ BVAR(OCL_OUTPUT_REG_ALLOC, false);
+ BVAR(OCL_OUTPUT_ASM, false);
+
+ void GenContext::allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue) {
+ uint32_t regSize;
+ regSize = this->ra->getRegSize(reg);
+ insertCurbeReg(reg, newCurbeEntry(value, subValue, regSize));
+ }
+
+ void GenContext::buildPatchList(void) {
+ const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
+ kernel->curbeSize = 0u;
+ auto &stackUse = dag->getUse(ir::ocl::stackptr);
+
+ // We insert the block IP mask first
+ using namespace ir::ocl;
+ allocCurbeReg(blockip, GBE_CURBE_BLOCK_IP);
+ allocCurbeReg(lid0, GBE_CURBE_LOCAL_ID_X);
+ allocCurbeReg(lid1, GBE_CURBE_LOCAL_ID_Y);
+ allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
+ allocCurbeReg(zero, GBE_CURBE_ZERO);
+ allocCurbeReg(one, GBE_CURBE_ONE);
+ if (stackUse.size() != 0)
+ allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
+ allocSLMOffsetCurbe();
+ // Go over the arguments and find the related patch locations
+ const uint32_t argNum = fn.argNum();
+ for (uint32_t argID = 0u; argID < argNum; ++argID) {
+ const ir::FunctionArgument &arg = fn.getArg(argID);
+ // For pointers and values, we have nothing to do. We just push the values
+ if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
+ arg.type == ir::FunctionArgument::LOCAL_POINTER ||
+ arg.type == ir::FunctionArgument::CONSTANT_POINTER ||
+ arg.type == ir::FunctionArgument::VALUE ||
+ arg.type == ir::FunctionArgument::STRUCTURE ||
+ arg.type == ir::FunctionArgument::IMAGE ||
+ arg.type == ir::FunctionArgument::SAMPLER)
+ this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
+ }
+
+ // Go over all the instructions and find the special register we need
+ // to push
+ #define INSERT_REG(SPECIAL_REG, PATCH) \
+ if (reg == ir::ocl::SPECIAL_REG) { \
+ if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
+ allocCurbeReg(reg, GBE_CURBE_##PATCH); \
+ } else
+
+ fn.foreachInstruction([&](ir::Instruction &insn) {
+ const uint32_t srcNum = insn.getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const ir::Register reg = insn.getSrc(srcID);
+ if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+ if (srcID != 0) continue;
+ const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
+ const unsigned char type = ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+ ir::ImageInfoKey key(bti, type);
+ const ir::Register imageInfo = insn.getSrc(0);
+ if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+ uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+ insertCurbeReg(imageInfo, offset);
+ }
+ continue;
+ }
+ if (fn.isSpecialReg(reg) == false) continue;
+ if (curbeRegs.find(reg) != curbeRegs.end()) continue;
+ if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
+ INSERT_REG(lsize0, LOCAL_SIZE_X)
+ INSERT_REG(lsize1, LOCAL_SIZE_Y)
+ INSERT_REG(lsize2, LOCAL_SIZE_Z)
+ INSERT_REG(gsize0, GLOBAL_SIZE_X)
+ INSERT_REG(gsize1, GLOBAL_SIZE_Y)
+ INSERT_REG(gsize2, GLOBAL_SIZE_Z)
+ INSERT_REG(goffset0, GLOBAL_OFFSET_X)
+ INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
+ INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
+ INSERT_REG(workdim, WORK_DIM)
+ INSERT_REG(numgroup0, GROUP_NUM_X)
+ INSERT_REG(numgroup1, GROUP_NUM_Y)
+ INSERT_REG(numgroup2, GROUP_NUM_Z)
+ INSERT_REG(stackptr, STACK_POINTER)
+ INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
+ INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
+ do {} while(0);
+ }
+ });
+#undef INSERT_REG
+
+
+ // After this point the vector is immutable. Sorting it will make
+ // research faster
+ std::sort(kernel->patches.begin(), kernel->patches.end());
+
+ kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
+ }
+
+ bool GenContext::emitCode(void) {
+ GenKernel *genKernel = static_cast<GenKernel*>(this->kernel);
+ buildPatchList();
+ sel->select();
+ schedulePreRegAllocation(*this, *this->sel);
+ if (UNLIKELY(ra->allocate(*this->sel) == false))
+ return false;
+ schedulePostRegAllocation(*this, *this->sel);
+ if (OCL_OUTPUT_REG_ALLOC)
+ ra->outputAllocation();
+ this->clearFlagRegister();
+ this->emitStackPointer();
+ this->emitSLMOffset();
+ this->emitInstructionStream();
+ if (this->patchBranches() == false)
+ return false;
+ genKernel->insnNum = p->store.size();
+ genKernel->insns = GBE_NEW_ARRAY_NO_ARG(GenInstruction, genKernel->insnNum);
+ std::memcpy(genKernel->insns, &p->store[0], genKernel->insnNum * sizeof(GenInstruction));
+ if (OCL_OUTPUT_ASM) {
+ std::cout << genKernel->getName() << "'s disassemble begin:" << std::endl;
+ ir::LabelIndex curLabel = (ir::LabelIndex)0;
+ GenCompactInstruction * pCom = NULL;
+ GenNativeInstruction insn;
+ std::cout << " L0:" << std::endl;
+ for (uint32_t insnID = 0; insnID < genKernel->insnNum; ) {
+ if (labelPos.find((ir::LabelIndex)(curLabel + 1))->second == insnID &&
+ curLabel < this->getFunction().labelNum()) {
+ std::cout << " L" << curLabel + 1 << ":" << std::endl;
+ curLabel = (ir::LabelIndex)(curLabel + 1);
+ }
+ std::cout << " (" << std::setw(8) << insnID << ") ";
+ pCom = (GenCompactInstruction*)&p->store[insnID];
+ if(pCom->bits1.cmpt_control == 1) {
+ decompactInstruction(pCom, &insn);
+ gen_disasm(stdout, &insn, deviceID, 1);
+ insnID++;
+ } else {
+ gen_disasm(stdout, &p->store[insnID], deviceID, 0);
+ insnID = insnID + 2;
+ }
+ }
+ std::cout << genKernel->getName() << "'s disassemble end." << std::endl;
+ }
+ return true;
+ }
+
+ Kernel *GenContext::allocateKernel(void) {
+ return GBE_NEW(GenKernel, name, deviceID);
+ }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
new file mode 100644
index 0000000..02c83d0
--- /dev/null
+++ b/backend/src/backend/gen_context.hpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_context.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_CONTEXT_HPP__
+#define __GBE_GEN_CONTEXT_HPP__
+
+#include "backend/context.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/program.h"
+#include "backend/gen_register.hpp"
+#include "ir/function.hpp"
+#include "ir/liveness.hpp"
+#include "sys/map.hpp"
+#include <string>
+
+namespace gbe
+{
+ class Kernel; // We build this structure
+ class GenEncoder; // Helps emitting Gen ISA
+ class GenRegAllocator; // Handle the register allocation
+ class Selection; // Performs instruction selection
+ class SelectionInstruction; // Pre-RA Gen instruction
+ class SelectionReg; // Pre-RA Gen register
+ class GenRegister;
+ typedef enum {
+ NO_ERROR,
+ REGISTER_ALLOCATION_FAIL,
+ REGISTER_SPILL_EXCEED_THRESHOLD,
+ REGISTER_SPILL_FAIL,
+ OUT_OF_RANGE_IF_ENDIF,
+ } CompileErrorCode;
+
+ /*! Context is the helper structure to build the Gen ISA or simulation code
+ * from GenIR
+ */
+ class GenContext : public Context
+ {
+ public:
+ /*! Create a new context. name is the name of the function we want to
+ * compile
+ */
+ GenContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID,
+ bool relaxMath = false);
+ /*! Release everything needed */
+ virtual ~GenContext(void);
+ /*! device's max srcatch buffer size */
+ #define GEN7_SCRATCH_SIZE (12 * KB)
+ /*! Start new code generation with specific parameters */
+ void startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure);
+ /*! Target device ID*/
+ uint32_t deviceID;
+ /*! Implements base class */
+ virtual bool emitCode(void);
+ /*! Align the scratch size to the device's scratch unit size */
+ virtual uint32_t alignScratchSize(uint32_t size);
+ /*! Get the device's max srcatch size */
+ virtual uint32_t getScratchSize(void) { return GEN7_SCRATCH_SIZE; }
+ /*! Function we emit code for */
+ INLINE const ir::Function &getFunction(void) const { return fn; }
+ /*! Simd width chosen for the current function */
+ INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
+ void clearFlagRegister(void);
+ /*! check the flag reg, if is grf, use f0.1 instead */
+ GenRegister checkFlagRegister(GenRegister flagReg);
+ /*! Emit the per-lane stack pointer computation */
+ virtual void emitStackPointer(void);
+ /*! Emit the instructions */
+ void emitInstructionStream(void);
+ /*! Set the correct target values for the branches */
+ bool patchBranches(void);
+ /*! Forward ir::Function isSpecialReg method */
+ INLINE bool isSpecialReg(ir::Register reg) const {
+ return fn.isSpecialReg(reg);
+ }
+ /*! Get the liveOut information for the given block */
+ INLINE const ir::Liveness::LiveOut &getLiveOut(const ir::BasicBlock *bb) const {
+ return this->liveness->getLiveOut(bb);
+ }
+ /*! Get the LiveIn information for the given block */
+ INLINE const ir::Liveness::UEVar &getLiveIn(const ir::BasicBlock *bb) const {
+ return this->liveness->getLiveIn(bb);
+ }
+
+ void collectShifter(GenRegister dest, GenRegister src);
+ void loadTopHalf(GenRegister dest, GenRegister src);
+ void storeTopHalf(GenRegister dest, GenRegister src);
+
+ void loadBottomHalf(GenRegister dest, GenRegister src);
+ void storeBottomHalf(GenRegister dest, GenRegister src);
+
+ void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
+ void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+ void I64Neg(GenRegister high, GenRegister low, GenRegister tmp);
+ void I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg);
+ void I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2);
+ void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
+ void I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low);
+ void saveFlag(GenRegister dest, int flag, int subFlag);
+ void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister exp, GenRegister mantissa, GenRegister tmp, GenRegister flag);
+
+ /*! Final Gen ISA emission helper functions */
+ void emitLabelInstruction(const SelectionInstruction &insn);
+ void emitUnaryInstruction(const SelectionInstruction &insn);
+ void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
+ void emitBinaryInstruction(const SelectionInstruction &insn);
+ void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
+ void emitTernaryInstruction(const SelectionInstruction &insn);
+ void emitI64MULHIInstruction(const SelectionInstruction &insn);
+ void emitI64MADSATInstruction(const SelectionInstruction &insn);
+ void emitI64HADDInstruction(const SelectionInstruction &insn);
+ void emitI64RHADDInstruction(const SelectionInstruction &insn);
+ void emitI64ShiftInstruction(const SelectionInstruction &insn);
+ void emitI64CompareInstruction(const SelectionInstruction &insn);
+ void emitI64SATADDInstruction(const SelectionInstruction &insn);
+ void emitI64SATSUBInstruction(const SelectionInstruction &insn);
+ void emitI64ToFloatInstruction(const SelectionInstruction &insn);
+ void emitFloatToI64Instruction(const SelectionInstruction &insn);
+ void emitCompareInstruction(const SelectionInstruction &insn);
+ void emitJumpInstruction(const SelectionInstruction &insn);
+ void emitIndirectMoveInstruction(const SelectionInstruction &insn);
+ void emitEotInstruction(const SelectionInstruction &insn);
+ void emitNoOpInstruction(const SelectionInstruction &insn);
+ void emitWaitInstruction(const SelectionInstruction &insn);
+ void emitBarrierInstruction(const SelectionInstruction &insn);
+ void emitFenceInstruction(const SelectionInstruction &insn);
+ void emitMathInstruction(const SelectionInstruction &insn);
+ void emitRead64Instruction(const SelectionInstruction &insn);
+ void emitWrite64Instruction(const SelectionInstruction &insn);
+ void emitUntypedReadInstruction(const SelectionInstruction &insn);
+ void emitUntypedWriteInstruction(const SelectionInstruction &insn);
+ void emitAtomicInstruction(const SelectionInstruction &insn);
+ void emitByteGatherInstruction(const SelectionInstruction &insn);
+ void emitByteScatterInstruction(const SelectionInstruction &insn);
+ void emitPackByteInstruction(const SelectionInstruction &insn);
+ void emitUnpackByteInstruction(const SelectionInstruction &insn);
+ void emitDWordGatherInstruction(const SelectionInstruction &insn);
+ void emitSampleInstruction(const SelectionInstruction &insn);
+ void emitTypedWriteInstruction(const SelectionInstruction &insn);
+ void emitSpillRegInstruction(const SelectionInstruction &insn);
+ void emitUnSpillRegInstruction(const SelectionInstruction &insn);
+ void emitGetImageInfoInstruction(const SelectionInstruction &insn);
+ void emitI64MULInstruction(const SelectionInstruction &insn);
+ void emitI64DIVREMInstruction(const SelectionInstruction &insn);
+ void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+ void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+
+ /*! Implements base class */
+ virtual Kernel *allocateKernel(void);
+ /*! Store the position of each label instruction in the Gen ISA stream */
+ map<ir::LabelIndex, uint32_t> labelPos;
+ typedef struct LabelPair {
+ LabelPair(ir::LabelIndex l0, ir::LabelIndex l1) :
+ l0(l0), l1(l1){};
+ ir::LabelIndex l0;
+ ir::LabelIndex l1;
+ } LabelPair;
+ /*! Store the Gen instructions to patch */
+ vector<std::pair<LabelPair, uint32_t>> branchPos3;
+ vector<std::pair<ir::LabelIndex, uint32_t>> branchPos2;
+
+ void insertJumpPos(const SelectionInstruction &insn);
+ /*! Encode Gen ISA */
+ GenEncoder *p;
+ /*! Instruction selection on Gen ISA (pre-register allocation) */
+ Selection *sel;
+ /*! Perform the register allocation */
+ GenRegAllocator *ra;
+ /*! Indicate if we need to tackle a register pressure issue when
+ * regenerating the code
+ */
+ uint32_t reservedSpillRegs;
+ bool limitRegisterPressure;
+ bool relaxMath;
+ const bool getIFENDIFFix(void) const { return ifEndifFix; }
+ void setIFENDIFFix(bool fix) { ifEndifFix = fix; }
+ const CompileErrorCode getErrCode() { return errCode; }
+
+ protected:
+ virtual GenEncoder* generateEncoder(void) {
+ return GBE_NEW(GenEncoder, this->simdWidth, 7, deviceID);
+ }
+ /*! allocate a new curbe register and insert to curbe pool. */
+ void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
+
+ private:
+ CompileErrorCode errCode;
+ bool ifEndifFix;
+ /*! Build the curbe patch list for the given kernel */
+ void buildPatchList(void);
+ /*! Calc the group's slm offset from R0.0, to work around HSW SLM bug*/
+ virtual void emitSLMOffset(void) { };
+ /*! allocate group's slm offset in curbe, only for HSW */
+ virtual void allocSLMOffsetCurbe(void) { };
+ /*! new selection of device */
+ virtual void newSelection(void);
+ friend class GenRegAllocator; //!< need to access errCode directly.
+
+ };
+
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_CONTEXT_HPP__ */
+
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
new file mode 100644
index 0000000..f0da50a
--- /dev/null
+++ b/backend/src/backend/gen_defs.hpp
@@ -0,0 +1,974 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <keith at tungstengraphics.com>
+ */
+
+#ifndef __GEN_DEFS_HPP__
+#define __GEN_DEFS_HPP__
+
+#include <stdint.h>
+
+/////////////////////////////////////////////////////////////////////////////
+// Gen EU defines
+/////////////////////////////////////////////////////////////////////////////
+
+/* Execution Unit (EU) defines */
+#define GEN_ALIGN_1 0
+#define GEN_ALIGN_16 1
+
+#define GEN_REG_SIZE 32
+
+#define GEN_ADDRESS_DIRECT 0
+#define GEN_ADDRESS_REGISTER_INDIRECT_REGISTER 1
+
+#define GEN_CHANNEL_X 0
+#define GEN_CHANNEL_Y 1
+#define GEN_CHANNEL_Z 2
+#define GEN_CHANNEL_W 3
+
+#define GEN_COMPRESSION_Q1 0
+#define GEN_COMPRESSION_Q2 1
+#define GEN_COMPRESSION_Q3 2
+#define GEN_COMPRESSION_Q4 3
+#define GEN_COMPRESSION_H1 0
+#define GEN_COMPRESSION_H2 2
+
+#define GEN_CONDITIONAL_NONE 0
+#define GEN_CONDITIONAL_Z 1
+#define GEN_CONDITIONAL_NZ 2
+#define GEN_CONDITIONAL_EQ 1 /* Z */
+#define GEN_CONDITIONAL_NEQ 2 /* NZ */
+#define GEN_CONDITIONAL_G 3
+#define GEN_CONDITIONAL_GE 4
+#define GEN_CONDITIONAL_L 5
+#define GEN_CONDITIONAL_LE 6
+#define GEN_CONDITIONAL_R 7
+#define GEN_CONDITIONAL_O 8
+#define GEN_CONDITIONAL_U 9
+
+#define GEN_DEBUG_NONE 0
+#define GEN_DEBUG_BREAKPOINT 1
+
+#define GEN_DEPENDENCY_NORMAL 0
+#define GEN_DEPENDENCY_NOTCLEARED 1
+#define GEN_DEPENDENCY_NOTCHECKED 2
+#define GEN_DEPENDENCY_DISABLE 3
+
+#define GEN_HORIZONTAL_STRIDE_0 0
+#define GEN_HORIZONTAL_STRIDE_1 1
+#define GEN_HORIZONTAL_STRIDE_2 2
+#define GEN_HORIZONTAL_STRIDE_4 3
+
+#define GEN_INSTRUCTION_NORMAL 0
+#define GEN_INSTRUCTION_SATURATE 1
+
+#define GEN_MASK_ENABLE 0
+#define GEN_MASK_DISABLE 1
+
+/*! Gen opcode */
+enum opcode {
+ GEN_OPCODE_MOV = 1,
+ GEN_OPCODE_SEL = 2,
+ GEN_OPCODE_NOT = 4,
+ GEN_OPCODE_AND = 5,
+ GEN_OPCODE_OR = 6,
+ GEN_OPCODE_XOR = 7,
+ GEN_OPCODE_SHR = 8,
+ GEN_OPCODE_SHL = 9,
+ GEN_OPCODE_RSR = 10,
+ GEN_OPCODE_RSL = 11,
+ GEN_OPCODE_ASR = 12,
+ GEN_OPCODE_CMP = 16,
+ GEN_OPCODE_CMPN = 17,
+ GEN_OPCODE_F32TO16 = 19,
+ GEN_OPCODE_F16TO32 = 20,
+ GEN_OPCODE_JMPI = 32,
+ GEN_OPCODE_BRD = 33,
+ GEN_OPCODE_IF = 34,
+ GEN_OPCODE_BRC = 35,
+ GEN_OPCODE_ELSE = 36,
+ GEN_OPCODE_ENDIF = 37,
+ GEN_OPCODE_DO = 38,
+ GEN_OPCODE_WHILE = 39,
+ GEN_OPCODE_BREAK = 40,
+ GEN_OPCODE_CONTINUE = 41,
+ GEN_OPCODE_HALT = 42,
+ GEN_OPCODE_MSAVE = 44,
+ GEN_OPCODE_MRESTORE = 45,
+ GEN_OPCODE_PUSH = 46,
+ GEN_OPCODE_POP = 47,
+ GEN_OPCODE_WAIT = 48,
+ GEN_OPCODE_SEND = 49,
+ GEN_OPCODE_SENDC = 50,
+ GEN_OPCODE_MATH = 56,
+ GEN_OPCODE_ADD = 64,
+ GEN_OPCODE_MUL = 65,
+ GEN_OPCODE_AVG = 66,
+ GEN_OPCODE_FRC = 67,
+ GEN_OPCODE_RNDU = 68,
+ GEN_OPCODE_RNDD = 69,
+ GEN_OPCODE_RNDE = 70,
+ GEN_OPCODE_RNDZ = 71,
+ GEN_OPCODE_MAC = 72,
+ GEN_OPCODE_MACH = 73,
+ GEN_OPCODE_LZD = 74,
+ GEN_OPCODE_FBH = 75,
+ GEN_OPCODE_FBL = 76,
+ GEN_OPCODE_ADDC = 78,
+ GEN_OPCODE_SUBB = 79,
+ GEN_OPCODE_SAD2 = 80,
+ GEN_OPCODE_SADA2 = 81,
+ GEN_OPCODE_DP4 = 84,
+ GEN_OPCODE_DPH = 85,
+ GEN_OPCODE_DP3 = 86,
+ GEN_OPCODE_DP2 = 87,
+ GEN_OPCODE_DPA2 = 88,
+ GEN_OPCODE_LINE = 89,
+ GEN_OPCODE_PLN = 90,
+ GEN_OPCODE_MAD = 91,
+ GEN_OPCODE_NOP = 126,
+};
+
+#define GEN_ATOMIC_SIMD16 0
+#define GEN_ATOMIC_SIMD8 1
+
+enum GenAtomicOpCode {
+ GEN_ATOMIC_OP_CMPWR8B = 0,
+ GEN_ATOMIC_OP_AND = 1,
+ GEN_ATOMIC_OP_OR = 2,
+ GEN_ATOMIC_OP_XOR = 3,
+ GEN_ATOMIC_OP_MOV = 4,
+ GEN_ATOMIC_OP_INC = 5,
+ GEN_ATOMIC_OP_DEC = 6,
+ GEN_ATOMIC_OP_ADD = 7,
+ GEN_ATOMIC_OP_SUB = 8,
+ GEN_ATOMIC_OP_REVSUB = 9,
+ GEN_ATOMIC_OP_IMAX = 10,
+ GEN_ATOMIC_OP_IMIN = 11,
+ GEN_ATOMIC_OP_UMAX = 12,
+ GEN_ATOMIC_OP_UMIN = 13,
+ GEN_ATOMIC_OP_CMPWR = 14,
+ GEN_ATOMIC_OP_PREDEC = 15
+};
+
+/*! Gen SFID */
+enum GenMessageTarget {
+ GEN_SFID_NULL = 0,
+ GEN_SFID_MATH = 1,
+ GEN_SFID_SAMPLER = 2,
+ GEN_SFID_MESSAGE_GATEWAY = 3,
+ GEN_SFID_DATAPORT_READ = 4,
+ GEN_SFID_DATAPORT_WRITE = 5,
+ GEN_SFID_URB = 6,
+ GEN_SFID_THREAD_SPAWNER = 7,
+ GEN6_SFID_DATAPORT_SAMPLER_CACHE = 4,
+ GEN6_SFID_DATAPORT_RENDER_CACHE = 5,
+ GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
+ GEN_SFID_DATAPORT_DATA_CACHE = 10,
+ GEN_SFID_DATAPORT1_DATA_CACHE = 12,
+};
+
+#define GEN_PREDICATE_NONE 0
+#define GEN_PREDICATE_NORMAL 1
+#define GEN_PREDICATE_ALIGN1_ANYV 2
+#define GEN_PREDICATE_ALIGN1_ALLV 3
+#define GEN_PREDICATE_ALIGN1_ANY2H 4
+#define GEN_PREDICATE_ALIGN1_ALL2H 5
+#define GEN_PREDICATE_ALIGN1_ANY4H 6
+#define GEN_PREDICATE_ALIGN1_ALL4H 7
+#define GEN_PREDICATE_ALIGN1_ANY8H 8
+#define GEN_PREDICATE_ALIGN1_ALL8H 9
+#define GEN_PREDICATE_ALIGN1_ANY16H 10
+#define GEN_PREDICATE_ALIGN1_ALL16H 11
+#define GEN_PREDICATE_ALIGN16_REPLICATE_X 2
+#define GEN_PREDICATE_ALIGN16_REPLICATE_Y 3
+#define GEN_PREDICATE_ALIGN16_REPLICATE_Z 4
+#define GEN_PREDICATE_ALIGN16_REPLICATE_W 5
+#define GEN_PREDICATE_ALIGN16_ANY4H 6
+#define GEN_PREDICATE_ALIGN16_ALL4H 7
+
+#define GEN_ARCHITECTURE_REGISTER_FILE 0
+#define GEN_GENERAL_REGISTER_FILE 1
+#define GEN_IMMEDIATE_VALUE 3
+
+#define GEN_TYPE_UD 0
+#define GEN_TYPE_D 1
+#define GEN_TYPE_UW 2
+#define GEN_TYPE_W 3
+#define GEN_TYPE_UB 4
+#define GEN_TYPE_B 5
+#define GEN_TYPE_VF 5 /* packed float vector, immediates only? */
+#define GEN_TYPE_HF 6
+#define GEN_TYPE_V 6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF 6
+#define GEN_TYPE_F 7
+#define GEN_TYPE_UL 8
+#define GEN_TYPE_L 9
+
+#define GEN_ARF_NULL 0x00
+#define GEN_ARF_ADDRESS 0x10
+#define GEN_ARF_ACCUMULATOR 0x20
+#define GEN_ARF_FLAG 0x30
+#define GEN_ARF_MASK 0x40
+#define GEN_ARF_MASK_STACK 0x50
+#define GEN_ARF_MASK_STACK_DEPTH 0x60
+#define GEN_ARF_STATE 0x70
+#define GEN_ARF_CONTROL 0x80
+#define GEN_ARF_NOTIFICATION_COUNT 0x90
+#define GEN_ARF_IP 0xA0
+
+#define GEN_MRF_COMPR4 (1 << 7)
+
+#define GEN_AMASK 0
+#define GEN_IMASK 1
+#define GEN_LMASK 2
+#define GEN_CMASK 3
+
+#define GEN_THREAD_NORMAL 0
+#define GEN_THREAD_ATOMIC 1
+#define GEN_THREAD_SWITCH 2
+
+#define GEN_VERTICAL_STRIDE_0 0
+#define GEN_VERTICAL_STRIDE_1 1
+#define GEN_VERTICAL_STRIDE_2 2
+#define GEN_VERTICAL_STRIDE_4 3
+#define GEN_VERTICAL_STRIDE_8 4
+#define GEN_VERTICAL_STRIDE_16 5
+#define GEN_VERTICAL_STRIDE_32 6
+#define GEN_VERTICAL_STRIDE_64 7
+#define GEN_VERTICAL_STRIDE_128 8
+#define GEN_VERTICAL_STRIDE_256 9
+#define GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL 0xF
+
+/* Execution width */
+#define GEN_WIDTH_1 0
+#define GEN_WIDTH_2 1
+#define GEN_WIDTH_4 2
+#define GEN_WIDTH_8 3
+#define GEN_WIDTH_16 4
+#define GEN_WIDTH_32 5
+
+/* Channels to enable for the untyped reads and writes */
+#define GEN_UNTYPED_RED (1 << 0)
+#define GEN_UNTYPED_GREEN (1 << 1)
+#define GEN_UNTYPED_BLUE (1 << 2)
+#define GEN_UNTYPED_ALPHA (1 << 3)
+
+/* SIMD mode for untyped reads and writes */
+#define GEN_UNTYPED_SIMD4x2 0
+#define GEN_UNTYPED_SIMD16 1
+#define GEN_UNTYPED_SIMD8 2
+
+/* SIMD mode for byte scatters / gathers */
+#define GEN_BYTE_SCATTER_SIMD8 0
+#define GEN_BYTE_SCATTER_SIMD16 1
+
+/* Data port message type for gen7*/
+#define GEN7_OBLOCK_READ 0 //0000: OWord Block Read
+#define GEN7_UNALIGNED_OBLOCK_READ 1 //0001: Unaligned OWord Block Read
+#define GEN7_ODBLOCK_READ 2 //0010: OWord Dual Block Read
+#define GEN7_DWORD_GATHER 3 //0011: DWord Scattered Read
+#define GEN7_BYTE_GATHER 4 //0100: Byte Scattered Read
+#define GEN7_UNTYPED_READ 5 //0101: Untyped Surface Read
+#define GEN7_UNTYPED_ATOMIC_READ 6 //0110: Untyped Atomic Operation
+#define GEN7_MEMORY_FENCE 7 //0111: Memory Fence
+#define GEN7_OBLOCK_WRITE 8 //1000: OWord Block Write
+#define GEN7_ODBLOCK_WRITE 10//1010: OWord Dual Block Write
+#define GEN7_DWORD_SCATTER 11//1011: DWord Scattered Write
+#define GEN7_BYTE_SCATTER 12//1100: Byte Scattered Write
+#define GEN7_UNTYPED_WRITE 13//1101: Untyped Surface Write
+
+/* Data port0 message type for Gen75*/
+#define GEN75_P0_OBLOCK_READ 0 //0000: OWord Block Read
+#define GEN75_P0_UNALIGNED_OBLOCK_READ 1 //0001: Unaligned OWord Block Read
+#define GEN75_P0_ODBLOCK_READ 2 //0010: OWord Dual Block Read
+#define GEN75_P0_DWORD_GATHER 3 //0011: DWord Scattered Read
+#define GEN75_P0_BYTE_GATHER 4 //0100: Byte Scattered Read
+#define GEN75_P0_MEMORY_FENCE 7 //0111: Memory Fence
+#define GEN75_P0_OBLOCK_WRITE 8 //1000: OWord Block Write
+#define GEN75_P0_ODBLOCK_WRITE 10 //1010: OWord Dual Block Write
+#define GEN75_P0_DWORD_SCATTER 11 //1011: DWord Scattered Write
+#define GEN75_P0_BYTE_SCATTER 12 //1100: Byte Scattered Write
+
+/* Data port1 message type for Gen75*/
+#define GEN75_P1_UNTYPED_READ 1 //0001: Untyped Surface Read
+#define GEN75_P1_UNTYPED_ATOMIC_OP 2 //0010: Untyped Atomic Operation
+#define GEN75_P1_UNTYPED_ATOMIC_OP_4X2 3 //0011: Untyped Atomic Operation SIMD4x2
+#define GEN75_P1_MEDIA_BREAD 4 //0100: Media Block Read
+#define GEN75_P1_TYPED_SURFACE_READ 5 //0101: Typed Surface Read
+#define GEN75_P1_TYPED_ATOMIC_OP 6 //0110: Typed Atomic Operation
+#define GEN75_P1_TYPED_ATOMIC_OP_4X2 7 //0111: Typed Atomic Operation SIMD4x2
+#define GEN75_P1_UNTYPED_SURFACE_WRITE 9 //1001: Untyped Surface Write
+#define GEN75_P1_MEDIA_TYPED_BWRITE 10 //1010: Media Block Write
+#define GEN75_P1_ATOMIC_COUNTER 11 //1011: Atomic Counter Operation
+#define GEN75_P1_ATOMIC_COUNTER_4X2 12 //1100: Atomic Counter Operation 4X2
+#define GEN75_P1_TYPED_SURFACE_WRITE 13 //1101: Typed Surface Write
+
+/* Data port data cache scratch messages*/
+#define GEN_SCRATCH_READ 0
+#define GEN_SCRATCH_WRITE 1
+#define GEN_SCRATCH_CHANNEL_MODE_OWORD 0
+#define GEN_SCRATCH_CHANNEL_MODE_DWORD 1
+#define GEN_SCRATCH_BLOCK_SIZE_1 0
+#define GEN_SCRATCH_BLOCK_SIZE_2 1
+#define GEN_SCRATCH_BLOCK_SIZE_4 3
+
+/* Data port render cache Message Type*/
+#define GEN_MBLOCK_READ 4 //0100: Media Block Read
+#define GEN_TYPED_READ 5 //0101: Typed Surface Read
+#define GEN_TYPED_ATOMIC 6 //0110: Typed Atomic Operation
+#define GEN_MEM_FENCE 7 //0111: Memory Fence
+#define GEN_MBLOCK_WRITE 10 //1010: Media Block Write
+#define GEN_RENDER_WRITE 12 //1100: Render Target Write
+#define GEN_TYPED_WRITE 13 //1101: Typed Surface Write
+
+/* For byte scatters and gathers, the element to write */
+#define GEN_BYTE_SCATTER_BYTE 0
+#define GEN_BYTE_SCATTER_WORD 1
+#define GEN_BYTE_SCATTER_DWORD 2
+#define GEN_BYTE_SCATTER_QWORD 3
+
+/* dword scattered rw */
+#define GEN_DWORD_SCATTER_8_DWORDS 2
+#define GEN_DWORD_SCATTER_16_DWORDS 3
+
+#define GEN_SAMPLER_RETURN_FORMAT_FLOAT32 0
+#define GEN_SAMPLER_RETURN_FORMAT_UINT32 2
+#define GEN_SAMPLER_RETURN_FORMAT_SINT32 3
+
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE 0
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE 0
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS 0
+#define GEN_SAMPLER_MESSAGE_SIMD8_KILLPIX 1
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD 1
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD 1
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS 2
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS 2
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE 0
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE 2
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE 1
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_RESINFO 2
+#define GEN_SAMPLER_MESSAGE_SIMD16_RESINFO 2
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_LD 7
+#define GEN_SAMPLER_MESSAGE_SIMD8_LD 7
+#define GEN_SAMPLER_MESSAGE_SIMD16_LD 7
+
+#define GEN5_SAMPLER_MESSAGE_SAMPLE 0
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS 1
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD 2
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE 3
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS 4
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE 6
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD 7
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO 10
+
+/* for GEN5 only */
+#define GEN_SAMPLER_SIMD_MODE_SIMD4X2 0
+#define GEN_SAMPLER_SIMD_MODE_SIMD8 1
+#define GEN_SAMPLER_SIMD_MODE_SIMD16 2
+#define GEN_SAMPLER_SIMD_MODE_SIMD32_64 3
+
+#define GEN_MATH_FUNCTION_INV 1
+#define GEN_MATH_FUNCTION_LOG 2
+#define GEN_MATH_FUNCTION_EXP 3
+#define GEN_MATH_FUNCTION_SQRT 4
+#define GEN_MATH_FUNCTION_RSQ 5
+#define GEN_MATH_FUNCTION_SIN 6 /* was 7 */
+#define GEN_MATH_FUNCTION_COS 7 /* was 8 */
+#define GEN_MATH_FUNCTION_FDIV 9 /* gen6+ */
+#define GEN_MATH_FUNCTION_POW 10
+#define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER 11
+#define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT 12
+#define GEN_MATH_FUNCTION_INT_DIV_REMAINDER 13
+
+#define GEN_MATH_INTEGER_UNSIGNED 0
+#define GEN_MATH_INTEGER_SIGNED 1
+
+#define GEN_MATH_PRECISION_FULL 0
+#define GEN_MATH_PRECISION_PARTIAL 1
+
+#define GEN_MATH_SATURATE_NONE 0
+#define GEN_MATH_SATURATE_SATURATE 1
+
+#define GEN_MATH_DATA_VECTOR 0
+#define GEN_MATH_DATA_SCALAR 1
+
+#define GEN_DEREFERENCE_URB 0
+#define GEN_DO_NOT_DEREFERENCE_URB 1
+
+#define GEN_MAX_NUM_BUFFER_ENTRIES (1 << 27)
+
+/* Message gateway */
+#define GEN_OPEN_GATEWAY 0b000
+#define GEN_CLOSE_GATEWAY 0b001
+#define GEN_FORWARD_MSG 0b010
+#define GEN_GET_TIME_STAMP 0b011
+#define GEN_BARRIER_MSG 0b100
+#define GEN_UPDATE_GATEWAT_STATE 0b101
+#define GEN_MMIO_READ_WRITE 0b110
+
+/////////////////////////////////////////////////////////////////////////////
+// Gen EU structures
+/////////////////////////////////////////////////////////////////////////////
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define GEN_MAX_GRF 128
+
+/* Instruction format for the execution units */
+
+struct GenInstruction {
+ uint32_t low;
+ uint32_t high;
+};
+
+union GenCompactInstruction {
+ struct GenInstruction low;
+ struct {
+ struct {
+ uint32_t opcode:7;
+ uint32_t debug_control:1;
+ uint32_t control_index:5;
+ uint32_t data_type_index:5;
+ uint32_t sub_reg_index:5;
+ uint32_t acc_wr_control:1;
+ uint32_t destreg_or_condmod:4;
+ uint32_t pad:1;
+ uint32_t cmpt_control:1;
+ uint32_t src0_index_lo:2;
+ } bits1;
+ struct {
+ uint32_t src0_index_hi:3;
+ uint32_t src1_index:5;
+ uint32_t dest_reg_nr:8;
+ uint32_t src0_reg_nr:8;
+ uint32_t src1_reg_nr:8;
+ } bits2;
+ };
+};
+
+union GenNativeInstruction
+{
+ struct {
+ struct GenInstruction low;
+ struct GenInstruction high;
+ };
+ struct {
+ struct {
+ uint32_t opcode:7;
+ uint32_t pad:1;
+ uint32_t access_mode:1;
+ uint32_t mask_control:1;
+ uint32_t dependency_control:2;
+ uint32_t quarter_control:2;
+ uint32_t thread_control:2;
+ uint32_t predicate_control:4;
+ uint32_t predicate_inverse:1;
+ uint32_t execution_size:3;
+ uint32_t destreg_or_condmod:4;
+ uint32_t acc_wr_control:1;
+ uint32_t cmpt_control:1;
+ uint32_t debug_control:1;
+ uint32_t saturate:1;
+ } header;
+
+ union {
+ struct {
+ uint32_t dest_reg_file:2;
+ uint32_t dest_reg_type:3;
+ uint32_t src0_reg_file:2;
+ uint32_t src0_reg_type:3;
+ uint32_t src1_reg_file:2;
+ uint32_t src1_reg_type:3;
+ uint32_t nib_ctrl:1;
+ uint32_t dest_subreg_nr:5;
+ uint32_t dest_reg_nr:8;
+ uint32_t dest_horiz_stride:2;
+ uint32_t dest_address_mode:1;
+ } da1;
+
+ struct {
+ uint32_t dest_reg_file:2;
+ uint32_t dest_reg_type:3;
+ uint32_t src0_reg_file:2;
+ uint32_t src0_reg_type:3;
+ uint32_t src1_reg_file:2; /* 0x00000c00 */
+ uint32_t src1_reg_type:3; /* 0x00007000 */
+ uint32_t nib_ctrl:1;
+ int dest_indirect_offset:10; /* offset against the deref'd address reg */
+ uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
+ uint32_t dest_horiz_stride:2;
+ uint32_t dest_address_mode:1;
+ } ia1;
+
+ struct {
+ uint32_t dest_reg_file:2;
+ uint32_t dest_reg_type:3;
+ uint32_t src0_reg_file:2;
+ uint32_t src0_reg_type:3;
+ uint32_t src1_reg_file:2;
+ uint32_t src1_reg_type:3;
+ uint32_t nib_ctrl:1;
+ uint32_t dest_writemask:4;
+ uint32_t dest_subreg_nr:1;
+ uint32_t dest_reg_nr:8;
+ uint32_t dest_horiz_stride:2;
+ uint32_t dest_address_mode:1;
+ } da16;
+
+ struct {
+ uint32_t dest_reg_file:2;
+ uint32_t dest_reg_type:3;
+ uint32_t src0_reg_file:2;
+ uint32_t src0_reg_type:3;
+ uint32_t nib_ctrl:1;
+ uint32_t dest_writemask:4;
+ int dest_indirect_offset:6;
+ uint32_t dest_subreg_nr:3;
+ uint32_t dest_horiz_stride:2;
+ uint32_t dest_address_mode:1;
+ } ia16;
+
+ struct {
+ uint32_t dest_reg_file:2;
+ uint32_t dest_reg_type:3;
+ uint32_t src0_reg_file:2;
+ uint32_t src0_reg_type:3;
+ uint32_t src1_reg_file:2;
+ uint32_t src1_reg_type:3;
+ uint32_t pad:1;
+ int jump_count:16;
+ } branch_gen6;
+
+ struct {
+ uint32_t dest_reg_file:1;
+ uint32_t flag_subreg_num:1;
+ uint32_t pad0:2;
+ uint32_t src0_abs:1;
+ uint32_t src0_negate:1;
+ uint32_t src1_abs:1;
+ uint32_t src1_negate:1;
+ uint32_t src2_abs:1;
+ uint32_t src2_negate:1;
+ uint32_t pad1:7;
+ uint32_t dest_writemask:4;
+ uint32_t dest_subreg_nr:3;
+ uint32_t dest_reg_nr:8;
+ } da3src;
+ } bits1;
+
+ union {
+ struct {
+ uint32_t src0_subreg_nr:5;
+ uint32_t src0_reg_nr:8;
+ uint32_t src0_abs:1;
+ uint32_t src0_negate:1;
+ uint32_t src0_address_mode:1;
+ uint32_t src0_horiz_stride:2;
+ uint32_t src0_width:3;
+ uint32_t src0_vert_stride:4;
+ uint32_t flag_sub_reg_nr:1;
+ uint32_t flag_reg_nr:1;
+ uint32_t pad:5;
+ } da1;
+
+ struct {
+ int src0_indirect_offset:10;
+ uint32_t src0_subreg_nr:3;
+ uint32_t src0_abs:1;
+ uint32_t src0_negate:1;
+ uint32_t src0_address_mode:1;
+ uint32_t src0_horiz_stride:2;
+ uint32_t src0_width:3;
+ uint32_t src0_vert_stride:4;
+ uint32_t flag_sub_reg_nr:1;
+ uint32_t flag_reg_nr:1;
+ uint32_t pad:5;
+ } ia1;
+
+ struct {
+ uint32_t src0_swz_x:2;
+ uint32_t src0_swz_y:2;
+ uint32_t src0_subreg_nr:1;
+ uint32_t src0_reg_nr:8;
+ uint32_t src0_abs:1;
+ uint32_t src0_negate:1;
+ uint32_t src0_address_mode:1;
+ uint32_t src0_swz_z:2;
+ uint32_t src0_swz_w:2;
+ uint32_t pad0:1;
+ uint32_t src0_vert_stride:4;
+ uint32_t flag_sub_reg_nr:1;
+ uint32_t flag_reg_nr:1;
+ uint32_t pad:5;
+ } da16;
+
+ struct {
+ uint32_t src0_swz_x:2;
+ uint32_t src0_swz_y:2;
+ int src0_indirect_offset:6;
+ uint32_t src0_subreg_nr:3;
+ uint32_t src0_abs:1;
+ uint32_t src0_negate:1;
+ uint32_t src0_address_mode:1;
+ uint32_t src0_swz_z:2;
+ uint32_t src0_swz_w:2;
+ uint32_t pad0:1;
+ uint32_t src0_vert_stride:4;
+ uint32_t flag_sub_reg_nr:1;
+ uint32_t flag_reg_nr:1;
+ uint32_t pad:5;
+ } ia16;
+
+ struct {
+ uint32_t src0_rep_ctrl:1;
+ uint32_t src0_swizzle:8;
+ uint32_t src0_subreg_nr:3;
+ uint32_t src0_reg_nr:8;
+ uint32_t pad0:1;
+ uint32_t src1_rep_ctrl:1;
+ uint32_t src1_swizzle:8;
+ uint32_t src1_subreg_nr_low:2;
+ } da3src;
+ } bits2;
+
+ union {
+ struct {
+ uint32_t src1_subreg_nr:5;
+ uint32_t src1_reg_nr:8;
+ uint32_t src1_abs:1;
+ uint32_t src1_negate:1;
+ uint32_t src1_address_mode:1;
+ uint32_t src1_horiz_stride:2;
+ uint32_t src1_width:3;
+ uint32_t src1_vert_stride:4;
+ uint32_t pad0:7;
+ } da1;
+
+ struct {
+ uint32_t src1_swz_x:2;
+ uint32_t src1_swz_y:2;
+ uint32_t src1_subreg_nr:1;
+ uint32_t src1_reg_nr:8;
+ uint32_t src1_abs:1;
+ uint32_t src1_negate:1;
+ uint32_t src1_address_mode:1;
+ uint32_t src1_swz_z:2;
+ uint32_t src1_swz_w:2;
+ uint32_t pad1:1;
+ uint32_t src1_vert_stride:4;
+ uint32_t pad2:7;
+ } da16;
+
+ struct {
+ int src1_indirect_offset:10;
+ uint32_t src1_subreg_nr:3;
+ uint32_t src1_abs:1;
+ uint32_t src1_negate:1;
+ uint32_t src1_address_mode:1;
+ uint32_t src1_horiz_stride:2;
+ uint32_t src1_width:3;
+ uint32_t src1_vert_stride:4;
+ uint32_t pad1:7;
+ } ia1;
+
+ struct {
+ uint32_t src1_swz_x:2;
+ uint32_t src1_swz_y:2;
+ int src1_indirect_offset:6;
+ uint32_t src1_subreg_nr:3;
+ uint32_t src1_abs:1;
+ uint32_t src1_negate:1;
+ uint32_t pad0:1;
+ uint32_t src1_swz_z:2;
+ uint32_t src1_swz_w:2;
+ uint32_t pad1:1;
+ uint32_t src1_vert_stride:4;
+ uint32_t pad2:7;
+ } ia16;
+
+ struct {
+ uint32_t function_control:19;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad1:2;
+ uint32_t end_of_thread:1;
+ } generic_gen5;
+
+ struct {
+ uint32_t sub_function_id:3;
+ uint32_t pad0:11;
+ uint32_t ack_req:1;
+ uint32_t notify:2;
+ uint32_t pad1:2;
+ uint32_t header:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } msg_gateway;
+
+ struct {
+ uint32_t opcode:1;
+ uint32_t request:1;
+ uint32_t pad0:2;
+ uint32_t resource:1;
+ uint32_t pad1:14;
+ uint32_t header:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } spawner_gen5;
+
+ /** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
+ struct {
+ uint32_t function:4;
+ uint32_t int_type:1;
+ uint32_t precision:1;
+ uint32_t saturate:1;
+ uint32_t data_type:1;
+ uint32_t snapshot:1;
+ uint32_t pad0:10;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad1:2;
+ uint32_t end_of_thread:1;
+ } math_gen5;
+
+ struct {
+ uint32_t bti:8;
+ uint32_t sampler:4;
+ uint32_t msg_type:5;
+ uint32_t simd_mode:2;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad1:2;
+ uint32_t end_of_thread:1;
+ } sampler_gen7;
+
+ /**
+ * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
+ *
+ * See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
+ **/
+ struct {
+ uint32_t bti:8;
+ uint32_t msg_control:5;
+ uint32_t msg_type:3;
+ uint32_t pad0:3;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad1:2;
+ uint32_t end_of_thread:1;
+ } gen6_dp_sampler_const_cache;
+
+ /*! Data port untyped read / write messages */
+ struct {
+ uint32_t bti:8;
+ uint32_t rgba:4;
+ uint32_t simd_mode:2;
+ uint32_t msg_type:4;
+ uint32_t category:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen7_untyped_rw;
+
+ /*! Data port byte scatter / gather */
+ struct {
+ uint32_t bti:8;
+ uint32_t simd_mode:1;
+ uint32_t ignored0:1;
+ uint32_t data_size:2;
+ uint32_t ignored1:2;
+ uint32_t msg_type:4;
+ uint32_t category:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen7_byte_rw;
+
+ /*! Data port Scratch Read/ write */
+ struct {
+ uint32_t offset:12;
+ uint32_t block_size:2;
+ uint32_t ignored0:1;
+ uint32_t invalidate_after_read:1;
+ uint32_t channel_mode:1;
+ uint32_t msg_type:1;
+ uint32_t category:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen7_scratch_rw;
+
+ /*! Data port OBlock read / write */
+ struct {
+ uint32_t bti:8;
+ uint32_t block_size:3;
+ uint32_t ignored:2;
+ uint32_t invalidate_after_read:1;
+ uint32_t msg_type:4;
+ uint32_t category:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen7_oblock_rw;
+
+ /*! Data port dword scatter / gather */
+ struct {
+ uint32_t bti:8;
+ uint32_t block_size:2;
+ uint32_t ignored0:3;
+ uint32_t invalidate_after_read:1;
+ uint32_t msg_type:4;
+ uint32_t ignored1:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen7_dword_rw;
+
+ /*! Data port typed read / write messages */
+ struct {
+ uint32_t bti:8;
+ uint32_t chan_mask:4;
+ uint32_t slot:2;
+ uint32_t msg_type:4;
+ uint32_t pad2:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad3:2;
+ uint32_t end_of_thread:1;
+ } gen7_typed_rw;
+
+ /*! Memory fence */
+ struct {
+ uint32_t bti:8;
+ uint32_t pad:5;
+ uint32_t commit_enable:1;
+ uint32_t msg_type:4;
+ uint32_t pad2:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad3:2;
+ uint32_t end_of_thread:1;
+ } gen7_memory_fence;
+
+ /*! atomic messages */
+ struct {
+ uint32_t bti:8;
+ uint32_t aop_type:4;
+ uint32_t simd_mode:1;
+ uint32_t return_data:1;
+ uint32_t msg_type:4;
+ uint32_t category:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad3:2;
+ uint32_t end_of_thread:1;
+ } gen7_atomic_op;
+
+ struct {
+ uint32_t src1_subreg_nr_high:1;
+ uint32_t src1_reg_nr:8;
+ uint32_t pad0:1;
+ uint32_t src2_rep_ctrl:1;
+ uint32_t src2_swizzle:8;
+ uint32_t src2_subreg_nr:3;
+ uint32_t src2_reg_nr:8;
+ uint32_t pad1:2;
+ } da3src;
+
+ /*! Message gateway */
+ struct {
+ uint32_t subfunc:3;
+ uint32_t pad:11;
+ uint32_t ackreq:1;
+ uint32_t notify:2;
+ uint32_t pad2:2;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad3:2;
+ uint32_t end_of_thread:1;
+ } gen7_msg_gw;
+
+ struct {
+ uint32_t jip:16;
+ uint32_t uip:16;
+ } gen7_branch;
+
+ int d;
+ uint32_t ud;
+ float f;
+ } bits3;
+ };
+};
+
+#endif /* __GEN_DEFS_HPP__ */
+
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
new file mode 100644
index 0000000..182752a
--- /dev/null
+++ b/backend/src/backend/gen_encoder.cpp
@@ -0,0 +1,1311 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <keith at tungstengraphics.com>
+ */
+
+#include "backend/gen_encoder.hpp"
+#include <cstring>
+
+
+namespace gbe
+{
+ extern bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split);
+ extern bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split);
+ //////////////////////////////////////////////////////////////////////////
+ // Some helper functions to encode
+ //////////////////////////////////////////////////////////////////////////
+ INLINE bool isVectorOfBytes(GenRegister reg) {
+ if (reg.hstride != GEN_HORIZONTAL_STRIDE_0 &&
+ (reg.type == GEN_TYPE_UB || reg.type == GEN_TYPE_B))
+ return true;
+ else
+ return false;
+ }
+
+ INLINE bool needToSplitAlu1(GenEncoder *p, GenRegister dst, GenRegister src) {
+ if (p->curr.execWidth != 16 || src.hstride == GEN_HORIZONTAL_STRIDE_0) return false;
+ if (isVectorOfBytes(dst) == true) return true;
+ if (isVectorOfBytes(src) == true) return true;
+ return false;
+ }
+
+ INLINE bool needToSplitAlu2(GenEncoder *p, GenRegister dst, GenRegister src0, GenRegister src1) {
+ if (p->curr.execWidth != 16 ||
+ (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+ src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+ return false;
+ if (isVectorOfBytes(dst) == true) return true;
+ if (isVectorOfBytes(src0) == true) return true;
+ if (isVectorOfBytes(src1) == true) return true;
+ return false;
+ }
+
+ INLINE bool needToSplitCmp(GenEncoder *p, GenRegister src0, GenRegister src1) {
+ if (p->curr.execWidth != 16 ||
+ (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+ src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+ return false;
+ if (isVectorOfBytes(src0) == true) return true;
+ if (isVectorOfBytes(src1) == true) return true;
+ if (src0.type == GEN_TYPE_D || src0.type == GEN_TYPE_UD || src0.type == GEN_TYPE_F)
+ return true;
+ if (src1.type == GEN_TYPE_D || src1.type == GEN_TYPE_UD || src1.type == GEN_TYPE_F)
+ return true;
+ return false;
+ }
+
+ void GenEncoder::setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
+ unsigned msg_length, unsigned response_length,
+ bool header_present, bool end_of_thread)
+ {
+ setSrc1(inst, GenRegister::immd(0));
+ inst->bits3.generic_gen5.header_present = header_present;
+ inst->bits3.generic_gen5.response_length = response_length;
+ inst->bits3.generic_gen5.msg_length = msg_length;
+ inst->bits3.generic_gen5.end_of_thread = end_of_thread;
+ inst->header.destreg_or_condmod = sfid;
+ }
+
+ void GenEncoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+ unsigned char msg_type, uint32_t msg_length,
+ bool header_present)
+ {
+ const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+ setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
+ insn->bits3.gen7_typed_rw.bti = bti;
+ insn->bits3.gen7_typed_rw.msg_type = msg_type;
+ }
+
+ void GenEncoder::setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti,
+ uint32_t rgba, uint32_t msg_type,
+ uint32_t msg_length, uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
+ insn->bits3.gen7_untyped_rw.msg_type = msg_type;
+ insn->bits3.gen7_untyped_rw.bti = bti;
+ insn->bits3.gen7_untyped_rw.rgba = rgba;
+ if (curr.execWidth == 8)
+ insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+ else if (curr.execWidth == 16)
+ insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+ else
+ NOT_SUPPORTED;
+ }
+
+ static void setDPByteScatterGather(GenEncoder *p,
+ GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t elem_size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ insn->bits3.gen7_byte_rw.msg_type = msg_type;
+ insn->bits3.gen7_byte_rw.bti = bti;
+ insn->bits3.gen7_byte_rw.data_size = elem_size;
+ if (p->curr.execWidth == 8)
+ insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
+ else if (p->curr.execWidth == 16)
+ insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
+ else
+ NOT_SUPPORTED;
+ }
+#if 0
+ static void setOBlockRW(GenEncoder *p,
+ GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ assert(size == 2 || size == 4);
+ insn->bits3.gen7_oblock_rw.msg_type = msg_type;
+ insn->bits3.gen7_oblock_rw.bti = bti;
+ insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
+ insn->bits3.gen7_oblock_rw.header_present = 1;
+ }
+#endif
+
+ static void setSamplerMessage(GenEncoder *p,
+ GenNativeInstruction *insn,
+ unsigned char bti,
+ unsigned char sampler,
+ uint32_t msg_type,
+ uint32_t response_length,
+ uint32_t msg_length,
+ bool header_present,
+ uint32_t simd_mode,
+ uint32_t return_format)
+ {
+ const GenMessageTarget sfid = GEN_SFID_SAMPLER;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ insn->bits3.sampler_gen7.bti = bti;
+ insn->bits3.sampler_gen7.sampler = sampler;
+ insn->bits3.sampler_gen7.msg_type = msg_type;
+ insn->bits3.sampler_gen7.simd_mode = simd_mode;
+ }
+
+ static void setDWordScatterMessgae(GenEncoder *p,
+ GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t block_size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ // FIXME there is a unknown issue with baytrail-t platform, the DWORD scatter
+ // message causes a hang at unit test case compiler_global_constant.
+ // We workaround it to use DATA CACHE instead.
+ const GenMessageTarget sfid = (p->deviceID == PCI_CHIP_BAYTRAIL_T) ?
+ GEN_SFID_DATAPORT_DATA_CACHE : GEN6_SFID_DATAPORT_CONSTANT_CACHE;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ insn->bits3.gen7_dword_rw.msg_type = msg_type;
+ insn->bits3.gen7_dword_rw.bti = bti;
+ insn->bits3.gen7_dword_rw.block_size = block_size;
+ insn->bits3.gen7_dword_rw.invalidate_after_read = 0;
+ }
+ //////////////////////////////////////////////////////////////////////////
+ // Gen Emitter encoding class
+ //////////////////////////////////////////////////////////////////////////
+ GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID) :
+ stateNum(0), gen(gen), deviceID(deviceID)
+ {
+ this->simdWidth = simdWidth;
+ this->curr.execWidth = simdWidth;
+ this->curr.quarterControl = GEN_COMPRESSION_Q1;
+ this->curr.noMask = 0;
+ this->curr.flag = 0;
+ this->curr.subFlag = 0;
+ this->curr.predicate = GEN_PREDICATE_NORMAL;
+ this->curr.inversePredicate = 0;
+ }
+
+ void GenEncoder::push(void) {
+ assert(stateNum < MAX_STATE_NUM);
+ stack[stateNum++] = curr;
+ }
+
+ void GenEncoder::pop(void) {
+ assert(stateNum > 0);
+ curr = stack[--stateNum];
+ }
+
+ void GenEncoder::setHeader(GenNativeInstruction *insn) {
+ if (this->curr.execWidth == 8)
+ insn->header.execution_size = GEN_WIDTH_8;
+ else if (this->curr.execWidth == 16)
+ insn->header.execution_size = GEN_WIDTH_16;
+ else if (this->curr.execWidth == 4)
+ insn->header.execution_size = GEN_WIDTH_4;
+ else if (this->curr.execWidth == 1)
+ insn->header.execution_size = GEN_WIDTH_1;
+ else
+ NOT_IMPLEMENTED;
+ insn->header.acc_wr_control = this->curr.accWrEnable;
+ insn->header.quarter_control = this->curr.quarterControl;
+ insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
+ insn->header.mask_control = this->curr.noMask;
+ insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+ insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+ if (this->curr.predicate != GEN_PREDICATE_NONE) {
+ insn->header.predicate_control = this->curr.predicate;
+ insn->header.predicate_inverse = this->curr.inversePredicate;
+ }
+ insn->header.saturate = this->curr.saturate;
+ }
+
+ void GenEncoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
+ if (dest.file != GEN_ARCHITECTURE_REGISTER_FILE)
+ assert(dest.nr < 128);
+
+ insn->bits1.da1.dest_reg_file = dest.file;
+ insn->bits1.da1.dest_reg_type = dest.type;
+ insn->bits1.da1.dest_address_mode = dest.address_mode;
+ insn->bits1.da1.dest_reg_nr = dest.nr;
+ insn->bits1.da1.dest_subreg_nr = dest.subnr;
+ if (dest.hstride == GEN_HORIZONTAL_STRIDE_0) {
+ if (dest.type == GEN_TYPE_UB || dest.type == GEN_TYPE_B)
+ dest.hstride = GEN_HORIZONTAL_STRIDE_4;
+ else if (dest.type == GEN_TYPE_UW || dest.type == GEN_TYPE_W)
+ dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+ else
+ dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+ }
+ insn->bits1.da1.dest_horiz_stride = dest.hstride;
+ }
+
+ void GenEncoder::setSrc0(GenNativeInstruction *insn, GenRegister reg) {
+ if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
+ assert(reg.nr < 128);
+
+ if (reg.address_mode == GEN_ADDRESS_DIRECT) {
+ insn->bits1.da1.src0_reg_file = reg.file;
+ insn->bits1.da1.src0_reg_type = reg.type;
+ insn->bits2.da1.src0_abs = reg.absolute;
+ insn->bits2.da1.src0_negate = reg.negation;
+ insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+ if (reg.file == GEN_IMMEDIATE_VALUE) {
+ insn->bits3.ud = reg.value.ud;
+
+ /* Required to set some fields in src1 as well: */
+ insn->bits1.da1.src1_reg_file = 0; /* arf */
+ insn->bits1.da1.src1_reg_type = reg.type;
+ }
+ else {
+ if (insn->header.access_mode == GEN_ALIGN_1) {
+ insn->bits2.da1.src0_subreg_nr = reg.subnr;
+ insn->bits2.da1.src0_reg_nr = reg.nr;
+ } else {
+ insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+ insn->bits2.da16.src0_reg_nr = reg.nr;
+ }
+
+ if (reg.width == GEN_WIDTH_1 &&
+ insn->header.execution_size == GEN_WIDTH_1) {
+ insn->bits2.da1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+ insn->bits2.da1.src0_width = GEN_WIDTH_1;
+ insn->bits2.da1.src0_vert_stride = GEN_VERTICAL_STRIDE_0;
+ }
+ else {
+ insn->bits2.da1.src0_horiz_stride = reg.hstride;
+ insn->bits2.da1.src0_width = reg.width;
+ insn->bits2.da1.src0_vert_stride = reg.vstride;
+ }
+ }
+ } else {
+ insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
+ insn->bits1.ia1.src0_reg_type = reg.type;
+ insn->bits2.ia1.src0_subreg_nr = 0;
+ insn->bits2.ia1.src0_indirect_offset = 0;
+ insn->bits2.ia1.src0_abs = 0;
+ insn->bits2.ia1.src0_negate = 0;
+ insn->bits2.ia1.src0_address_mode = reg.address_mode;
+ insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+ insn->bits2.ia1.src0_width = GEN_WIDTH_1;
+ insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+ }
+ }
+
+ void GenEncoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
+ assert(reg.nr < 128);
+ assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
+
+ insn->bits1.da1.src1_reg_file = reg.file;
+ insn->bits1.da1.src1_reg_type = reg.type;
+ insn->bits3.da1.src1_abs = reg.absolute;
+ insn->bits3.da1.src1_negate = reg.negation;
+
+ assert(insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
+
+ if (reg.file == GEN_IMMEDIATE_VALUE)
+ insn->bits3.ud = reg.value.ud;
+ else {
+ assert (reg.address_mode == GEN_ADDRESS_DIRECT);
+ if (insn->header.access_mode == GEN_ALIGN_1) {
+ insn->bits3.da1.src1_subreg_nr = reg.subnr;
+ insn->bits3.da1.src1_reg_nr = reg.nr;
+ } else {
+ insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+ insn->bits3.da16.src1_reg_nr = reg.nr;
+ }
+
+ if (reg.width == GEN_WIDTH_1 &&
+ insn->header.execution_size == GEN_WIDTH_1) {
+ insn->bits3.da1.src1_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+ insn->bits3.da1.src1_width = GEN_WIDTH_1;
+ insn->bits3.da1.src1_vert_stride = GEN_VERTICAL_STRIDE_0;
+ } else {
+ insn->bits3.da1.src1_horiz_stride = reg.hstride;
+ insn->bits3.da1.src1_width = reg.width;
+ insn->bits3.da1.src1_vert_stride = reg.vstride;
+ }
+ }
+ }
+
+ static const uint32_t untypedRWMask[] = {
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
+ GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
+ GEN_UNTYPED_ALPHA,
+ 0
+ };
+
+ void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ if (this->curr.execWidth == 8) {
+ msg_length = 1;
+ response_length = elemNum;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2;
+ response_length = 2*elemNum;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(insn,
+ bti,
+ untypedRWMask[elemNum],
+ GEN7_UNTYPED_READ,
+ msg_length,
+ response_length);
+ }
+
+ void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ this->setHeader(insn);
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ msg_length = 1+elemNum;
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ msg_length = 2*(1+elemNum);
+ }
+ else
+ NOT_IMPLEMENTED;
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPUntypedRW(insn,
+ bti,
+ untypedRWMask[elemNum],
+ GEN7_UNTYPED_WRITE,
+ msg_length,
+ response_length);
+ }
+
+ void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ if (this->curr.execWidth == 8) {
+ msg_length = 1;
+ response_length = 1;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2;
+ response_length = 2;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPByteScatterGather(this,
+ insn,
+ bti,
+ elemSize,
+ GEN7_BYTE_GATHER,
+ msg_length,
+ response_length);
+ }
+
+ void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ this->setHeader(insn);
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ msg_length = 2;
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ msg_length = 4;
+ } else
+ NOT_IMPLEMENTED;
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDPByteScatterGather(this,
+ insn,
+ bti,
+ elemSize,
+ GEN7_BYTE_SCATTER,
+ msg_length,
+ response_length);
+ }
+
+ void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ uint32_t block_size = 0;
+ if (this->curr.execWidth == 8) {
+ msg_length = 1;
+ response_length = 1;
+ block_size = GEN_DWORD_SCATTER_8_DWORDS;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2;
+ response_length = 2;
+ block_size = GEN_DWORD_SCATTER_16_DWORDS;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src);
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDWordScatterMessgae(this,
+ insn,
+ bti,
+ block_size,
+ GEN7_DWORD_GATHER,
+ msg_length,
+ response_length);
+
+ }
+
+ void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+
+ if (this->curr.execWidth == 8) {
+ msg_length = srcNum;
+ response_length = 1;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2*srcNum;
+ response_length = 2;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+ setMessageDescriptor(insn, sfid, msg_length, response_length);
+ insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
+ insn->bits3.gen7_atomic_op.bti = bti;
+ insn->bits3.gen7_atomic_op.return_data = 1;
+ insn->bits3.gen7_atomic_op.aop_type = function;
+
+ if (this->curr.execWidth == 8)
+ insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+ else if (this->curr.execWidth == 16)
+ insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+ else
+ NOT_SUPPORTED;
+
+ }
+ GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
+ GenCompactInstruction insn;
+ std::memset(&insn, 0, sizeof(GenCompactInstruction));
+ insn.bits1.opcode = opcode;
+ this->store.push_back(insn.low);
+ return (GenCompactInstruction *)&this->store.back();
+ }
+
+ GenNativeInstruction *GenEncoder::next(uint32_t opcode) {
+ GenNativeInstruction insn;
+ std::memset(&insn, 0, sizeof(GenNativeInstruction));
+ insn.header.opcode = opcode;
+ this->store.push_back(insn.low);
+ this->store.push_back(insn.high);
+ return (GenNativeInstruction *)(&this->store.back()-1);
+ }
+
+ INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src0, GenRegister src1 = GenRegister::null()) {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = p->getDoubleExecWidth();
+ p->curr.nibControl = 0;
+ GenNativeInstruction *insn = p->next(opcode);
+ p->setHeader(insn);
+ p->setDst(insn, dst);
+ p->setSrc0(insn, src0);
+ if (!GenRegister::isNull(src1))
+ p->setSrc1(insn, src1);
+ if (w == 8)
+ p->curr.nibControl = 1; // second 1/8 mask
+ insn = p->next(opcode);
+ p->setHeader(insn);
+ p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+ p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+ if (!GenRegister::isNull(src1))
+ p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+ p->pop();
+ }
+
+ // Double register accessing is a little special,
+ // Per Gen spec, then only supported mode is SIMD8 and, it only
+ // handles four doubles each time.
+ // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8
+ // to two SIMD1x4.
+ INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src0, GenRegister src1 = GenRegister::null()) {
+ if (p->curr.execWidth == 8)
+ _handleDouble(p, opcode, dst, src0, src1);
+ else if (p->curr.execWidth == 16) {
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ _handleDouble(p, opcode, dst, src0, src1);
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ if (!GenRegister::isNull(src1))
+ src1 = GenRegister::offset(src1, 2);
+ _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
+ p->pop();
+ }
+ }
+
+ void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src, uint32_t condition) {
+ if (dst.isdf() && src.isdf()) {
+ handleDouble(p, opcode, dst, src);
+ } else if (dst.isint64() && src.isint64()) { // handle int64
+ p->MOV(dst.bottom_half(), src.bottom_half());
+ p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
+ } else if (needToSplitAlu1(p, dst, src) == false) {
+ if(compactAlu1(p, opcode, dst, src, condition, false))
+ return;
+ GenNativeInstruction *insn = p->next(opcode);
+ if (condition != 0) {
+ GBE_ASSERT(opcode == GEN_OPCODE_MOV ||
+ opcode == GEN_OPCODE_NOT);
+ insn->header.destreg_or_condmod = condition;
+ }
+ p->setHeader(insn);
+ p->setDst(insn, dst);
+ p->setSrc0(insn, src);
+ } else {
+ GenNativeInstruction *insnQ1, *insnQ2;
+
+ // Instruction for the first quarter
+ insnQ1 = p->next(opcode);
+ p->setHeader(insnQ1);
+ insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
+ insnQ1->header.execution_size = GEN_WIDTH_8;
+ p->setDst(insnQ1, dst);
+ p->setSrc0(insnQ1, src);
+
+ // Instruction for the second quarter
+ insnQ2 = p->next(opcode);
+ p->setHeader(insnQ2);
+ insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
+ insnQ2->header.execution_size = GEN_WIDTH_8;
+ p->setDst(insnQ2, GenRegister::Qn(dst, 1));
+ p->setSrc0(insnQ2, GenRegister::Qn(src, 1));
+ }
+ }
+
+ void alu2(GenEncoder *p,
+ uint32_t opcode,
+ GenRegister dst,
+ GenRegister src0,
+ GenRegister src1,
+ uint32_t condition)
+ {
+ if (dst.isdf() && src0.isdf() && src1.isdf()) {
+ handleDouble(p, opcode, dst, src0, src1);
+ } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
+ if(compactAlu2(p, opcode, dst, src0, src1, condition, false))
+ return;
+ GenNativeInstruction *insn = p->next(opcode);
+ if (condition != 0) {
+ GBE_ASSERT(opcode == GEN_OPCODE_OR ||
+ opcode == GEN_OPCODE_XOR ||
+ opcode == GEN_OPCODE_AND);
+ insn->header.destreg_or_condmod = condition;
+ }
+ p->setHeader(insn);
+ p->setDst(insn, dst);
+ p->setSrc0(insn, src0);
+ p->setSrc1(insn, src1);
+ } else {
+ GenNativeInstruction *insnQ1, *insnQ2;
+
+ // Instruction for the first quarter
+ insnQ1 = p->next(opcode);
+ p->setHeader(insnQ1);
+ insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
+ insnQ1->header.execution_size = GEN_WIDTH_8;
+ p->setDst(insnQ1, dst);
+ p->setSrc0(insnQ1, src0);
+ p->setSrc1(insnQ1, src1);
+
+ // Instruction for the second quarter
+ insnQ2 = p->next(opcode);
+ p->setHeader(insnQ2);
+ insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
+ insnQ2->header.execution_size = GEN_WIDTH_8;
+ p->setDst(insnQ2, GenRegister::Qn(dst, 1));
+ p->setSrc0(insnQ2, GenRegister::Qn(src0, 1));
+ p->setSrc1(insnQ2, GenRegister::Qn(src1, 1));
+ }
+ }
+
+#define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+
+ static GenNativeInstruction *alu3(GenEncoder *p,
+ uint32_t opcode,
+ GenRegister dest,
+ GenRegister src0,
+ GenRegister src1,
+ GenRegister src2)
+ {
+ GenNativeInstruction *insn = p->next(opcode);
+
+ assert(dest.file == GEN_GENERAL_REGISTER_FILE);
+ assert(dest.nr < 128);
+ assert(dest.address_mode == GEN_ADDRESS_DIRECT);
+ assert(dest.type = GEN_TYPE_F);
+ insn->bits1.da3src.dest_reg_file = 0;
+ insn->bits1.da3src.dest_reg_nr = dest.nr;
+ insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
+ insn->bits1.da3src.dest_writemask = 0xf;
+ p->setHeader(insn);
+ insn->header.access_mode = GEN_ALIGN_16;
+ insn->header.execution_size = GEN_WIDTH_8;
+
+ assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+ assert(src0.address_mode == GEN_ADDRESS_DIRECT);
+ assert(src0.nr < 128);
+ assert(src0.type == GEN_TYPE_F);
+ insn->bits2.da3src.src0_swizzle = NO_SWIZZLE;
+ insn->bits2.da3src.src0_subreg_nr = src0.subnr / 4 ;
+ insn->bits2.da3src.src0_reg_nr = src0.nr;
+ insn->bits1.da3src.src0_abs = src0.absolute;
+ insn->bits1.da3src.src0_negate = src0.negation;
+ insn->bits2.da3src.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
+
+ assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+ assert(src1.address_mode == GEN_ADDRESS_DIRECT);
+ assert(src1.nr < 128);
+ assert(src1.type == GEN_TYPE_F);
+ insn->bits2.da3src.src1_swizzle = NO_SWIZZLE;
+ insn->bits2.da3src.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
+ insn->bits3.da3src.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
+ insn->bits2.da3src.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
+ insn->bits3.da3src.src1_reg_nr = src1.nr;
+ insn->bits1.da3src.src1_abs = src1.absolute;
+ insn->bits1.da3src.src1_negate = src1.negation;
+
+ assert(src2.file == GEN_GENERAL_REGISTER_FILE);
+ assert(src2.address_mode == GEN_ADDRESS_DIRECT);
+ assert(src2.nr < 128);
+ assert(src2.type == GEN_TYPE_F);
+ insn->bits3.da3src.src2_swizzle = NO_SWIZZLE;
+ insn->bits3.da3src.src2_subreg_nr = src2.subnr / 4;
+ insn->bits3.da3src.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
+ insn->bits3.da3src.src2_reg_nr = src2.nr;
+ insn->bits1.da3src.src2_abs = src2.absolute;
+ insn->bits1.da3src.src2_negate = src2.negation;
+
+ // Emit second half of the instruction
+ if (p->curr.execWidth == 16) {
+ GenNativeInstruction q1Insn = *insn;
+ insn = p->next(opcode);
+ *insn = q1Insn;
+ insn->header.quarter_control = GEN_COMPRESSION_Q2;
+ insn->bits1.da3src.dest_reg_nr++;
+ if (insn->bits2.da3src.src0_rep_ctrl == 0)
+ insn->bits2.da3src.src0_reg_nr++;
+ if (insn->bits2.da3src.src1_rep_ctrl == 0)
+ insn->bits3.da3src.src1_reg_nr++;
+ if (insn->bits3.da3src.src2_rep_ctrl == 0)
+ insn->bits3.da3src.src2_reg_nr++;
+ }
+
+ return insn;
+ }
+
+#undef NO_SWIZZLE
+
+#define ALU1(OP) \
+ void GenEncoder::OP(GenRegister dest, GenRegister src0, uint32_t condition) { \
+ alu1(this, GEN_OPCODE_##OP, dest, src0, condition); \
+ }
+
+#define ALU2(OP) \
+ void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1) { \
+ alu2(this, GEN_OPCODE_##OP, dest, src0, src1, 0); \
+ }
+
+#define ALU2_MOD(OP) \
+ void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition) { \
+ alu2(this, GEN_OPCODE_##OP, dest, src0, src1, condition); \
+ }
+
+
+#define ALU3(OP) \
+ void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2) { \
+ alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
+ }
+
+ void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+ union { double d; unsigned u[2]; } u;
+ u.d = value;
+ GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+ push();
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ curr.execWidth = 1;
+ MOV(r, GenRegister::immud(u.u[1]));
+ MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
+ pop();
+ r.type = GEN_TYPE_DF;
+ r.vstride = GEN_VERTICAL_STRIDE_0;
+ r.width = GEN_WIDTH_1;
+ r.hstride = GEN_HORIZONTAL_STRIDE_0;
+ push();
+ uint32_t width = curr.execWidth;
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ curr.quarterControl = GEN_COMPRESSION_Q1;
+ MOV(dest, r);
+ if (width == 16) {
+ curr.quarterControl = GEN_COMPRESSION_Q2;
+ MOV(GenRegister::offset(dest, 2), r);
+ }
+ pop();
+ }
+
+ void GenEncoder::UPSAMPLE_SHORT(GenRegister dest, GenRegister src0, GenRegister src1) {
+ dest.type = GEN_TYPE_B;
+ dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+ src0.type = GEN_TYPE_B;
+ src0.hstride = GEN_HORIZONTAL_STRIDE_2;
+ src1.type = GEN_TYPE_B;
+ src1.hstride = GEN_HORIZONTAL_STRIDE_2;
+ MOV(dest, src1);
+ dest.subnr ++;
+ MOV(dest, src0);
+ }
+
+ void GenEncoder::UPSAMPLE_INT(GenRegister dest, GenRegister src0, GenRegister src1) {
+ dest.type = GEN_TYPE_W;
+ dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+ src0.type = GEN_TYPE_W;
+ src0.hstride = GEN_HORIZONTAL_STRIDE_2;
+ src1.type = GEN_TYPE_W;
+ src1.hstride = GEN_HORIZONTAL_STRIDE_2;
+ MOV(dest, src1);
+ dest.subnr += 2;
+ MOV(dest, src0);
+ }
+
+ void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
+ GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
+ MOV(dest.bottom_half(), u0);
+ MOV(dest.top_half(this->simdWidth), u1);
+ }
+
+ void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+ GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+ int w = curr.execWidth;
+ GenRegister r0;
+ int factor = 1;
+ if (dest.type == GEN_TYPE_F) {
+ r0 = r;
+ r = GenRegister::h2(r);
+ factor = 2;
+ } else {
+ r0 = GenRegister::h2(r);
+ }
+ push();
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ MOV(r0, src0);
+ MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 4));
+ curr.noMask = 0;
+ curr.quarterControl = 0;
+ curr.nibControl = 0;
+ MOV(dest, r);
+ curr.nibControl = 1;
+ MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8 / factor));
+ pop();
+ if (w == 16) {
+ push();
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ MOV(r0, GenRegister::suboffset(src0, 8));
+ MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 12));
+ curr.noMask = 0;
+ curr.quarterControl = 1;
+ curr.nibControl = 0;
+ MOV(GenRegister::suboffset(dest, 8), r);
+ curr.nibControl = 1;
+ MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8 / factor));
+ pop();
+ }
+ }
+
+ ALU1(MOV)
+ ALU1(RNDZ)
+ ALU1(RNDE)
+ ALU1(RNDD)
+ ALU1(RNDU)
+ ALU1(FBH)
+ ALU1(FBL)
+ ALU1(F16TO32)
+ ALU1(F32TO16)
+ ALU2(SEL)
+ ALU1(NOT)
+ ALU2_MOD(AND)
+ ALU2_MOD(OR)
+ ALU2_MOD(XOR)
+ ALU2(SHR)
+ ALU2(SHL)
+ ALU2(RSR)
+ ALU2(RSL)
+ ALU2(ASR)
+ ALU1(FRC)
+ ALU2(MAC)
+ ALU1(LZD)
+ ALU2(LINE)
+ ALU2(PLN)
+ ALU2(MACH)
+ ALU3(MAD)
+ // ALU2(BRC)
+ // ALU1(ENDIF)
+ // ALU1(IF)
+
+ void GenEncoder::SUBB(GenRegister dest, GenRegister src0, GenRegister src1) {
+ push();
+ curr.accWrEnable = 1;
+ alu2(this, GEN_OPCODE_SUBB, dest, src0, src1);
+ pop();
+ }
+
+ void GenEncoder::ADDC(GenRegister dest, GenRegister src0, GenRegister src1) {
+ push();
+ curr.accWrEnable = 1;
+ alu2(this, GEN_OPCODE_ADDC, dest, src0, src1);
+ pop();
+ }
+
+ void GenEncoder::ADD(GenRegister dest, GenRegister src0, GenRegister src1) {
+ if (src0.type == GEN_TYPE_F ||
+ (src0.file == GEN_IMMEDIATE_VALUE &&
+ src0.type == GEN_TYPE_VF)) {
+ assert(src1.type != GEN_TYPE_UD);
+ assert(src1.type != GEN_TYPE_D);
+ }
+
+ if (src1.type == GEN_TYPE_F ||
+ (src1.file == GEN_IMMEDIATE_VALUE &&
+ src1.type == GEN_TYPE_VF)) {
+ assert(src0.type != GEN_TYPE_UD);
+ assert(src0.type != GEN_TYPE_D);
+ }
+
+ alu2(this, GEN_OPCODE_ADD, dest, src0, src1);
+ }
+
+ void GenEncoder::MUL(GenRegister dest, GenRegister src0, GenRegister src1) {
+ if (src0.type == GEN_TYPE_D ||
+ src0.type == GEN_TYPE_UD ||
+ src1.type == GEN_TYPE_D ||
+ src1.type == GEN_TYPE_UD)
+ assert(dest.type != GEN_TYPE_F);
+
+ if (src0.type == GEN_TYPE_F ||
+ (src0.file == GEN_IMMEDIATE_VALUE &&
+ src0.type == GEN_TYPE_VF)) {
+ assert(src1.type != GEN_TYPE_UD);
+ assert(src1.type != GEN_TYPE_D);
+ }
+
+ if (src1.type == GEN_TYPE_F ||
+ (src1.file == GEN_IMMEDIATE_VALUE &&
+ src1.type == GEN_TYPE_VF)) {
+ assert(src0.type != GEN_TYPE_UD);
+ assert(src0.type != GEN_TYPE_D);
+ }
+
+ assert(src0.file != GEN_ARCHITECTURE_REGISTER_FILE ||
+ src0.nr != GEN_ARF_ACCUMULATOR);
+ assert(src1.file != GEN_ARCHITECTURE_REGISTER_FILE ||
+ src1.nr != GEN_ARF_ACCUMULATOR);
+
+ alu2(this, GEN_OPCODE_MUL, dest, src0, src1);
+ }
+
+
+ void GenEncoder::NOP(void) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_NOP);
+ this->setDst(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
+ this->setSrc0(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
+ this->setSrc1(insn, GenRegister::immud(0x0));
+ }
+
+ void GenEncoder::BARRIER(GenRegister src) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::null());
+ this->setSrc0(insn, src);
+ setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
+ insn->bits3.msg_gateway.sub_function_id = GEN_BARRIER_MSG;
+ insn->bits3.msg_gateway.notify = 0x1;
+ }
+ void GenEncoder::FENCE(GenRegister dst) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, dst);
+ setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA_CACHE, 1, 1, 1);
+ insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
+ insn->bits3.gen7_memory_fence.commit_enable = 0x1;
+ }
+
+ void GenEncoder::JMPI(GenRegister src, bool longjmp) {
+ alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
+ if (longjmp)
+ NOP();
+ }
+
+#define ALU2_BRA(OP) \
+ void GenEncoder::OP(GenRegister src) { \
+ alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \
+ }
+
+ ALU2_BRA(IF)
+ ALU2_BRA(ENDIF)
+ ALU2_BRA(BRD)
+ ALU2_BRA(BRC)
+
+ void GenEncoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
+ GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
+ GBE_ASSERT(insnID < this->store.size());
+ GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
+ insn.header.opcode == GEN_OPCODE_BRD ||
+ insn.header.opcode == GEN_OPCODE_ENDIF ||
+ insn.header.opcode == GEN_OPCODE_IF ||
+ insn.header.opcode == GEN_OPCODE_BRC);
+
+ if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768)) {
+ if (insn.header.opcode == GEN_OPCODE_IF) {
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ return;
+ }
+ else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+ jumpDistance = jumpDistance - 2;
+ }
+
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+ } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
+ // For the conditional jump distance out of S15 range, we need to use an
+ // inverted jmp followed by a add ip, ip, distance to implement.
+ // A little hacky as we need to change the nop instruction to add
+ // instruction manually.
+ // If this is a unconditional jump, we just need to add the IP directly.
+ // FIXME there is an optimization method which we can insert a
+ // ADD instruction on demand. But that will need some extra analysis
+ // for all the branching instruction. And need to adjust the distance
+ // for those branch instruction's start point and end point contains
+ // this instruction.
+ GenNativeInstruction *insn2 = (GenNativeInstruction *)&this->store[insnID+2];
+ GBE_ASSERT(insn2->header.opcode == GEN_OPCODE_NOP);
+ insn2 = insn2;
+ insn.header.opcode = GEN_OPCODE_ADD;
+ this->setDst(&insn, GenRegister::ip());
+ this->setSrc0(&insn, GenRegister::ip());
+ this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
+ } else {
+ GenNativeInstruction &insn2 = *(GenNativeInstruction *)&this->store[insnID+2];
+ insn.header.predicate_inverse ^= 1;
+ this->setSrc1(&insn, GenRegister::immd(2));
+ GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
+ GBE_ASSERT(insnID < this->store.size());
+ insn2.header.predicate_control = GEN_PREDICATE_NONE;
+ insn2.header.opcode = GEN_OPCODE_ADD;
+ this->setDst(&insn2, GenRegister::ip());
+ this->setSrc0(&insn2, GenRegister::ip());
+ this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
+ }
+ }
+
+ void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1, GenRegister dst) {
+ if (needToSplitCmp(this, src0, src1) == false) {
+ if(!GenRegister::isNull(dst) && compactAlu2(this, GEN_OPCODE_CMP, dst, src0, src1, conditional, false)) {
+ return;
+ }
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_CMP);
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = conditional;
+ if (GenRegister::isNull(dst))
+ insn->header.thread_control = GEN_THREAD_SWITCH;
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src0);
+ this->setSrc1(insn, src1);
+ } else {
+ GenNativeInstruction *insnQ1, *insnQ2;
+
+ // Instruction for the first quarter
+ insnQ1 = this->next(GEN_OPCODE_CMP);
+ this->setHeader(insnQ1);
+ if (GenRegister::isNull(dst))
+ insnQ1->header.thread_control = GEN_THREAD_SWITCH;
+ insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
+ insnQ1->header.execution_size = GEN_WIDTH_8;
+ insnQ1->header.destreg_or_condmod = conditional;
+ this->setDst(insnQ1, dst);
+ this->setSrc0(insnQ1, src0);
+ this->setSrc1(insnQ1, src1);
+
+ // Instruction for the second quarter
+ insnQ2 = this->next(GEN_OPCODE_CMP);
+ this->setHeader(insnQ2);
+ if (GenRegister::isNull(dst))
+ insnQ2->header.thread_control = GEN_THREAD_SWITCH;
+ insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
+ insnQ2->header.execution_size = GEN_WIDTH_8;
+ insnQ2->header.destreg_or_condmod = conditional;
+ this->setDst(insnQ2, GenRegister::Qn(dst, 1));
+ this->setSrc0(insnQ2, GenRegister::Qn(src0, 1));
+ this->setSrc1(insnQ2, GenRegister::Qn(src1, 1));
+ }
+ }
+
+ void GenEncoder::SEL_CMP(uint32_t conditional,
+ GenRegister dst,
+ GenRegister src0,
+ GenRegister src1)
+ {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEL);
+ GBE_ASSERT(curr.predicate == GEN_PREDICATE_NONE);
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = conditional;
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src0);
+ this->setSrc1(insn, src1);
+ }
+
+ void GenEncoder::WAIT(void) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_WAIT);
+ GenRegister src = GenRegister::notification1();
+ this->setDst(insn, GenRegister::null());
+ this->setSrc0(insn, src);
+ this->setSrc1(insn, GenRegister::null());
+ insn->header.execution_size = 0; /* must */
+ insn->header.predicate_control = 0;
+ insn->header.quarter_control = 0;
+ }
+
+ void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH);
+ assert(dst.file == GEN_GENERAL_REGISTER_FILE);
+ assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+ assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+ assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+
+ if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+ function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER ||
+ function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+ assert(src0.type != GEN_TYPE_F);
+ assert(src1.type != GEN_TYPE_F);
+ } else {
+ assert(src0.type == GEN_TYPE_F);
+ assert(src1.type == GEN_TYPE_F);
+ }
+
+ insn->header.destreg_or_condmod = function;
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src0);
+ this->setSrc1(insn, src1);
+
+ if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+ function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER) {
+ insn->header.execution_size = this->curr.execWidth == 1 ? GEN_WIDTH_1 : GEN_WIDTH_8;
+ insn->header.quarter_control = GEN_COMPRESSION_Q1;
+
+ if(this->curr.execWidth == 16) {
+ GenNativeInstruction *insn2 = this->next(GEN_OPCODE_MATH);
+ GenRegister new_dest, new_src0, new_src1;
+ new_dest = GenRegister::QnPhysical(dst, 1);
+ new_src0 = GenRegister::QnPhysical(src0, 1);
+ new_src1 = GenRegister::QnPhysical(src1, 1);
+ insn2->header.destreg_or_condmod = function;
+ this->setHeader(insn2);
+ insn2->header.execution_size = GEN_WIDTH_8;
+ insn2->header.quarter_control = GEN_COMPRESSION_Q2;
+ this->setDst(insn2, new_dest);
+ this->setSrc0(insn2, new_src0);
+ this->setSrc1(insn2, new_src1);
+ }
+
+ }
+ }
+
+ void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH);
+ assert(dst.file == GEN_GENERAL_REGISTER_FILE);
+ assert(src.file == GEN_GENERAL_REGISTER_FILE);
+ assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+ assert(src.type == GEN_TYPE_F);
+
+ insn->header.destreg_or_condmod = function;
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src);
+ }
+
+ void GenEncoder::SAMPLE(GenRegister dest,
+ GenRegister msg,
+ unsigned int msg_len,
+ bool header_present,
+ unsigned char bti,
+ unsigned char sampler,
+ uint32_t simdWidth,
+ uint32_t writemask,
+ uint32_t return_format,
+ bool isLD,
+ bool isUniform)
+ {
+ if (writemask == 0) return;
+ uint32_t msg_type = isLD ? GEN_SAMPLER_MESSAGE_SIMD8_LD :
+ GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+ uint32_t response_length = (4 * (simdWidth / 8));
+ uint32_t msg_length = (msg_len * (simdWidth / 8));
+ if (header_present)
+ msg_length++;
+ uint32_t simd_mode = (simdWidth == 16) ?
+ GEN_SAMPLER_SIMD_MODE_SIMD16 : GEN_SAMPLER_SIMD_MODE_SIMD8;
+ if(isUniform) {
+ response_length = 1;
+ msg_type = GEN_SAMPLER_MESSAGE_SIMD4X2_LD;
+ msg_length = 1;
+ simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD4X2;
+ }
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, dest);
+ this->setSrc0(insn, msg);
+ setSamplerMessage(this, insn, bti, sampler, msg_type,
+ response_length, msg_length,
+ header_present,
+ simd_mode, return_format);
+ }
+
+ void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
+ {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_type = GEN_TYPED_WRITE;
+ uint32_t msg_length = header_present ? 9 : 8;
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ this->setSrc0(insn, msg);
+ setTypedWriteMessage(insn, bti, msg_type, msg_length, header_present);
+ }
+ static void setScratchMessage(GenEncoder *p,
+ GenNativeInstruction *insn,
+ uint32_t offset,
+ uint32_t block_size,
+ uint32_t channel_mode,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length, true);
+ insn->bits3.gen7_scratch_rw.block_size = block_size;
+ insn->bits3.gen7_scratch_rw.msg_type = msg_type;
+ insn->bits3.gen7_scratch_rw.channel_mode = channel_mode;
+ insn->bits3.gen7_scratch_rw.offset = offset;
+ insn->bits3.gen7_scratch_rw.category = 1;
+ }
+
+ void GenEncoder::SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode)
+ {
+ assert(src_num == 1 || src_num ==2);
+ uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
+ // here src_num means register that will be write out: in terms of 32byte register number
+ setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_WRITE, src_num+1, 0);
+ }
+
+ void GenEncoder::SCRATCH_READ(GenRegister dst, GenRegister src, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode)
+ {
+ assert(dst_num == 1 || dst_num ==2);
+ uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src);
+ this->setSrc1(insn, GenRegister::immud(0));
+ // here dst_num is the register that will be write-back: in terms of 32byte register
+ setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
+ }
+
+ void GenEncoder::EOT(uint32_t msg) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ this->setSrc0(insn, GenRegister::ud8grf(msg,0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ insn->header.execution_size = GEN_WIDTH_8;
+ insn->bits3.spawner_gen5.resource = GEN_DO_NOT_DEREFERENCE_URB;
+ insn->bits3.spawner_gen5.msg_length = 1;
+ insn->bits3.spawner_gen5.end_of_thread = 1;
+ insn->header.destreg_or_condmod = GEN_SFID_THREAD_SPAWNER;
+ }
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
new file mode 100644
index 0000000..d6e2b97
--- /dev/null
+++ b/backend/src/backend/gen_encoder.hpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <keith at tungstengraphics.com>
+ */
+
+#ifndef __GBE_GEN_ENCODER_HPP__
+#define __GBE_GEN_ENCODER_HPP__
+
+#include "backend/gen_defs.hpp"
+#include "backend/gen_register.hpp"
+#include "sys/platform.hpp"
+#include "sys/vector.hpp"
+#include <cassert>
+#include "src/cl_device_data.h"
+
+namespace gbe
+{
+ /*! Helper structure to emit Gen instructions */
+ class GenEncoder
+ {
+ public:
+ /*! simdWidth is the default width for the instructions */
+ GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID);
+
+ virtual ~GenEncoder(void) { }
+ /*! Size of the stack (should be large enough) */
+ enum { MAX_STATE_NUM = 16 };
+ /*! gen7 exec width of the double data type */
+ #define GEN7_DOUBLE_EXEC_WIDTH 8
+ /*! Push the current instruction state */
+ void push(void);
+ /*! Pop the latest pushed state */
+ void pop(void);
+ /*! The instruction stream we are building */
+ vector<GenInstruction> store;
+ /*! Current instruction state to use */
+ GenInstructionState curr;
+ /*! State used to encode the instructions */
+ GenInstructionState stack[MAX_STATE_NUM];
+ /*! Number of states currently pushed */
+ uint32_t stateNum;
+ /*! Gen generation to encode */
+ uint32_t gen;
+ /*! Device ID */
+ uint32_t deviceID;
+ /*! simd width for this codegen */
+ uint32_t simdWidth;
+ ////////////////////////////////////////////////////////////////////////
+ // Encoding functions
+ ////////////////////////////////////////////////////////////////////////
+
+#define ALU1(OP) void OP(GenRegister dest, GenRegister src0, uint32_t condition = 0);
+#define ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1);
+#define ALU2_MOD(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition = 0);
+#define ALU3(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2);
+ ALU1(MOV)
+ ALU1(FBH)
+ ALU1(FBL)
+ ALU2(SUBB)
+ ALU2(UPSAMPLE_SHORT)
+ ALU2(UPSAMPLE_INT)
+ ALU1(RNDZ)
+ ALU1(RNDE)
+ ALU1(RNDD)
+ ALU1(RNDU)
+ ALU1(F16TO32)
+ ALU1(F32TO16)
+ ALU2(SEL)
+ ALU1(NOT)
+ ALU2_MOD(AND)
+ ALU2_MOD(OR)
+ ALU2_MOD(XOR)
+ ALU2(SHR)
+ ALU2(SHL)
+ ALU2(RSR)
+ ALU2(RSL)
+ ALU2(ASR)
+ ALU2(ADD)
+ ALU2(ADDC)
+ ALU2(MUL)
+ ALU1(FRC)
+ ALU2(MAC)
+ ALU2(MACH)
+ ALU1(LZD)
+ ALU2(LINE)
+ ALU2(PLN)
+ ALU3(MAD)
+ //ALU2(MOV_DF);
+ ALU2(BRC)
+ ALU1(BRD)
+#undef ALU1
+#undef ALU2
+#undef ALU2_MOD
+#undef ALU3
+ /*! Get double/long exec width */
+ virtual int getDoubleExecWidth(void) { return GEN7_DOUBLE_EXEC_WIDTH; }
+ virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
+ virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+ void LOAD_INT64_IMM(GenRegister dest, int64_t value);
+ /*! Barrier message (to synchronize threads of a workgroup) */
+ void BARRIER(GenRegister src);
+ /*! Memory fence message (to order loads and stores between threads) */
+ void FENCE(GenRegister dst);
+ /*! Jump indexed instruction */
+ virtual void JMPI(GenRegister src, bool longjmp = false);
+ /*! IF indexed instruction */
+ void IF(GenRegister src);
+ /*! ENDIF indexed instruction */
+ void ENDIF(GenRegister src);
+ /*! BRC indexed instruction */
+ void BRC(GenRegister src);
+ /*! BRD indexed instruction */
+ void BRD(GenRegister src);
+ /*! Compare instructions */
+ void CMP(uint32_t conditional, GenRegister src0, GenRegister src1, GenRegister dst = GenRegister::null());
+ /*! Select with embedded compare (like sel.le ...) */
+ void SEL_CMP(uint32_t conditional, GenRegister dst, GenRegister src0, GenRegister src1);
+ /*! EOT is used to finish GPGPU threads */
+ void EOT(uint32_t msg_nr);
+ /*! No-op */
+ void NOP(void);
+ /*! Wait instruction (used for the barrier) */
+ void WAIT(void);
+ /*! Atomic instructions */
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+ /*! Untyped read (upto 4 channels) */
+ virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+ /*! Untyped write (upto 4 channels) */
+ virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+ /*! Byte gather (for unaligned bytes, shorts and ints) */
+ void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
+ /*! Byte scatter (for unaligned bytes, shorts and ints) */
+ void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+ /*! DWord gather (for constant cache read) */
+ void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
+ /*! for scratch memory read */
+ void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
+ /*! for scratch memory write */
+ void SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode);
+ /*! Send instruction for the sampler */
+ void SAMPLE(GenRegister dest,
+ GenRegister msg,
+ unsigned int msg_len,
+ bool header_present,
+ unsigned char bti,
+ unsigned char sampler,
+ unsigned int simdWidth,
+ uint32_t writemask,
+ uint32_t return_format,
+ bool isLD,
+ bool isUniform);
+
+ /*! TypedWrite instruction for texture */
+ virtual void TYPED_WRITE(GenRegister header,
+ bool header_present,
+ unsigned char bti);
+ /*! Extended math function (2 sources) */
+ void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
+ /*! Extended math function (1 source) */
+ void MATH(GenRegister dst, uint32_t function, GenRegister src);
+
+ /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
+ virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+
+ ////////////////////////////////////////////////////////////////////////
+ // Helper functions to encode
+ ////////////////////////////////////////////////////////////////////////
+ virtual void setHeader(GenNativeInstruction *insn);
+ virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
+ uint32_t msg_type, uint32_t msg_length,
+ uint32_t response_length);
+ virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+ unsigned char msg_type, uint32_t msg_length,
+ bool header_present);
+ void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
+ unsigned msg_length, unsigned response_length,
+ bool header_present = false, bool end_of_thread = false);
+ void setDst(GenNativeInstruction *insn, GenRegister dest);
+ void setSrc0(GenNativeInstruction *insn, GenRegister reg);
+ void setSrc1(GenNativeInstruction *insn, GenRegister reg);
+ GenCompactInstruction *nextCompact(uint32_t opcode);
+ GenNativeInstruction *next(uint32_t opcode);
+ uint32_t n_instruction(void) const { return store.size(); }
+ GBE_CLASS(GenEncoder); //!< Use custom allocators
+ };
+
+ void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src, uint32_t condition = 0);
+
+ void alu2(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src0, GenRegister src1, uint32_t condition = 0);
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_ENCODER_HPP__ */
+
+
diff --git a/backend/src/backend/gen_insn_compact.cpp b/backend/src/backend/gen_insn_compact.cpp
new file mode 100644
index 0000000..f19c364
--- /dev/null
+++ b/backend/src/backend/gen_insn_compact.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling Song <ruiling.song at intel.com>
+ */
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include <cstring>
+
+namespace gbe {
+
+ struct compact_table_entry {
+ uint32_t bit_pattern;
+ uint32_t index;
+ };
+
+ static compact_table_entry control_table[] = {
+ {0b0000000000000000010, 0},
+ {0b0000100000000000000, 1},
+ {0b0000100000000000001, 2},
+ {0b0000100000000000010, 3},
+ {0b0000100000000000011, 4},
+ {0b0000100000000000100, 5},
+ {0b0000100000000000101, 6},
+ {0b0000100000000000111, 7},
+ {0b0000100000000001000, 8},
+ {0b0000100000000001001, 9},
+ {0b0000100000000001101, 10},
+ {0b0000110000000000000, 11},
+ {0b0000110000000000001, 12},
+ {0b0000110000000000010, 13},
+ {0b0000110000000000011, 14},
+ {0b0000110000000000100, 15},
+ {0b0000110000000000101, 16},
+ {0b0000110000000000111, 17},
+ {0b0000110000000001001, 18},
+ {0b0000110000000001101, 19},
+ {0b0000110000000010000, 20},
+ {0b0000110000100000000, 21},
+ {0b0001000000000000000, 22},
+ {0b0001000000000000010, 23},
+ {0b0001000000000000100, 24},
+ {0b0001000000100000000, 25},
+ {0b0010110000000000000, 26},
+ {0b0010110000000010000, 27},
+ {0b0011000000000000000, 28},
+ {0b0011000000100000000, 29},
+ {0b0101000000000000000, 30},
+ {0b0101000000100000000, 31},
+ };
+
+ static compact_table_entry data_type_table[] = {
+ {0b000000001000001100, 20},
+ {0b001000000000000001, 0},
+ {0b001000000000100000, 1},
+ {0b001000000000100001, 2},
+ {0b001000000000111101, 21},
+ {0b001000000001100001, 3},
+ {0b001000000010100101, 22},
+ {0b001000000010111101, 4},
+ {0b001000001011111101, 5},
+ {0b001000001110100001, 6},
+ {0b001000001110100101, 7},
+ {0b001000001110111101, 8},
+ {0b001000010000100000, 23},
+ {0b001000010000100001, 9},
+ {0b001000110000100000, 10},
+ {0b001000110000100001, 11},
+ {0b001001010010100100, 24},
+ {0b001001010010100101, 12},
+ {0b001001110010000100, 25},
+ {0b001001110010100100, 13},
+ {0b001001110010100101, 14},
+ {0b001010010100001001, 26},
+ {0b001010010100101000, 30},
+ {0b001010110100101000, 31},
+ {0b001011110110101100, 29},
+ {0b001101111110111101, 27},
+ {0b001111001110111101, 15},
+ {0b001111011110011101, 16},
+ {0b001111011110111100, 17},
+ {0b001111011110111101, 18},
+ {0b001111111110111100, 19},
+ {0b001111111110111101, 28},
+ };
+
+ static compact_table_entry data_type_decompact[] = {
+ {0b001000000000000001, 0},
+ {0b001000000000100000, 1},
+ {0b001000000000100001, 2},
+ {0b001000000001100001, 3},
+ {0b001000000010111101, 4},
+ {0b001000001011111101, 5},
+ {0b001000001110100001, 6},
+ {0b001000001110100101, 7},
+ {0b001000001110111101, 8},
+ {0b001000010000100001, 9},
+ {0b001000110000100000, 10},
+ {0b001000110000100001, 11},
+ {0b001001010010100101, 12},
+ {0b001001110010100100, 13},
+ {0b001001110010100101, 14},
+ {0b001111001110111101, 15},
+ {0b001111011110011101, 16},
+ {0b001111011110111100, 17},
+ {0b001111011110111101, 18},
+ {0b001111111110111100, 19},
+ {0b000000001000001100, 20},
+ {0b001000000000111101, 21},
+ {0b001000000010100101, 22},
+ {0b001000010000100000, 23},
+ {0b001001010010100100, 24},
+ {0b001001110010000100, 25},
+ {0b001010010100001001, 26},
+ {0b001101111110111101, 27},
+ {0b001111111110111101, 28},
+ {0b001011110110101100, 29},
+ {0b001010010100101000, 30},
+ {0b001010110100101000, 31},
+ };
+
+ static compact_table_entry subreg_table[] = {
+ {0b000000000000000, 0},
+ {0b000000000000001, 1},
+ {0b000000000001000, 2},
+ {0b000000000001111, 3},
+ {0b000000000010000, 4},
+ {0b000000010000000, 5},
+ {0b000000100000000, 6},
+ {0b000000110000000, 7},
+ {0b000001000000000, 8},
+ {0b000001000010000, 9},
+ {0b000001010000000, 10},
+ {0b001000000000000, 11},
+ {0b001000000000001, 12},
+ {0b001000010000001, 13},
+ {0b001000010000010, 14},
+ {0b001000010000011, 15},
+ {0b001000010000100, 16},
+ {0b001000010000111, 17},
+ {0b001000010001000, 18},
+ {0b001000010001110, 19},
+ {0b001000010001111, 20},
+ {0b001000110000000, 21},
+ {0b001000111101000, 22},
+ {0b010000000000000, 23},
+ {0b010000110000000, 24},
+ {0b011000000000000, 25},
+ {0b011110010000111, 26},
+ {0b100000000000000, 27},
+ {0b101000000000000, 28},
+ {0b110000000000000, 29},
+ {0b111000000000000, 30},
+ {0b111000000011100, 31},
+ };
+
+ static compact_table_entry srcreg_table[] = {
+ {0b000000000000, 0},
+ {0b000000000010, 1},
+ {0b000000010000, 2},
+ {0b000000010010, 3},
+ {0b000000011000, 4},
+ {0b000000100000, 5},
+ {0b000000101000, 6},
+ {0b000001001000, 7},
+ {0b000001010000, 8},
+ {0b000001110000, 9},
+ {0b000001111000, 10},
+ {0b001100000000, 11},
+ {0b001100000010, 12},
+ {0b001100001000, 13},
+ {0b001100010000, 14},
+ {0b001100010010, 15},
+ {0b001100100000, 16},
+ {0b001100101000, 17},
+ {0b001100111000, 18},
+ {0b001101000000, 19},
+ {0b001101000010, 20},
+ {0b001101001000, 21},
+ {0b001101010000, 22},
+ {0b001101100000, 23},
+ {0b001101101000, 24},
+ {0b001101110000, 25},
+ {0b001101110001, 26},
+ {0b001101111000, 27},
+ {0b010001101000, 28},
+ {0b010001101001, 29},
+ {0b010001101010, 30},
+ {0b010110001000, 31},
+ };
+
+ static int cmp_key(const void *p1, const void*p2) {
+ const compact_table_entry * px = (compact_table_entry *)p1;
+ const compact_table_entry * py = (compact_table_entry *)p2;
+ return (px->bit_pattern) - py->bit_pattern;
+ }
+ union ControlBits{
+ struct {
+ uint32_t access_mode:1;
+ uint32_t mask_control:1;
+ uint32_t dependency_control:2;
+ uint32_t quarter_control:2;
+ uint32_t thread_control:2;
+ uint32_t predicate_control:4;
+ uint32_t predicate_inverse:1;
+ uint32_t execution_size:3;
+ uint32_t saturate:1;
+ uint32_t flag_sub_reg_nr:1;
+ uint32_t flag_reg_nr:1;
+ uint32_t pad:23;
+ };
+ uint32_t data;
+ };
+ union DataTypeBits{
+ struct {
+ uint32_t dest_reg_file:2;
+ uint32_t dest_reg_type:3;
+ uint32_t src0_reg_file:2;
+ uint32_t src0_reg_type:3;
+ uint32_t src1_reg_file:2;
+ uint32_t src1_reg_type:3;
+ uint32_t dest_horiz_stride:2;
+ uint32_t dest_address_mode:1;
+ uint32_t pad:14;
+ };
+ uint32_t data;
+ };
+ union SubRegBits {
+ struct {
+ uint32_t dest_subreg_nr:5;
+ uint32_t src0_subreg_nr:5;
+ uint32_t src1_subreg_nr:5;
+ uint32_t pad:17;
+ };
+ uint32_t data;
+ };
+ union SrcRegBits {
+ struct {
+ uint32_t src_abs:1;
+ uint32_t src_negate:1;
+ uint32_t src_address_mode:1;
+ uint32_t src_horiz_stride:2;
+ uint32_t src_width:3;
+ uint32_t src_vert_stride:4;
+ uint32_t pad:20;
+ };
+ uint32_t data;
+ };
+
+ void decompactInstruction(GenCompactInstruction * p, GenNativeInstruction *pOut) {
+
+ memset(pOut, 0, sizeof(GenNativeInstruction));
+ union ControlBits control_bits;
+ control_bits.data = control_table[(uint32_t)p->bits1.control_index].bit_pattern;
+ pOut->low.low = (uint32_t)p->bits1.opcode | ((control_bits.data & 0xffff) << 8);
+ pOut->header.destreg_or_condmod = p->bits1.destreg_or_condmod;
+ pOut->header.saturate = control_bits.saturate;
+ pOut->header.acc_wr_control = p->bits1.acc_wr_control;
+ pOut->header.cmpt_control = p->bits1.cmpt_control;
+ pOut->header.debug_control = p->bits1.debug_control;
+
+ union DataTypeBits data_type_bits;
+ union SubRegBits subreg_bits;
+ union SrcRegBits src0_bits;
+ data_type_bits.data = data_type_decompact[(uint32_t)p->bits1.data_type_index].bit_pattern;
+ subreg_bits.data = subreg_table[(uint32_t)p->bits1.sub_reg_index].bit_pattern;
+ src0_bits.data = srcreg_table[p->bits1.src0_index_lo | p->bits2.src0_index_hi << 2].bit_pattern;
+
+ pOut->low.high |= data_type_bits.data & 0x7fff;
+ pOut->bits1.da1.dest_horiz_stride = data_type_bits.dest_horiz_stride;
+ pOut->bits1.da1.dest_address_mode = data_type_bits.dest_address_mode;
+ pOut->bits1.da1.dest_reg_nr = p->bits2.dest_reg_nr;
+ pOut->bits1.da1.dest_subreg_nr = subreg_bits.dest_subreg_nr;
+
+ pOut->bits2.da1.src0_subreg_nr = subreg_bits.src0_subreg_nr;
+ pOut->bits2.da1.src0_reg_nr = p->bits2.src0_reg_nr;
+ pOut->high.low |= (src0_bits.data << 13);
+ pOut->bits2.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
+ pOut->bits2.da1.flag_reg_nr = control_bits.flag_reg_nr;
+
+ if(data_type_bits.src1_reg_file == GEN_IMMEDIATE_VALUE) {
+ uint32_t imm = (uint32_t)p->bits2.src1_reg_nr | (p->bits2.src1_index<<8);
+ pOut->bits3.ud = imm & 0x1000 ? (imm | 0xfffff000) : imm;
+ } else {
+ union SrcRegBits src1_bits;
+ src1_bits.data = srcreg_table[p->bits2.src1_index].bit_pattern;
+ pOut->bits3.da1.src1_subreg_nr = subreg_bits.src1_subreg_nr;
+ pOut->bits3.da1.src1_reg_nr = p->bits2.src1_reg_nr;
+ pOut->high.high |= (src1_bits.data << 13);
+ }
+ }
+
+ int compactControlBits(GenEncoder *p, uint32_t quarter, uint32_t execWidth) {
+
+ const GenInstructionState *s = &p->curr;
+ // some quick check
+ if(s->nibControl != 0)
+ return -1;
+ if(s->predicate > GEN_PREDICATE_NORMAL)
+ return -1;
+ if(s->flag == 1)
+ return -1;
+
+ ControlBits b;
+ b.data = 0;
+
+ if (execWidth == 8)
+ b.execution_size = GEN_WIDTH_8;
+ else if (execWidth == 16)
+ b.execution_size = GEN_WIDTH_16;
+ else if (execWidth == 4)
+ b.execution_size = GEN_WIDTH_4;
+ else if (execWidth == 1)
+ b.execution_size = GEN_WIDTH_1;
+ else
+ NOT_IMPLEMENTED;
+
+ b.mask_control = s->noMask;
+ b.quarter_control = quarter;
+ b.predicate_control = s->predicate;
+ b.predicate_inverse = s->inversePredicate;
+
+ b.saturate = s->saturate;
+ b.flag_sub_reg_nr = s->subFlag;
+ b.flag_reg_nr = s->flag;
+
+ compact_table_entry key;
+ key.bit_pattern = b.data;
+
+ compact_table_entry *r = (compact_table_entry *)bsearch(&key, control_table,
+ sizeof(control_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+ if (r == NULL)
+ return -1;
+ return r->index;
+ }
+
+ int compactDataTypeBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
+
+ // compact does not support any indirect acess
+ if(dst->address_mode != GEN_ADDRESS_DIRECT)
+ return -1;
+
+ if(src0->file == GEN_IMMEDIATE_VALUE)
+ return -1;
+
+ DataTypeBits b;
+ b.data = 0;
+
+ b.dest_horiz_stride = dst->hstride == GEN_HORIZONTAL_STRIDE_0 ? GEN_HORIZONTAL_STRIDE_1 : dst->hstride;
+ b.dest_address_mode = dst->address_mode;
+ b.dest_reg_file = dst->file;
+ b.dest_reg_type = dst->type;
+
+ b.src0_reg_file = src0->file;
+ b.src0_reg_type = src0->type;
+
+ if(src1) {
+ b.src1_reg_type = src1->type;
+ b.src1_reg_file = src1->file;
+ } else {
+ // default to zero
+ b.src1_reg_type = 0;
+ b.src1_reg_file = 0;
+ }
+
+ compact_table_entry key;
+ key.bit_pattern = b.data;
+
+ compact_table_entry *r = (compact_table_entry *)bsearch(&key, data_type_table,
+ sizeof(data_type_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+ if (r == NULL)
+ return -1;
+ return r->index;
+ }
+ int compactSubRegBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
+ SubRegBits b;
+ b.data = 0;
+ b.dest_subreg_nr = dst->subnr;
+ b.src0_subreg_nr = src0->subnr;
+ if(src1)
+ b.src1_subreg_nr = src1->subnr;
+ else
+ b.src1_subreg_nr = 0;
+
+ compact_table_entry key;
+ key.bit_pattern = b.data;
+
+ compact_table_entry *r = (compact_table_entry *)bsearch(&key, subreg_table,
+ sizeof(subreg_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+ if (r == NULL)
+ return -1;
+ return r->index;
+ }
+ int compactSrcRegBits(GenEncoder *p, GenRegister *src) {
+ // As we only use GEN_ALIGN_1 and compact only support direct register access,
+ // we only need to verify [hstride, width, vstride]
+ if(src->file == GEN_IMMEDIATE_VALUE)
+ return -1;
+ if(src->address_mode != GEN_ADDRESS_DIRECT)
+ return -1;
+
+ SrcRegBits b;
+ b.data = 0;
+ b.src_abs = src->absolute;
+ b.src_negate = src->negation;
+ b.src_address_mode = src->address_mode;
+ if(p->curr.execWidth == 1 && src->width == GEN_WIDTH_1) {
+ b.src_width = src->width;
+ b.src_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+ b.src_vert_stride = GEN_VERTICAL_STRIDE_0;
+ }
+ else {
+ b.src_horiz_stride = src->hstride;
+ b.src_width = src->width;
+ b.src_vert_stride = src->vstride;
+ }
+ compact_table_entry key;
+ key.bit_pattern = b.data;
+
+ compact_table_entry *r = (compact_table_entry *)bsearch(&key, srcreg_table,
+ sizeof(srcreg_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+ if (r == NULL)
+ return -1;
+ return r->index;
+ }
+
+ bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split) {
+ if(split) {
+ // TODO support it
+ return false;
+ } else {
+ int control_index = compactControlBits(p, p->curr.quarterControl, p->curr.execWidth);
+ if(control_index == -1) return false;
+
+ int data_type_index = compactDataTypeBits(p, &dst, &src, NULL);
+ if(data_type_index == -1) return false;
+
+ int sub_reg_index = compactSubRegBits(p, &dst, &src, NULL);
+ if(sub_reg_index == -1) return false;
+
+ int src_reg_index = compactSrcRegBits(p, &src);
+ if(src_reg_index == -1) return false;
+
+ GenCompactInstruction * insn = p->nextCompact(opcode);
+ insn->bits1.control_index = control_index;
+ insn->bits1.data_type_index = data_type_index;
+ insn->bits1.sub_reg_index = sub_reg_index;
+ insn->bits1.acc_wr_control = p->curr.accWrEnable;
+ insn->bits1.destreg_or_condmod = condition;
+ insn->bits1.cmpt_control = 1;
+ insn->bits1.src0_index_lo = src_reg_index & 3;
+
+ insn->bits2.src0_index_hi = src_reg_index >> 2;
+ insn->bits2.src1_index = 0;
+ insn->bits2.dest_reg_nr = dst.nr;
+ insn->bits2.src0_reg_nr = src.nr;
+ insn->bits2.src1_reg_nr = 0;
+ return true;
+ }
+ }
+
+ bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split) {
+ if(split) {
+ // TODO support it
+ return false;
+ } else {
+ if(opcode == GEN_OPCODE_IF || opcode == GEN_OPCODE_ENDIF || opcode == GEN_OPCODE_JMPI) return false;
+
+ int control_index = compactControlBits(p, p->curr.quarterControl, p->curr.execWidth);
+ if(control_index == -1) return false;
+
+ int data_type_index = compactDataTypeBits(p, &dst, &src0, &src1);
+ if(data_type_index == -1) return false;
+
+ int sub_reg_index = compactSubRegBits(p, &dst, &src0, &src1);
+ if(sub_reg_index == -1) return false;
+
+ int src0_reg_index = compactSrcRegBits(p, &src0);
+ if(src0_reg_index == -1) return false;
+
+ bool src1_imm = false;
+ int src1_reg_index;
+ if(src1.file == GEN_IMMEDIATE_VALUE) {
+ if(src1.absolute != 0 || src1.negation != 0 || src1.type == GEN_TYPE_F)
+ return false;
+ if(src1.value.d < -4096 || src1.value.d > 4095) // 13bit signed imm
+ return false;
+ src1_imm = true;
+ } else {
+ src1_reg_index = compactSrcRegBits(p, &src1);
+ if(src1_reg_index == -1) return false;
+ }
+ GenCompactInstruction * insn = p->nextCompact(opcode);
+ insn->bits1.control_index = control_index;
+ insn->bits1.data_type_index = data_type_index;
+ insn->bits1.sub_reg_index = sub_reg_index;
+ insn->bits1.acc_wr_control = p->curr.accWrEnable;
+ insn->bits1.destreg_or_condmod = condition;
+ insn->bits1.cmpt_control = 1;
+ insn->bits1.src0_index_lo = src0_reg_index & 3;
+
+ insn->bits2.src0_index_hi = src0_reg_index >> 2;
+ insn->bits2.src1_index = src1_imm ? (src1.value.ud & 8191)>> 8 : src1_reg_index;
+ insn->bits2.dest_reg_nr = dst.nr;
+ insn->bits2.src0_reg_nr = src0.nr;
+ insn->bits2.src1_reg_nr = src1_imm ? (src1.value.ud & 0xff): src1.nr;
+ return true;
+ }
+ }
+};
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
new file mode 100644
index 0000000..8535b4a
--- /dev/null
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -0,0 +1,42 @@
+// Family Latency SIMD16 SIMD8
+DECL_GEN7_SCHEDULE(Label, 0, 0, 0)
+DECL_GEN7_SCHEDULE(Unary, 20, 4, 2)
+DECL_GEN7_SCHEDULE(UnaryWithTemp, 20, 40, 20)
+DECL_GEN7_SCHEDULE(Binary, 20, 4, 2)
+DECL_GEN7_SCHEDULE(BinaryWithTemp, 20, 40, 20)
+DECL_GEN7_SCHEDULE(Ternary, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64Shift, 20, 40, 20)
+DECL_GEN7_SCHEDULE(I64HADD, 20, 40, 20)
+DECL_GEN7_SCHEDULE(I64RHADD, 20, 40, 20)
+DECL_GEN7_SCHEDULE(I64ToFloat, 20, 40, 20)
+DECL_GEN7_SCHEDULE(FloatToI64, 20, 40, 20)
+DECL_GEN7_SCHEDULE(I64MULHI, 20, 40, 20)
+DECL_GEN7_SCHEDULE(I64MADSAT, 20, 40, 20)
+DECL_GEN7_SCHEDULE(Compare, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64Compare, 20, 80, 20)
+DECL_GEN7_SCHEDULE(I64DIVREM, 20, 80, 20)
+DECL_GEN7_SCHEDULE(Jump, 14, 1, 1)
+DECL_GEN7_SCHEDULE(IndirectMove, 20, 2, 2)
+DECL_GEN7_SCHEDULE(Eot, 20, 1, 1)
+DECL_GEN7_SCHEDULE(NoOp, 20, 2, 2)
+DECL_GEN7_SCHEDULE(Wait, 20, 2, 2)
+DECL_GEN7_SCHEDULE(Math, 20, 4, 2)
+DECL_GEN7_SCHEDULE(Barrier, 80, 1, 1)
+DECL_GEN7_SCHEDULE(Fence, 80, 1, 1)
+DECL_GEN7_SCHEDULE(Read64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(Write64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(UntypedRead, 160, 1, 1)
+DECL_GEN7_SCHEDULE(UntypedWrite, 160, 1, 1)
+DECL_GEN7_SCHEDULE(ByteGather, 160, 1, 1)
+DECL_GEN7_SCHEDULE(ByteScatter, 160, 1, 1)
+DECL_GEN7_SCHEDULE(DWordGather, 160, 1, 1)
+DECL_GEN7_SCHEDULE(PackByte, 40, 1, 1)
+DECL_GEN7_SCHEDULE(UnpackByte, 40, 1, 1)
+DECL_GEN7_SCHEDULE(Sample, 160, 1, 1)
+DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1)
+DECL_GEN7_SCHEDULE(SpillReg, 20, 1, 1)
+DECL_GEN7_SCHEDULE(UnSpillReg, 160, 1, 1)
+DECL_GEN7_SCHEDULE(Atomic, 80, 1, 1)
+DECL_GEN7_SCHEDULE(I64MUL, 20, 40, 20)
+DECL_GEN7_SCHEDULE(I64SATADD, 20, 40, 20)
+DECL_GEN7_SCHEDULE(I64SATSUB, 20, 40, 20)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
new file mode 100644
index 0000000..106d608
--- /dev/null
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -0,0 +1,722 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_scheduling.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Overall idea:
+ * =============
+ *
+ * This is the instruction scheduling part of the code. With Gen, we actually
+ * have a simple strategy to follow. Indeed, here are the constraints:
+ *
+ * 1 - the number of registers per HW thread is constant and given (128 32 bytes
+ * GRF per thread). So, we can use all these registers with no penalty
+ * 2 - spilling is super bad. Instruction latency matters but the top priority
+ * is to avoid as much as possible spilling
+ *
+ *
+ * We schedule twice using at each time a local forward list scheduler
+ *
+ * Before the register allocation
+ * ==============================
+ *
+ * We try to limit the register pressure.
+ * Well, this is a hard problem and we have a decent strategy now that we called
+ * "zero cycled LIFO scheduling".
+ * We use a local forward list scheduling and we schedule the instructions in a
+ * LIFO order i.e. as a stack. Basically, we take the most recent instruction
+ * and schedule it right away. Obviously we ignore completely the real latencies
+ * and throuputs and just simulate instructions that are issued and completed in
+ * zero cycle. For the complex kernels we already have (like menger sponge),
+ * this provides a pretty good strategy enabling SIMD16 code generation where
+ * when scheduling is deactivated, even SIMD8 fails
+ *
+ * One may argue that this strategy is bad, latency wise. This is not true since
+ * the register allocator will anyway try to burn as many registers as possible.
+ * So, there is still opportunities to schedule after register allocation.
+ *
+ * Our idea seems to work decently. There is however a strong research article
+ * that is able to near-optimally reschudle the instructions to minimize
+ * register use. This is:
+ *
+ * "Minimum Register Instruction Sequence Problem: Revisiting Optimal Code
+ * Generation for DAGs"
+ *
+ * After the register allocation
+ * ==============================
+ *
+ * This is here a pretty simple strategy based on a regular forward list
+ * scheduling. Since Gen is a co-issue based machine, this is useless to take
+ * into account really precise timings since instruction issues will happen
+ * out-of-order based on other thread executions.
+ *
+ * Note that we over-simplify the problem. Indeed, Gen register file is flexible
+ * and we are able to use sub-registers of GRF in particular when we handle
+ * uniforms or mask registers which are spilled in GRFs. Thing is that two
+ * uniforms may not interfere even if they belong to the same GRF (i.e. they use
+ * two different sub-registers). This means that the interference relation is
+ * not transitive for Gen. To simplify everything, we just take consider full
+ * GRFs (in SIMD8) or double full GRFs (in SIMD16) regardless of the fact this
+ * is a uniform, a mask or a regular GRF.
+ *
+ * Obviously, this leads to extra dependencies in the code.
+ */
+
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/cvar.hpp"
+#include "sys/intrusive_list.hpp"
+
+namespace gbe
+{
+ // Helper structure to schedule the basic blocks
+ struct SelectionScheduler;
+
+ // Node for the schedule DAG
+ struct ScheduleDAGNode;
+
+ typedef enum {
+ WRITE_AFTER_WRITE,
+ WRITE_AFTER_READ,
+ READ_AFTER_WRITE,
+ READ_AFTER_WRITE_MEMORY
+ } DepMode;
+
+ /*! We need to chain together the node we point */
+ struct ScheduleListNode : public intrusive_list_node
+ {
+ INLINE ScheduleListNode(ScheduleDAGNode *node, DepMode m = READ_AFTER_WRITE) : node(node), depMode(m) {}
+ ScheduleDAGNode *node;
+ DepMode depMode;
+ };
+
+ /*! Node of the DAG */
+ struct ScheduleDAGNode
+ {
+ INLINE ScheduleDAGNode(SelectionInstruction &insn) :
+ insn(insn), refNum(0), retiredCycle(0), preRetired(false), readDistance(0x7fffffff) {}
+ bool dependsOn(ScheduleDAGNode *node) const {
+ GBE_ASSERT(node != NULL);
+ for (auto child : node->children)
+ if (child.node == this)
+ return true;
+ return false;
+ }
+ /*! Children that depends on us */
+ intrusive_list<ScheduleListNode> children;
+ /*! Instruction after code selection */
+ SelectionInstruction &insn;
+ /*! Number of nodes that point to us (i.e. nodes we depend on) */
+ uint32_t refNum;
+ /*! Cycle when the instruction is retired */
+ uint32_t retiredCycle;
+ bool preRetired;
+ uint32_t readDistance;
+ };
+
+ /*! To track loads and stores */
+ enum GenMemory : uint8_t {
+ GLOBAL_MEMORY = 0,
+ LOCAL_MEMORY,
+ SCRATCH_MEMORY,
+ MAX_MEM_SYSTEM
+ };
+
+ /*! Do we allocate after or before the register allocation? */
+ enum SchedulePolicy {
+ PRE_ALLOC = 0, // LIFO scheduling (tends to limit register pressure)
+ POST_ALLOC // FIFO scheduling (limits latency problems)
+ };
+
+ /*! Helper structure to handle dependencies while scheduling. Takes into
+ * account virtual and physical registers and memory sub-systems
+ */
+ struct DependencyTracker : public NonCopyable
+ {
+ DependencyTracker(const Selection &selection, SelectionScheduler &scheduler);
+ /*! Reset it before scheduling a new block */
+ void clear(bool fullClear = false);
+ /*! Get an index in the node array for the given register */
+ uint32_t getIndex(GenRegister reg) const;
+ /*! Get an index in the node array for the given memory system */
+ uint32_t getIndex(uint32_t bti) const;
+ /*! Add a new dependency "node0 depends on node1" */
+ void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode m);
+ /*! Add a new dependency "node0 depends on node located at index" */
+ void addDependency(ScheduleDAGNode *node0, uint32_t index, DepMode m);
+ /*! Add a new dependency "node located at index depends on node0" */
+ void addDependency(uint32_t index, ScheduleDAGNode *node0, DepMode m);
+ /*! No dependency for null registers and immediate */
+ INLINE bool ignoreDependency(GenRegister reg) const {
+ if (reg.file == GEN_IMMEDIATE_VALUE)
+ return true;
+ else if (reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+ if ((reg.nr & 0xf0) == GEN_ARF_NULL)
+ return true;
+ }
+ return false;
+ }
+ /*! Owns the tracker */
+ SelectionScheduler &scheduler;
+ /*! Add a new dependency "node0 depends on node set for register reg" */
+ void addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m);
+ /*! Add a new dependency "node set for register reg depends on node0" */
+ void addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m);
+ /*! Make the node located at insnID a barrier */
+ void makeBarrier(int32_t insnID, int32_t insnNum);
+ /*! Update all the writes (memory, predicates, registers) */
+ void updateWrites(ScheduleDAGNode *node);
+ /*! Maximum number of *physical* flag registers */
+ static const uint32_t MAX_FLAG_REGISTER = 8u;
+ /*! Maximum number of *physical* accumulators registers */
+ static const uint32_t MAX_ACC_REGISTER = 1u;
+ /*! Stores the last node that wrote to a register / memory ... */
+ vector<ScheduleDAGNode*> nodes;
+ /*! store nodes each node depends on */
+ map<ScheduleDAGNode *, vector<ScheduleDAGNode*>> deps;
+ /*! Stores the nodes per instruction */
+ vector<ScheduleDAGNode*> insnNodes;
+ /*! Number of virtual register in the selection */
+ uint32_t grfNum;
+ };
+
+ /*! Perform the instruction scheduling */
+ struct SelectionScheduler : public NonCopyable
+ {
+ /*! Init the book keeping structures */
+ SelectionScheduler(GenContext &ctx, Selection &selection, SchedulePolicy policy);
+ /*! Make all lists empty */
+ void clearLists(void);
+ /*! Return the number of instructions to schedule in the DAG */
+ int32_t buildDAG(SelectionBlock &bb);
+ /*! traverse read node and update read distance for all the child. */
+ void traverseReadNode(ScheduleDAGNode *node, uint32_t degree = 0);
+ /*! Schedule the DAG, pre register allocation and post register allocation. */
+ void preScheduleDAG(SelectionBlock &bb, int32_t insnNum);
+ void postScheduleDAG(SelectionBlock &bb, int32_t insnNum);
+ /*! To limit register pressure or limit insn latency problems */
+ SchedulePolicy policy;
+ /*! Make ScheduleListNode allocation faster */
+ DECL_POOL(ScheduleListNode, listPool);
+ /*! Make ScheduleDAGNode allocation faster */
+ DECL_POOL(ScheduleDAGNode, nodePool);
+ /*! Ready list is instructions that can be scheduled */
+ intrusive_list<ScheduleListNode> ready;
+ /*! Active list is instructions that are executing */
+ intrusive_list<ScheduleListNode> active;
+ /*! Handle complete compilation */
+ GenContext &ctx;
+ /*! Code to schedule */
+ Selection &selection;
+ /*! To help tracking dependencies */
+ DependencyTracker tracker;
+ };
+
+ DependencyTracker::DependencyTracker(const Selection &selection, SelectionScheduler &scheduler) :
+ scheduler(scheduler)
+ {
+ if (scheduler.policy == PRE_ALLOC) {
+ this->grfNum = selection.getRegNum();
+ nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
+ } else {
+ const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+ GBE_ASSERT(simdWidth == 8 || simdWidth == 16);
+ this->grfNum = simdWidth == 8 ? 128 : 64;
+ nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
+ }
+ insnNodes.resize(selection.getLargestBlockSize());
+ }
+
+ void DependencyTracker::clear(bool fullClear) { for (auto &x : nodes) x = NULL; if (fullClear) deps.clear(); }
+ void DependencyTracker::addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m) {
+ if (this->ignoreDependency(reg) == false) {
+ const uint32_t index = this->getIndex(reg);
+ this->addDependency(node0, index, m);
+ if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+ this->addDependency(node0, index + 1, m);
+ }
+ }
+
+ void DependencyTracker::addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m) {
+ if (this->ignoreDependency(reg) == false) {
+ const uint32_t index = this->getIndex(reg);
+ this->addDependency(index, node0, m);
+ if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+ this->addDependency(index + 1, node0, m);
+ }
+ }
+
+ void DependencyTracker::addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode depMode) {
+ if (node0 != NULL && node1 != NULL && node0 != node1 && node0->dependsOn(node1) == false) {
+ if (node1->insn.isRead())
+ depMode = depMode == READ_AFTER_WRITE ? READ_AFTER_WRITE_MEMORY : depMode;
+ ScheduleListNode *dep = scheduler.newScheduleListNode(node0, depMode);
+ node0->refNum++;
+ node1->children.push_back(dep);
+ auto it = deps.find(node0);
+ if (it != deps.end()) {
+ it->second.push_back(node1);
+ } else {
+ vector<ScheduleDAGNode*> vn;
+ vn.push_back(node1);
+ deps.insert(std::make_pair(node0, vn));
+ }
+ }
+ }
+
+ void DependencyTracker::addDependency(ScheduleDAGNode *node, uint32_t index, DepMode m) {
+ this->addDependency(node, this->nodes[index], m);
+ }
+
+ void DependencyTracker::addDependency(uint32_t index, ScheduleDAGNode *node, DepMode m) {
+ this->addDependency(this->nodes[index], node, m);
+ }
+
+ void DependencyTracker::makeBarrier(int32_t barrierID, int32_t insnNum) {
+ ScheduleDAGNode *barrier = this->insnNodes[barrierID];
+
+ // The barrier depends on all nodes before it
+ for (int32_t insnID = 0; insnID < barrierID; ++insnID)
+ this->addDependency(barrier, this->insnNodes[insnID], WRITE_AFTER_WRITE);
+
+ // All nodes after barriers depend on the barrier
+ for (int32_t insnID = barrierID + 1; insnID < insnNum; ++insnID)
+ this->addDependency(this->insnNodes[insnID], barrier, WRITE_AFTER_WRITE);
+ }
+
+ static GenRegister getFlag(const SelectionInstruction &insn) {
+ if (insn.state.physicalFlag) {
+ const uint32_t nr = insn.state.flag;
+ const uint32_t subnr = insn.state.subFlag;
+ return GenRegister::flag(nr, subnr);
+ } else
+ return GenRegister::uw1grf(ir::Register(insn.state.flagIndex));
+ }
+
+ uint32_t DependencyTracker::getIndex(GenRegister reg) const {
+ // Non GRF physical register
+ if (reg.physical) {
+ //GBE_ASSERT (reg.file == GEN_ARCHITECTURE_REGISTER_FILE);
+ if(reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+ const uint32_t file = reg.nr & 0xf0;
+ const uint32_t nr = reg.nr & 0x0f;
+ if (file == GEN_ARF_FLAG) {
+ const uint32_t subnr = reg.subnr / sizeof(uint16_t);
+ GBE_ASSERT(nr < MAX_FLAG_REGISTER && (subnr == 0 || subnr == 1));
+ return grfNum + 2*nr + subnr;
+ } else if (file == GEN_ARF_ACCUMULATOR) {
+ GBE_ASSERT(nr < MAX_ACC_REGISTER);
+ return grfNum + MAX_FLAG_REGISTER + nr;
+ } else {
+ NOT_SUPPORTED;
+ return 0;
+ }
+ } else {
+ const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+ return simdWidth == 8 ? reg.nr : reg.nr / 2;
+ }
+ }
+ // We directly manipulate physical GRFs here
+ else if (scheduler.policy == POST_ALLOC) {
+ const GenRegister physical = scheduler.ctx.ra->genReg(reg);
+ const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+ return simdWidth == 8 ? physical.nr : physical.nr / 2;
+ }
+ // We use virtual registers since allocation is not done yet
+ else
+ return reg.value.reg;
+ }
+
+ uint32_t DependencyTracker::getIndex(uint32_t bti) const {
+ const uint32_t memDelta = grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER;
+ return bti == 0xfe ? memDelta + LOCAL_MEMORY : (bti == 0xff ? memDelta + SCRATCH_MEMORY : memDelta + GLOBAL_MEMORY);
+ }
+
+ void DependencyTracker::updateWrites(ScheduleDAGNode *node) {
+ const SelectionInstruction &insn = node->insn;
+
+ // Track writes in registers
+ for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID) {
+ const GenRegister dst = insn.dst(dstID);
+ if (this->ignoreDependency(dst) == false) {
+ const uint32_t index = this->getIndex(dst);
+ this->nodes[index] = node;
+ if (scheduler.policy == POST_ALLOC && (dst.isdf() || dst.isint64()))
+ this->nodes[index + 1] = node;
+ }
+ }
+
+ // Track writes in predicates
+ if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP || insn.state.modFlag) {
+ const uint32_t index = this->getIndex(getFlag(insn));
+ this->nodes[index] = node;
+ }
+
+ // Track writes in accumulators
+ if (insn.state.accWrEnable) {
+ const uint32_t index = this->getIndex(GenRegister::acc());
+ this->nodes[index] = node;
+ }
+
+ // Track writes in memory
+ if (insn.isWrite()) {
+ const uint32_t index = this->getIndex(insn.getbti());
+ this->nodes[index] = node;
+ }
+
+ // Track writes in scratch memory
+ if(insn.opcode == SEL_OP_SPILL_REG) {
+ const uint32_t index = this->getIndex(0xff);
+ this->nodes[index] = node;
+ }
+ // Consider barriers and wait write to memory
+ if (insn.opcode == SEL_OP_BARRIER ||
+ insn.opcode == SEL_OP_FENCE ||
+ insn.opcode == SEL_OP_WAIT) {
+ const uint32_t local = this->getIndex(0xfe);
+ const uint32_t global = this->getIndex(0x00);
+ this->nodes[local] = this->nodes[global] = node;
+ }
+ }
+
+ /*! Kind-of roughly estimated latency. Nothing real here */
+ static uint32_t getLatencyGen7(const SelectionInstruction &insn) {
+#define DECL_GEN7_SCHEDULE(FAMILY, LATENCY, SIMD16, SIMD8)\
+ const uint32_t FAMILY##InstructionLatency = LATENCY;
+#include "gen_insn_gen7_schedule_info.hxx"
+#undef DECL_GEN7_SCHEDULE
+
+ switch (insn.opcode) {
+#define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: return FAMILY##Latency;
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_SELECTION_IR
+ };
+ return 0;
+ }
+
+ /*! Throughput in cycles for SIMD8 or SIMD16 */
+ static uint32_t getThroughputGen7(const SelectionInstruction &insn, bool isSIMD8) {
+#define DECL_GEN7_SCHEDULE(FAMILY, LATENCY, SIMD16, SIMD8)\
+ const uint32_t FAMILY##InstructionThroughput = isSIMD8 ? SIMD8 : SIMD16;
+#include "gen_insn_gen7_schedule_info.hxx"
+#undef DECL_GEN7_SCHEDULE
+
+ switch (insn.opcode) {
+#define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: return FAMILY##Throughput;
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_SELECTION_IR
+ };
+ return 0;
+ }
+
+ SelectionScheduler::SelectionScheduler(GenContext &ctx,
+ Selection &selection,
+ SchedulePolicy policy) :
+ policy(policy), listPool(nextHighestPowerOf2(selection.getLargestBlockSize())),
+ ctx(ctx), selection(selection), tracker(selection, *this)
+ {
+ this->clearLists();
+ }
+
+ void SelectionScheduler::clearLists(void) {
+ this->ready.fast_clear();
+ this->active.fast_clear();
+ }
+
+ void SelectionScheduler::traverseReadNode(ScheduleDAGNode *node, uint32_t degree) {
+ GBE_ASSERT(degree != 0 || node->insn.isRead());
+ if (node->readDistance != 0x7FFFFFFF)
+ return;
+ node->readDistance = degree;
+ if (degree > 5)
+ return;
+ //printf("node id %d op %d degree %d \n", node->insn.ID, node->insn.opcode, degree);
+ auto it = tracker.deps.find(node);
+ if (it != tracker.deps.end()) {
+ for (auto &depNode : it->second) {
+ if (depNode && !depNode->insn.isRead())
+ traverseReadNode(depNode, degree + 1);
+ }
+ }
+ }
+
+ int32_t SelectionScheduler::buildDAG(SelectionBlock &bb) {
+ nodePool.rewind();
+ listPool.rewind();
+ tracker.clear(true);
+ this->clearLists();
+
+ // Track write-after-write and read-after-write dependencies
+ int32_t insnNum = 0;
+ for (auto &insn : bb.insnList) {
+ // Create a new node for this instruction
+ ScheduleDAGNode *node = this->newScheduleDAGNode(insn);
+ tracker.insnNodes[insnNum++] = node;
+
+ // read-after-write in registers
+ for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
+ tracker.addDependency(node, insn.src(srcID), READ_AFTER_WRITE);
+
+ // read-after-write for predicate
+ if (insn.state.predicate != GEN_PREDICATE_NONE)
+ tracker.addDependency(node, getFlag(insn), READ_AFTER_WRITE);
+
+ // read-after-write in memory
+ if (insn.isRead()) {
+ const uint32_t index = tracker.getIndex(insn.getbti());
+ tracker.addDependency(node, index, READ_AFTER_WRITE);
+ }
+ //read-after-write of scratch memory
+ if (insn.opcode == SEL_OP_UNSPILL_REG) {
+ const uint32_t index = tracker.getIndex(0xff);
+ tracker.addDependency(node, index, READ_AFTER_WRITE);
+ }
+
+ // Consider barriers and wait are reading memory (local and global)
+ if (insn.opcode == SEL_OP_BARRIER ||
+ insn.opcode == SEL_OP_FENCE ||
+ insn.opcode == SEL_OP_WAIT) {
+ const uint32_t local = tracker.getIndex(0xfe);
+ const uint32_t global = tracker.getIndex(0x00);
+ tracker.addDependency(node, local, READ_AFTER_WRITE);
+ tracker.addDependency(node, global, READ_AFTER_WRITE);
+ }
+
+ // write-after-write in registers
+ for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID)
+ tracker.addDependency(node, insn.dst(dstID), WRITE_AFTER_WRITE);
+
+ // write-after-write for predicate
+ if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP || insn.state.modFlag)
+ tracker.addDependency(node, getFlag(insn), WRITE_AFTER_WRITE);
+
+ // write-after-write for accumulators
+ if (insn.state.accWrEnable)
+ tracker.addDependency(node, GenRegister::acc(), WRITE_AFTER_WRITE);
+
+ // write-after-write in memory
+ if (insn.isWrite()) {
+ const uint32_t index = tracker.getIndex(insn.getbti());
+ tracker.addDependency(node, index, WRITE_AFTER_WRITE);
+ }
+
+ // write-after-write in scratch memory
+ if (insn.opcode == SEL_OP_SPILL_REG) {
+ const uint32_t index = tracker.getIndex(0xff);
+ tracker.addDependency(node, index, WRITE_AFTER_WRITE);
+ }
+
+ // Track all writes done by the instruction
+ tracker.updateWrites(node);
+ }
+
+ // Track write-after-read dependencies
+ tracker.clear();
+ for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
+ ScheduleDAGNode *node = tracker.insnNodes[insnID];
+ const SelectionInstruction &insn = node->insn;
+
+ // write-after-read in registers
+ for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
+ tracker.addDependency(insn.src(srcID), node, WRITE_AFTER_READ);
+
+ // write-after-read for predicate
+ if (insn.state.predicate != GEN_PREDICATE_NONE)
+ tracker.addDependency(getFlag(insn), node, WRITE_AFTER_READ);
+
+ // write-after-read in memory
+ if (insn.isRead()) {
+ const uint32_t index = tracker.getIndex(insn.getbti());
+ tracker.addDependency(index, node, WRITE_AFTER_READ);
+ }
+
+ // write-after-read in scratch memory
+ if (insn.opcode == SEL_OP_UNSPILL_REG) {
+ const uint32_t index = tracker.getIndex(0xff);
+ tracker.addDependency(index, node, WRITE_AFTER_READ);
+ }
+
+ // Consider barriers and wait are reading memory (local and global)
+ if (insn.opcode == SEL_OP_BARRIER ||
+ insn.opcode == SEL_OP_FENCE ||
+ insn.opcode == SEL_OP_WAIT) {
+ const uint32_t local = tracker.getIndex(0xfe);
+ const uint32_t global = tracker.getIndex(0x00);
+ tracker.addDependency(local, node, WRITE_AFTER_READ);
+ tracker.addDependency(global, node, WRITE_AFTER_READ);
+ }
+
+ // Track all writes done by the instruction
+ tracker.updateWrites(node);
+ }
+
+ // Update distance to read for each read node.
+ for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+ ScheduleDAGNode *node = tracker.insnNodes[insnID];
+ const SelectionInstruction &insn = node->insn;
+ if (insn.isRead())
+ traverseReadNode(node);
+ }
+
+ // Make labels and branches non-schedulable (i.e. they act as barriers)
+ for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+ ScheduleDAGNode *node = tracker.insnNodes[insnID];
+ if (node->insn.isBranch() || node->insn.isLabel()
+ || node->insn.opcode == SEL_OP_EOT || node->insn.opcode == SEL_OP_IF
+ || node->insn.opcode == SEL_OP_BARRIER)
+ tracker.makeBarrier(insnID, insnNum);
+ }
+
+ // Build the initial ready list (should only be the label actually)
+ for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+ ScheduleDAGNode *node = tracker.insnNodes[insnID];
+ if (node->refNum == 0) {
+ ScheduleListNode *listNode = this->newScheduleListNode(node);
+ this->ready.push_back(listNode);
+ }
+ }
+
+ return insnNum;
+ }
+
+ void SelectionScheduler::preScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+ printf("Not implemented yet. \n");
+ }
+
+ void SelectionScheduler::postScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+ uint32_t cycle = 0;
+ const bool isSIMD8 = this->ctx.getSimdWidth() == 8;
+ vector <ScheduleDAGNode *> scheduledNodes;
+ while (insnNum) {
+
+ // Retire all the instructions that finished
+ //printf("cycle = %d \n", cycle);
+ for (auto toRetireIt = active.begin(); toRetireIt != active.end();) {
+ ScheduleDAGNode *toRetireNode = toRetireIt.node()->node;
+ // Firstly, put all write after read children to ready.
+ if (toRetireNode->preRetired == false) {
+ auto &children = toRetireNode->children;
+ toRetireNode->preRetired = true;
+ //printf("id %d pre retired \n", toRetireNode->insn.ID);
+ for (auto it = children.begin(); it != children.end();) {
+ ScheduleListNode *listNode = it.node();
+ if (listNode->depMode != WRITE_AFTER_READ) {
+ ++it;
+ continue;
+ }
+ if (--it->node->refNum == 0) {
+ //printf("pre push id %d to ready list. \n", listNode->node->insn.ID);
+ it = children.erase(it);
+ this->ready.push_back(listNode);
+ } else
+ ++it;
+ }
+ if (children.size() == 0) {
+ toRetireIt = this->active.erase(toRetireIt);
+ continue;
+ }
+ }
+ // Instruction is now complete
+ if (toRetireNode->retiredCycle <= cycle) {
+ toRetireIt = this->active.erase(toRetireIt);
+ //printf("id %d retired \n", toRetireNode->insn.ID);
+ // Traverse all children and make them ready if no more dependency
+ auto &children = toRetireNode->children;
+ for (auto it = children.begin(); it != children.end();) {
+ ScheduleListNode *listNode = it.node();
+ if (listNode->depMode == WRITE_AFTER_READ) {
+ ++it;
+ continue;
+ }
+ if (--it->node->refNum == 0) {
+ it = children.erase(it);
+ if (listNode->depMode != WRITE_AFTER_READ)
+ this->ready.push_back(listNode);
+ //printf("push id %d to ready list. \n", listNode->node->insn.ID);
+ } else
+ ++it;
+ }
+ } else
+ ++toRetireIt;
+ }
+
+ // Try to schedule something from the ready list
+ intrusive_list<ScheduleListNode>::iterator toSchedule;
+ toSchedule = this->ready.begin();
+ float minCost = 1000;
+ for(auto it = this->ready.begin(); it != this->ready.end(); ++it) {
+ float cost = (it->depMode == WRITE_AFTER_READ) ? 0 : ((it->depMode == WRITE_AFTER_WRITE) ? 5 : 10)
+ - 5.0 / (it->node->readDistance == 0 ? 0.1 : it->node->readDistance);
+ if (cost < minCost) {
+ toSchedule = it;
+ minCost = cost;
+ }
+ }
+ if (toSchedule != this->ready.end()) {
+ //printf("get id %d op %d to schedule \n", toSchedule->node->insn.ID, toSchedule->node->insn.opcode);
+ // The instruction is instantaneously issued to simulate zero cycle
+ // scheduling
+ cycle += getThroughputGen7(toSchedule->node->insn, isSIMD8);
+
+ this->ready.erase(toSchedule);
+ this->active.push_back(toSchedule.node());
+ // When we schedule before allocation, instruction is instantaneously
+ // ready. This allows to have a real LIFO strategy
+ toSchedule->node->retiredCycle = cycle + getLatencyGen7(toSchedule->node->insn);
+ bb.append(&toSchedule->node->insn);
+ scheduledNodes.push_back(toSchedule->node);
+ insnNum--;
+ } else
+ cycle++;
+ }
+ }
+
+ BVAR(OCL_POST_ALLOC_INSN_SCHEDULE, true);
+ BVAR(OCL_PRE_ALLOC_INSN_SCHEDULE, false);
+
+ void schedulePostRegAllocation(GenContext &ctx, Selection &selection) {
+ if (OCL_POST_ALLOC_INSN_SCHEDULE) {
+ SelectionScheduler scheduler(ctx, selection, POST_ALLOC);
+ for (auto &bb : *selection.blockList) {
+ const int32_t insnNum = scheduler.buildDAG(bb);
+ bb.insnList.clear();
+ scheduler.postScheduleDAG(bb, insnNum);
+ }
+ }
+ }
+
+ void schedulePreRegAllocation(GenContext &ctx, Selection &selection) {
+ if (OCL_PRE_ALLOC_INSN_SCHEDULE) {
+ SelectionScheduler scheduler(ctx, selection, PRE_ALLOC);
+ // FIXME, need to implement proper pre reg allocation scheduling algorithm.
+ return;
+ for (auto &bb : *selection.blockList) {
+ const int32_t insnNum = scheduler.buildDAG(bb);
+ bb.insnList.clear();
+ scheduler.preScheduleDAG(bb, insnNum);
+ }
+ }
+ }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_insn_scheduling.hpp b/backend/src/backend/gen_insn_scheduling.hpp
new file mode 100644
index 0000000..534557d
--- /dev/null
+++ b/backend/src/backend/gen_insn_scheduling.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_scheduling.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_INSN_SCHEDULING_HPP__
+#define __GBE_GEN_INSN_SCHEDULING_HPP__
+
+namespace gbe
+{
+ class Selection; // Pre ISA code
+ class GenContext; // Handle compilation for Gen
+
+ /*! Schedule the code per basic block (tends to limit register number) */
+ void schedulePreRegAllocation(GenContext &ctx, Selection &selection);
+
+ /*! Schedule the code per basic block (tends to deal with insn latency) */
+ void schedulePostRegAllocation(GenContext &ctx, Selection &selection);
+
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_INSN_SCHEDULING_HPP__ */
+
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
new file mode 100644
index 0000000..96d3965
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -0,0 +1,4032 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_selection.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* This is the instruction selection code. First of all, this is a bunch of c++
+ * crap. Sorry if this is not that readable. Anyway, the goal here is to take
+ * GenIR code (i.e. the very regular, very RISC IR) and to produce GenISA with
+ * virtual registers (i.e. regular GenIR registers).
+ *
+ * Overall idea:
+ * =============
+ *
+ * There is a lot of papers and research about that but I tried to keep it
+ * simple. No dynamic programming, nothing like this. Just a recursive maximal
+ * munch.
+ *
+ * Basically, the code is executed per basic block from bottom to top. Patterns
+ * of GenIR instructions are defined and each instruction is matched against the
+ * best pattern i.e. the pattern that catches the largest number of
+ * instructions. Once matched, a sequence of instructions is output.
+ *
+ * Each instruction the match depends on is then marked as "root" i.e. we
+ * indicate that each of these instructions must be generated: we indeed need their
+ * destinations for the next instructions (remember that we generate the code in
+ * reverse order)
+ *
+ * Patterns:
+ * =========
+ *
+ * There is a lot of patterns and I did not implement all of them obviously. I
+ * just quickly gather the complete code to make pattern implementation kind of
+ * easy. This is pretty verbose to add a pattern but it should be not too hard
+ * to add new ones.
+ *
+ * To create and register patterns, I just abused C++ pre-main. A bunch of
+ * patterns is then created and sorted per opcode (i.e. the opcode of the root
+ * of the pattern): this creates a library of patterns that may be used in
+ * run-time.
+ *
+ * Predication / Masking and CFG linearization
+ * ===========================================
+ *
+ * The current version is based on an unfortunate choice. Basically, the problem
+ * to solve is how to map unstructured branches (i.e. regular gotos) onto Gen.
+ * Gen has a native support for structured branches (if/else/endif/while...) but
+ * nothing really native for unstructured branches.
+ *
+ * The idea we implemented is simple. We stole one flag register (here f0.0) to
+ * mask all the instructions (and only activate the proper SIMD lanes) and we
+ * use the CFG linearization technique to properly handle the control flow. This
+ * is not really good for one particular reason: Gen instructions must use the
+ * *same* flag register for the predicates (used for masking) and the
+ * conditional modifier (used as a destination for CMP). This leads to extra
+ * complications with compare instructions and select instructions. Basically,
+ * we need to insert extra MOVs.
+ *
+ * Also, there is some extra kludge to handle the predicates for JMPI.
+ *
+ * TODO:
+ * =====
+ *
+ * Sadly, I recreated here a new DAG class. This is just a bad idea since we
+ * already have the DAG per basic block with the Function graph i.e. the
+ * complete graph of uses and definitions. I think we should be able to save a
+ * lot of code here if we can simply reuse the code from UD / DU chains.
+ *
+ * Finally, cross-block instruction selection is quite possible with this simple
+ * approach. Basically, instructions from dominating blocks could be merged and
+ * matched with other instructions in the dominated block. This leads to the
+ * interesting approach which consists in traversing the dominator tree in post
+ * order
+ *
+ * We already use if/endif to enclose each basic block. We will continue to identify
+ * those blocks which could match to structured branching and use pure structured
+ * instruction to handle them completely.
+ */
+
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_context.hpp"
+#include "ir/function.hpp"
+#include "ir/liveness.hpp"
+#include "ir/profile.hpp"
+#include "sys/cvar.hpp"
+#include "sys/vector.hpp"
+#include <algorithm>
+#include <climits>
+
+namespace gbe
+{
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Helper functions
+ ///////////////////////////////////////////////////////////////////////////
+
+ uint32_t getGenType(ir::Type type) {
+ using namespace ir;
+ switch (type) {
+ case TYPE_BOOL: return GEN_TYPE_UW;
+ case TYPE_S8: return GEN_TYPE_B;
+ case TYPE_U8: return GEN_TYPE_UB;
+ case TYPE_S16: return GEN_TYPE_W;
+ case TYPE_U16: return GEN_TYPE_UW;
+ case TYPE_S32: return GEN_TYPE_D;
+ case TYPE_U32: return GEN_TYPE_UD;
+ case TYPE_S64: return GEN_TYPE_L;
+ case TYPE_U64: return GEN_TYPE_UL;
+ case TYPE_FLOAT: return GEN_TYPE_F;
+ case TYPE_DOUBLE: return GEN_TYPE_DF;
+ default: NOT_SUPPORTED; return GEN_TYPE_F;
+ }
+ }
+
+ ir::Type getIRType(uint32_t genType) {
+ using namespace ir;
+ switch (genType) {
+ case GEN_TYPE_B: return TYPE_S8;
+ case GEN_TYPE_UB: return TYPE_U8;
+ case GEN_TYPE_W: return TYPE_S16;
+ case GEN_TYPE_UW: return TYPE_U16;
+ case GEN_TYPE_D: return TYPE_S32;
+ case GEN_TYPE_UD: return TYPE_U32;
+ case GEN_TYPE_L: return TYPE_S64;
+ case GEN_TYPE_UL: return TYPE_U64;
+ case GEN_TYPE_F: return TYPE_FLOAT;
+ case GEN_TYPE_DF: return TYPE_DOUBLE;
+ default: NOT_SUPPORTED; return TYPE_FLOAT;
+ }
+ }
+
+ uint32_t getGenCompare(ir::Opcode opcode, bool inverse = false) {
+ using namespace ir;
+ switch (opcode) {
+ case OP_LE: return (!inverse) ? GEN_CONDITIONAL_LE : GEN_CONDITIONAL_G;
+ case OP_LT: return (!inverse) ? GEN_CONDITIONAL_L : GEN_CONDITIONAL_GE;
+ case OP_GE: return (!inverse) ? GEN_CONDITIONAL_GE : GEN_CONDITIONAL_L;
+ case OP_GT: return (!inverse) ? GEN_CONDITIONAL_G : GEN_CONDITIONAL_LE;
+ case OP_EQ: return (!inverse) ? GEN_CONDITIONAL_EQ : GEN_CONDITIONAL_NEQ;
+ case OP_NE: return (!inverse) ? GEN_CONDITIONAL_NEQ : GEN_CONDITIONAL_EQ;
+ default: NOT_SUPPORTED; return 0u;
+ };
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // SelectionInstruction
+ ///////////////////////////////////////////////////////////////////////////
+
+ SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
+ parent(NULL), opcode(op), dstNum(dst), srcNum(src)
+ {
+ extra.function = 0;
+ }
+
+ void SelectionInstruction::prepend(SelectionInstruction &other) {
+ gbe::prepend(&other, this);
+ other.parent = this->parent;
+ }
+
+ void SelectionInstruction::append(SelectionInstruction &other) {
+ gbe::append(&other, this);
+ other.parent = this->parent;
+ }
+
+ bool SelectionInstruction::isRead(void) const {
+ return this->opcode == SEL_OP_UNTYPED_READ ||
+ this->opcode == SEL_OP_READ64 ||
+ this->opcode == SEL_OP_ATOMIC ||
+ this->opcode == SEL_OP_BYTE_GATHER ||
+ this->opcode == SEL_OP_SAMPLE ||
+ this->opcode == SEL_OP_DWORD_GATHER;
+ }
+
+ bool SelectionInstruction::isWrite(void) const {
+ return this->opcode == SEL_OP_UNTYPED_WRITE ||
+ this->opcode == SEL_OP_WRITE64 ||
+ this->opcode == SEL_OP_ATOMIC ||
+ this->opcode == SEL_OP_BYTE_SCATTER ||
+ this->opcode == SEL_OP_TYPED_WRITE;
+ }
+
+ bool SelectionInstruction::isBranch(void) const {
+ return this->opcode == SEL_OP_JMPI;
+ }
+
+ bool SelectionInstruction::isLabel(void) const {
+ return this->opcode == SEL_OP_LABEL;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // SelectionVector
+ ///////////////////////////////////////////////////////////////////////////
+
+ SelectionVector::SelectionVector(void) :
+ insn(NULL), reg(NULL), regNum(0), isSrc(0)
+ {}
+
+ ///////////////////////////////////////////////////////////////////////////
+ // SelectionBlock
+ ///////////////////////////////////////////////////////////////////////////
+
+ SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), isLargeBlock(false), endifLabel( (ir::LabelIndex) 0){}
+
+ void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); }
+
+ void SelectionBlock::append(SelectionInstruction *insn) {
+ this->insnList.push_back(insn);
+ insn->parent = this;
+ }
+
+ void SelectionBlock::prepend(SelectionInstruction *insn) {
+ this->insnList.push_front(insn);
+ insn->parent = this;
+ }
+
+ void SelectionBlock::append(SelectionVector *vec) {
+ this->vectorList.push_back(vec);
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Maximal munch selection on DAG
+ ///////////////////////////////////////////////////////////////////////////
+
+ /*! All instructions in a block are organized into a DAG */
+ class SelectionDAG
+ {
+ public:
+ INLINE SelectionDAG(const ir::Instruction &insn) :
+ insn(insn), mergeable(0), childNum(insn.getSrcNum()), isRoot(0) {
+ GBE_ASSERT(insn.getSrcNum() < 127);
+ for (uint32_t childID = 0; childID < childNum; ++childID)
+ this->child[childID] = NULL;
+ computeBool = false;
+ isUsed = false;
+ }
+ /*! Mergeable are non-root instructions with valid sources */
+ INLINE void setAsMergeable(uint32_t which) { mergeable|=(1<<which); }
+ /*! Mergeable are non-root instructions with valid sources */
+ INLINE bool isMergeable(uint32_t which) const { return mergeable&(1<<which); }
+ /*! Children that need to be matched */
+ SelectionDAG *child[ir::Instruction::MAX_SRC_NUM];
+ /*! Instruction that needs to be matched */
+ const ir::Instruction &insn;
+ /*! When sources have been overwritten, a child insn cannot be merged */
+ uint32_t mergeable:ir::Instruction::MAX_SRC_NUM;
+ /*! Number of children we have in the pattern */
+ uint32_t childNum:7;
+ /*! A root must be generated, no matter what */
+ uint32_t isRoot:1;
+ /*! A bool register is used as normal computing sources. */
+ bool computeBool;
+ /*! is used in this block */
+ bool isUsed;
+ };
+
+ /*! A pattern is a tree to match. This is the general interface for them. For
+ * pattern to be matched, we need to match the complete tree i.e. this node
+ * and its child nodes
+ */
+ class SelectionPattern
+ {
+ public:
+ SelectionPattern(uint32_t insnNum, uint32_t cost) :
+ insnNum(insnNum), cost(cost) {}
+ /*! This is an abstract class */
+ virtual ~SelectionPattern(void) {}
+ /*! Emit Gen code in the selection. Return false if no match */
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const = 0;
+ /*! All the possible opcodes for this pattern (for fast sort) */
+ vector<ir::Opcode> opcodes;
+ /*! Number of instruction generated */
+ uint32_t insnNum;
+ /*! Cost of the pattern */
+ uint32_t cost;
+ };
+
+ /*! Store and sort all the patterns. This is our global library we use for the
+ * code selection
+ */
+ class SelectionLibrary
+ {
+ public:
+ /*! Will register all the patterns */
+ SelectionLibrary(void);
+ /*! Release and destroy all the registered patterns */
+ ~SelectionLibrary(void);
+ /*! Insert the given pattern for all associated opcodes */
+ template <typename PatternType> void insert(void);
+ /*! One list of pattern per opcode */
+ typedef vector<const SelectionPattern*> PatternList;
+ /*! All lists of patterns properly sorted per opcode */
+ PatternList patterns[ir::OP_INVALID];
+ /*! All patterns to free */
+ vector<const SelectionPattern*> toFree;
+ };
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Code selection internal implementation
+ ///////////////////////////////////////////////////////////////////////////
+
+ /*! Actual implementation of the instruction selection engine */
+ class Selection::Opaque
+ {
+ public:
+ /*! simdWidth is the default width for the instructions */
+ Opaque(GenContext &ctx);
+ /*! Release everything */
+ virtual ~Opaque(void);
+ /*! Implements the instruction selection itself */
+ void select(void);
+ /*! Start a backward generation (from the end of the block) */
+ void startBackwardGeneration(void);
+ /*! End backward code generation and output the code in the block */
+ void endBackwardGeneration(void);
+ /*! Implement public class */
+ uint32_t getLargestBlockSize(void) const;
+ /*! Implement public class */
+ INLINE uint32_t getVectorNum(void) const { return this->vectorNum; }
+ /*! Implement public class */
+ INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov);
+ /*! Implement public class */
+ INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov);
+ /*! spill a register (insert spill/unspill instructions) */
+ INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
+ /*! should add per thread offset to the local memory address when load/store/atomic */
+ bool needPatchSLMAddr() const { return patchSLMAddr; }
+ void setPatchSLMAddr(bool b) { patchSLMAddr = b; }
+ /*! indicate whether a register is a scalar/uniform register. */
+ INLINE bool isScalarReg(const ir::Register ®) const {
+ const ir::RegisterData ®Data = getRegisterData(reg);
+ return regData.isUniform();
+ }
+
+ INLINE GenRegister unpacked_uw(const ir::Register ®) const {
+ return GenRegister::unpacked_uw(reg, isScalarReg(reg));
+ }
+
+ INLINE GenRegister unpacked_ub(const ir::Register ®) const {
+ return GenRegister::unpacked_ub(reg, isScalarReg(reg));
+ }
+ /*! Implement public class */
+ INLINE uint32_t getRegNum(void) const { return file.regNum(); }
+ /*! Implements public interface */
+ INLINE ir::RegisterData getRegisterData(ir::Register reg) const {
+ return file.get(reg);
+ }
+ /*! Implement public class */
+ INLINE ir::RegisterFamily getRegisterFamily(ir::Register reg) const {
+ return file.get(reg).family;
+ }
+ /*! Implement public class */
+ SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+ /*! Return the selection register from the GenIR one */
+ GenRegister selReg(ir::Register, ir::Type type = ir::TYPE_FLOAT) const;
+ /*! Compute the nth register part when using SIMD8 with Qn (n in 2,3,4) */
+ GenRegister selRegQn(ir::Register, uint32_t quarter, ir::Type type = ir::TYPE_FLOAT) const;
+ /*! Size of the stack (should be large enough) */
+ enum { MAX_STATE_NUM = 16 };
+ /*! Push the current instruction state */
+ INLINE void push(void) {
+ assert(stateNum < MAX_STATE_NUM);
+ stack[stateNum++] = curr;
+ }
+ /*! Pop the latest pushed state */
+ INLINE void pop(void) {
+ assert(stateNum > 0);
+ curr = stack[--stateNum];
+ }
+ /*! Create a new register in the register file and append it in the
+ * temporary list of the current block
+ */
+ INLINE ir::Register reg(ir::RegisterFamily family, bool scalar = false) {
+ GBE_ASSERT(block != NULL);
+ const ir::Register reg = file.append(family, scalar);
+ block->append(reg);
+ return reg;
+ }
+ /*! Append a block at the block stream tail. It becomes the current block */
+ void appendBlock(const ir::BasicBlock &bb);
+ /*! Append an instruction in the current block */
+ SelectionInstruction *appendInsn(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+ /*! Append a new vector of registers in the current block */
+ SelectionVector *appendVector(void);
+ /*! Build a DAG for the basic block (return number of instructions) */
+ uint32_t buildBasicBlockDAG(const ir::BasicBlock &bb);
+ /*! Perform the selection on the basic block */
+ void matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum);
+ /*! A root instruction needs to be generated */
+ bool isRoot(const ir::Instruction &insn) const;
+
+ /*! To handle selection block allocation */
+ DECL_POOL(SelectionBlock, blockPool);
+ /*! To handle selection instruction allocation */
+ LinearAllocator insnAllocator;
+ /*! To handle selection vector allocation */
+ DECL_POOL(SelectionVector, vecPool);
+ /*! Per register information used with top-down block sweeping */
+ vector<SelectionDAG*> regDAG;
+ /*! Store one DAG per instruction */
+ vector<SelectionDAG*> insnDAG;
+ /*! Owns this structure */
+ GenContext &ctx;
+ /*! Tail of the code fragment for backward code generation */
+ intrusive_list<SelectionInstruction> bwdList;
+ /*! List of emitted blocks */
+ intrusive_list<SelectionBlock> blockList;
+ /*! Currently processed block */
+ SelectionBlock *block;
+ /*! Current instruction state to use */
+ GenInstructionState curr;
+ /*! We append new registers so we duplicate the function register file */
+ ir::RegisterFile file;
+ /*! State used to encode the instructions */
+ GenInstructionState stack[MAX_STATE_NUM];
+ /*! Maximum number of instructions in the basic blocks */
+ uint32_t maxInsnNum;
+ /*! Speed up instruction dag allocation */
+ DECL_POOL(SelectionDAG, dagPool);
+ /*! Total number of registers in the function we encode */
+ uint32_t regNum;
+ /*! Number of states currently pushed */
+ uint32_t stateNum;
+ /*! Number of vector allocated */
+ uint32_t vectorNum;
+ /*! If true, generate code backward */
+ bool bwdCodeGeneration;
+ /*! To make function prototypes more readable */
+ typedef const GenRegister &Reg;
+
+#define ALU1(OP) \
+ INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
+#define ALU1WithTemp(OP) \
+ INLINE void OP(Reg dst, Reg src, Reg temp) { ALU1WithTemp(SEL_OP_##OP, dst, src, temp); }
+#define ALU2(OP) \
+ INLINE void OP(Reg dst, Reg src0, Reg src1) { ALU2(SEL_OP_##OP, dst, src0, src1); }
+#define ALU2WithTemp(OP) \
+ INLINE void OP(Reg dst, Reg src0, Reg src1, Reg temp) { ALU2WithTemp(SEL_OP_##OP, dst, src0, src1, temp); }
+#define ALU3(OP) \
+ INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
+#define I64Shift(OP) \
+ INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
+ ALU1(MOV)
+ ALU1WithTemp(MOV_DF)
+ ALU1WithTemp(LOAD_DF_IMM)
+ ALU1(LOAD_INT64_IMM)
+ ALU1(RNDZ)
+ ALU1(RNDE)
+ ALU1(F16TO32)
+ ALU1(F32TO16)
+ ALU2(SEL)
+ ALU2(SEL_INT64)
+ ALU1(NOT)
+ ALU2(AND)
+ ALU2(OR)
+ ALU2(XOR)
+ ALU2(I64AND)
+ ALU2(I64OR)
+ ALU2(I64XOR)
+ ALU2(SHR)
+ ALU2(SHL)
+ ALU2(RSR)
+ ALU2(RSL)
+ ALU2(ASR)
+ ALU2(ADD)
+ ALU2WithTemp(I64ADD)
+ ALU2WithTemp(I64SUB)
+ ALU2(MUL)
+ ALU1(FRC)
+ ALU1(RNDD)
+ ALU1(RNDU)
+ ALU2(MACH)
+ ALU1(LZD)
+ ALU3(MAD)
+ ALU2WithTemp(MUL_HI)
+ ALU1(FBH)
+ ALU1(FBL)
+ ALU2WithTemp(HADD)
+ ALU2WithTemp(RHADD)
+ ALU2(UPSAMPLE_SHORT)
+ ALU2(UPSAMPLE_INT)
+ ALU2(UPSAMPLE_LONG)
+ ALU1WithTemp(CONVI_TO_I64)
+ ALU1WithTemp(CONVF_TO_I64)
+ ALU1(CONVI64_TO_I)
+ I64Shift(I64SHL)
+ I64Shift(I64SHR)
+ I64Shift(I64ASR)
+#undef ALU1
+#undef ALU1WithTemp
+#undef ALU2
+#undef ALU2WithTemp
+#undef ALU3
+#undef I64Shift
+ /*! Convert 64-bit integer to 32-bit float */
+ void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]);
+ /*! Convert 64-bit integer to 32-bit float */
+ void CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]);
+ /*! Saturated 64bit x*y + z */
+ void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]);
+ /*! High 64bit of x*y */
+ void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]);
+ /*! (x+y)>>1 without mod. overflow */
+ void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+ /*! (x+y+1)>>1 without mod. overflow */
+ void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+ /*! Shift a 64-bit integer */
+ void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]);
+ /*! Compare 64-bit integer */
+ void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
+ /*! Saturated addition of 64-bit integer */
+ void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]);
+ /*! Saturated subtraction of 64-bit integer */
+ void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]);
+ /*! Encode a barrier instruction */
+ void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
+ /*! Encode a barrier instruction */
+ void FENCE(GenRegister dst);
+ /*! Encode a label instruction */
+ void LABEL(ir::LabelIndex label);
+ /*! Jump indexed instruction, return the encoded instruction count according to jump distance. */
+ int JMPI(Reg src, ir::LabelIndex target, ir::LabelIndex origin);
+ /*! IF indexed instruction */
+ void IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip);
+ /*! ENDIF indexed instruction */
+ void ENDIF(Reg src, ir::LabelIndex jip);
+ /*! BRD indexed instruction */
+ void BRD(Reg src, ir::LabelIndex jip);
+ /*! BRC indexed instruction */
+ void BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip);
+ /*! Compare instructions */
+ void CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst = GenRegister::null());
+ /*! Select instruction with embedded comparison */
+ void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
+ /* Constant buffer move instruction */
+ void INDIRECT_MOVE(Reg dst, Reg src);
+ /*! EOT is used to finish GPGPU threads */
+ void EOT(void);
+ /*! No-op */
+ void NOP(void);
+ /*! Wait instruction (used for the barrier) */
+ void WAIT(void);
+ /*! Atomic instruction */
+ void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
+ /*! Read 64 bits float/int array */
+ void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+ /*! Write 64 bits float/int array */
+ void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
+ /*! Untyped read (up to 4 elements) */
+ void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+ /*! Untyped write (up to 4 elements) */
+ void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+ /*! Byte gather (for unaligned bytes, shorts and ints) */
+ void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
+ /*! Byte scatter (for unaligned bytes, shorts and ints) */
+ void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
+ /*! DWord scatter (for constant cache read) */
+ void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
+ /*! Unpack the uint to char4 */
+ void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum);
+ /*! pack the char4 to uint */
+ void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum);
+ /*! Extended math function (2 arguments) */
+ void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
+ /*! Extended math function (1 argument) */
+ void MATH(Reg dst, uint32_t function, Reg src);
+ /*! Encode unary instructions */
+ void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
+ /*! Encode unary with temp reg instructions */
+ void ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg temp);
+ /*! Encode binary instructions */
+ void ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1);
+ /*! Encode binary with temp reg instructions */
+ void ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp);
+ /*! Encode ternary instructions */
+ void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
+ /*! Encode sample instructions */
+ void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform);
+ /*! Encode typed write instructions */
+ void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
+ /*! Get image information */
+ void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
+ /*! Multiply 64-bit integers */
+ void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+ /*! 64-bit integer division */
+ void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
+ /*! 64-bit integer remainder of division */
+ void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
+ /* common functions for both binary instruction and sel_cmp and compare instruction.
+ It will handle the IMM or normal register assignment, and will try to avoid LOADI
+ as much as possible. */
+ void getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
+ GenRegister &src1, ir::Type type, bool &inverse);
+ void getSrcGenRegImm(SelectionDAG &dag,
+ SelectionDAG *dag0, SelectionDAG *dag1,
+ GenRegister &src0, GenRegister &src1,
+ ir::Type type, bool &inverse);
+ /*! Use custom allocators */
+ GBE_CLASS(Opaque);
+ friend class SelectionBlock;
+ friend class SelectionInstruction;
+ private:
+ /*! Auxiliary label for if/endif. */
+ uint16_t currAuxLabel;
+ bool patchSLMAddr;
+ INLINE ir::LabelIndex newAuxLabel()
+ {
+ currAuxLabel++;
+ return (ir::LabelIndex)currAuxLabel;
+ }
+
+ };
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Helper function
+ ///////////////////////////////////////////////////////////////////////////
+
+ /*! Directly mark all sources as root (when no match is found) */
+ static void markAllChildren(SelectionDAG &dag) {
+ // Do not merge anything, so all sources become roots
+ for (uint32_t childID = 0; childID < dag.childNum; ++childID)
+ if (dag.child[childID])
+ dag.child[childID]->isRoot = 1;
+ }
+
+ /*! Helper function to figure if two sources are the same */
+ static bool sourceMatch(SelectionDAG *src0DAG, uint32_t src0ID,
+ SelectionDAG *src1DAG, uint32_t src1ID)
+ {
+ GBE_ASSERT(src0DAG && src1DAG);
+ // Ensure they are the same physical registers
+ const ir::Register src0 = src0DAG->insn.getSrc(src0ID);
+ const ir::Register src1 = src1DAG->insn.getSrc(src1ID);
+ if (src0 != src1)
+ return false;
+ // Ensure they contain the same values
+ return src0DAG->child[src0ID] == src1DAG->child[src1ID];
+ }
+
+ Selection::Opaque::Opaque(GenContext &ctx) :
+ ctx(ctx), block(NULL),
+ curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
+ maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
+ stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()), patchSLMAddr(false)
+ {
+ const ir::Function &fn = ctx.getFunction();
+ this->regNum = fn.regNum();
+ this->regDAG.resize(regNum);
+ this->insnDAG.resize(maxInsnNum);
+ }
+
+ Selection::Opaque::~Opaque(void) {
+ for (auto it = blockList.begin(); it != blockList.end();) {
+ SelectionBlock &block = *it;
+ ++it;
+ this->deleteSelectionBlock(&block);
+ }
+ }
+
+ SelectionInstruction*
+ Selection::Opaque::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum)
+ {
+ const size_t regSize = (dstNum+srcNum)*sizeof(GenRegister);
+ const size_t size = sizeof(SelectionInstruction) + regSize;
+ void *ptr = insnAllocator.allocate(size);
+ return new (ptr) SelectionInstruction(opcode, dstNum, srcNum);
+ }
+
+ void Selection::Opaque::startBackwardGeneration(void) {
+ this->bwdCodeGeneration = true;
+ }
+
+ void Selection::Opaque::endBackwardGeneration(void) {
+ for (auto it = bwdList.rbegin(); it != bwdList.rend();) {
+ SelectionInstruction &insn = *it;
+ auto toRemoveIt = it--;
+ bwdList.erase(toRemoveIt);
+ this->block->prepend(&insn);
+ }
+
+ this->bwdCodeGeneration = false;
+ }
+
+ uint32_t Selection::Opaque::getLargestBlockSize(void) const {
+ size_t maxInsnNum = 0;
+ for (const auto &bb : blockList)
+ maxInsnNum = std::max(maxInsnNum, bb.insnList.size());
+ return uint32_t(maxInsnNum);
+ }
+
+ void Selection::Opaque::appendBlock(const ir::BasicBlock &bb) {
+ this->block = this->newSelectionBlock(&bb);
+ this->blockList.push_back(this->block);
+ }
+
+ SelectionInstruction *Selection::Opaque::appendInsn(SelectionOpcode opcode,
+ uint32_t dstNum,
+ uint32_t srcNum)
+ {
+ GBE_ASSERT(dstNum <= SelectionInstruction::MAX_DST_NUM && srcNum <= SelectionInstruction::MAX_SRC_NUM);
+ GBE_ASSERT(this->block != NULL);
+ SelectionInstruction *insn = this->create(opcode, dstNum, srcNum);
+ if (this->bwdCodeGeneration)
+ this->bwdList.push_back(insn);
+ else
+ this->block->append(insn);
+ insn->state = this->curr;
+ return insn;
+ }
+
+ SelectionVector *Selection::Opaque::appendVector(void) {
+ GBE_ASSERT(this->block != NULL);
+ SelectionVector *vector = this->newSelectionVector();
+
+ if (this->bwdCodeGeneration)
+ vector->insn = this->bwdList.back();
+ else
+ vector->insn = this->block->insnList.back();
+ this->block->append(vector);
+ this->vectorNum++;
+ return vector;
+ }
+
+ bool Selection::Opaque::spillRegs(const SpilledRegs &spilledRegs,
+ uint32_t registerPool) {
+ GBE_ASSERT(registerPool != 0);
+
+ for (auto &block : blockList)
+ for (auto &insn : block.insnList) {
+ // spill / unspill insn should be skipped when do spilling
+ if(insn.opcode == SEL_OP_SPILL_REG
+ || insn.opcode == SEL_OP_UNSPILL_REG)
+ continue;
+ const int simdWidth = insn.state.execWidth;
+
+ const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+ struct RegSlot {
+ RegSlot(ir::Register _reg, uint8_t _srcID,
+ uint8_t _poolOffset, bool _isTmp, uint32_t _addr)
+ : reg(_reg), srcID(_srcID), poolOffset(_poolOffset), isTmpReg(_isTmp), addr(_addr)
+ {};
+ ir::Register reg;
+ union {
+ uint8_t srcID;
+ uint8_t dstID;
+ };
+ uint8_t poolOffset;
+ bool isTmpReg;
+ int32_t addr;
+ };
+ uint8_t poolOffset = 1; // keep one for scratch message header
+ vector <struct RegSlot> regSet;
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const GenRegister selReg = insn.src(srcID);
+ const ir::Register reg = selReg.reg();
+ auto it = spilledRegs.find(reg);
+ if(it != spilledRegs.end()
+ && selReg.file == GEN_GENERAL_REGISTER_FILE
+ && selReg.physical == 0) {
+ ir::RegisterFamily family = getRegisterFamily(reg);
+ if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+ poolOffset += simdWidth / 8; // qword register fill could not share the scratch read message payload register
+ }
+ struct RegSlot regSlot(reg, srcID, poolOffset,
+ it->second.isTmpReg,
+ it->second.addr);
+ if(family == ir::FAMILY_QWORD) {
+ poolOffset += 2 * simdWidth / 8;
+ } else {
+ poolOffset += simdWidth / 8;
+ }
+ regSet.push_back(regSlot);
+ }
+ }
+
+ if (poolOffset > ctx.reservedSpillRegs) {
+ if (GBE_DEBUG)
+ std::cerr << "Instruction (#" << (uint32_t)insn.opcode
+ << ") src too large pooloffset "
+ << (uint32_t)poolOffset << std::endl;
+ return false;
+ }
+ // FIXME, to support post register allocation scheduling,
+ // put all the reserved register to the spill/unspill's destination registers.
+ // This is not the best way. We need to refine the spill/unspill instruction to
+ // only use passed in registers and don't access hard coded offset in the future.
+ while(!regSet.empty()) {
+ struct RegSlot regSlot = regSet.back();
+ regSet.pop_back();
+ const GenRegister selReg = insn.src(regSlot.srcID);
+ if (!regSlot.isTmpReg) {
+ /* For temporary registers, we don't need to unspill. */
+ SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG,
+ 1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(), 0);
+ unspill->state = GenInstructionState(simdWidth);
+ unspill->state.noMask = 1;
+ unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
+ registerPool + regSlot.poolOffset, 0,
+ selReg.type, selReg.vstride,
+ selReg.width, selReg.hstride);
+ for(uint32_t i = 1; i < 1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++)
+ unspill->dst(i) = ctx.getSimdWidth() == 8 ?
+ GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1), 0 ) :
+ GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1) * 2, 0);
+ unspill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth;
+ unspill->extra.scratchMsgHeader = registerPool;
+ insn.prepend(*unspill);
+ }
+
+ GenRegister src = insn.src(regSlot.srcID);
+ // change nr/subnr, keep other register settings
+ src.nr = registerPool + regSlot.poolOffset; src.subnr = 0; src.physical = 1;
+ insn.src(regSlot.srcID) = src;
+ };
+
+ /*
+ To save one register, registerPool + 1 was used by both
+ the src0 as source and other operands as payload. To avoid
+ side effect, we use a stack model to push all operands
+ register, and spill the 0th dest at last. As all the spill
+ will be append to the current instruction. Then the last spill
+ instruction will be the first instruction after current
+ instruction. Thus the registerPool + 1 still contain valid
+ data.
+ */
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const GenRegister selReg = insn.dst(dstID);
+ const ir::Register reg = selReg.reg();
+ auto it = spilledRegs.find(reg);
+ if(it != spilledRegs.end()
+ && selReg.file == GEN_GENERAL_REGISTER_FILE
+ && selReg.physical == 0) {
+ ir::RegisterFamily family = getRegisterFamily(reg);
+ if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+ poolOffset += simdWidth / 8; // qword register spill could not share the scratch write message payload register
+ }
+ struct RegSlot regSlot(reg, dstID, poolOffset,
+ it->second.isTmpReg,
+ it->second.addr);
+ if (family == ir::FAMILY_QWORD) poolOffset += 2 * simdWidth / 8;
+ else poolOffset += simdWidth / 8;
+ regSet.push_back(regSlot);
+ }
+ }
+
+ if (poolOffset > ctx.reservedSpillRegs){
+ if (GBE_DEBUG)
+ std::cerr << "Instruction (#" << (uint32_t)insn.opcode
+ << ") dst too large pooloffset "
+ << (uint32_t)poolOffset << std::endl;
+ return false;
+ }
+ while(!regSet.empty()) {
+ struct RegSlot regSlot = regSet.back();
+ regSet.pop_back();
+ const GenRegister selReg = insn.dst(regSlot.dstID);
+ if(!regSlot.isTmpReg) {
+ /* For temporary registers, we don't need to unspill. */
+ SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG,
+ (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth() , 1);
+ spill->state = insn.state;//GenInstructionState(simdWidth);
+ spill->state.accWrEnable = 0;
+ spill->state.saturate = 0;
+ if (insn.opcode == SEL_OP_SEL)
+ spill->state.predicate = GEN_PREDICATE_NONE;
+ spill->src(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
+ registerPool + regSlot.poolOffset, 0,
+ selReg.type, selReg.vstride,
+ selReg.width, selReg.hstride);
+ spill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth;
+ spill->extra.scratchMsgHeader = registerPool;
+ for(uint32_t i = 0; i < 0 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++)
+ spill->dst(i) = ctx.getSimdWidth() == 8 ?
+ GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i), 0 ) :
+ GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i) * 2, 0);
+ insn.append(*spill);
+ }
+
+ GenRegister dst = insn.dst(regSlot.dstID);
+ // change nr/subnr, keep other register settings
+ dst.physical =1; dst.nr = registerPool + regSlot.poolOffset; dst.subnr = 0;
+ insn.dst(regSlot.dstID)= dst;
+ }
+ }
+ return true;
+ }
+
+ ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+ SelectionBlock *block = insn->parent;
+ const uint32_t simdWidth = insn->state.execWidth;
+ ir::Register tmp;
+ GenRegister gr;
+
+ // This will append the temporary register in the instruction block
+ this->block = block;
+ tmp = this->reg(ir::getFamily(type), simdWidth == 1);
+ gr = this->selReg(tmp, type);
+ if (needMov) {
+ // Generate the MOV instruction and replace the register in the instruction
+ SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
+ mov->src(0) = GenRegister::retype(insn->src(regID), gr.type);
+ mov->state = GenInstructionState(simdWidth);
+ if (this->isScalarReg(insn->src(regID).reg()))
+ mov->state.noMask = 1;
+ mov->dst(0) = gr;
+ insn->prepend(*mov);
+ }
+ insn->src(regID) = gr;
+
+ return tmp;
+ }
+
+ ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+ SelectionBlock *block = insn->parent;
+ uint32_t simdWidth;
+ if (!GenRegister::isNull(insn->dst(regID)))
+ simdWidth = this->isScalarReg(insn->dst(regID).reg()) ? 1 : insn->state.execWidth;
+ else {
+ GBE_ASSERT(needMov == false);
+ simdWidth = insn->state.execWidth;
+ }
+ ir::Register tmp;
+ GenRegister gr;
+ this->block = block;
+ tmp = this->reg(ir::getFamily(type));
+ gr = this->selReg(tmp, type);
+ if (needMov) {
+ // Generate the MOV instruction and replace the register in the instruction
+ SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
+ mov->dst(0) = GenRegister::retype(insn->dst(regID), gr.type);
+ mov->state = GenInstructionState(simdWidth);
+ if (simdWidth == 1) {
+ mov->state.noMask = 1;
+ mov->src(0) = GenRegister::retype(GenRegister::vec1(GEN_GENERAL_REGISTER_FILE, gr.reg()), gr.type);
+ } else
+ mov->src(0) = gr;
+ insn->append(*mov);
+ }
+ insn->dst(regID) = gr;
+ return tmp;
+ }
+
+#define SEL_REG(SIMD16, SIMD8, SIMD1) \
+ if (ctx.sel->isScalarReg(reg) == true) \
+ return GenRegister::retype(GenRegister::SIMD1(reg), genType); \
+ else if (simdWidth == 8) \
+ return GenRegister::retype(GenRegister::SIMD8(reg), genType); \
+ else { \
+ GBE_ASSERT (simdWidth == 16); \
+ return GenRegister::retype(GenRegister::SIMD16(reg), genType); \
+ }
+
+ GenRegister Selection::Opaque::selReg(ir::Register reg, ir::Type type) const {
+ using namespace ir;
+ const uint32_t genType = getGenType(type);
+ const uint32_t simdWidth = ctx.getSimdWidth();
+ const RegisterData data = file.get(reg);
+ const RegisterFamily family = data.family;
+ switch (family) {
+ case FAMILY_BOOL: SEL_REG(uw16grf, uw8grf, uw1grf); break;
+ case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
+ case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
+ case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+ case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
+ default: NOT_SUPPORTED;
+ }
+ GBE_ASSERT(false);
+ return GenRegister();
+ }
+
+#undef SEL_REG
+
+ GenRegister Selection::Opaque::selRegQn(ir::Register reg, uint32_t q, ir::Type type) const {
+ GenRegister sreg = this->selReg(reg, type);
+ sreg.quarter = q;
+ return sreg;
+ }
+
+ /*! Syntactic sugar for method declaration */
+ typedef const GenRegister &Reg;
+
+ void Selection::Opaque::LABEL(ir::LabelIndex index) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_LABEL, 0, 0);
+ insn->index = uint16_t(index);
+ }
+
+ void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1);
+ insn->src(0) = src;
+ insn->dst(0) = fence;
+ insn->extra.barrierType = barrierType;
+ }
+
+ void Selection::Opaque::FENCE(GenRegister dst) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_FENCE, 1, 0);
+ insn->dst(0) = dst;
+ }
+
+ int Selection::Opaque::JMPI(Reg src, ir::LabelIndex index, ir::LabelIndex origin) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1);
+ insn->src(0) = src;
+ insn->index = uint16_t(index);
+ insn->extra.longjmp = abs(index - origin) > 800;
+ return insn->extra.longjmp ? 2 : 1;
+ }
+
+ void Selection::Opaque::BRD(Reg src, ir::LabelIndex jip) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BRD, 0, 1);
+ insn->src(0) = src;
+ insn->index = uint16_t(jip);
+ }
+
+ void Selection::Opaque::BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BRC, 0, 1);
+ insn->src(0) = src;
+ insn->index = uint16_t(jip);
+ insn->index1 = uint16_t(uip);
+ }
+
+ void Selection::Opaque::IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
+ insn->src(0) = src;
+ insn->index = uint16_t(jip);
+ insn->index1 = uint16_t(uip);
+ }
+
+ void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
+ this->block->endifLabel = this->newAuxLabel();
+ this->LABEL(this->block->endifLabel);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
+ insn->src(0) = src;
+ insn->index = uint16_t(this->block->endifLabel);
+ }
+
+ void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 1, 2);
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->dst(0) = dst;
+ insn->extra.function = conditional;
+ }
+
+ void Selection::Opaque::SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_SEL_CMP, 1, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->extra.function = conditional;
+ }
+ void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg src) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 1, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ }
+
+ void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
+ uint32_t srcNum, Reg src0,
+ Reg src1, Reg src2, uint32_t bti) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ if(srcNum > 1) insn->src(1) = src1;
+ if(srcNum > 2) insn->src(2) = src2;
+ insn->extra.function = function;
+ insn->setbti(bti);
+ SelectionVector *vector = this->appendVector();
+
+ vector->regNum = srcNum;
+ vector->reg = &insn->src(0);
+ vector->isSrc = 1;
+ }
+
+ void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
+ void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
+ void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
+
+ void Selection::Opaque::READ64(Reg addr,
+ const GenRegister *dst,
+ uint32_t elemNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
+ SelectionVector *srcVector = this->appendVector();
+ SelectionVector *dstVector = this->appendVector();
+
+ // Regular instruction to encode
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->dst(elemID) = dst[elemID];
+ insn->src(0) = addr;
+ insn->setbti(bti);
+ insn->extra.elem = elemNum;
+
+ dstVector->regNum = elemNum;
+ dstVector->isSrc = 0;
+ dstVector->reg = &insn->dst(0);
+
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->reg = &insn->src(0);
+ }
+
+ void Selection::Opaque::UNTYPED_READ(Reg addr,
+ const GenRegister *dst,
+ uint32_t elemNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
+ SelectionVector *srcVector = this->appendVector();
+ SelectionVector *dstVector = this->appendVector();
+ if (this->isScalarReg(dst[0].reg()))
+ insn->state.noMask = 1;
+ // Regular instruction to encode
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->dst(elemID) = dst[elemID];
+ insn->src(0) = addr;
+ insn->setbti(bti);
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation
+ dstVector->regNum = elemNum;
+ dstVector->isSrc = 0;
+ dstVector->reg = &insn->dst(0);
+
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->reg = &insn->src(0);
+ }
+
+ void Selection::Opaque::WRITE64(Reg addr,
+ const GenRegister *src,
+ uint32_t srcNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
+ SelectionVector *vector = this->appendVector();
+
+ // Regular instruction to encode
+ insn->src(0) = addr;
+ for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+ insn->src(elemID + 1) = src[elemID];
+
+ insn->setbti(bti);
+ insn->extra.elem = srcNum;
+
+ vector->regNum = srcNum + 1;
+ vector->reg = &insn->src(0);
+ vector->isSrc = 1;
+ }
+
+ void Selection::Opaque::UNTYPED_WRITE(Reg addr,
+ const GenRegister *src,
+ uint32_t elemNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
+ SelectionVector *vector = this->appendVector();
+
+ // Regular instruction to encode
+ insn->src(0) = addr;
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->src(elemID+1) = src[elemID];
+ insn->setbti(bti);
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation for the sources
+ vector->regNum = elemNum+1;
+ vector->reg = &insn->src(0);
+ vector->isSrc = 1;
+ }
+
+ void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, 1, 1);
+ SelectionVector *srcVector = this->appendVector();
+ SelectionVector *dstVector = this->appendVector();
+
+ if (this->isScalarReg(dst.reg()))
+ insn->state.noMask = 1;
+ // Instruction to encode
+ insn->src(0) = addr;
+ insn->dst(0) = dst;
+ insn->setbti(bti);
+ insn->extra.elem = elemSize;
+
+ // byte gather requires vector in the sense that scalar are not allowed
+ // (yet)
+ dstVector->regNum = 1;
+ dstVector->isSrc = 0;
+ dstVector->reg = &insn->dst(0);
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->reg = &insn->src(0);
+ }
+
+ void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, 0, 2);
+ SelectionVector *vector = this->appendVector();
+
+ // Instruction to encode
+ insn->src(0) = addr;
+ insn->src(1) = src;
+ insn->setbti(bti);
+ insn->extra.elem = elemSize;
+
+ // value and address are contiguous in the send
+ vector->regNum = 2;
+ vector->isSrc = 1;
+ vector->reg = &insn->src(0);
+ }
+
+ void Selection::Opaque::DWORD_GATHER(Reg dst, Reg addr, uint32_t bti) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_DWORD_GATHER, 1, 1);
+ SelectionVector *vector = this->appendVector();
+ SelectionVector *srcVector = this->appendVector();
+
+ if (this->isScalarReg(dst.reg()))
+ insn->state.noMask = 1;
+ insn->src(0) = addr;
+ insn->dst(0) = dst;
+ insn->setbti(bti);
+ vector->regNum = 1;
+ vector->isSrc = 0;
+ vector->reg = &insn->dst(0);
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->reg = &insn->src(0);
+ }
+
+ void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1);
+ insn->src(0) = src;
+ for(uint32_t i = 0; i < elemNum; i++)
+ insn->dst(i) = dst[i];
+ }
+ void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum);
+ for(uint32_t i = 0; i < elemNum; i++)
+ insn->src(i) = src[i];
+ insn->dst(0) = dst;
+ }
+
+ void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->extra.function = function;
+ }
+
+ void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ insn->extra.function = function;
+ }
+
+ void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 6; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 14, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 13; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 14, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 13; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ }
+
+ void Selection::Opaque::ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src, Reg temp) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 2, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ insn->dst(1) = temp;
+ }
+
+ void Selection::Opaque::ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 1, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ }
+
+ void Selection::Opaque::ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 2, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->dst(1) = temp;
+ }
+
+ void Selection::Opaque::ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 1, 3);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->src(2) = src2;
+ }
+
+ void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2);
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i=0; i<3; i++)
+ insn->dst(i) = tmp[i];
+ insn->extra.function = conditional;
+ }
+
+ void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 6, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i=0; i<5; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 6, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i=0; i<5; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 7, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ for(int i = 0; i < 6; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVF_TO_I64, 3, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ for(int i = 0; i < 2; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 10, 3);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->src(2) = src2;
+ for(int i = 0; i < 9; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 10, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 9; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, 5, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 4; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, 5, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 4; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 7, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 6; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ // Boiler plate to initialize the selection library at c++ pre-main
+ static SelectionLibrary *selLib = NULL;
+ static void destroySelectionLibrary(void) { GBE_DELETE(selLib); }
+ static struct SelectionLibraryInitializer {
+ SelectionLibraryInitializer(void) {
+ selLib = GBE_NEW_NO_ARG(SelectionLibrary);
+ atexit(destroySelectionLibrary);
+ }
+ } selectionLibraryInitializer;
+
+ bool Selection::Opaque::isRoot(const ir::Instruction &insn) const {
+ if (insn.getDstNum() > 1 ||
+ insn.hasSideEffect() ||
+ insn.isMemberOf<ir::BranchInstruction>() ||
+ insn.isMemberOf<ir::LabelInstruction>())
+ return true;
+
+ // No side effect, not a branch and no destination? Impossible
+ GBE_ASSERT(insn.getDstNum() == 1);
+
+ // Root if alive outside the block.
+ // XXX we should use Value and not registers in liveness info
+ const ir::BasicBlock *insnBlock = insn.getParent();
+ const ir::Liveness &liveness = this->ctx.getLiveness();
+ const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(insnBlock);
+ const ir::Register reg = insn.getDst(0);
+ if (liveOut.contains(reg))
+ return true;
+
+ // The instruction is only used in the current basic block
+ return false;
+ }
+
+ uint32_t Selection::Opaque::buildBasicBlockDAG(const ir::BasicBlock &bb)
+ {
+ using namespace ir;
+
+ // Clear all registers
+ for (uint32_t regID = 0; regID < this->regNum; ++regID)
+ this->regDAG[regID] = NULL;
+
+ this->block->hasBarrier = false;
+ this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA ||
+ bb.getLastInstruction()->getOpcode() == OP_RET;
+ if (!this->block->hasBranch)
+ this->block->endifOffset = -1;
+
+ // Build the DAG on the fly
+ uint32_t insnNum = 0;
+ const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+ if (insn.getOpcode() == OP_SYNC)
+ this->block->hasBarrier = true;
+
+ // Build a selectionDAG node for instruction
+ SelectionDAG *dag = this->newSelectionDAG(insn);
+
+ // Point to non-root children
+ const uint32_t srcNum = insn.getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const ir::Register reg = insn.getSrc(srcID);
+ SelectionDAG *child = this->regDAG[reg];
+ if (child) {
+ const ir::Instruction &childInsn = child->insn;
+ const uint32_t childSrcNum = childInsn.getSrcNum();
+
+ // We can merge a child only if its sources are still valid
+ bool mergeable = true;
+ for (uint32_t otherID = 0; otherID < childSrcNum; ++otherID) {
+ const SelectionDAG *srcDAG = child->child[otherID];
+ const ir::Register srcReg = childInsn.getSrc(otherID);
+ SelectionDAG *currDAG = this->regDAG[srcReg];
+ if (srcDAG != currDAG) {
+ mergeable = false;
+ break;
+ }
+ }
+ if (mergeable) dag->setAsMergeable(srcID);
+ dag->child[srcID] = child;
+ // Check whether this bool is used as a normal source
+ // oprand other than BRA/SEL.
+ if (getRegisterFamily(reg) == FAMILY_BOOL) {
+ if (insn.getOpcode() != OP_BRA &&
+ (insn.getOpcode() != OP_SEL ||
+ (insn.getOpcode() == OP_SEL && srcID != 0)))
+ child->computeBool = true;
+ }
+ child->isUsed = true;
+ } else
+ dag->child[srcID] = NULL;
+ }
+
+ // Make it a root if we must
+ if (this->isRoot(insn)) dag->isRoot = 1;
+
+ // Save the DAG <-> instruction mapping
+ this->insnDAG[insnNum++] = dag;
+
+ // Associate all output registers to this instruction
+ const uint32_t dstNum = insn.getDstNum();
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const ir::Register reg = insn.getDst(dstID);
+ this->regDAG[reg] = dag;
+ }
+ });
+
+ return insnNum;
+ }
+
+ void Selection::Opaque::matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum)
+ {
+ // Bottom up code generation
+ bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
+
+ if(needEndif) {
+ const ir::BasicBlock *next = bb.getNextBlock();
+ this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+ }
+
+ for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
+ // Process all possible patterns for this instruction
+ SelectionDAG &dag = *insnDAG[insnID];
+ if (dag.isRoot) {
+ const ir::Instruction &insn = dag.insn;
+ const ir::Opcode opcode = insn.getOpcode();
+ auto it = selLib->patterns[opcode].begin();
+ const auto end = selLib->patterns[opcode].end();
+
+ // Start a new code fragment
+ this->startBackwardGeneration();
+ // If there is no branch at the end of this block.
+
+ // Try all the patterns from best to worst
+ do {
+ if ((*it)->emit(*this, dag))
+ break;
+ ++it;
+ } while (it != end);
+ GBE_ASSERT(it != end);
+ // If we are in if/endif fix mode, and this block is
+ // large enough, we need to insert endif/if pair to eliminate
+ // the too long if/endif block.
+ if (this->ctx.getIFENDIFFix() &&
+ this->block->insnList.size() != 0 &&
+ this->block->insnList.size() % 1000 == 0 &&
+ (uint16_t)this->block->endifLabel != 0) {
+ ir::LabelIndex jip = this->block->endifLabel;
+ this->ENDIF(GenRegister::immd(0), jip);
+ this->push();
+ this->curr.predicate = GEN_PREDICATE_NORMAL;
+ this->IF(GenRegister::immd(0), jip, jip);
+ this->pop();
+ this->block->isLargeBlock = true;
+ }
+
+ // Output the code in the current basic block
+ this->endBackwardGeneration();
+ }
+ }
+ }
+
+ void Selection::Opaque::select(void)
+ {
+ using namespace ir;
+ const Function &fn = ctx.getFunction();
+
+ // Perform the selection per basic block
+ fn.foreachBlock([&](const BasicBlock &bb) {
+ this->dagPool.rewind();
+ this->appendBlock(bb);
+ const uint32_t insnNum = this->buildBasicBlockDAG(bb);
+ this->matchBasicBlock(bb, insnNum);
+ });
+ }
+
+ void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
+ GenRegister *msgPayloads, uint32_t msgNum,
+ uint32_t bti, uint32_t sampler, bool isLD, bool isUniform) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum);
+ SelectionVector *dstVector = this->appendVector();
+ SelectionVector *msgVector = this->appendVector();
+
+ // Regular instruction to encode
+ for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+ insn->dst(elemID) = dst[elemID];
+ for (uint32_t elemID = 0; elemID < msgNum; ++elemID)
+ insn->src(elemID) = msgPayloads[elemID];
+
+ // Sends require contiguous allocation
+ dstVector->regNum = dstNum;
+ dstVector->isSrc = 0;
+ dstVector->reg = &insn->dst(0);
+
+ // Only the messages require contiguous registers.
+ msgVector->regNum = msgNum;
+ msgVector->isSrc = 1;
+ msgVector->reg = &insn->src(0);
+
+ insn->setbti(bti);
+ insn->extra.sampler = sampler;
+ insn->extra.rdmsglen = msgNum;
+ insn->extra.isLD = isLD;
+ insn->extra.isUniform = isUniform;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Code selection public implementation
+ ///////////////////////////////////////////////////////////////////////////
+
+ Selection::Selection(GenContext &ctx) {
+ this->blockList = NULL;
+ this->opaque = GBE_NEW(Selection::Opaque, ctx);
+ }
+
+ Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
+ this->opaque->setPatchSLMAddr(true);
+ }
+
+ void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
+ uint32_t bti, bool is3D) {
+ uint32_t elemID = 0;
+ uint32_t i;
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum);
+ SelectionVector *msgVector = this->appendVector();;
+
+ for( i = 0; i < msgNum; ++i, ++elemID)
+ insn->src(elemID) = msgs[i];
+
+ insn->setbti(bti);
+ insn->extra.msglen = msgNum;
+ insn->extra.is3DWrite = is3D;
+ // Sends require contiguous allocation
+ msgVector->regNum = msgNum;
+ msgVector->isSrc = 1;
+ msgVector->reg = &insn->src(0);
+ }
+
+ Selection::~Selection(void) { GBE_DELETE(this->opaque); }
+
+ void Selection::select(void) {
+ this->opaque->select();
+ this->blockList = &this->opaque->blockList;
+ }
+
+ uint32_t Selection::getLargestBlockSize(void) const {
+ return this->opaque->getLargestBlockSize();
+ }
+
+ uint32_t Selection::getVectorNum(void) const {
+ return this->opaque->getVectorNum();
+ }
+
+ uint32_t Selection::getRegNum(void) const {
+ return this->opaque->getRegNum();
+ }
+
+ ir::RegisterFamily Selection::getRegisterFamily(ir::Register reg) const {
+ return this->opaque->getRegisterFamily(reg);
+ }
+
+ ir::RegisterData Selection::getRegisterData(ir::Register reg) const {
+ return this->opaque->getRegisterData(reg);
+ }
+
+ ir::Register Selection::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+ return this->opaque->replaceSrc(insn, regID, type, needMov);
+ }
+
+ ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+ return this->opaque->replaceDst(insn, regID, type, needMov);
+ }
+ bool Selection::spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool) {
+ return this->opaque->spillRegs(spilledRegs, registerPool);
+ }
+
+ bool Selection::isScalarReg(const ir::Register ®) const {
+ return this->opaque->isScalarReg(reg);
+ }
+
+ SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
+ return this->opaque->create(opcode, dstNum, srcNum);
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Implementation of all patterns
+ ///////////////////////////////////////////////////////////////////////////
+
+ bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+ using namespace ir;
+ const auto &childInsn = cast<LoadImmInstruction>(insn);
+ const auto &imm = childInsn.getImmediate();
+ if(imm.getType() != TYPE_DOUBLE && imm.getType() != TYPE_S64 && imm.getType() != TYPE_U64)
+ return true;
+ return false;
+ }
+
+ GenRegister getRegisterFromImmediate(ir::Immediate imm, ir::Type type, bool negate = false)
+ {
+ using namespace ir;
+ int sign = negate ? -1 : 1;
+ switch (type) {
+ case TYPE_U32: return GenRegister::immud(imm.getIntegerValue() * sign);
+ case TYPE_S32: return GenRegister::immd(imm.getIntegerValue() * sign);
+ case TYPE_FLOAT: return GenRegister::immf(imm.getFloatValue() * sign);
+ case TYPE_U16: return GenRegister::immuw(imm.getIntegerValue() * sign);
+ case TYPE_S16: return GenRegister::immw((int16_t)imm.getIntegerValue() * sign);
+ case TYPE_U8: return GenRegister::immuw(imm.getIntegerValue() * sign);
+ case TYPE_S8: return GenRegister::immw((int8_t)imm.getIntegerValue() * sign);
+ case TYPE_DOUBLE: return GenRegister::immdf(imm.getDoubleValue() * sign);
+ case TYPE_BOOL: return GenRegister::immuw(-imm.getIntegerValue()); //return 0xffff when true
+ default: NOT_SUPPORTED; return GenRegister::immuw(0);
+ }
+ }
+
+ BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
+ void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag,
+ SelectionDAG *dag0, SelectionDAG *dag1,
+ GenRegister &src0, GenRegister &src1,
+ ir::Type type, bool &inverse) {
+ using namespace ir;
+ inverse = false;
+ // Right source can always be an immediate
+ const int src0Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src0Index : 0;
+ const int src1Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src1Index : 1;
+ if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
+ canGetRegisterFromImmediate(dag1->insn)) {
+ const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
+ src0 = this->selReg(dag.insn.getSrc(src0Index), type);
+ src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
+ if (dag0) dag0->isRoot = 1;
+ }
+ // Left source cannot be immediate but it is OK if we can commute
+ else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && dag.insn.isMemberOf<BinaryInstruction>() &&
+ ((cast<BinaryInstruction>(dag.insn)).commutes() || dag.insn.getOpcode() == OP_SUB) &&
+ dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
+ const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
+ src0 = dag.insn.getOpcode() != OP_SUB ?
+ this->selReg(dag.insn.getSrc(src1Index), type) :
+ GenRegister::negate(this->selReg(dag.insn.getSrc(src1Index), type));
+ Immediate imm = childInsn.getImmediate();
+ src1 = getRegisterFromImmediate(imm, type, dag.insn.getOpcode() == OP_SUB);
+ if (dag1) dag1->isRoot = 1;
+ }
+ // If it's a compare instruction, theoritically, we can easily revert the condition code to
+ // switch the two operands. But we can't do that for float due to the NaN's exist.
+ // For a normal select instruction, we can always inverse the predication to switch the two
+ // operands' position.
+ else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
+ dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) &&
+ ((dag.insn.isMemberOf<CompareInstruction>() && type != TYPE_FLOAT && type != TYPE_DOUBLE) ||
+ (dag.insn.isMemberOf<SelectInstruction>()))) {
+ const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
+ src0 = this->selReg(dag.insn.getSrc(src1Index), type);
+ src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
+ inverse = true;
+ if (dag1) dag1->isRoot = 1;
+ }
+ // Just grab the two sources
+ else {
+ src0 = this->selReg(dag.insn.getSrc(src0Index), type);
+ src1 = this->selReg(dag.insn.getSrc(src1Index), type);
+ markAllChildren(dag);
+ }
+ }
+
+ void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
+ GenRegister &src1, ir::Type type,
+ bool &inverse) {
+ SelectionDAG *dag0 = dag.child[0];
+ SelectionDAG *dag1 = dag.child[1];
+ getSrcGenRegImm(dag, dag0, dag1, src0, src1, type, inverse);
+ }
+
+
+ /*! Template for the one-to-many instruction patterns */
+ template <typename T, typename U>
+ class OneToManyPattern : public SelectionPattern
+ {
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ OneToManyPattern(uint32_t insnNum, uint32_t cost) :
+ SelectionPattern(insnNum, cost)
+ {
+ for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+ if (ir::isOpcodeFrom<U>(ir::Opcode(op)) == true)
+ this->opcodes.push_back(ir::Opcode(op));
+ }
+ /*! Call the child method with the proper prototype */
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+ bool markChildren = true;
+ if (static_cast<const T*>(this)->emitOne(sel, ir::cast<U>(dag.insn), markChildren)) {
+ if (markChildren)
+ markAllChildren(dag);
+ return true;
+ }
+ return false;
+ }
+ };
+
+/*! Declare a naive one-to-many pattern */
+#define DECL_PATTERN(FAMILY) \
+ struct FAMILY##Pattern : public OneToManyPattern<FAMILY##Pattern, ir::FAMILY>
+
+#define DECL_CTOR(FAMILY, INSN_NUM, COST) \
+ FAMILY##Pattern(void) : OneToManyPattern<FAMILY##Pattern, ir::FAMILY>(INSN_NUM, COST) {}
+
+ /*! Unary instruction patterns */
+ DECL_PATTERN(UnaryInstruction)
+ {
+ static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType) {
+ if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
+ return insnType;
+ if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
+ return ir::TYPE_U32;
+ if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
+ return insnType;
+ if (insnType == ir::TYPE_BOOL)
+ return ir::TYPE_U16;
+ return ir::TYPE_FLOAT;
+ }
+
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn, bool &markChildren) const {
+ const ir::Opcode opcode = insn.getOpcode();
+ const ir::Type insnType = insn.getType();
+ const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType));
+ const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType));
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+ switch (opcode) {
+ case ir::OP_ABS:
+ if (insn.getType() == ir::TYPE_S32) {
+ const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
+ const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
+ sel.MOV(dst_, GenRegister::abs(src_));
+ } else {
+ GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT);
+ sel.MOV(dst, GenRegister::abs(src));
+ }
+ break;
+ case ir::OP_MOV:
+ if (dst.isdf()) {
+ ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+ sel.MOV_DF(dst, src, sel.selReg(r));
+ } else {
+ sel.push();
+ auto dag = sel.regDAG[insn.getDst(0)];
+ if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
+ dag->isUsed) {
+ sel.curr.physicalFlag = 0;
+ sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+ sel.curr.modFlag = 1;
+ }
+ sel.MOV(dst, src);
+ sel.pop();
+ }
+ break;
+ case ir::OP_RNDD: sel.RNDD(dst, src); break;
+ case ir::OP_RNDE: sel.RNDE(dst, src); break;
+ case ir::OP_RNDU: sel.RNDU(dst, src); break;
+ case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
+ case ir::OP_FBH: sel.FBH(dst, src); break;
+ case ir::OP_FBL: sel.FBL(dst, src); break;
+ case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
+ case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
+ case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
+ case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break;
+ case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
+ case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
+ case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
+ case ir::OP_SIMD_ANY:
+ {
+ const GenRegister constZero = GenRegister::immuw(0);;
+ const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
+ const GenRegister flag01 = GenRegister::flag(0, 1);
+
+ sel.push();
+ int simdWidth = sel.curr.execWidth;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.execWidth = 1;
+ sel.curr.noMask = 1;
+ sel.MOV(flag01, constZero);
+ sel.curr.execWidth = simdWidth;
+ sel.curr.noMask = 0;
+
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero);
+
+ if (sel.curr.execWidth == 16)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else if (sel.curr.execWidth == 8)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ else
+ NOT_IMPLEMENTED;
+ sel.SEL(dst, regOne, constZero);
+ sel.pop();
+ }
+ break;
+ case ir::OP_SIMD_ALL:
+ {
+ const GenRegister constZero = GenRegister::immuw(0);
+ const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
+ const GenRegister flag01 = GenRegister::flag(0, 1);
+
+ sel.push();
+ int simdWidth = sel.curr.execWidth;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.execWidth = 1;
+ sel.curr.noMask = 1;
+ sel.MOV(flag01, regOne);
+
+ sel.curr.execWidth = simdWidth;
+ sel.curr.noMask = 0;
+
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero);
+
+ if (sel.curr.execWidth == 16)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+ else if (sel.curr.execWidth == 8)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+ else
+ NOT_IMPLEMENTED;
+ sel.SEL(dst, regOne, constZero);
+ sel.pop();
+ }
+ break;
+
+ default: NOT_SUPPORTED;
+ }
+ sel.pop();
+ return true;
+ }
+ DECL_CTOR(UnaryInstruction, 1, 1)
+ };
+
+
+ /*! Binary regular instruction pattern */
+ class BinaryInstructionPattern : public SelectionPattern
+ {
+ public:
+ BinaryInstructionPattern(void) : SelectionPattern(1,1) {
+ for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+ if (ir::isOpcodeFrom<ir::BinaryInstruction>(ir::Opcode(op)) == true)
+ this->opcodes.push_back(ir::Opcode(op));
+ }
+
+ bool emitDivRemInst(Selection::Opaque &sel, SelectionDAG &dag, ir::Opcode op) const
+ {
+ using namespace ir;
+ const ir::BinaryInstruction &insn = cast<BinaryInstruction>(dag.insn);
+ const Type type = insn.getType();
+ GenRegister dst = sel.selReg(insn.getDst(0), type);
+ GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+ GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+ const uint32_t simdWidth = sel.curr.execWidth;
+ const RegisterFamily family = getFamily(type);
+ uint32_t function = (op == OP_DIV)?
+ GEN_MATH_FUNCTION_INT_DIV_QUOTIENT :
+ GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
+
+ //bytes and shorts must be converted to int for DIV and REM per GEN restriction
+ if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
+ GenRegister tmp0, tmp1;
+ ir::Register reg = sel.reg(FAMILY_DWORD, simdWidth == 1);
+
+ tmp0 = GenRegister::udxgrf(simdWidth, reg);
+ tmp0 = GenRegister::retype(tmp0, GEN_TYPE_D);
+ sel.MOV(tmp0, src0);
+
+ tmp1 = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ tmp1 = GenRegister::retype(tmp1, GEN_TYPE_D);
+ sel.MOV(tmp1, src1);
+
+ sel.MATH(tmp0, function, tmp0, tmp1);
+ GenRegister unpacked;
+ if(family == FAMILY_WORD) {
+ unpacked = sel.unpacked_uw(reg);
+ } else {
+ unpacked = sel.unpacked_ub(reg);
+ }
+ unpacked = GenRegister::retype(unpacked, getGenType(type));
+ sel.MOV(dst, unpacked);
+ } else if (type == TYPE_S32 || type == TYPE_U32 ) {
+ sel.MATH(dst, function, src0, src1);
+ } else if(type == TYPE_FLOAT) {
+ GBE_ASSERT(op != OP_REM);
+ sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
+ } else if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[13];
+ for(int i=0; i < 13; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ if(op == OP_DIV)
+ sel.I64DIV(dst, src0, src1, tmp);
+ else
+ sel.I64REM(dst, src0, src1, tmp);
+ sel.pop();
+ }
+ markAllChildren(dag);
+ return true;
+ }
+
+ INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
+ using namespace ir;
+ const ir::BinaryInstruction &insn = cast<BinaryInstruction>(dag.insn);
+ const Opcode opcode = insn.getOpcode();
+ const Type type = insn.getType();
+ GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+ sel.push();
+
+ // Boolean values use scalars
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
+ if(opcode == OP_DIV || opcode == OP_REM) {
+ bool ret = this->emitDivRemInst(sel, dag, opcode);
+ sel.pop();
+ return ret;
+ }
+ // Immediates not supported
+ if (opcode == OP_POW) {
+ GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+ GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+
+ if(type == TYPE_FLOAT) {
+ sel.MATH(dst, GEN_MATH_FUNCTION_POW, src0, src1);
+ } else {
+ NOT_IMPLEMENTED;
+ }
+ markAllChildren(dag);
+ sel.pop();
+ return true;
+ }
+
+ // Look for immediate values
+ GenRegister src0, src1;
+ bool inverse = false;
+ sel.getSrcGenRegImm(dag, src0, src1, type, inverse);
+ // Output the binary instruction
+ if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
+ dag.isUsed) {
+ GBE_ASSERT(insn.getOpcode() == OP_AND ||
+ insn.getOpcode() == OP_OR ||
+ insn.getOpcode() == OP_XOR);
+ sel.curr.physicalFlag = 0;
+ sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+ sel.curr.modFlag = 1;
+ }
+
+ switch (opcode) {
+ case OP_ADD:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+ sel.I64ADD(dst, src0, src1, t);
+ } else
+ sel.ADD(dst, src0, src1);
+ break;
+ case OP_ADDSAT:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister tmp[5];
+ for(int i=0; i<5; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.I64SATADD(dst, src0, src1, tmp);
+ sel.pop();
+ break;
+ }
+ sel.push();
+ sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
+ sel.ADD(dst, src0, src1);
+ sel.pop();
+ break;
+ case OP_XOR:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+ sel.I64XOR(dst, src0, src1);
+ else
+ sel.XOR(dst, src0, src1);
+ break;
+ case OP_OR:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+ sel.I64OR(dst, src0, src1);
+ else
+ sel.OR(dst, src0, src1);
+ break;
+ case OP_AND:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+ sel.I64AND(dst, src0, src1);
+ else
+ sel.AND(dst, src0, src1);
+ break;
+ case OP_SUB:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+ sel.I64SUB(dst, src0, src1, t);
+ } else
+ sel.ADD(dst, src0, GenRegister::negate(src1));
+ break;
+ case OP_SUBSAT:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister tmp[5];
+ for(int i=0; i<5; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.I64SATSUB(dst, src0, src1, tmp);
+ sel.pop();
+ break;
+ }
+ sel.push();
+ sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
+ sel.ADD(dst, src0, GenRegister::negate(src1));
+ sel.pop();
+ break;
+ case OP_SHL:
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i ++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.I64SHL(dst, src0, src1, tmp);
+ sel.pop();
+ } else
+ sel.SHL(dst, src0, src1);
+ break;
+ case OP_SHR:
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i ++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.I64SHR(dst, src0, src1, tmp);
+ sel.pop();
+ } else
+ sel.SHR(dst, src0, src1);
+ break;
+ case OP_ASR:
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i ++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.I64ASR(dst, src0, src1, tmp);
+ sel.pop();
+ } else
+ sel.ASR(dst, src0, src1);
+ break;
+ case OP_MUL_HI: {
+ GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+ sel.MUL_HI(dst, src0, src1, temp);
+ break;
+ }
+ case OP_I64_MUL_HI:
+ {
+ GenRegister temp[9];
+ for(int i=0; i<9; i++) {
+ temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ temp[i].type = GEN_TYPE_UD;
+ }
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.I64_MUL_HI(dst, src0, src1, temp);
+ sel.pop();
+ break;
+ }
+ case OP_MUL:
+ if (type == TYPE_U32 || type == TYPE_S32) {
+ sel.pop();
+ return false;
+ } else if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64MUL(dst, src0, src1, tmp);
+ } else
+ sel.MUL(dst, src0, src1);
+ break;
+ case OP_HADD: {
+ GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
+ sel.HADD(dst, src0, src1, temp);
+ break;
+ }
+ case OP_RHADD: {
+ GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
+ sel.RHADD(dst, src0, src1, temp);
+ break;
+ }
+ case OP_I64HADD:
+ {
+ GenRegister tmp[4];
+ for(int i=0; i<4; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64HADD(dst, src0, src1, tmp);
+ break;
+ }
+ case OP_I64RHADD:
+ {
+ GenRegister tmp[4];
+ for(int i=0; i<4; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64RHADD(dst, src0, src1, tmp);
+ break;
+ }
+ case OP_UPSAMPLE_SHORT:
+ sel.UPSAMPLE_SHORT(dst, src0, src1);
+ break;
+ case OP_UPSAMPLE_INT:
+ sel.UPSAMPLE_INT(dst, src0, src1);
+ break;
+ case OP_UPSAMPLE_LONG:
+ sel.UPSAMPLE_LONG(dst, src0, src1);
+ break;
+ default: NOT_IMPLEMENTED;
+ }
+ sel.pop();
+ return true;
+ }
+ };
+
+ /*! MAD pattern */
+ class MulAddInstructionPattern : public SelectionPattern
+ {
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ MulAddInstructionPattern(void) : SelectionPattern(2, 1) {
+ this->opcodes.push_back(ir::OP_ADD);
+ this->opcodes.push_back(ir::OP_SUB);
+ }
+
+ /*! Implements base class */
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
+ using namespace ir;
+
+ // XXX TODO: we need a clean support of FP_CONTRACT to remove below line 'return false'
+ // if 'pragma FP_CONTRACT OFF' is used in cl kernel, we should not do mad optimization.
+ if (!sel.ctx.relaxMath || sel.ctx.getSimdWidth() == 16)
+ return false;
+ // MAD tend to increase liveness of the sources (since there are three of
+ // them). TODO refine this strategy. Well, we should be able at least to
+ // evaluate per basic block register pressure and selectively enable
+ // disable MADs
+ if (sel.ctx.limitRegisterPressure)
+ return false;
+
+ // We are good to try. We need a MUL for one of the two sources
+ const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+ if (insn.getType() != TYPE_FLOAT)
+ return false;
+ SelectionDAG *child0 = dag.child[0];
+ SelectionDAG *child1 = dag.child[1];
+ const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT);
+ if (child0 && child0->insn.getOpcode() == OP_MUL) {
+ GBE_ASSERT(cast<ir::BinaryInstruction>(child0->insn).getType() == TYPE_FLOAT);
+ SelectionDAG *child00 = child0->child[0];
+ SelectionDAG *child01 = child0->child[1];
+ if ((child00 && child00->insn.getOpcode() == OP_LOADI) ||
+ (child01 && child01->insn.getOpcode() == OP_LOADI) ||
+ (child1 && child1->insn.getOpcode() == OP_LOADI))
+ return false;
+ const GenRegister src0 = sel.selReg(child0->insn.getSrc(0), TYPE_FLOAT);
+ const GenRegister src1 = sel.selReg(child0->insn.getSrc(1), TYPE_FLOAT);
+ GenRegister src2 = sel.selReg(insn.getSrc(1), TYPE_FLOAT);
+ if(insn.getOpcode() == ir::OP_SUB) src2 = GenRegister::negate(src2);
+ sel.MAD(dst, src2, src0, src1); // order different on HW!
+ if (child0->child[0]) child0->child[0]->isRoot = 1;
+ if (child0->child[1]) child0->child[1]->isRoot = 1;
+ if (child1) child1->isRoot = 1;
+ return true;
+ }
+ if (child1 && child1->insn.getOpcode() == OP_MUL) {
+ GBE_ASSERT(cast<ir::BinaryInstruction>(child1->insn).getType() == TYPE_FLOAT);
+ SelectionDAG *child10 = child1->child[0];
+ SelectionDAG *child11 = child1->child[1];
+ if ((child10 && child10->insn.getOpcode() == OP_LOADI) ||
+ (child11 && child11->insn.getOpcode() == OP_LOADI) ||
+ (child0 && child0->insn.getOpcode() == OP_LOADI))
+ return false;
+ GenRegister src0 = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
+ const GenRegister src1 = sel.selReg(child1->insn.getSrc(1), TYPE_FLOAT);
+ const GenRegister src2 = sel.selReg(insn.getSrc(0), TYPE_FLOAT);
+ if(insn.getOpcode() == ir::OP_SUB) src0 = GenRegister::negate(src0);
+ sel.MAD(dst, src2, src0, src1); // order different on HW!
+ if (child1->child[0]) child1->child[0]->isRoot = 1;
+ if (child1->child[1]) child1->child[1]->isRoot = 1;
+ if (child0) child0->isRoot = 1;
+ return true;
+ }
+ return false;
+ }
+ };
+
+ /*! sel.{le,l,ge...} like patterns */
+ class SelectModifierInstructionPattern : public SelectionPattern
+ {
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ SelectModifierInstructionPattern(void) : SelectionPattern(2, 1) {
+ this->opcodes.push_back(ir::OP_SEL);
+ }
+
+ /*! Implements base class */
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
+ using namespace ir;
+ SelectionDAG *cmp = dag.child[0];
+ const SelectInstruction &insn = cast<SelectInstruction>(dag.insn);
+
+ if (insn.getType() == TYPE_S64 || insn.getType() == TYPE_U64) // not support
+ return false;
+
+ // Not in this block
+ if (cmp == NULL) return false;
+
+ // We need to match a compare
+ if (cmp->insn.isMemberOf<CompareInstruction>() == false) return false;
+
+ // We look for something like that:
+ // cmp.{le,ge...} flag src0 src1
+ // sel dst flag src0 src1
+ // So both sources must match
+ if (sourceMatch(cmp, 0, &dag, 1) == false) return false;
+ if (sourceMatch(cmp, 1, &dag, 2) == false) return false;
+ // OK, we merge the instructions
+ const ir::CompareInstruction &cmpInsn = cast<CompareInstruction>(cmp->insn);
+ const ir::Opcode opcode = cmpInsn.getOpcode();
+ if(opcode == OP_ORD) return false;
+ GenRegister src0, src1;
+ const ir::Type type = cmpInsn.getType();
+ bool inverse = false;
+ sel.getSrcGenRegImm(*cmp, src0, src1, type, inverse);
+
+ const uint32_t genCmp = getGenCompare(opcode, inverse);
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
+ // Like for regular selects, we need a temporary since we cannot predicate
+ // properly
+ const uint32_t simdWidth = sel.curr.execWidth;
+ const GenRegister dst = sel.selReg(insn.getDst(0), type);
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.execWidth = simdWidth;
+ sel.SEL_CMP(genCmp, dst, src0, src1);
+ sel.pop();
+ return true;
+ }
+ };
+
+ /*! 32 bits integer multiply needs more instructions */
+ class Int32x32MulInstructionPattern : public SelectionPattern
+ {
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ Int32x32MulInstructionPattern(void) : SelectionPattern(1, 4) {
+ this->opcodes.push_back(ir::OP_MUL);
+ }
+
+ /*! Implements base class */
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
+ using namespace ir;
+ const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+ const Type type = insn.getType();
+ if (type == TYPE_U32 || type == TYPE_S32) {
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+ const uint32_t simdWidth = sel.curr.execWidth;
+
+ GenRegister dst = sel.selReg(insn.getDst(0), type);
+ GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+ GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+
+ // Either left part of the 16-wide register or just a simd 8 register
+ dst = GenRegister::retype(dst, GEN_TYPE_D);
+ src0 = GenRegister::retype(src0, GEN_TYPE_D);
+ src1 = GenRegister::retype(src1, GEN_TYPE_D);
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), src0, src1);
+ sel.curr.accWrEnable = 1;
+ sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), src0, src1);
+ sel.curr.accWrEnable = 0;
+ if (simdWidth == 1) {
+ sel.curr.execWidth = 1;
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::vec1(GenRegister::acc()));
+ } else {
+ sel.curr.execWidth = 8;
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::acc());
+ }
+
+ // Right part of the 16-wide register now
+ if (simdWidth == 16) {
+ int predicate = sel.curr.predicate;
+ int noMask = sel.curr.noMask;
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ const GenRegister nextSrc0 = sel.selRegQn(insn.getSrc(0), 1, TYPE_S32);
+ const GenRegister nextSrc1 = sel.selRegQn(insn.getSrc(1), 1, TYPE_S32);
+ sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), nextSrc0, nextSrc1);
+ sel.curr.accWrEnable = 1;
+ sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), nextSrc0, nextSrc1);
+ sel.curr.accWrEnable = 0;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ if (predicate != GEN_PREDICATE_NONE || noMask != 1) {
+ const ir::Register reg = sel.reg(FAMILY_DWORD);
+ sel.MOV(GenRegister::f8grf(reg), GenRegister::acc());
+ sel.curr.noMask = noMask;;
+ sel.curr.predicate = predicate;
+ sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F),
+ GenRegister::f8grf(reg));
+ } else
+ sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F), GenRegister::acc());
+ }
+
+ sel.pop();
+ // All children are marked as root
+ markAllChildren(dag);
+ return true;
+ } else
+ return false;
+ }
+ };
+
+ /*! 32x16 bits integer can be done in one instruction */
+ class Int32x16MulInstructionPattern : public SelectionPattern
+ {
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ Int32x16MulInstructionPattern(void) : SelectionPattern(1, 1) {
+ this->opcodes.push_back(ir::OP_MUL);
+ }
+
+ bool is16BitSpecialReg(ir::Register reg) const {
+ if (reg == ir::ocl::lid0 ||
+ reg == ir::ocl::lid1 ||
+ reg == ir::ocl::lid2 ||
+ reg == ir::ocl::lsize0 ||
+ reg == ir::ocl::lsize1||
+ reg == ir::ocl::lsize2)
+ return true;
+ else
+ return false;
+ }
+
+ /*! Try to emit a multiply where child childID is a 16 immediate */
+ bool emitMulImmediate(Selection::Opaque &sel, SelectionDAG &dag, uint32_t childID) const {
+ using namespace ir;
+ const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+ const Register dst = insn.getDst(0);
+ const Register src1 = insn.getSrc(childID ^ 1);
+ const SelectionDAG *src0DAG = dag.child[childID];
+ if (src0DAG != NULL) {
+ if (src0DAG->insn.getOpcode() == OP_LOADI) {
+ const auto &loadimm = cast<LoadImmInstruction>(src0DAG->insn);
+ const Immediate imm = loadimm.getImmediate();
+ const Type type = imm.getType();
+ GBE_ASSERT(type == TYPE_U32 || type == TYPE_S32);
+ if (type == TYPE_U32 && imm.getIntegerValue() <= 0xffff) {
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
+ sel.MUL(sel.selReg(dst, type),
+ sel.selReg(src1, type),
+ GenRegister::immuw(imm.getIntegerValue()));
+ sel.pop();
+ if (dag.child[childID ^ 1] != NULL)
+ dag.child[childID ^ 1]->isRoot = 1;
+ return true;
+ }
+ if (type == TYPE_S32 && (imm.getIntegerValue() >= -32768 && imm.getIntegerValue() <= 32767)) {
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
+ sel.MUL(sel.selReg(dst, type),
+ sel.selReg(src1, type),
+ GenRegister::immw(imm.getIntegerValue()));
+ sel.pop();
+ if (dag.child[childID ^ 1] != NULL)
+ dag.child[childID ^ 1]->isRoot = 1;
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /*! Try to emit a multiply with a 16 bit special register */
+ bool emitMulSpecialReg(Selection::Opaque &sel, SelectionDAG &dag, uint32_t childID) const {
+ using namespace ir;
+ const BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+ const Type type = insn.getType();
+ const Register dst = insn.getDst(0);
+ const Register src0 = insn.getSrc(childID);
+ const Register src1 = insn.getSrc(childID ^ 1);
+ if (is16BitSpecialReg(src0)) {
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+ sel.MUL(sel.selReg(dst, type),
+ sel.selReg(src1, type),
+ sel.selReg(src0, TYPE_U32));
+ sel.pop();
+ markAllChildren(dag);
+ return true;
+ }
+ return false;
+ }
+
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
+ using namespace ir;
+ const BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+ const Type type = insn.getType();
+ if (type == TYPE_U32 || type == TYPE_S32) {
+ if (this->emitMulSpecialReg(sel, dag, 0))
+ return true;
+ if (this->emitMulSpecialReg(sel, dag, 1))
+ return true;
+ if (this->emitMulImmediate(sel, dag, 0))
+ return true;
+ if (this->emitMulImmediate(sel, dag, 1))
+ return true;
+ }
+ return false;
+ }
+ };
+
+#define DECL_NOT_IMPLEMENTED_ONE_TO_MANY(FAMILY) \
+ struct FAMILY##Pattern : public OneToManyPattern<FAMILY##Pattern, ir::FAMILY>\
+ {\
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::FAMILY &insn, bool &markChildren) const {\
+ NOT_IMPLEMENTED;\
+ return false;\
+ }\
+ DECL_CTOR(FAMILY, 1, 1); \
+ }
+#undef DECL_NOT_IMPLEMENTED_ONE_TO_MANY
+
+ /*! Load immediate pattern */
+ DECL_PATTERN(LoadImmInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadImmInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const Type type = insn.getType();
+ const Immediate imm = insn.getImmediate();
+ const GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
+ switch (type) {
+ case TYPE_BOOL:
+ if (!sel.isScalarReg(insn.getDst(0)) && sel.regDAG[insn.getDst(0)]->isUsed) {
+ sel.curr.modFlag = 1;
+ sel.curr.physicalFlag = 0;
+ sel.curr.flagIndex = (uint16_t) insn.getDst(0);
+ }
+ sel.MOV(dst, imm.getIntegerValue() ? GenRegister::immuw(0xffff) : GenRegister::immuw(0));
+ break;
+ case TYPE_U32:
+ case TYPE_S32:
+ case TYPE_FLOAT:
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_F),
+ GenRegister::immf(imm.asFloatValue()));
+ break;
+ case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
+ case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
+ case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
+ case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
+ case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.getDoubleValue()), sel.selReg(sel.reg(FAMILY_QWORD))); break;
+ case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
+ case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
+ default: NOT_SUPPORTED;
+ }
+ sel.pop();
+ return true;
+ }
+
+ DECL_CTOR(LoadImmInstruction, 1,1);
+ };
+
+ /*! Sync instruction */
+ DECL_PATTERN(SyncInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::SyncInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const ir::Register reg = sel.reg(FAMILY_DWORD);
+ const uint32_t params = insn.getParameters();
+
+ // A barrier is OK to start the thread synchronization *and* SLM fence
+ sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
+ return true;
+ }
+
+ DECL_CTOR(SyncInstruction, 1,1);
+ };
+
+ INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
+ using namespace ir;
+ switch (type) {
+ case TYPE_DOUBLE:
+ case TYPE_S64:
+ case TYPE_U64:
+ return GEN_BYTE_SCATTER_QWORD;
+ case TYPE_FLOAT:
+ case TYPE_U32:
+ case TYPE_S32:
+ return GEN_BYTE_SCATTER_DWORD;
+ case TYPE_BOOL:
+ case TYPE_U16:
+ case TYPE_S16:
+ return GEN_BYTE_SCATTER_WORD;
+ case TYPE_U8:
+ case TYPE_S8:
+ return GEN_BYTE_SCATTER_BYTE;
+ default: NOT_SUPPORTED;
+ return GEN_BYTE_SCATTER_BYTE;
+ }
+ }
+
+ /*! Load instruction pattern */
+ DECL_PATTERN(LoadInstruction)
+ {
+ void readDWord(Selection::Opaque &sel,
+ vector<GenRegister> &dst,
+ vector<GenRegister> &dst2,
+ GenRegister addr,
+ uint32_t valueNum,
+ ir::AddressSpace space,
+ ir::BTI bti) const
+ {
+ for (uint32_t x = 0; x < bti.count; x++) {
+ if(x > 0)
+ for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+
+ GenRegister temp = getRelativeAddress(sel, addr, space, bti.bti[x]);
+ sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
+ if(x > 0) {
+ sel.push();
+ if(sel.isScalarReg(dst[0].reg())) {
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ }
+ for (uint32_t y = 0; y < valueNum; y++)
+ sel.ADD(dst[y], dst[y], dst2[y]);
+ sel.pop();
+ }
+ }
+ }
+
+ void emitUntypedRead(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister addr,
+ ir::BTI bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ vector<GenRegister> dst(valueNum);
+ vector<GenRegister> dst2(valueNum);
+ for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+ readDWord(sel, dst, dst2, addr, valueNum, insn.getAddressSpace(), bti);
+ }
+
+ void emitDWordGather(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister addr,
+ ir::BTI bti) const
+ {
+ using namespace ir;
+ GBE_ASSERT(bti.count == 1);
+ const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth();
+ GBE_ASSERT(insn.getValueNum() == 1);
+
+ if(simdWidth == 1) {
+ GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
+ sel.push();
+ sel.curr.noMask = 1;
+ sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
+ sel.pop();
+ return;
+ }
+
+ GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+ // get dword based address
+ GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+
+ sel.push();
+ if (sel.isScalarReg(addr.reg())) {
+ sel.curr.noMask = 1;
+ }
+ sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+ sel.pop();
+
+ sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
+ }
+
+ void emitRead64(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister addr,
+ ir::BTI bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ /* XXX support scalar only right now. */
+ GBE_ASSERT(valueNum == 1);
+ GBE_ASSERT(bti.count == 1);
+ GenRegister dst[valueNum];
+ GenRegister tmpAddr = getRelativeAddress(sel, addr, insn.getAddressSpace(), bti.bti[0]);
+ for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
+ sel.READ64(tmpAddr, dst, valueNum, bti.bti[0]);
+ }
+
+ void readByteAsDWord(Selection::Opaque &sel,
+ const uint32_t elemSize,
+ GenRegister address,
+ GenRegister dst,
+ uint32_t simdWidth,
+ uint8_t bti) const
+ {
+ using namespace ir;
+ Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
+ GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
+ GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
+ // Get dword aligned addr
+ sel.push();
+ if (simdWidth == 1) {
+ sel.curr.execWidth = 1;
+ sel.curr.noMask = 1;
+ }
+ sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+ sel.pop();
+ sel.push();
+ if (simdWidth == 1)
+ sel.curr.noMask = 1;
+ sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
+
+ if (simdWidth == 1)
+ sel.curr.execWidth = 1;
+ // Get the remaining offset from aligned addr
+ sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+ sel.SHL(tmpAddr, tmpAddr, GenRegister::immud(0x3));
+ sel.SHR(tmpData, tmpData, tmpAddr);
+
+ if (elemSize == GEN_BYTE_SCATTER_WORD)
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+ else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+ sel.pop();
+ }
+
+ void emitByteGather(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ const uint32_t elemSize,
+ GenRegister address,
+ ir::BTI bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
+ 1 : sel.ctx.getSimdWidth();
+ RegisterFamily family = getFamily(insn.getValueType());
+
+ if(valueNum > 1) {
+ vector<GenRegister> dst(valueNum);
+ const uint32_t typeSize = getFamilySize(family);
+
+ for(uint32_t i = 0; i < valueNum; i++)
+ dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+ uint32_t tmpRegNum = typeSize*valueNum / 4;
+ vector<GenRegister> tmp(tmpRegNum);
+ vector<GenRegister> tmp2(tmpRegNum);
+ for(uint32_t i = 0; i < tmpRegNum; i++) {
+ tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ }
+
+ readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+
+ for(uint32_t i = 0; i < tmpRegNum; i++) {
+ sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+ }
+ } else {
+ GBE_ASSERT(insn.getValueNum() == 1);
+ const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
+ GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
+ GenRegister tmp = value;
+
+ for (int x = 0; x < bti.count; x++) {
+ if (x > 0)
+ tmp = sel.selReg(sel.reg(family, simdWidth == 1), insn.getValueType());
+
+ GenRegister addr = getRelativeAddress(sel, address, insn.getAddressSpace(), bti.bti[x]);
+ readByteAsDWord(sel, elemSize, addr, tmp, simdWidth, bti.bti[x]);
+ if (x > 0) {
+ sel.push();
+ if (simdWidth == 1) {
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ }
+ sel.ADD(value, value, tmp);
+ sel.pop();
+ }
+ }
+ }
+ }
+
+ void emitIndirectMove(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister address) const
+ {
+ using namespace ir;
+ GBE_ASSERT(insn.getValueNum() == 1); //todo: handle vec later
+
+ const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
+ const GenRegister src = address;
+ sel.INDIRECT_MOVE(dst, src);
+ }
+
+ INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, ir::AddressSpace space, uint8_t bti) const {
+ if(space == ir::MEM_LOCAL || space == ir::MEM_CONSTANT)
+ return address;
+
+ sel.push();
+ sel.curr.noMask = 1;
+ GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
+ sel.pop();
+ return temp;
+ }
+
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
+ using namespace ir;
+ GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+ const AddressSpace space = insn.getAddressSpace();
+ GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
+ insn.getAddressSpace() == MEM_CONSTANT ||
+ insn.getAddressSpace() == MEM_PRIVATE ||
+ insn.getAddressSpace() == MEM_LOCAL);
+ //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
+ const Type type = insn.getValueType();
+ const uint32_t elemSize = getByteScatterGatherSize(type);
+ if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+ GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+ sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+ address = temp;
+ }
+ BTI bti;
+ if (space == MEM_CONSTANT || space == MEM_LOCAL) {
+ bti.bti[0] = space == MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+ bti.count = 1;
+ } else {
+ bti = insn.getBTI();
+ }
+ if (space == MEM_CONSTANT) {
+ // XXX TODO read 64bit constant through constant cache
+ // Per HW Spec, constant cache messages can read at least DWORD data.
+ // So, byte/short data type, we have to read through data cache.
+ if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitRead64(sel, insn, address, bti);
+ else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ this->emitDWordGather(sel, insn, address, bti);
+ else {
+ this->emitByteGather(sel, insn, elemSize, address, bti);
+ }
+ } else {
+ if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitRead64(sel, insn, address, bti);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ this->emitUntypedRead(sel, insn, address, bti);
+ else {
+ this->emitByteGather(sel, insn, elemSize, address, bti);
+ }
+ }
+ return true;
+ }
+ DECL_CTOR(LoadInstruction, 1, 1);
+ };
+
+ /*! Store instruction pattern */
+ DECL_PATTERN(StoreInstruction)
+ {
+ void emitUntypedWrite(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister addr,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ vector<GenRegister> value(valueNum);
+
+ addr = GenRegister::retype(addr, GEN_TYPE_F);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+ value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
+ sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+ }
+
+ void emitWrite64(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister addr,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ /* XXX support scalar only right now. */
+ GBE_ASSERT(valueNum == 1);
+ addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ GenRegister src[valueNum];
+
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+ src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
+ sel.WRITE64(addr, src, valueNum, bti);
+ }
+
+ void emitByteScatter(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ const uint32_t elemSize,
+ GenRegister addr,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ uint32_t valueNum = insn.getValueNum();
+
+ if(valueNum > 1) {
+ const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
+ vector<GenRegister> value(valueNum);
+
+ if(elemSize == GEN_BYTE_SCATTER_WORD) {
+ for(uint32_t i = 0; i < valueNum; i++)
+ value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16);
+ } else if(elemSize == GEN_BYTE_SCATTER_BYTE) {
+ for(uint32_t i = 0; i < valueNum; i++)
+ value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8);
+ }
+
+ uint32_t tmpRegNum = typeSize*valueNum / 4;
+ vector<GenRegister> tmp(tmpRegNum);
+ for(uint32_t i = 0; i < tmpRegNum; i++) {
+ tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
+ }
+
+ sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+ } else {
+ const GenRegister value = sel.selReg(insn.getValue(0));
+ GBE_ASSERT(insn.getValueNum() == 1);
+ const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ if (elemSize == GEN_BYTE_SCATTER_WORD) {
+ sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
+ } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
+ sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
+ }
+ sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
+ }
+ }
+
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const AddressSpace space = insn.getAddressSpace();
+ const Type type = insn.getValueType();
+ const uint32_t elemSize = getByteScatterGatherSize(type);
+ GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+ if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+ GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+ sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+ address = temp;
+ }
+ if(space == MEM_LOCAL) {
+ if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitWrite64(sel, insn, address, 0xfe);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ this->emitUntypedWrite(sel, insn, address, 0xfe);
+ else
+ this->emitByteScatter(sel, insn, elemSize, address, 0xfe);
+ } else {
+ BTI bti = insn.getBTI();
+ for (int x = 0; x < bti.count; x++) {
+ GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+ sel.push();
+ sel.curr.noMask = 1;
+ sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti.bti[x]), ir::TYPE_U32)));
+ sel.pop();
+ if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitWrite64(sel, insn, temp, bti.bti[x]);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ this->emitUntypedWrite(sel, insn, temp, bti.bti[x]);
+ else {
+ this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x]);
+ }
+ }
+ }
+ return true;
+ }
+ DECL_CTOR(StoreInstruction, 1, 1);
+ };
+
+ /*! Compare instruction pattern */
+ class CompareInstructionPattern : public SelectionPattern
+ {
+ public:
+ CompareInstructionPattern(void) : SelectionPattern(1,1) {
+ for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+ if (ir::isOpcodeFrom<ir::CompareInstruction>(ir::Opcode(op)) == true)
+ this->opcodes.push_back(ir::Opcode(op));
+ }
+
+ INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
+ using namespace ir;
+ const ir::CompareInstruction &insn = cast<CompareInstruction>(dag.insn);
+ const Opcode opcode = insn.getOpcode();
+ const Type type = insn.getType();
+ const Register dst = insn.getDst(0);
+ GenRegister tmpDst;
+ const BasicBlock *curr = insn.getParent();
+ const ir::Liveness &liveness = sel.ctx.getLiveness();
+ const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(curr);
+ bool needStoreBool = false;
+ if (liveOut.contains(dst) || dag.computeBool)
+ needStoreBool = true;
+
+ if(type == TYPE_S64 || type == TYPE_U64 ||
+ type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+ type == TYPE_U32 || type == TYPE_S32 /*||
+ (!needStoreBool)*/)
+ tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_F);
+ else
+ tmpDst = sel.selReg(dst, TYPE_BOOL);
+
+ // Look for immediate values for the right source
+ GenRegister src0, src1;
+ bool inverseCmp = false;
+ sel.getSrcGenRegImm(dag, src0, src1, type, inverseCmp);
+ sel.push();
+ if (sel.isScalarReg(dst))
+ sel.curr.noMask = 1;
+ sel.curr.physicalFlag = 0;
+ sel.curr.modFlag = 1;
+ sel.curr.flagIndex = (uint16_t)dst;
+ sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean.
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[3];
+ for(int i=0; i<3; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.curr.flagGen = 1;
+ sel.I64CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmp);
+ } else if(opcode == OP_ORD) {
+ sel.push();
+ sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.curr.flagGen = 1;
+ sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
+ sel.pop();
+ } else {
+ if((type == TYPE_S64 || type == TYPE_U64 ||
+ type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+ type == TYPE_U32 || type == TYPE_S32))
+ sel.curr.flagGen = 1;
+ else if (sel.isScalarReg(dst)) {
+ // If the dest reg is a scalar bool, we can't set it as
+ // dst register, as the execution width is still 8 or 16.
+ // Instead, we set the needStoreBool to flagGen, and change
+ // the dst to null register. And let the flag reg allocation
+ // function to generate the flag grf on demand correctly latter.
+ sel.curr.flagGen = needStoreBool;
+ tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_UW);
+ }
+ sel.CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmpDst);
+ }
+ sel.pop();
+ return true;
+ }
+ };
+
+ /*! Bit cast instruction pattern */
+ DECL_PATTERN(BitCastInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::BitCastInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const Type dstType = insn.getDstType();
+ const Type srcType = insn.getSrcType();
+ const uint32_t dstNum = insn.getDstNum();
+ const uint32_t srcNum = insn.getSrcNum();
+ int index = 0, multiple, narrowNum;
+ bool narrowDst;
+ Type narrowType;
+
+ if(dstNum > srcNum) {
+ multiple = dstNum / srcNum;
+ narrowType = dstType;
+ narrowNum = dstNum;
+ narrowDst = 1;
+ } else {
+ multiple = srcNum / dstNum;
+ narrowType = srcType;
+ narrowNum = srcNum;
+ narrowDst = 0;
+ }
+
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
+ // As we store long/ulong low/high part separately,
+ // we need to deal with it separately, we need to change it back again
+ // when hardware support native long type.
+ const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
+ const int simdWidth = sel.curr.execWidth;
+
+ for(int i = 0; i < narrowNum; i++, index++) {
+ GenRegister narrowReg, wideReg;
+ if(narrowDst) {
+ narrowReg = sel.selReg(insn.getDst(i), narrowType);
+ wideReg = sel.selReg(insn.getSrc(index/multiple), narrowType); //retype to narrow type
+ } else {
+ wideReg = sel.selReg(insn.getDst(index/multiple), narrowType);
+ narrowReg = sel.selReg(insn.getSrc(i), narrowType); //retype to narrow type
+ }
+
+ // set correct horizontal stride
+ if(wideReg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ if(multiple == 2) {
+ wideReg = sel.unpacked_uw(wideReg.reg());
+ wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+ if(isInt64) {
+ wideReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+ wideReg.vstride = GEN_VERTICAL_STRIDE_8;
+ }
+ } else if(multiple == 4) {
+ wideReg = sel.unpacked_ub(wideReg.reg());
+ wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+ if(isInt64) {
+ wideReg.hstride = GEN_HORIZONTAL_STRIDE_2;
+ wideReg.vstride = GEN_VERTICAL_STRIDE_16;
+ }
+ } else if(multiple == 8) {
+ // we currently store high/low 32bit separately in register,
+ // so, its hstride is 4 here.
+ wideReg = sel.unpacked_ub(wideReg.reg());
+ wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+ } else {
+ GBE_ASSERT(0);
+ }
+ }
+
+ if(!isInt64 && index % multiple) {
+ wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
+ wideReg.subphysical = 1;
+ }
+ if(isInt64) {
+ wideReg.subphysical = 1;
+ // Offset to next half
+ if((i % multiple) >= multiple/2)
+ wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
+ // Offset to desired narrow element in wideReg
+ if(index % (multiple/2))
+ wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
+ }
+
+ GenRegister xdst = narrowDst ? narrowReg : wideReg;
+ GenRegister xsrc = narrowDst ? wideReg : narrowReg;
+
+ if(isInt64) {
+ sel.MOV(xdst, xsrc);
+ } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ xdst.subphysical = 1;
+ xsrc.subphysical = 1;
+ for(int i = 0; i < simdWidth/4; i ++) {
+ sel.curr.chooseNib(i);
+ sel.MOV(xdst, xsrc);
+ xdst = GenRegister::offset(xdst, 0, 4 * typeSize(getGenType(dstType)));
+ xsrc = GenRegister::offset(xsrc, 0, 4 * typeSize(getGenType(srcType)));
+ }
+ sel.pop();
+ } else
+ sel.MOV(xdst, xsrc);
+ }
+ sel.pop();
+
+ return true;
+ }
+ DECL_CTOR(BitCastInstruction, 1, 1);
+ };
+
+ /*! Convert instruction pattern */
+ DECL_PATTERN(ConvertInstruction)
+ {
+
+ INLINE bool lowerI64Reg(Selection::Opaque &sel, SelectionDAG *dag, GenRegister &src, uint32_t type) const {
+ using namespace ir;
+ GBE_ASSERT(type == GEN_TYPE_UD || type == GEN_TYPE_F);
+ if (dag->insn.getOpcode() == OP_LOADI) {
+ const auto &immInsn = cast<LoadImmInstruction>(dag->insn);
+ const auto imm = immInsn.getImmediate();
+ const Type immType = immInsn.getType();
+ if (immType == TYPE_S64 &&
+ imm.getIntegerValue() <= INT_MAX &&
+ imm.getIntegerValue() >= INT_MIN) {
+ src = GenRegister::immd((int32_t)imm.getIntegerValue());
+ return true;
+ } else if (immType == TYPE_U64 &&
+ imm.getIntegerValue() <= UINT_MAX) {
+ src = GenRegister::immud((uint32_t)imm.getIntegerValue());
+ return true;
+ }
+ } else if (dag->insn.getOpcode() == OP_CVT) {
+ const auto cvtInsn = cast<ConvertInstruction>(dag->insn);
+ auto srcType = cvtInsn.getSrcType();
+ if (((srcType == TYPE_U32 || srcType == TYPE_S32) &&
+ (type == GEN_TYPE_UD || type == GEN_TYPE_D)) ||
+ ((srcType == TYPE_FLOAT) && type == GEN_TYPE_F)) {
+ src = GenRegister::retype(sel.selReg(cvtInsn.getSrc(0), srcType), type);
+ dag->isRoot = 1;
+ return true;
+ } else if (srcType == TYPE_FLOAT ||
+ srcType == TYPE_U16 ||
+ srcType == TYPE_S16 ||
+ srcType == TYPE_U32 ||
+ srcType == TYPE_S32) {
+ src = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32), type);
+ dag->isRoot = 1;
+ sel.MOV(src, sel.selReg(cvtInsn.getSrc(0), srcType));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const Type dstType = insn.getDstType();
+ const Type srcType = insn.getSrcType();
+ const RegisterFamily dstFamily = getFamily(dstType);
+ const RegisterFamily srcFamily = getFamily(srcType);
+ const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+ const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+ const Opcode opcode = insn.getOpcode();
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+ if(opcode == ir::OP_SAT_CVT)
+ sel.curr.saturate = 1;
+
+ // We need two instructions to make the conversion
+ if (opcode == OP_F16TO32) {
+ sel.F16TO32(dst, src);
+ } else if (opcode == OP_F32TO16) {
+ GenRegister unpacked;
+ unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+ sel.push();
+ if (sel.isScalarReg(insn.getSrc(0))) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+ sel.F32TO16(unpacked, src);
+ sel.pop();
+ sel.MOV(dst, unpacked);
+ } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
+ GenRegister unpacked;
+ if (dstFamily == FAMILY_WORD) {
+ const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+ if (!sel.isScalarReg(dst.reg())) {
+ unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+ unpacked = GenRegister::retype(unpacked, type);
+ } else
+ unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
+ } else {
+ const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+ if (!sel.isScalarReg(dst.reg())) {
+ unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+ unpacked = GenRegister::retype(unpacked, type);
+ } else
+ unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+ }
+ if(srcFamily == FAMILY_QWORD) {
+ GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp.type = GEN_TYPE_D;
+ sel.CONVI64_TO_I(tmp, src);
+ sel.MOV(unpacked, tmp);
+ } else {
+ sel.push();
+ if (sel.isScalarReg(insn.getSrc(0))) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+ sel.MOV(unpacked, src);
+ sel.pop();
+ }
+ if (unpacked.reg() != dst.reg())
+ sel.MOV(dst, unpacked);
+ } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) &&
+ (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64))
+ sel.CONVI64_TO_I(dst, src);
+ else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {
+ auto dag = sel.regDAG[src.reg()];
+ // FIXME, in the future, we need to do a common I64 lower to I32 analysis
+ // at llvm IR layer which could cover more cases then just this one.
+ SelectionDAG *dag0, *dag1;
+ if (dag && dag->child[0] && dag->child[1]) {
+ if (dag->child[0]->insn.getOpcode() == OP_LOADI) {
+ dag0 = dag->child[1];
+ dag1 = dag->child[0];
+ } else {
+ dag0 = dag->child[0];
+ dag1 = dag->child[1];
+ }
+ GBE_ASSERT(!(dag->child[0]->insn.getOpcode() == OP_LOADI &&
+ dag->child[1]->insn.getOpcode() == OP_LOADI));
+ if (dag->insn.getOpcode() == OP_AND ||
+ dag->insn.getOpcode() == OP_OR ||
+ dag->insn.getOpcode() == OP_XOR) {
+ GenRegister src0;
+ GenRegister src1;
+ if (lowerI64Reg(sel, dag0, src0, GEN_TYPE_UD) &&
+ lowerI64Reg(sel, dag1, src1, GEN_TYPE_UD)) {
+ switch (dag->insn.getOpcode()) {
+ default:
+ case OP_AND: sel.AND(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+ case OP_OR: sel.OR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+ case OP_XOR: sel.XOR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+ }
+ sel.MOV(dst, GenRegister::retype(dst, GEN_TYPE_UD));
+ markChildren = false;
+ return true;
+ }
+ }
+ }
+ GenRegister tmp[6];
+ for(int i=0; i<6; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ }
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.CONVI64_TO_F(dst, src, tmp);
+ sel.pop();
+ } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
+ (src.isdf() && dstType == ir::TYPE_FLOAT)) {
+ ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+ sel.MOV_DF(dst, src, sel.selReg(r));
+ } else if (dst.isint64()) {
+ switch(src.type) {
+ case GEN_TYPE_F:
+ {
+ GenRegister tmp[2];
+ tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.CONVF_TO_I64(dst, src, tmp);
+ sel.pop();
+ break;
+ }
+ case GEN_TYPE_DF:
+ NOT_IMPLEMENTED;
+ default:
+ sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+ }
+ } else
+ sel.MOV(dst, src);
+
+ sel.pop();
+
+ return true;
+ }
+ DECL_CTOR(ConvertInstruction, 1, 1);
+ };
+
+ /*! Convert instruction pattern */
+ DECL_PATTERN(AtomicInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const AtomicOps atomicOp = insn.getAtomicOpcode();
+ const AddressSpace space = insn.getAddressSpace();
+ const uint32_t srcNum = insn.getSrcNum();
+
+ GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32); //address
+ GenRegister src1 = src0, src2 = src0;
+ if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+ if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
+ GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
+ GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+ if(space == MEM_LOCAL) {
+ if (sel.needPatchSLMAddr()) {
+ GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+ src0 = temp;
+ }
+ sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
+ } else {
+ ir::BTI b = insn.getBTI();
+ for (int x = 0; x < b.count; x++) {
+ sel.push();
+ sel.curr.noMask = 1;
+ GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+ sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
+ sel.pop();
+ sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
+ }
+ }
+ return true;
+ }
+ DECL_CTOR(AtomicInstruction, 1, 1);
+ };
+
+ /*! Select instruction pattern */
+ class SelectInstructionPattern : public SelectionPattern
+ {
+ public:
+ SelectInstructionPattern(void) : SelectionPattern(1,1) {
+ for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+ if (ir::isOpcodeFrom<ir::SelectInstruction>(ir::Opcode(op)) == true)
+ this->opcodes.push_back(ir::Opcode(op));
+ }
+
+ INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
+ using namespace ir;
+ const ir::SelectInstruction &insn = cast<SelectInstruction>(dag.insn);
+
+ // Get all registers for the instruction
+ const Type type = insn.getType();
+ const GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+ // Look for immediate values for the right source
+ GenRegister src0, src1;
+ SelectionDAG *dag0 = dag.child[0]; // source 0 is the predicate!
+ SelectionDAG *dag1 = dag.child[1];
+ SelectionDAG *dag2 = dag.child[2];
+
+ if (dag0) dag0->isRoot = 1;
+ bool inverse = false;
+ sel.getSrcGenRegImm(dag, dag1, dag2, src0, src1, type, inverse);
+ const Register pred = insn.getPredicate();
+ sel.push();
+ if (sel.isScalarReg(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+ sel.curr.inversePredicate ^= inverse;
+ sel.curr.physicalFlag = 0;
+ sel.curr.flagIndex = (uint16_t) pred;
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ if (!dag0)
+ sel.curr.externFlag = 1;
+ if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
+ sel.SEL_INT64(dst, src0, src1);
+ else
+ sel.SEL(dst, src0, src1);
+ sel.pop();
+
+ return true;
+ }
+ };
+
+ DECL_PATTERN(TernaryInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn, bool &markChildren) const {
+ using namespace ir;
+ const Type type = insn.getType();
+ const GenRegister dst = sel.selReg(insn.getDst(0), type),
+ src0 = sel.selReg(insn.getSrc(0), type),
+ src1 = sel.selReg(insn.getSrc(1), type),
+ src2 = sel.selReg(insn.getSrc(2), type);
+ switch(insn.getOpcode()) {
+ case OP_I64MADSAT:
+ {
+ GenRegister tmp[9];
+ for(int i=0; i<9; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ sel.push();
+ sel.curr.flag = 0;
+ sel.curr.subFlag = 1;
+ sel.I64MADSAT(dst, src0, src1, src2, tmp);
+ sel.pop();
+ break;
+ }
+ case OP_MAD:
+ {
+ sel.MAD(dst, src2, src0, src1);
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ return true;
+ }
+
+ DECL_CTOR(TernaryInstruction, 1, 1);
+ };
+
+
+ /*! Label instruction pattern */
+ DECL_PATTERN(LabelInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::LabelInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const LabelIndex label = insn.getLabelIndex();
+ const GenRegister src0 = sel.selReg(ocl::blockip);
+ const GenRegister src1 = GenRegister::immuw(label);
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ GBE_ASSERTM(label < GEN_MAX_LABEL, "We reached the maximum label number which is reserved for barrier handling");
+ sel.LABEL(label);
+
+ // Do not emit any code for the "returning" block. There is no need for it
+ if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
+ return true;
+
+ LabelIndex jip;
+ const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex();
+ if (sel.ctx.hasJIP(&insn))
+ jip = sel.ctx.getLabelIndex(&insn);
+ else
+ jip = nextLabel;
+
+ // Emit the mask computation at the head of each basic block
+ sel.push();
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1,
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ sel.pop();
+
+ if (sel.block->hasBarrier) {
+ // If this block has barrier, we don't execute the block until all lanes
+ // are 1s. Set each reached lane to 1, then check all lanes. If there is any
+ // lane not reached, we jump to jip. And no need to issue if/endif for
+ // this block, as it will always excute with all lanes activated.
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL),
+ GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ if (simdWidth == 8)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+ else if (simdWidth == 16)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+ else
+ NOT_IMPLEMENTED;
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ sel.curr.inversePredicate = 1;
+ sel.JMPI(GenRegister::immd(0), jip, label);
+ sel.pop();
+ // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+ sel.pop();
+ }
+ else {
+ if (sel.ctx.hasJIP(&insn) &&
+ // If jump to next label and the endif offset is -1, then
+ // We don't need to add a jmpi here, as the following IF will do the same
+ // thing if all channels are disabled.
+ (jip != nextLabel || sel.block->endifOffset != -1)) {
+ // If it is required, insert a JUMP to bypass the block
+ sel.push();
+ if (simdWidth == 8)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ else if (simdWidth == 16)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else
+ NOT_IMPLEMENTED;
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ sel.curr.inversePredicate = 1;
+ sel.JMPI(GenRegister::immd(0), jip, label);
+ sel.pop();
+ }
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.IF(GenRegister::immd(0), sel.block->endifLabel, sel.block->endifLabel);
+ sel.pop();
+ }
+
+ return true;
+ }
+ DECL_CTOR(LabelInstruction, 1, 1);
+ };
+
+ DECL_PATTERN(SampleInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::SampleInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ GenRegister msgPayloads[4];
+ GenRegister dst[insn.getDstNum()];
+ uint32_t srcNum = insn.getSrcNum();
+ uint32_t valueID = 0;
+ uint32_t msgLen = 0;
+
+ for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
+ dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
+
+ GBE_ASSERT(srcNum == 3);
+ if (insn.getSrc(1) == ir::ocl::invalid) //not 3D
+ srcNum = 1;
+ else if (insn.getSrc(2) == ir::ocl::invalid)
+ srcNum = 2;
+
+ if (insn.getSamplerOffset() != 0) {
+ // U, lod, [V], [W]
+ GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
+ msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+ msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ if (srcNum > 1)
+ msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+ if (srcNum > 2)
+ msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+ // Clear the lod to zero.
+ sel.MOV(msgPayloads[1], GenRegister::immud(0));
+ msgLen = srcNum + 1;
+ } else {
+ // U, V, [W]
+ GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
+ for (valueID = 0; valueID < srcNum; ++valueID)
+ msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ msgLen = srcNum;
+ }
+ // We switch to a fixup bti for linear filter on a image1d array sampling.
+ uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? BTI_MAX_IMAGE_NUM : 0);
+ if (bti > 253) {
+ std::cerr << "Too large bti " << bti;
+ return false;
+ }
+ uint32_t sampler = insn.getSamplerIndex();
+
+ sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, msgLen, bti, sampler, insn.getSamplerOffset() != 0, false);
+ return true;
+ }
+ DECL_CTOR(SampleInstruction, 1, 1);
+ };
+
+ /*! Typed write instruction pattern. */
+ DECL_PATTERN(TypedWriteInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
+ const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
+ const uint32_t coordNum = 3;
+
+ if (simdWidth == 16) {
+ for(uint32_t i = 0; i < msgNum; i++)
+ msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ } else {
+ uint32_t valueID = 0;
+ msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
+ msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
+
+ // fake u.
+ if (insn.getSrc(1) == ir::ocl::invalid)
+ msgs[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ // fake w.
+ if (insn.getSrc(2) == ir::ocl::invalid)
+ msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ // LOD.
+ msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
+ msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ }
+
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(msgs[0], GenRegister::immud(0));
+ sel.curr.execWidth = 1;
+
+ GenRegister channelEn = GenRegister::offset(msgs[0], 0, 7*4);
+ channelEn.subphysical = 1;
+ // Enable all channels.
+ sel.MOV(channelEn, GenRegister::immud(0xffff));
+ sel.curr.execWidth = 8;
+ // Set zero LOD.
+ if (simdWidth == 8)
+ sel.MOV(msgs[4], GenRegister::immud(0));
+ else
+ sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
+ sel.pop();
+
+ uint32_t bti = insn.getImageIndex();
+ if (simdWidth == 8)
+ sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+ else {
+ sel.push();
+ sel.curr.execWidth = 8;
+ for( uint32_t quarter = 0; quarter < 2; quarter++)
+ {
+ #define QUARTER_MOV0(msgs, msgid, src) \
+ sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \
+ GenRegister::Qn(src, quarter))
+
+ #define QUARTER_MOV1(msgs, msgid, src) \
+ sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \
+ GenRegister::Qn(src, quarter))
+ sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
+ // Set U,V,W
+ QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
+ if (insn.getSrc(1) != ir::ocl::invalid) //not 2D
+ QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
+ if (insn.getSrc(2) != ir::ocl::invalid) //not 3D
+ QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
+ // Set R, G, B, A
+ QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(3), insn.getSrcType()));
+ QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(4), insn.getSrcType()));
+ QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(5), insn.getSrcType()));
+ QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(6), insn.getSrcType()));
+ sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+ #undef QUARTER_MOV0
+ #undef QUARTER_MOV1
+ }
+ sel.pop();
+ }
+ return true;
+ }
+ DECL_CTOR(TypedWriteInstruction, 1, 1);
+ };
+
+ /*! get image info instruction pattern. */
+ DECL_PATTERN(GetImageInfoInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ GenRegister dst;
+ dst = sel.selReg(insn.getDst(0), TYPE_U32);
+ GenRegister imageInfoReg = GenRegister::ud1grf(insn.getSrc(0));
+ sel.MOV(dst, imageInfoReg);
+
+ return true;
+ }
+ DECL_CTOR(GetImageInfoInstruction, 1, 1);
+ };
+
+ /*! Branch instruction pattern */
+ class BranchInstructionPattern : public SelectionPattern
+ {
+ public:
+ BranchInstructionPattern(void) : SelectionPattern(1,1) {
+ for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+ if (ir::isOpcodeFrom<ir::BranchInstruction>(ir::Opcode(op)) == true)
+ this->opcodes.push_back(ir::Opcode(op));
+ }
+
+ void emitForwardBranch(Selection::Opaque &sel,
+ const ir::BranchInstruction &insn,
+ ir::LabelIndex dst,
+ ir::LabelIndex src) const
+ {
+ using namespace ir;
+ const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
+
+ // We will not emit any jump if we must go the next block anyway
+ const BasicBlock *curr = insn.getParent();
+ const BasicBlock *next = curr->getNextBlock();
+ const LabelIndex nextLabel = next->getLabelIndex();
+ if (insn.isPredicated() == true) {
+ const Register pred = insn.getPredicateIndex();
+ sel.push();
+ // we don't need to set next label to the pcip
+ // as if there is no backward jump latter, then obviously everything will work fine.
+ // If there is backward jump latter, then all the pcip will be updated correctly there.
+ sel.curr.physicalFlag = 0;
+ sel.curr.flagIndex = (uint16_t) pred;
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), nextLabel);
+ sel.block->endifOffset = -1;
+ sel.pop();
+ } else {
+ // Update the PcIPs
+ const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+ sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), nextLabel);
+ sel.block->endifOffset = -1;
+ if (nextLabel == jip) return;
+ // Branch to the jump target
+ sel.push();
+ sel.curr.execWidth = 1;
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, curr->getLabelIndex());
+ sel.pop();
+ }
+ }
+
+ void emitBackwardBranch(Selection::Opaque &sel,
+ const ir::BranchInstruction &insn,
+ ir::LabelIndex dst,
+ ir::LabelIndex src) const
+ {
+ using namespace ir;
+ const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
+ const Function &fn = sel.ctx.getFunction();
+ const BasicBlock &bb = fn.getBlock(src);
+ const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+ const LabelIndex label = bb.getLabelIndex();
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ GBE_ASSERT(bb.getNextBlock() != NULL);
+
+ if (insn.isPredicated() == true) {
+ const Register pred = insn.getPredicateIndex();
+
+ // Update the PcIPs for all the branches. Just put the IPs of the next
+ // block. Next instruction will properly update the IPs of the lanes
+ // that actually take the branch
+ const LabelIndex next = bb.getNextBlock()->getLabelIndex();
+ sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
+ GBE_ASSERT(jip == dst);
+ sel.push();
+ sel.curr.physicalFlag = 0;
+ sel.curr.flagIndex = (uint16_t) pred;
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+ sel.block->endifOffset = -1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), next);
+ sel.curr.execWidth = 1;
+ if (simdWidth == 16)
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else
+ sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ sel.curr.noMask = 1;
+ sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label);
+ sel.pop();
+ } else {
+ const LabelIndex next = bb.getNextBlock()->getLabelIndex();
+ // Update the PcIPs
+ sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+ sel.block->endifOffset = -1;
+ if (!sel.block->hasBarrier)
+ sel.ENDIF(GenRegister::immd(0), next);
+ // Branch to the jump target
+ sel.push();
+ sel.curr.execWidth = 1;
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label);
+ sel.pop();
+ }
+ }
+
+ INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+ using namespace ir;
+ const ir::BranchInstruction &insn = cast<BranchInstruction>(dag.insn);
+ const Opcode opcode = insn.getOpcode();
+ if (opcode == OP_RET)
+ sel.EOT();
+ else if (opcode == OP_BRA) {
+ const LabelIndex dst = insn.getLabelIndex();
+ const LabelIndex src = insn.getParent()->getLabelIndex();
+
+ sel.push();
+ if (insn.isPredicated() == true) {
+ if (dag.child[0] == NULL)
+ sel.curr.externFlag = 1;
+ }
+
+ // We handle foward and backward branches differently
+ if (uint32_t(dst) <= uint32_t(src))
+ this->emitBackwardBranch(sel, insn, dst, src);
+ else
+ this->emitForwardBranch(sel, insn, dst, src);
+ sel.pop();
+ } else
+ NOT_IMPLEMENTED;
+
+ markAllChildren(dag);
+ return true;
+ }
+
+ };
+
+ /*! Sort patterns */
+ INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
+ if (p0->insnNum != p1->insnNum)
+ return p0->insnNum > p1->insnNum;
+ return p0->cost < p1->cost;
+ }
+
+ SelectionLibrary::SelectionLibrary(void) {
+ this->insert<UnaryInstructionPattern>();
+ this->insert<BinaryInstructionPattern>();
+ this->insert<TypedWriteInstructionPattern>();
+ this->insert<SyncInstructionPattern>();
+ this->insert<LoadImmInstructionPattern>();
+ this->insert<LoadInstructionPattern>();
+ this->insert<StoreInstructionPattern>();
+ this->insert<SelectInstructionPattern>();
+ this->insert<CompareInstructionPattern>();
+ this->insert<BitCastInstructionPattern>();
+ this->insert<ConvertInstructionPattern>();
+ this->insert<AtomicInstructionPattern>();
+ this->insert<TernaryInstructionPattern>();
+ this->insert<LabelInstructionPattern>();
+ this->insert<BranchInstructionPattern>();
+ this->insert<Int32x32MulInstructionPattern>();
+ this->insert<Int32x16MulInstructionPattern>();
+ this->insert<MulAddInstructionPattern>();
+ this->insert<SelectModifierInstructionPattern>();
+ this->insert<SampleInstructionPattern>();
+ this->insert<GetImageInfoInstructionPattern>();
+
+ // Sort all the patterns with the number of instructions they output
+ for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+ std::sort(this->patterns[op].begin(), this->patterns[op].end(), cmp);
+ }
+
+ SelectionLibrary::~SelectionLibrary(void) {
+ for (auto pattern : this->toFree)
+ GBE_DELETE(const_cast<SelectionPattern*>(pattern));
+ }
+
+ template <typename PatternType>
+ void SelectionLibrary::insert(void) {
+ const SelectionPattern *pattern = GBE_NEW_NO_ARG(PatternType);
+ this->toFree.push_back(pattern);
+ for (auto opcode : pattern->opcodes)
+ this->patterns[opcode].push_back(pattern);
+ }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
new file mode 100644
index 0000000..9bcce6f
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_selection.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GEN_INSN_SELECTION_HPP__
+#define __GEN_INSN_SELECTION_HPP__
+
+#include "ir/register.hpp"
+#include "ir/instruction.hpp"
+#include "backend/gen_register.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_context.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/vector.hpp"
+#include "sys/intrusive_list.hpp"
+
+namespace gbe
+{
+ /*! Translate IR type to Gen type */
+ uint32_t getGenType(ir::Type type);
+ /*! Translate Gen type to IR type */
+ ir::Type getIRType(uint32_t genType);
+
+ /*! Translate IR compare to Gen compare */
+ uint32_t getGenCompare(ir::Opcode opcode);
+
+ #define GEN_MAX_LABEL 0xFFFF
+
+ /*! Selection opcodes properly encoded from 0 to n for fast jump tables
+ * generations
+ */
+ enum SelectionOpcode {
+#define DECL_SELECTION_IR(OP, FN) SEL_OP_##OP,
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_SELECTION_IR
+ };
+
+ // Owns and Allocates selection instructions
+ class Selection;
+
+ // List of SelectionInstruction forms a block
+ class SelectionBlock;
+
+ /*! A selection instruction is also almost a Gen instruction but *before* the
+ * register allocation
+ */
+ class SelectionInstruction : public NonCopyable, public intrusive_list_node
+ {
+ public:
+ /*! Owns the instruction */
+ SelectionBlock *parent;
+ /*! Append an instruction before this one */
+ void prepend(SelectionInstruction &insn);
+ /*! Append an instruction after this one */
+ void append(SelectionInstruction &insn);
+ /*! Does it read memory? */
+ bool isRead(void) const;
+ /*! Does it write memory? */
+ bool isWrite(void) const;
+ /*! Is it a branch instruction (i.e. modify control flow) */
+ bool isBranch(void) const;
+ /*! Is it a label instruction (i.e. change the implicit mask) */
+ bool isLabel(void) const;
+ /*! Get the destination register */
+ GenRegister &dst(uint32_t dstID) { return regs[dstID]; }
+ /*! Get the source register */
+ GenRegister &src(uint32_t srcID) { return regs[dstNum+srcID]; }
+ /*! Damn C++ */
+ const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
+ /*! Damn C++ */
+ const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
+ /*! No more than 9 sources (used by typed writes on simd8 mode.) */
+ enum { MAX_SRC_NUM = 9 };
+ /*! No more than 16 destinations (15 used by I64DIV/I64REM) */
+ enum { MAX_DST_NUM = 16 };
+ /*! State of the instruction (extra fields neeed for the encoding) */
+ GenInstructionState state;
+ union {
+ struct {
+ /*! Store bti for loads/stores and function for math, atomic and compares */
+ uint16_t function:8;
+ /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
+ uint16_t elem:8;
+ };
+ struct {
+ /*! Number of sources in the tuple */
+ uint16_t width:4;
+ /*! vertical stride (0,1,2,4,8 or 16) */
+ uint16_t vstride:5;
+ /*! horizontal stride (0,1,2,4,8 or 16) */
+ uint16_t hstride:5;
+ /*! offset (0 to 7) */
+ uint16_t offset:5;
+ };
+ struct {
+ uint16_t scratchOffset;
+ uint16_t scratchMsgHeader;
+ };
+ struct {
+ uint16_t bti:8;
+ uint16_t msglen:5;
+ uint16_t is3DWrite:1;
+ };
+ struct {
+ uint16_t rdbti:8;
+ uint16_t sampler:5;
+ uint16_t rdmsglen:3;
+ bool isLD; // is this a ld message?
+ bool isUniform;
+ };
+ uint32_t barrierType;
+ bool longjmp;
+ } extra;
+ /*! Gen opcode */
+ uint8_t opcode;
+ /*! Number of destinations */
+ uint8_t dstNum:5;
+ /*! Number of sources */
+ uint8_t srcNum:4;
+ /*! To store various indices */
+ uint16_t index;
+ /*! For BRC/IF to store the UIP */
+ uint16_t index1;
+ /*! instruction ID used for vector allocation. */
+ uint32_t ID;
+ /*! Variable sized. Destinations and sources go here */
+ GenRegister regs[0];
+ INLINE uint32_t getbti() const {
+ GBE_ASSERT(isRead() || isWrite());
+ switch (opcode) {
+ case SEL_OP_ATOMIC: return extra.elem;
+ case SEL_OP_BYTE_SCATTER:
+ case SEL_OP_WRITE64:
+ case SEL_OP_DWORD_GATHER:
+ case SEL_OP_UNTYPED_WRITE:
+ case SEL_OP_UNTYPED_READ:
+ case SEL_OP_BYTE_GATHER:
+ case SEL_OP_READ64: return extra.function;
+ case SEL_OP_SAMPLE: return extra.rdbti;
+ case SEL_OP_TYPED_WRITE: return extra.bti;
+ default:
+ GBE_ASSERT(0);
+ }
+ return 0;
+ }
+ private:
+ INLINE void setbti(uint32_t bti) {
+ GBE_ASSERT(isRead() || isWrite());
+ switch (opcode) {
+ case SEL_OP_ATOMIC: extra.elem = bti; return;
+ case SEL_OP_BYTE_SCATTER:
+ case SEL_OP_WRITE64:
+ case SEL_OP_UNTYPED_WRITE:
+ case SEL_OP_DWORD_GATHER:
+ case SEL_OP_UNTYPED_READ:
+ case SEL_OP_BYTE_GATHER:
+ case SEL_OP_READ64: extra.function = bti; return;
+ case SEL_OP_SAMPLE: extra.rdbti = bti; return;
+ case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
+ default:
+ GBE_ASSERT(0);
+ }
+ }
+ /*! Just Selection class can create SelectionInstruction */
+ SelectionInstruction(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+ // Allocates (with a linear allocator) and owns SelectionInstruction
+ friend class Selection;
+ };
+
+ /*! Instructions like sends require to make registers contiguous in GRF */
+ class SelectionVector : public NonCopyable, public intrusive_list_node
+ {
+ public:
+ SelectionVector(void);
+ /*! The instruction that requires the vector of registers */
+ SelectionInstruction *insn;
+ /*! Directly points to the selection instruction registers */
+ GenRegister *reg;
+ /*! Number of registers in the vector */
+ uint16_t regNum;
+ /*! Indicate if this a destination or a source vector */
+ uint16_t isSrc;
+ };
+
+ // Owns the selection block
+ class Selection;
+
+ /*! A selection block is the counterpart of the IR Basic block. It contains
+ * the instructions generated from an IR basic block
+ */
+ class SelectionBlock : public NonCopyable, public intrusive_list_node
+ {
+ public:
+ SelectionBlock(const ir::BasicBlock *bb);
+ /*! All the emitted instructions in the block */
+ intrusive_list<SelectionInstruction> insnList;
+ /*! The vectors that may be required by some instructions of the block */
+ intrusive_list<SelectionVector> vectorList;
+ /*! Extra registers needed by the block (only live in the block) */
+ gbe::vector<ir::Register> tmp;
+ /*! Associated IR basic block */
+ const ir::BasicBlock *bb;
+ /*! Append a new temporary register */
+ void append(ir::Register reg);
+ /*! Append a new selection vector in the block */
+ void append(SelectionVector *vec);
+ /*! Append a new selection instruction at the end of the block */
+ void append(SelectionInstruction *insn);
+ /*! Append a new selection instruction at the beginning of the block */
+ void prepend(SelectionInstruction *insn);
+ bool isLargeBlock;
+ ir::LabelIndex endifLabel;
+ int endifOffset;
+ bool hasBarrier;
+ bool hasBranch;
+ };
+
+ /*! Owns the selection engine */
+ class GenContext;
+ /*! Selection engine produces the pre-ISA instruction blocks */
+ class Selection
+ {
+ public:
+ /*! Initialize internal structures used for the selection */
+ Selection(GenContext &ctx);
+ /*! Release everything */
+ ~Selection(void);
+ /*! Implements the instruction selection itself */
+ void select(void);
+ /*! Get the number of instructions of the largest block */
+ uint32_t getLargestBlockSize(void) const;
+ /*! Number of register vectors in the selection */
+ uint32_t getVectorNum(void) const;
+ /*! Number of registers (temporaries are created during selection) */
+ uint32_t getRegNum(void) const;
+ /*! Get the family for the given register */
+ ir::RegisterFamily getRegisterFamily(ir::Register reg) const;
+ /*! Get the data for the given register */
+ ir::RegisterData getRegisterData(ir::Register reg) const;
+ /*! Replace a source by the returned temporary register */
+ ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type = ir::TYPE_FLOAT, bool needMov = true);
+ /*! Replace a destination to the returned temporary register */
+ ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type = ir::TYPE_FLOAT, bool needMov = true);
+ /*! spill a register (insert spill/unspill instructions) */
+ bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
+ /*! Indicate if a register is scalar or not */
+ bool isScalarReg(const ir::Register ®) const;
+ /*! Create a new selection instruction */
+ SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+ /*! List of emitted blocks */
+ intrusive_list<SelectionBlock> *blockList;
+ /*! Actual implementation of the register allocator (use Pimpl) */
+ class Opaque;
+ /*! Created and destroyed in cpp */
+ Opaque *opaque;
+ /*! Use custom allocators */
+ GBE_CLASS(Selection);
+ };
+
+ class Selection75: public Selection
+ {
+ public:
+ /*! Initialize internal structures used for the selection */
+ Selection75(GenContext &ctx);
+ };
+
+} /* namespace gbe */
+
+#endif /* __GEN_INSN_SELECTION_HPP__ */
+
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
new file mode 100644
index 0000000..ddc9d5e
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -0,0 +1,86 @@
+DECL_SELECTION_IR(LABEL, LabelInstruction)
+DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
+DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
+DECL_SELECTION_IR(NOT, UnaryInstruction)
+DECL_SELECTION_IR(LZD, UnaryInstruction)
+DECL_SELECTION_IR(RNDZ, UnaryInstruction)
+DECL_SELECTION_IR(RNDE, UnaryInstruction)
+DECL_SELECTION_IR(RNDD, UnaryInstruction)
+DECL_SELECTION_IR(RNDU, UnaryInstruction)
+DECL_SELECTION_IR(FRC, UnaryInstruction)
+DECL_SELECTION_IR(F16TO32, UnaryInstruction)
+DECL_SELECTION_IR(F32TO16, UnaryInstruction)
+DECL_SELECTION_IR(SEL, BinaryInstruction)
+DECL_SELECTION_IR(SEL_INT64, BinaryInstruction)
+DECL_SELECTION_IR(AND, BinaryInstruction)
+DECL_SELECTION_IR(OR, BinaryInstruction)
+DECL_SELECTION_IR(XOR, BinaryInstruction)
+DECL_SELECTION_IR(I64AND, BinaryInstruction)
+DECL_SELECTION_IR(I64OR, BinaryInstruction)
+DECL_SELECTION_IR(I64XOR, BinaryInstruction)
+DECL_SELECTION_IR(SHR, BinaryInstruction)
+DECL_SELECTION_IR(SHL, BinaryInstruction)
+DECL_SELECTION_IR(RSR, BinaryInstruction)
+DECL_SELECTION_IR(RSL, BinaryInstruction)
+DECL_SELECTION_IR(ASR, BinaryInstruction)
+DECL_SELECTION_IR(I64SHR, I64ShiftInstruction)
+DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
+DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
+DECL_SELECTION_IR(ADD, BinaryInstruction)
+DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATADD, I64SATADDInstruction)
+DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATSUB, I64SATSUBInstruction)
+DECL_SELECTION_IR(MUL, BinaryInstruction)
+DECL_SELECTION_IR(I64MUL, I64MULInstruction)
+DECL_SELECTION_IR(I64DIV, I64DIVREMInstruction)
+DECL_SELECTION_IR(I64REM, I64DIVREMInstruction)
+DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
+DECL_SELECTION_IR(MACH, BinaryInstruction)
+DECL_SELECTION_IR(CMP, CompareInstruction)
+DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
+DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
+DECL_SELECTION_IR(MAD, TernaryInstruction)
+DECL_SELECTION_IR(JMPI, JumpInstruction)
+DECL_SELECTION_IR(EOT, EotInstruction)
+DECL_SELECTION_IR(INDIRECT_MOVE, IndirectMoveInstruction)
+DECL_SELECTION_IR(NOP, NoOpInstruction)
+DECL_SELECTION_IR(WAIT, WaitInstruction)
+DECL_SELECTION_IR(MATH, MathInstruction)
+DECL_SELECTION_IR(BARRIER, BarrierInstruction)
+DECL_SELECTION_IR(FENCE, FenceInstruction)
+DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
+DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(READ64, Read64Instruction)
+DECL_SELECTION_IR(WRITE64, Write64Instruction)
+DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
+DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
+DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
+DECL_SELECTION_IR(PACK_BYTE, PackByteInstruction)
+DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
+DECL_SELECTION_IR(SAMPLE, SampleInstruction)
+DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
+DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
+DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
+DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64_MUL_HI, I64MULHIInstruction)
+DECL_SELECTION_IR(FBH, UnaryInstruction)
+DECL_SELECTION_IR(FBL, UnaryInstruction)
+DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
+DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
+DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
+DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
+DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
+DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
+DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
+DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
+DECL_SELECTION_IR(BRC, UnaryInstruction)
+DECL_SELECTION_IR(BRD, UnaryInstruction)
+DECL_SELECTION_IR(IF, UnaryInstruction)
+DECL_SELECTION_IR(ENDIF, UnaryInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
new file mode 100644
index 0000000..5324587
--- /dev/null
+++ b/backend/src/backend/gen_program.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DataLayout.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/Linker/Linker.h"
+#else
+#include "llvm/Linker.h"
+#endif
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IRReader/IRReader.h"
+
+#include "backend/program.h"
+#include "backend/gen_program.h"
+#include "backend/gen_program.hpp"
+#include "backend/gen_context.hpp"
+#include "backend/gen75_context.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen/gen_mesa_disasm.h"
+#include "backend/gen_reg_allocation.hpp"
+#include "ir/unit.hpp"
+#include "llvm/llvm_to_gen.hpp"
+#include "llvm/llvm_gen_backend.hpp"
+
+#include <clang/CodeGen/CodeGenAction.h>
+
+#include <cstring>
+#include <sstream>
+#include <memory>
+#include <iostream>
+#include <fstream>
+#include <mutex>
+#include <unistd.h>
+
+namespace gbe {
+
+ GenKernel::GenKernel(const std::string &name, uint32_t deviceID) :
+ Kernel(name), deviceID(deviceID), insns(NULL), insnNum(0)
+ {}
+ GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
+ const char *GenKernel::getCode(void) const { return (const char*) insns; }
+ const void GenKernel::setCode(const char * ins, size_t size) {
+ insns = (GenInstruction *)ins;
+ insnNum = size / sizeof(GenInstruction);
+ }
+ size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
+
+ void GenKernel::printStatus(int indent, std::ostream& outs) {
+#ifdef GBE_COMPILER_AVAILABLE
+ Kernel::printStatus(indent, outs);
+
+ FILE *f = fopen("/dev/null", "w");
+ char *buf = new char[4096];
+ setbuffer(f, buf, 4096);
+ GenCompactInstruction * pCom = NULL;
+ GenNativeInstruction nativeInsn;
+
+ for (uint32_t i = 0; i < insnNum;) {
+ pCom = (GenCompactInstruction*)(insns+i);
+ if(pCom->bits1.cmpt_control == 1) {
+ decompactInstruction(pCom, &nativeInsn);
+ gen_disasm(f, &nativeInsn, deviceID, 1);
+ i++;
+ } else {
+ gen_disasm(f, insns+i, deviceID, 0);
+ i = i + 2;
+ }
+ outs << buf;
+ fflush(f);
+ setbuffer(f, NULL, 0);
+ setbuffer(f, buf, 4096);
+ }
+
+ setbuffer(f, NULL, 0);
+ delete [] buf;
+ fclose(f);
+#endif
+ }
+
+ void GenProgram::CleanLlvmResource(void){
+#ifdef GBE_COMPILER_AVAILABLE
+ if(module){
+ delete (llvm::Module*)module;
+ module = NULL;
+ }
+
+ if(llvm_ctx){
+ delete (llvm::LLVMContext*)llvm_ctx;
+ llvm_ctx = NULL;
+ }
+#endif
+ }
+
+ /*! We must avoid spilling at all cost with Gen */
+ static const struct CodeGenStrategy {
+ uint32_t simdWidth;
+ uint32_t reservedSpillRegs;
+ bool limitRegisterPressure;
+ } codeGenStrategy[] = {
+ {16, 0, false},
+ {16, 10, false},
+ {8, 0, false},
+ {8, 8, false},
+ {8, 16, false},
+ };
+
+ Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) {
+#ifdef GBE_COMPILER_AVAILABLE
+ // Be careful when the simdWidth is forced by the programmer. We can see it
+ // when the function already provides the simd width we need to use (i.e.
+ // non zero)
+ const ir::Function *fn = unit.getFunction(name);
+ uint32_t codeGenNum = sizeof(codeGenStrategy) / sizeof(codeGenStrategy[0]);
+ uint32_t codeGen = 0;
+ GenContext *ctx = NULL;
+ if (fn->getSimdWidth() == 8) {
+ codeGen = 2;
+ } else if (fn->getSimdWidth() == 16) {
+ codeGenNum = 2;
+ } else if (fn->getSimdWidth() == 0) {
+ codeGen = 0;
+ } else
+ GBE_ASSERT(0);
+ Kernel *kernel = NULL;
+
+ // Stop when compilation is successful
+ if (IS_IVYBRIDGE(deviceID)) {
+ ctx = GBE_NEW(GenContext, unit, name, deviceID, relaxMath);
+ } else if (IS_HASWELL(deviceID)) {
+ ctx = GBE_NEW(Gen75Context, unit, name, deviceID, relaxMath);
+ }
+ GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
+
+ for (; codeGen < codeGenNum; ++codeGen) {
+ const uint32_t simdWidth = codeGenStrategy[codeGen].simdWidth;
+ const bool limitRegisterPressure = codeGenStrategy[codeGen].limitRegisterPressure;
+ const uint32_t reservedSpillRegs = codeGenStrategy[codeGen].reservedSpillRegs;
+
+ // Force the SIMD width now and try to compile
+ unit.getFunction(name)->setSimdWidth(simdWidth);
+ ctx->startNewCG(simdWidth, reservedSpillRegs, limitRegisterPressure);
+ kernel = ctx->compileKernel();
+ if (kernel != NULL) {
+ GBE_ASSERT(ctx->getErrCode() == NO_ERROR);
+ break;
+ }
+ fn->getImageSet()->clearInfo();
+ // If we get a out of range if/endif error.
+ // We need to set the context to if endif fix mode and restart the previous compile.
+ if ( ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && !ctx->getIFENDIFFix() ) {
+ ctx->setIFENDIFFix(true);
+ codeGen--;
+ } else
+ GBE_ASSERT(!(ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && ctx->getIFENDIFFix()));
+ }
+
+ GBE_ASSERTM(kernel != NULL, "Fail to compile kernel, may need to increase reserved registers for spilling.");
+ return kernel;
+#else
+ return NULL;
+#endif
+ }
+
+#define BINARY_HEADER_LENGTH 8
+#define IS_GEN_BINARY(binary) (*binary == '\0' && *(binary+1) == 'G'&& *(binary+2) == 'E' &&*(binary+3) == 'N' &&*(binary+4) == 'C')
+#define FILL_GEN_BINARY(binary) do{*binary = '\0'; *(binary+1) = 'G'; *(binary+2) = 'E'; *(binary+3) = 'N'; *(binary+4) = 'C';}while(0)
+#define FILL_DEVICE_ID(binary, src_hw_info) do {*(binary+5) = src_hw_info[0]; *(binary+6) = src_hw_info[1]; *(binary+7) = src_hw_info[2];}while(0)
+#define DEVICE_MATCH(typeA, src_hw_info) ((IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "IVB")) || \
+ (IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "BYT")) || \
+ (IS_BAYTRAIL_T(typeA) && !strcmp(src_hw_info, "BYT")) || \
+ (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) )
+
+ static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
+ using namespace gbe;
+ std::string binary_content;
+ //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3 bytes are hw info.
+ char src_hw_info[4]="";
+ src_hw_info[0] = *(binary+5);
+ src_hw_info[1] = *(binary+6);
+ src_hw_info[2] = *(binary+7);
+
+ // check whether is gen binary ('/0GENC')
+ if(!IS_GEN_BINARY(binary)){
+ return NULL;
+ }
+ // check the whether the current device ID match the binary file's.
+ if(!DEVICE_MATCH(deviceID, src_hw_info)){
+ return NULL;
+ }
+
+ binary_content.assign(binary+BINARY_HEADER_LENGTH, size-BINARY_HEADER_LENGTH);
+ GenProgram *program = GBE_NEW(GenProgram, deviceID);
+ std::istringstream ifs(binary_content, std::ostringstream::binary);
+
+ if (!program->deserializeFromBin(ifs)) {
+ delete program;
+ return NULL;
+ }
+
+ //program->printStatus(0, std::cout);
+ return reinterpret_cast<gbe_program>(program);
+ }
+
+ static gbe_program genProgramNewFromLLVMBinary(uint32_t deviceID, const char *binary, size_t size) {
+#ifdef GBE_COMPILER_AVAILABLE
+ using namespace gbe;
+ std::string binary_content;
+ //the first byte stands for binary_type.
+ binary_content.assign(binary+1, size-1);
+ llvm::StringRef llvm_bin_str(binary_content);
+ llvm::LLVMContext& c = llvm::getGlobalContext();
+ llvm::SMDiagnostic Err;
+ llvm::MemoryBuffer* memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
+ acquireLLVMContextLock();
+ llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
+ releaseLLVMContextLock();
+ if(module == NULL){
+ GBE_ASSERT(0);
+ }
+
+ GenProgram *program = GBE_NEW(GenProgram, deviceID, module);
+
+ //program->printStatus(0, std::cout);
+ return reinterpret_cast<gbe_program>(program);
+#else
+ return NULL;
+#endif
+ }
+
+ static size_t genProgramSerializeToBinary(gbe_program program, char **binary, int binary_type) {
+ using namespace gbe;
+ size_t sz;
+ std::ostringstream oss;
+ GenProgram *prog = (GenProgram*)program;
+
+ //0 means GEN binary, 1 means LLVM bitcode compiled object, 2 means LLVM bitcode library
+ if(binary_type == 0){
+ if ((sz = prog->serializeToBin(oss)) == 0) {
+ *binary = NULL;
+ return 0;
+ }
+
+ //add header to differetiate from llvm bitcode binary.
+ //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3 bytes are hw info.
+ *binary = (char *)malloc(sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
+ memset(*binary, 0, sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
+ FILL_GEN_BINARY(*binary);
+ char src_hw_info[4]="";
+ if(IS_IVYBRIDGE(prog->deviceID)){
+ src_hw_info[0]='I';
+ src_hw_info[1]='V';
+ src_hw_info[2]='B';
+ if(IS_BAYTRAIL_T(prog->deviceID)){
+ src_hw_info[0]='B';
+ src_hw_info[1]='Y';
+ src_hw_info[2]='T';
+ }
+ }else if(IS_HASWELL(prog->deviceID)){
+ src_hw_info[0]='H';
+ src_hw_info[1]='S';
+ src_hw_info[2]='W';
+ }
+ FILL_DEVICE_ID(*binary, src_hw_info);
+ memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
+ return sz+BINARY_HEADER_LENGTH;
+ }else{
+#ifdef GBE_COMPILER_AVAILABLE
+ std::string str;
+ llvm::raw_string_ostream OS(str);
+ llvm::WriteBitcodeToFile((llvm::Module*)prog->module, OS);
+ std::string& bin_str = OS.str();
+ int llsz = bin_str.size();
+ *binary = (char *)malloc(sizeof(char) * (llsz+1) );
+ *(*binary) = binary_type;
+ memcpy(*binary+1, bin_str.c_str(), llsz);
+ return llsz+1;
+#else
+ return 0;
+#endif
+ }
+ }
+
+ static gbe_program genProgramNewFromLLVM(uint32_t deviceID,
+ const char *fileName,
+ const void* module,
+ const void* llvm_ctx,
+ size_t stringSize,
+ char *err,
+ size_t *errSize,
+ int optLevel)
+ {
+ using namespace gbe;
+ GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx);
+#ifdef GBE_COMPILER_AVAILABLE
+ std::string error;
+ // Try to compile the program
+ if (program->buildFromLLVMFile(fileName, module, error, optLevel) == false) {
+ if (err != NULL && errSize != NULL && stringSize > 0u) {
+ const size_t msgSize = std::min(error.size(), stringSize-1u);
+ std::memcpy(err, error.c_str(), msgSize);
+ *errSize = error.size();
+ }
+ GBE_DELETE(program);
+ return NULL;
+ }
+#endif
+ // Everything run fine
+ return (gbe_program) program;
+ }
+
+ static gbe_program genProgramNewGenProgram(uint32_t deviceID, const void* module,
+ const void* llvm_ctx) {
+ using namespace gbe;
+ GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx);
+ // Everything run fine
+ return (gbe_program) program;
+ }
+
+ static void genProgramLinkFromLLVM(gbe_program dst_program,
+ gbe_program src_program,
+ size_t stringSize,
+ char * err,
+ size_t * errSize)
+ {
+#ifdef GBE_COMPILER_AVAILABLE
+ using namespace gbe;
+ std::string errMsg;
+ if(((GenProgram*)dst_program)->module == NULL){
+ ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module);
+ errSize = 0;
+ }else{
+ //set the global variables and functions to link once to fix redefine.
+ llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
+ for (llvm::Module::global_iterator I = src->global_begin(), E = src->global_end(); I != E; ++I) {
+ I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
+ }
+
+ for (llvm::Module::iterator I = src->begin(), E = src->end(); I != E; ++I) {
+ llvm::Function *F = llvm::dyn_cast<llvm::Function>(I);
+ if (F && isKernelFunction(*F)) continue;
+ I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
+ }
+ llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
+ llvm::Linker::LinkModules( dst,
+ src,
+ llvm::Linker::PreserveSource,
+ &errMsg);
+ if (errMsg.c_str() != NULL) {
+ if (err != NULL && errSize != NULL && stringSize > 0u) {
+ if(errMsg.length() < stringSize )
+ stringSize = errMsg.length();
+ strcpy(err, errMsg.c_str());
+ err[stringSize+1] = '\0';
+ }
+ }
+ }
+ // Everything run fine
+#endif
+ }
+
+ static void genProgramBuildFromLLVM(gbe_program program,
+ size_t stringSize,
+ char *err,
+ size_t *errSize,
+ const char * options)
+ {
+#ifdef GBE_COMPILER_AVAILABLE
+ using namespace gbe;
+ std::string error;
+
+ int optLevel = 1;
+
+ if(options) {
+ char *p;
+ p = strstr(const_cast<char *>(options), "-cl-opt-disable");
+ if (p)
+ optLevel = 0;
+ }
+
+ GenProgram* p = (GenProgram*) program;
+ // Try to compile the program
+ acquireLLVMContextLock();
+ llvm::Module* module = (llvm::Module*)p->module;
+
+ if (p->buildFromLLVMFile(NULL, module, error, optLevel) == false) {
+ if (err != NULL && errSize != NULL && stringSize > 0u) {
+ const size_t msgSize = std::min(error.size(), stringSize-1u);
+ std::memcpy(err, error.c_str(), msgSize);
+ *errSize = error.size();
+ }
+ GBE_DELETE(p);
+ }
+ releaseLLVMContextLock();
+#endif
+ }
+
+} /* namespace gbe */
+
+void genSetupCallBacks(void)
+{
+ gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
+ gbe_program_new_from_llvm_binary = gbe::genProgramNewFromLLVMBinary;
+ gbe_program_serialize_to_binary = gbe::genProgramSerializeToBinary;
+ gbe_program_new_from_llvm = gbe::genProgramNewFromLLVM;
+ gbe_program_new_gen_program = gbe::genProgramNewGenProgram;
+ gbe_program_link_from_llvm = gbe::genProgramLinkFromLLVM;
+ gbe_program_build_from_llvm = gbe::genProgramBuildFromLLVM;
+}
diff --git a/backend/src/backend/gen_program.h b/backend/src/backend/gen_program.h
new file mode 100644
index 0000000..8d37a70
--- /dev/null
+++ b/backend/src/backend/gen_program.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.h
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * C-like interface for the gen kernels and programs
+ */
+
+#ifndef __GBE_GEN_PROGRAM_H__
+#define __GBE_GEN_PROGRAM_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <semaphore.h>
+
+/*! This will make the compiler output Gen ISA code */
+extern void genSetupCallBacks(void);
+
+#endif /* __GBE_GEN_PROGRAM_H__ */
+
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
new file mode 100644
index 0000000..1b5136e
--- /dev/null
+++ b/backend/src/backend/gen_program.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_PROGRAM_HPP__
+#define __GBE_GEN_PROGRAM_HPP__
+
+#include "backend/program.h"
+#include "backend/program.hpp"
+#include "backend/gen_defs.hpp"
+
+// Gen ISA instruction
+struct GenInstruction;
+namespace gbe
+{
+ /*! Describe a compiled kernel */
+ class GenKernel : public Kernel
+ {
+ public:
+ /*! Create an empty kernel with the given name */
+ GenKernel(const std::string &name, uint32_t deviceID);
+ /*! Destroy it */
+ virtual ~GenKernel(void);
+ /*! Implements base class */
+ virtual const char *getCode(void) const;
+ /*! Set the instruction stream (to be implemented) */
+ virtual const void setCode(const char *, size_t size);
+ /*! Implements get the code size */
+ virtual size_t getCodeSize(void) const;
+ /*! Implements printStatus*/
+ virtual void printStatus(int indent, std::ostream& outs);
+ uint32_t deviceID; //!< Current device ID
+ GenInstruction *insns; //!< Instruction stream
+ uint32_t insnNum; //!< Number of instructions
+ GBE_CLASS(GenKernel); //!< Use custom allocators
+ };
+
+ /*! Describe a compiled program */
+ class GenProgram : public Program
+ {
+ public:
+ /*! Create an empty program */
+ GenProgram(uint32_t deviceID, const void* mod = NULL, const void* ctx = NULL) : deviceID(deviceID),module((void*)mod), llvm_ctx((void*)ctx) {}
+ /*! Current device ID*/
+ uint32_t deviceID;
+ /*! Destroy the program */
+ virtual ~GenProgram(void) {};
+ /*! Clean LLVM resource */
+ virtual void CleanLlvmResource(void);
+ /*! Implements base class */
+ virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath);
+ /*! Allocate an empty kernel. */
+ virtual Kernel *allocateKernel(const std::string &name) {
+ return GBE_NEW(GenKernel, name, deviceID);
+ }
+ void* module;
+ void* llvm_ctx;
+ /*! Use custom allocators */
+ GBE_CLASS(GenProgram);
+ };
+ /*! decompact GEN ASM if it is in compacted format */
+ extern void decompactInstruction(union GenCompactInstruction *p, union GenNativeInstruction *pOut);
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_PROGRAM_HPP__ */
+
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
new file mode 100644
index 0000000..b7fbc93
--- /dev/null
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -0,0 +1,1218 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_reg_allocation.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/profile.hpp"
+#include "ir/function.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "backend/gen_register.hpp"
+#include "backend/program.hpp"
+#include "sys/exception.hpp"
+#include "sys/cvar.hpp"
+#include <algorithm>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+
+namespace gbe
+{
+ /////////////////////////////////////////////////////////////////////////////
+ // Register allocator internal implementation
+ /////////////////////////////////////////////////////////////////////////////
+
+ /*! Provides the location of a register in a vector */
+ typedef std::pair<SelectionVector*, uint32_t> VectorLocation;
+ /*! Interval as used in linear scan allocator. Basically, stores the first and
+ * the last instruction where the register is alive
+ */
+ struct GenRegInterval {
+ INLINE GenRegInterval(ir::Register reg) :
+ reg(reg), minID(INT_MAX), maxID(-INT_MAX) {}
+ ir::Register reg; //!< (virtual) register of the interval
+ int32_t minID, maxID; //!< Starting and ending points
+ };
+
+ typedef struct GenRegIntervalKey {
+ GenRegIntervalKey(uint16_t reg, int32_t maxID) {
+ key = ((uint64_t)maxID << 16) | reg;
+ }
+ const ir::Register getReg() const {
+ return (ir::Register)(key & 0xFFFF);
+ }
+ const int32_t getMaxID() const {
+ return key >> 16;
+ }
+ uint64_t key;
+ } GenRegIntervalKey;
+
+ struct spillCmp {
+ bool operator () (const GenRegIntervalKey &lhs, const GenRegIntervalKey &rhs) const
+ { return lhs.key > rhs.key; }
+ };
+
+ typedef set <GenRegIntervalKey, spillCmp> SpillSet;
+
+ class SpillCandidateSet : public SpillSet
+ {
+ public:
+ std::set<GenRegIntervalKey, spillCmp>::iterator find(GenRegInterval interval) {
+ GenRegIntervalKey key(interval.reg, interval.maxID);
+ return SpillSet::find(key);
+ }
+ void insert(GenRegInterval interval) {
+ GenRegIntervalKey key(interval.reg, interval.maxID);
+ SpillSet::insert(key);
+ }
+ void erase(GenRegInterval interval) {
+ GenRegIntervalKey key(interval.reg, interval.maxID);
+ SpillSet::erase(key);
+ }
+ };
+
+ /*! Implements the register allocation */
+ class GenRegAllocator::Opaque
+ {
+ public:
+ /*! Initialize the register allocator */
+ Opaque(GenContext &ctx);
+ /*! Release all taken resources */
+ ~Opaque(void);
+ /*! Perform the register allocation. Return true if success */
+ bool allocate(Selection &selection);
+ /*! Return the Gen register from the selection register */
+ GenRegister genReg(const GenRegister ®);
+ /*! Output the register allocation */
+ void outputAllocation(void);
+ INLINE void getRegAttrib(ir::Register reg, uint32_t ®Size, ir::RegisterFamily *regFamily = NULL) const {
+ // Note that byte vector registers use two bytes per byte (and can be
+ // interleaved)
+ static const size_t familyVectorSize[] = {2,2,2,4,8};
+ static const size_t familyScalarSize[] = {2,2,2,4,8};
+ using namespace ir;
+ const bool isScalar = ctx.sel->isScalarReg(reg);
+ const RegisterData regData = ctx.sel->getRegisterData(reg);
+ const RegisterFamily family = regData.family;
+ const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
+ regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+ if (regFamily != NULL)
+ *regFamily = family;
+ }
+ private:
+ /*! Expire one GRF interval. Return true if one was successfully expired */
+ bool expireGRF(const GenRegInterval &limit);
+ /*! Expire a flag register. Return true if one was successfully expired */
+ bool expireFlag(const GenRegInterval &limit);
+ /*! Allocate the virtual boolean (== flags) registers */
+ void allocateFlags(Selection &selection);
+ /*! validated flags which contains valid value in the physical flag register */
+ set<uint16_t> validatedFlags;
+ /*! validated temp flag register which indicate the flag 0,1 contains which virtual flag register. */
+ uint16_t validTempFlagReg;
+ /*! validate flag for the current flag user instruction */
+ void validateFlag(Selection &selection, SelectionInstruction &insn);
+ /*! Allocate the GRF registers */
+ bool allocateGRFs(Selection &selection);
+ /*! Create gen registers for all preallocated curbe registers. */
+ void allocatePayloadRegs(void);
+ /*! Create a Gen register from a register set in the payload */
+ void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
+ /*! Create the intervals for each register */
+ /*! Allocate the vectors detected in the instruction selection pass */
+ void allocateVector(Selection &selection);
+ /*! Allocate the given interval. Return true if success */
+ bool createGenReg(const GenRegInterval &interval);
+ /*! Indicate if the registers are already allocated in vectors */
+ bool isAllocated(const SelectionVector *vector) const;
+ /*! Reallocate registers if needed to make the registers in the vector
+ * contigous in memory
+ */
+ void coalesce(Selection &selection, SelectionVector *vector);
+ /*! The context owns the register allocator */
+ GenContext &ctx;
+ /*! Map virtual registers to offset in the (physical) register file */
+ map<ir::Register, uint32_t> RA;
+ /*! Map offset to virtual registers. */
+ map<uint32_t, ir::Register> offsetReg;
+ /*! Provides the position of each register in a vector */
+ map<ir::Register, VectorLocation> vectorMap;
+ /*! All vectors used in the selection */
+ vector<SelectionVector*> vectors;
+ /*! The set of booleans that will go to GRF (cannot be kept into flags) */
+ set<ir::Register> grfBooleans;
+ /*! The set of booleans which be held in flags, don't need to allocate grf */
+ set<ir::Register> flagBooleans;
+ /*! All the register intervals */
+ vector<GenRegInterval> intervals;
+ /*! All the boolean register intervals on the corresponding BB*/
+ typedef map<ir::Register, GenRegInterval> RegIntervalMap;
+ set<SelectionBlock *> flag0ReservedBlocks;
+ map<SelectionBlock *, RegIntervalMap *> boolIntervalsMap;
+ /*! Intervals sorting based on starting point positions */
+ vector<GenRegInterval*> starting;
+ /*! Intervals sorting based on ending point positions */
+ vector<GenRegInterval*> ending;
+ /*! registers that are spilled */
+ SpilledRegs spilledRegs;
+ /*! register which could be spilled.*/
+ SpillCandidateSet spillCandidate;
+ /* reserved registers for register spill/reload */
+ uint32_t reservedReg;
+ /*! Current vector to expire */
+ uint32_t expiringID;
+ INLINE void insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector = false);
+ INLINE bool expireReg(ir::Register reg);
+ INLINE bool spillAtInterval(GenRegInterval interval, int size, uint32_t alignment);
+ INLINE uint32_t allocateReg(GenRegInterval interval, uint32_t size, uint32_t alignment);
+ INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
+ INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
+ INLINE bool vectorCanSpill(SelectionVector *vector);
+ INLINE void allocateScratchForSpilled();
+
+ /*! replace specified source/dst register with temporary register and update interval */
+ INLINE ir::Register replaceReg(Selection &sel, SelectionInstruction *insn,
+ uint32_t regID, bool isSrc,
+ ir::Type type = ir::TYPE_FLOAT, bool needMov = true) {
+ ir::Register reg;
+ if (isSrc)
+ reg = sel.replaceSrc(insn, regID, type, needMov);
+ else
+ reg = sel.replaceDst(insn, regID, type, needMov);
+ intervals.push_back(reg);
+ intervals[reg].minID = insn->ID;
+ intervals[reg].maxID = insn->ID;
+ return reg;
+ }
+ /*! Use custom allocator */
+ GBE_CLASS(Opaque);
+ };
+
+
+ GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {}
+ GenRegAllocator::Opaque::~Opaque(void) {}
+
+ void GenRegAllocator::Opaque::allocatePayloadReg(ir::Register reg,
+ uint32_t offset,
+ uint32_t subOffset)
+ {
+ using namespace ir;
+ assert(offset >= GEN_REG_SIZE);
+ offset += subOffset;
+ RA.insert(std::make_pair(reg, offset));
+ GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
+ this->intervals[reg].minID = 0;
+ this->intervals[reg].maxID = 0;
+ }
+
+ INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+ using namespace ir;
+ for(auto &it : this->ctx.curbeRegs)
+ allocatePayloadReg(it.first, it.second);
+
+ // Allocate all pushed registers (i.e. structure kernel arguments)
+ const Function &fn = ctx.getFunction();
+ GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
+ const Function::PushMap &pushMap = fn.getPushMap();
+ for (auto rit = pushMap.rbegin(); rit != pushMap.rend(); ++rit) {
+ const uint32_t argID = rit->second.argID;
+ const FunctionArgument arg = fn.getArg(argID);
+
+ const uint32_t subOffset = rit->second.offset;
+ const Register reg = rit->second.getRegister();
+ auto it = this->ctx.curbeRegs.find(arg.reg);
+ assert(it != ctx.curbeRegs.end());
+ allocatePayloadReg(reg, it->second, subOffset);
+ ctx.splitBlock(it->second, subOffset);
+ }
+ }
+
+ bool GenRegAllocator::Opaque::createGenReg(const GenRegInterval &interval) {
+ using namespace ir;
+ const ir::Register reg = interval.reg;
+ if (RA.contains(reg) == true)
+ return true; // already allocated
+ uint32_t regSize;
+ ir::RegisterFamily family;
+ getRegAttrib(reg, regSize, &family);
+ uint32_t grfOffset = allocateReg(interval, regSize, regSize);
+ if (grfOffset == 0) {
+ return false;
+ }
+ insertNewReg(reg, grfOffset);
+ return true;
+ }
+
+ bool GenRegAllocator::Opaque::isAllocated(const SelectionVector *vector) const {
+ const ir::Register first = vector->reg[0].reg();
+ const auto it = vectorMap.find(first);
+
+ // If the first register is not allocated we are done
+ if (it == vectorMap.end())
+ return false;
+
+ // If there are more left registers than in the found vector, there are
+ // still registers to allocate
+ const SelectionVector *other = it->second.first;
+ const uint32_t otherFirst = it->second.second;
+ const uint32_t leftNum = other->regNum - otherFirst;
+ if (leftNum < vector->regNum)
+ return false;
+
+ // Now check that all the registers in the already allocated vector match
+ // the current vector
+ for (uint32_t regID = 1; regID < vector->regNum; ++regID) {
+ const ir::Register from = vector->reg[regID].reg();
+ const ir::Register to = other->reg[regID + otherFirst].reg();
+ if (from != to)
+ return false;
+ }
+ return true;
+ }
+
+ void GenRegAllocator::Opaque::coalesce(Selection &selection, SelectionVector *vector) {
+ for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+ const ir::Register reg = vector->reg[regID].reg();
+ const auto it = this->vectorMap.find(reg);
+ // case 1: the register is not already in a vector, so it can stay in this
+ // vector. Note that local IDs are *non-scalar* special registers but will
+ // require a MOV anyway since pre-allocated in the CURBE
+ // If an element has very long interval, we don't want to put it into a
+ // vector as it will add more pressure to the register allocation.
+ if (it == vectorMap.end() &&
+ ctx.sel->isScalarReg(reg) == false &&
+ ctx.isSpecialReg(reg) == false &&
+ (intervals[reg].maxID - intervals[reg].minID) < 2048)
+ {
+ const VectorLocation location = std::make_pair(vector, regID);
+ this->vectorMap.insert(std::make_pair(reg, location));
+ }
+ // case 2: the register is already in another vector, so we need to move
+ // it to a temporary register.
+ // TODO: we can do better than that if we analyze the liveness of the
+ // already allocated registers in the vector. If there is no inteference
+ // and the order is maintained, we can reuse the previous vector and avoid
+ // the MOVs
+ else {
+ ir::Register tmp;
+ ir::Type type = getIRType(vector->reg[regID].type);
+ tmp = this->replaceReg(selection, vector->insn, regID, vector->isSrc, type);
+ const VectorLocation location = std::make_pair(vector, regID);
+ this->vectorMap.insert(std::make_pair(tmp, location));
+ }
+ }
+ }
+
+ /*! Will sort vector in decreasing order */
+ inline bool cmp(const SelectionVector *v0, const SelectionVector *v1) {
+ return v0->regNum > v1->regNum;
+ }
+
+ void GenRegAllocator::Opaque::allocateVector(Selection &selection) {
+ const uint32_t vectorNum = selection.getVectorNum();
+ this->vectors.resize(vectorNum);
+
+ // First we find and store all vectors
+ uint32_t vectorID = 0;
+ for (auto &block : *selection.blockList)
+ for (auto &v : block.vectorList)
+ this->vectors[vectorID++] = &v;
+ GBE_ASSERT(vectorID == vectorNum);
+
+ // Heuristic (really simple...): sort them by the number of registers they
+ // contain
+ std::sort(this->vectors.begin(), this->vectors.end(), cmp);
+
+ // Insert MOVs when this is required
+ for (vectorID = 0; vectorID < vectorNum; ++vectorID) {
+ SelectionVector *vector = this->vectors[vectorID];
+ if (this->isAllocated(vector))
+ continue;
+ this->coalesce(selection, vector);
+ }
+ }
+
+ template <bool sortStartingPoint>
+ inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
+ return sortStartingPoint ? i0->minID < i1->minID : i0->maxID < i1->maxID;
+ }
+
+ bool GenRegAllocator::Opaque::expireGRF(const GenRegInterval &limit) {
+ bool ret = false;
+ while (this->expiringID != ending.size()) {
+ const GenRegInterval *toExpire = this->ending[this->expiringID];
+ const ir::Register reg = toExpire->reg;
+
+ // Dead code produced by the insn selection -> we skip it
+ if (toExpire->minID > toExpire->maxID) {
+ this->expiringID++;
+ continue;
+ }
+
+ //ignore register that already spilled
+ if(spilledRegs.find(reg) != spilledRegs.end()) {
+ this->expiringID++;
+ continue;
+ }
+
+ if (toExpire->maxID >= limit.minID)
+ break;
+
+ if (expireReg(reg))
+ ret = true;
+ this->expiringID++;
+ }
+
+ // We were not able to expire anything
+ return ret;
+ }
+
+
+ #define IS_IMPLICITLY_MOD_FLAG(insn) (insn.state.modFlag == 1 && \
+ (insn.opcode == SEL_OP_MOV || \
+ insn.opcode == SEL_OP_AND || \
+ insn.opcode == SEL_OP_OR || \
+ insn.opcode == SEL_OP_XOR))
+
+ #define IS_SCALAR_FLAG(insn) selection.isScalarReg(ir::Register(insn.state.flagIndex))
+ #define GET_FLAG_REG(insn) GenRegister::uwxgrf(IS_SCALAR_FLAG(insn) ? 1 : 8,\
+ ir::Register(insn.state.flagIndex));
+ #define IS_TEMP_FLAG(insn) (insn.state.flag == 0 && insn.state.subFlag == 1)
+ // Flag is a virtual flag, this function is to validate the virtual flag
+ // to a physical flag. It is used to validate both temporary flag and the
+ // non-temporary flag registers.
+ // We track the last temporary validate register, if it's the same as
+ // current, we can avoid the revalidation.
+ void GenRegAllocator::Opaque::validateFlag(Selection &selection,
+ SelectionInstruction &insn) {
+ GBE_ASSERT(insn.state.physicalFlag == 1);
+ if (!IS_TEMP_FLAG(insn) && validatedFlags.find(insn.state.flagIndex) != validatedFlags.end())
+ return;
+ else if (IS_TEMP_FLAG(insn) && validTempFlagReg == insn.state.flagIndex)
+ return;
+ SelectionInstruction *cmp0 = selection.create(SEL_OP_CMP, 1, 2);
+ cmp0->state = GenInstructionState(ctx.getSimdWidth());
+ cmp0->state.flag = insn.state.flag;
+ cmp0->state.subFlag = insn.state.subFlag;
+ if (IS_SCALAR_FLAG(insn))
+ cmp0->state.noMask = 1;
+ cmp0->src(0) = GET_FLAG_REG(insn);
+ cmp0->src(1) = GenRegister::immuw(0);
+ cmp0->dst(0) = GenRegister::retype(GenRegister::null(), GEN_TYPE_UW);
+ cmp0->extra.function = GEN_CONDITIONAL_NEQ;
+ insn.prepend(*cmp0);
+ if (!IS_TEMP_FLAG(insn))
+ validatedFlags.insert(insn.state.flagIndex);
+ else {
+ if (insn.state.modFlag == 0)
+ validTempFlagReg = insn.state.flagIndex;
+ else
+ validTempFlagReg = 0;
+ }
+ }
+
+
+ void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
+ // Previously, we have a global flag allocation implemntation.
+ // After some analysis, I found the global flag allocation is not
+ // the best solution here.
+ // As for the cross block reference of bool value, we have to
+ // combine it with current emask. There is no obvious advantage to
+ // allocate deadicate physical flag register for those cross block usage.
+ // We just need to allocate physical flag within each BB. We need to handle
+ // the following cases:
+ //
+ // 1. The bool's liveness never beyond this BB. And the bool is only used as
+ // a dst register or a pred register. This bool value could be
+ // allocated in physical flag only if there is enough physical flag.
+ // We already identified those bool at the instruction select stage, and
+ // put them in the flagBooleans set.
+ // 2. The bool is defined in another BB and used in this BB, then we need
+ // to prepend an instruction at the position where we use it.
+ // 3. The bool is defined in this BB but is also used as some instruction's
+ // source registers rather than the pred register. We have to keep the normal
+ // grf (UW8/UW16) register for this bool. For some CMP instruction, we need to
+ // append a SEL instruction convert the flag to the grf register.
+ // 4. Even for the spilling flag, if there is only one spilling flag, we will also
+ // try to reuse the temporary flag register latter. This requires all the
+ // instructions should got it flag at the instruction selection stage. And should
+ // not use the flag physical number directly at the gen_context stage. Otherwise,
+ // may break the algorithm here.
+ // We will track all the validated bool value and to avoid any redundant
+ // validation for the same flag. But if there is no enough physical flag,
+ // we have to spill the previous allocated physical flag. And the spilling
+ // policy is to spill the allocate flag which live to the last time end point.
+
+ // we have three flags we use for booleans f0.0 , f1.0 and f1.1
+ for (auto &block : *selection.blockList) {
+ // Store the registers allocated in the map
+ map<ir::Register, uint32_t> allocatedFlags;
+ map<const GenRegInterval*, uint32_t> allocatedFlagIntervals;
+
+ const uint32_t flagNum = flag0ReservedBlocks.contains(&block) ? 2 : 3;
+ uint32_t freeFlags[] = {2, 3, 0};
+ uint32_t freeNum = flagNum;
+ if (boolIntervalsMap.find(&block) == boolIntervalsMap.end())
+ continue;
+ const auto boolsMap = boolIntervalsMap[&block];
+ vector<const GenRegInterval*> flagStarting;
+ vector<const GenRegInterval*> flagEnding;
+ GBE_ASSERT(boolsMap->size() > 0);
+ uint32_t regNum = boolsMap->size();
+ flagStarting.resize(regNum);
+ flagEnding.resize(regNum);
+ uint32_t id = 0;
+ for (auto &interval : *boolsMap) {
+ flagStarting[id] = flagEnding[id] = &interval.second;
+ id++;
+ }
+ std::sort(flagStarting.begin(), flagStarting.end(), cmp<true>);
+ std::sort(flagEnding.begin(), flagEnding.end(), cmp<false>);
+
+ uint32_t endID = 0; // interval to expire
+ for (uint32_t startID = 0; startID < regNum; ++startID) {
+ const GenRegInterval *interval = flagStarting[startID];
+ const ir::Register reg = interval->reg;
+ GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+ if (freeNum != 0) {
+ allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+ allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+ } else {
+ // Try to expire one register
+ while (endID != flagEnding.size()) {
+ const GenRegInterval *toExpire = flagEnding[endID];
+ // Dead code produced by the insn selection -> we skip it
+ if (toExpire->minID > toExpire->maxID) {
+ endID++;
+ continue;
+ }
+ // We cannot expire this interval and the next ones
+ if (toExpire->maxID >= interval->minID)
+ break;
+ // We reuse a flag from a previous interval (the oldest one)
+ auto it = allocatedFlags.find(toExpire->reg);
+ if (it == allocatedFlags.end()) {
+ endID++;
+ continue;
+ }
+ freeFlags[freeNum++] = it->second;
+ endID++;
+ break;
+ }
+ if (freeNum != 0) {
+ allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+ allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+ }
+ else {
+ // FIXME we may sort the allocated flags before do the spilling in the furture.
+ int32_t spill = -1;
+ const GenRegInterval *spillInterval = NULL;
+ int32_t maxID = 0;
+ for (auto &it : allocatedFlagIntervals) {
+ if (it.first->maxID <= interval->minID)
+ continue;
+ if (it.first->maxID > maxID && it.second != 0) {
+ maxID = it.first->maxID;
+ spill = it.second;
+ spillInterval = it.first;
+ }
+ }
+ if (spill != -1) {
+ allocatedFlags.insert(std::make_pair(reg, spill));
+ allocatedFlagIntervals.insert(std::make_pair(interval, spill));
+ allocatedFlags.erase(spillInterval->reg);
+ allocatedFlagIntervals.erase(spillInterval);
+ // We spill this flag booleans register, so erase it from the flag boolean set.
+ if (flagBooleans.contains(spillInterval->reg))
+ flagBooleans.erase(spillInterval->reg);
+ } else {
+ GBE_ASSERT(0);
+ }
+ }
+ }
+ }
+ delete boolsMap;
+
+ // Now, we traverse all the selection instructions and we patch them to make
+ // them use flag registers
+ validTempFlagReg = 0;
+ validatedFlags.clear();
+ for (auto &insn : block.insnList) {
+ // Patch the predicate now. Note that only compares actually modify it (it
+ // is called a "conditional modifier"). The other instructions just read
+ // it
+ if (insn.state.physicalFlag == 0) {
+ auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
+ if (it != allocatedFlags.end()) {
+ insn.state.physicalFlag = 1;
+ insn.state.flag = it->second / 2;
+ insn.state.subFlag = it->second & 1;
+
+ // modFlag is for the LOADI/MOV/AND/OR/XOR instructions which will modify a
+ // flag register. We set the condition for them to save one instruction if possible.
+ if (IS_IMPLICITLY_MOD_FLAG(insn)) {
+ // If this is a modFlag on a scalar bool, we need to remove it
+ // from the allocated flags map. Then latter, the user could
+ // validate the flag from the scalar value correctly.
+ if (IS_SCALAR_FLAG(insn)) {
+ allocatedFlags.erase(ir::Register(insn.state.flagIndex));
+ continue;
+ }
+ insn.extra.function = GEN_CONDITIONAL_NEQ;
+ }
+ // If this is an external bool, we need to validate it if it is not validated yet.
+ if ((insn.state.externFlag &&
+ insn.state.predicate != GEN_PREDICATE_NONE))
+ validateFlag(selection, insn);
+ } else {
+ insn.state.physicalFlag = 1;
+ insn.state.flag = 0;
+ insn.state.subFlag = 1;
+
+ // If this is for MOV/AND/OR/... we don't need to waste an extra instruction
+ // to generate the flag here, just continue to next instruction. And the validTempFlagReg
+ // will not be destroyed.
+ if (IS_IMPLICITLY_MOD_FLAG(insn))
+ continue;
+ // This bool doesn't have a deadicated flag, we use temporary flag here.
+ // each time we need to validate it from the grf register.
+ if (insn.state.predicate != GEN_PREDICATE_NONE)
+ validateFlag(selection, insn);
+ }
+ // This is a CMP for a pure flag booleans, we don't need to write result to
+ // the grf. And latter, we will not allocate grf for it.
+ if (insn.opcode == SEL_OP_CMP &&
+ (flagBooleans.contains(insn.dst(0).reg()) ||
+ GenRegister::isNull(insn.dst(0)))) {
+ // set a temporary register to avoid switch in this block.
+ bool isSrc = false;
+ bool needMov = false;
+ this->replaceReg(selection, &insn, 0, isSrc, ir::TYPE_FLOAT, needMov);
+ }
+ // If the instruction requires to generate (CMP for long/int/float..)
+ // the flag value to the register, and it's not a pure flag boolean,
+ // we need to use SEL instruction to generate the flag value to the UW8
+ // register.
+ if (insn.state.flagGen == 1 &&
+ !flagBooleans.contains((ir::Register)(insn.state.flagIndex))) {
+ SelectionInstruction *sel0 = selection.create(SEL_OP_SEL, 1, 2);
+ uint32_t simdWidth;
+ simdWidth = IS_SCALAR_FLAG(insn) ? 1 : ctx.getSimdWidth();
+
+ sel0->state = GenInstructionState(simdWidth);
+ if (IS_SCALAR_FLAG(insn))
+ sel0->state.noMask = 1;
+ sel0->state.flag = insn.state.flag;
+ sel0->state.subFlag = insn.state.subFlag;
+ sel0->state.predicate = GEN_PREDICATE_NORMAL;
+ sel0->src(0) = GenRegister::uw1grf(ir::ocl::one);
+ sel0->src(1) = GenRegister::uw1grf(ir::ocl::zero);
+ sel0->dst(0) = GET_FLAG_REG(insn);
+ insn.append(*sel0);
+ // We use the zero one after the liveness analysis, we have to update
+ // the liveness data manually here.
+ GenRegInterval &interval0 = intervals[ir::ocl::zero];
+ GenRegInterval &interval1 = intervals[ir::ocl::one];
+ interval0.minID = std::min(interval0.minID, (int32_t)insn.ID);
+ interval0.maxID = std::max(interval0.maxID, (int32_t)insn.ID);
+ interval1.minID = std::min(interval1.minID, (int32_t)insn.ID);
+ interval1.maxID = std::max(interval1.maxID, (int32_t)insn.ID);
+ }
+ } else {
+ // If the instruction use the temporary flag register manually,
+ // we should invalidate the temp flag reg here.
+ if (insn.state.flag == 0 && insn.state.subFlag == 1)
+ validTempFlagReg = 0;
+ }
+ }
+ }
+ }
+
+ IVAR(OCL_SIMD16_SPILL_THRESHOLD, 0, 16, 256);
+ bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
+ // Perform the linear scan allocator
+ ctx.errCode = REGISTER_ALLOCATION_FAIL;
+ const uint32_t regNum = ctx.sel->getRegNum();
+ for (uint32_t startID = 0; startID < regNum; ++startID) {
+ const GenRegInterval &interval = *this->starting[startID];
+ const ir::Register reg = interval.reg;
+ if (interval.maxID == -INT_MAX)
+ continue; // Unused register
+ if (RA.contains(reg))
+ continue; // already allocated
+
+ if (flagBooleans.contains(reg))
+ continue;
+
+ // Case 1: the register belongs to a vector, allocate all the registers in
+ // one piece
+ auto it = vectorMap.find(reg);
+ if (it != vectorMap.end()) {
+ const SelectionVector *vector = it->second.first;
+ // all the reg in the SelectionVector are spilled
+ if(spilledRegs.find(vector->reg[0].reg())
+ != spilledRegs.end())
+ continue;
+
+ uint32_t alignment;
+ uint32_t size = 0;
+ for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+ getRegAttrib(vector->reg[regID].reg(), alignment, NULL);
+ size += alignment;
+ }
+ // FIXME this is workaround for scheduling limitation, which requires 2*GEN_REG_SIZE under SIMD16.
+ const uint32_t maxAlignment = ctx.getSimdWidth()/8*GEN_REG_SIZE;
+ const uint32_t grfOffset = allocateReg(interval, size, maxAlignment);
+ if(grfOffset == 0) {
+ for(int i = vector->regNum-1; i >= 0; i--) {
+ if (!spillReg(vector->reg[i].reg()))
+ return false;
+ }
+ continue;
+ }
+ uint32_t subOffset = 0;
+ for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+ const ir::Register reg = vector->reg[regID].reg();
+ GBE_ASSERT(RA.contains(reg) == false);
+ getRegAttrib(reg, alignment, NULL);
+ // check all sub registers aligned correctly
+ GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
+ insertNewReg(reg, grfOffset + subOffset, true);
+ ctx.splitBlock(grfOffset, subOffset); //splitBlock will not split if regID == 0
+ subOffset += alignment;
+ }
+ }
+ // Case 2: This is a regular scalar register, allocate it alone
+ else if (this->createGenReg(interval) == false) {
+ if (!spillReg(interval))
+ return false;
+ }
+ }
+ if (!spilledRegs.empty()) {
+ GBE_ASSERT(reservedReg != 0);
+ if (ctx.getSimdWidth() == 16) {
+ if (spilledRegs.size() > (unsigned int)OCL_SIMD16_SPILL_THRESHOLD) {
+ ctx.errCode = REGISTER_SPILL_EXCEED_THRESHOLD;
+ return false;
+ }
+ }
+ allocateScratchForSpilled();
+ bool success = selection.spillRegs(spilledRegs, reservedReg);
+ if (!success) {
+ ctx.errCode = REGISTER_SPILL_FAIL;
+ return false;
+ }
+ }
+ ctx.errCode = NO_ERROR;
+ return true;
+ }
+
+ INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+ {
+ const uint32_t regNum = spilledRegs.size();
+ this->starting.resize(regNum);
+ this->ending.resize(regNum);
+ uint32_t regID = 0;
+ for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
+ this->starting[regID] = this->ending[regID] = &intervals[it->first];
+ regID++;
+ }
+ std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+ std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+ int toExpire = 0;
+ for(uint32_t i = 0; i < regNum; i++) {
+ const GenRegInterval * cur = starting[i];
+ const GenRegInterval * exp = ending[toExpire];
+ if (exp->maxID < cur->minID) {
+ auto it = spilledRegs.find(exp->reg);
+ GBE_ASSERT(it != spilledRegs.end());
+ if(it->second.addr != -1) {
+ ctx.deallocateScratchMem(it->second.addr);
+ }
+ toExpire++;
+ }
+ auto it = spilledRegs.find(cur->reg);
+ GBE_ASSERT(it != spilledRegs.end());
+ if(cur->minID == cur->maxID) {
+ it->second.addr = -1;
+ continue;
+ }
+
+ ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
+ it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
+ * ctx.getSimdWidth());
+ }
+ }
+
+ INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
+ {
+ auto it = RA.find(reg);
+ if (flagBooleans.contains(reg))
+ return false;
+ GBE_ASSERT(it != RA.end());
+ // offset less than 32 means it is not managed by our reg allocator.
+ if (it->second < 32)
+ return false;
+
+ ctx.deallocate(it->second);
+ if (reservedReg != 0
+ && (spillCandidate.find(intervals[reg]) != spillCandidate.end())) {
+ spillCandidate.erase(intervals[reg]);
+ /* offset --> reg map should keep updated. */
+ offsetReg.erase(it->second);
+ }
+
+ return true;
+ }
+
+ // insert a new register with allocated offset,
+ // put it to the RA map and the spill map if it could be spilled.
+ INLINE void GenRegAllocator::Opaque::insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector)
+ {
+ RA.insert(std::make_pair(reg, grfOffset));
+
+ if (reservedReg != 0) {
+
+ uint32_t regSize;
+ ir::RegisterFamily family;
+ getRegAttrib(reg, regSize, &family);
+ // At simd16 mode, we may introduce some simd8 registers in te instruction selection stage.
+ // To spill those simd8 temporary registers will introduce unecessary complexity. We just simply
+ // avoid to spill those temporary registers here.
+ if (ctx.getSimdWidth() == 16 && reg.value() >= ctx.getFunction().getRegisterFile().regNum())
+ return;
+
+ if ((regSize == ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_DWORD)
+ || (regSize == 2 * ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_QWORD)) {
+ GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end());
+ offsetReg.insert(std::make_pair(grfOffset, reg));
+ spillCandidate.insert(intervals[reg]);
+ }
+ }
+ }
+
+ INLINE bool GenRegAllocator::Opaque::spillReg(ir::Register reg,
+ bool isAllocated) {
+ return spillReg(intervals[reg], isAllocated);
+ }
+
+ INLINE bool GenRegAllocator::Opaque::spillReg(GenRegInterval interval,
+ bool isAllocated) {
+ if (reservedReg == 0)
+ return false;
+
+ if (interval.reg.value() >= ctx.getFunction().getRegisterFile().regNum() &&
+ ctx.getSimdWidth() == 16)
+ return false;
+
+ ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
+ // we currently only support DWORD/QWORD spill
+ if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)
+ return false;
+
+ SpillRegTag spillTag;
+ spillTag.isTmpReg = interval.maxID == interval.minID;
+ spillTag.addr = -1;
+
+ if (isAllocated) {
+ // If this register is allocated, we need to expire it and erase it
+ // from the RA map.
+ bool success = expireReg(interval.reg);
+ GBE_ASSERT(success);
+ success = success;
+ RA.erase(interval.reg);
+ }
+ spilledRegs.insert(std::make_pair(interval.reg, spillTag));
+ return true;
+ }
+
+ // Check whethere a vector which is allocated can be spilled out
+ // If a partial of a vector is expired, the vector will be unspillable, currently.
+ // FIXME we may need to fix those unspillable vector in the furture.
+ INLINE bool GenRegAllocator::Opaque::vectorCanSpill(SelectionVector *vector) {
+ for(uint32_t id = 0; id < vector->regNum; id++)
+ if (spillCandidate.find(intervals[(ir::Register)(vector->reg[id].value.reg)])
+ == spillCandidate.end())
+ return false;
+ return true;
+ }
+
+ INLINE bool GenRegAllocator::Opaque::spillAtInterval(GenRegInterval interval,
+ int size,
+ uint32_t alignment) {
+ if (reservedReg == 0)
+ return false;
+ auto it = spillCandidate.begin();
+ // If there is no spill candidate or current register is spillable and current register's
+ // endpoint is after all the spillCandidate register's endpoint we return false. The
+ // caller will spill current register.
+ // At simd16 mode, we will always try to spill here rather than return to the caller.
+ // The reason is that the caller may have a vector to allocate, and some element may be
+ // temporary registers which could not be spilled.
+ if (it == spillCandidate.end()
+ || (ctx.getSimdWidth() == 8 && (it->getMaxID() <= interval.maxID
+ && alignment == ctx.getSimdWidth()/8 * GEN_REG_SIZE)))
+ return false;
+
+ ir::Register reg = it->getReg();
+ set<ir::Register> spillSet;
+ int32_t savedSize = size;
+ while(size > 0) {
+ auto vectorIt = vectorMap.find(reg);
+ bool isVector = vectorIt != vectorMap.end();
+ bool needRestart = false;
+ ir::RegisterFamily family = ctx.sel->getRegisterFamily(reg);
+ if (isVector
+ && (vectorCanSpill(vectorIt->second.first))) {
+ const SelectionVector *vector = vectorIt->second.first;
+ for (uint32_t id = 0; id < vector->regNum; id++) {
+ GBE_ASSERT(spilledRegs.find(vector->reg[id].reg())
+ == spilledRegs.end());
+ spillSet.insert(vector->reg[id].reg());
+ reg = vector->reg[id].reg();
+ family = ctx.sel->getRegisterFamily(reg);
+ size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
+ : GEN_REG_SIZE * ctx.getSimdWidth()/8;
+ }
+ } else if (!isVector) {
+ spillSet.insert(reg);
+ size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
+ : GEN_REG_SIZE * ctx.getSimdWidth()/8;
+ } else
+ needRestart = true; // is a vector which could not be spilled.
+
+ if (size <= 0)
+ break;
+ if (!needRestart) {
+ uint32_t offset = RA.find(reg)->second;
+ uint32_t nextOffset = (family == ir::FAMILY_QWORD) ? (offset + 2 * GEN_REG_SIZE * ctx.getSimdWidth() / 8)
+ : (offset + GEN_REG_SIZE * ctx.getSimdWidth() / 8);
+ auto nextRegIt = offsetReg.find(nextOffset);
+ if (nextRegIt != offsetReg.end())
+ reg = nextRegIt->second;
+ else
+ needRestart = true;
+ }
+
+ if (needRestart) {
+#if 0
+ // FIXME, we should enable this code block in the future.
+ // If the spill set is not zero and we need a restart, we can
+ // simply return to try to allocate the registers at first.
+ // As some vectors which have expired elements may be marked as
+ // unspillable vector.
+ if (spillSet.size() > 0)
+ break;
+#endif
+ it++;
+ // next register is not in spill candidate.
+ // let's move to next candidate and start over.
+ if (it == spillCandidate.end())
+ return false;
+ reg = it->getReg();
+ size = savedSize;
+ spillSet.clear();
+ }
+ }
+
+ for(auto spillreg : spillSet)
+ spillReg(spillreg, true);
+ return true;
+ }
+
+ INLINE uint32_t GenRegAllocator::Opaque::allocateReg(GenRegInterval interval,
+ uint32_t size,
+ uint32_t alignment) {
+ uint32_t grfOffset;
+ static uint32_t tick = 0;
+ // Doing expireGRF too freqently will cause the post register allocation
+ // scheduling very hard. As it will cause a very high register conflict rate.
+ // The tradeoff here is to reduce the freqency here. And if we are under spilling
+ // then no need to reduce that freqency as the register pressure is the most
+ // important factor.
+ if (tick % 12 == 0 || ctx.reservedSpillRegs != 0)
+ this->expireGRF(interval);
+ tick++;
+ // For some scalar byte register, it may be used as a destination register
+ // and the source is a scalar Dword. If that is the case, the byte register
+ // must get 4byte alignment register offset.
+ alignment = (alignment + 3) & ~3;
+ while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
+ const bool success = this->expireGRF(interval);
+ if (success == false) {
+ if (spillAtInterval(interval, size, alignment) == false)
+ return 0;
+ }
+ }
+ return grfOffset;
+ }
+
+ INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
+ using namespace ir;
+ if (ctx.reservedSpillRegs != 0) {
+ reservedReg = ctx.allocate(ctx.reservedSpillRegs * GEN_REG_SIZE, GEN_REG_SIZE);
+ reservedReg /= GEN_REG_SIZE;
+ } else {
+ reservedReg = 0;
+ }
+ // schedulePreRegAllocation(ctx, selection);
+
+ // Now start the linear scan allocation
+ for (uint32_t regID = 0; regID < ctx.sel->getRegNum(); ++regID)
+ this->intervals.push_back(ir::Register(regID));
+
+ // Allocate the special registers (only those which are actually used)
+ this->allocatePayloadRegs();
+
+ // Group and barrier IDs are always allocated by the hardware in r0
+ RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1
+ RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6
+ RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7
+ RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
+
+ // block IP used to handle the mask in SW is always allocated
+
+ // Compute the intervals
+ int32_t insnID = 0;
+ for (auto &block : *selection.blockList) {
+ int32_t lastID = insnID;
+ int32_t firstID = insnID;
+ // Update the intervals of each used register. Note that we do not
+ // register allocate R0, so we skip all sub-registers in r0
+ RegIntervalMap *boolsMap = new RegIntervalMap;
+ if (block.isLargeBlock)
+ flag0ReservedBlocks.insert(&block);
+ for (auto &insn : block.insnList) {
+ const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+ insn.ID = insnID;
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const GenRegister &selReg = insn.src(srcID);
+ const ir::Register reg = selReg.reg();
+ if (selReg.file != GEN_GENERAL_REGISTER_FILE ||
+ reg == ir::ocl::barrierid ||
+ reg == ir::ocl::groupid0 ||
+ reg == ir::ocl::groupid1 ||
+ reg == ir::ocl::groupid2)
+ continue;
+ this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
+ this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+ }
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const GenRegister &selReg = insn.dst(dstID);
+ const ir::Register reg = selReg.reg();
+ if (selReg.file != GEN_GENERAL_REGISTER_FILE ||
+ reg == ir::ocl::barrierid ||
+ reg == ir::ocl::groupid0 ||
+ reg == ir::ocl::groupid1 ||
+ reg == ir::ocl::groupid2)
+ continue;
+ this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
+ this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+ }
+
+ // OK, a flag is used as a predicate or a conditional modifier
+ if (insn.state.physicalFlag == 0) {
+ const ir::Register reg = ir::Register(insn.state.flagIndex);
+ this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
+ this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+ // Check whether this is a pure flag booleans candidate.
+ if (insn.state.grfFlag == 0)
+ flagBooleans.insert(reg);
+ GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+ // update the bool register's per-BB's interval data
+ if (boolsMap->find(reg) == boolsMap->end()) {
+ GenRegInterval boolInterval(reg);
+ boolsMap->insert(std::make_pair(reg, boolInterval));
+ }
+ boolsMap->find(reg)->second.minID = std::min(boolsMap->find(reg)->second.minID, insnID);
+ boolsMap->find(reg)->second.maxID = std::max(boolsMap->find(reg)->second.maxID, insnID);
+ if (&insn == block.insnList.back() &&
+ insn.opcode == SEL_OP_JMPI &&
+ insn.state.predicate != GEN_PREDICATE_NONE) {
+ // If this is the last instruction and is a predicated JMPI.
+ // We must extent its liveness before any other instrution.
+ // As we need to allocate f0 to it, and need to keep the f0
+ // unchanged during the block. The root cause is this instruction
+ // is out-of the if/endif region, so we have to borrow the f0
+ // to get correct bits for all channels.
+ boolsMap->find(reg)->second.minID = 0;
+ if (flag0ReservedBlocks.contains(&block))
+ flag0ReservedBlocks.erase(&block);
+ }
+ } else {
+ // Make sure that instruction selection stage didn't use physiacl flags incorrectly.
+ GBE_ASSERT ((insn.opcode == SEL_OP_LABEL ||
+ insn.opcode == SEL_OP_IF ||
+ insn.opcode == SEL_OP_JMPI ||
+ insn.state.predicate == GEN_PREDICATE_NONE ||
+ (block.hasBarrier && insn.opcode == SEL_OP_MOV) ||
+ (insn.state.flag == 0 && insn.state.subFlag == 1)));
+ }
+ lastID = insnID;
+ insnID++;
+ }
+
+ // All registers alive at the begining of the block must update their intervals.
+ const ir::BasicBlock *bb = block.bb;
+ for (auto reg : ctx.getLiveIn(bb))
+ this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
+
+ // All registers alive at the end of the block must have their intervals
+ // updated as well
+ for (auto reg : ctx.getLiveOut(bb))
+ this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
+
+ if (boolsMap->size() > 0)
+ boolIntervalsMap.insert(std::make_pair(&block, boolsMap));
+ else
+ delete boolsMap;
+ }
+
+ this->intervals[ocl::retVal].minID = INT_MAX;
+ this->intervals[ocl::retVal].maxID = -INT_MAX;
+
+ // Allocate all the vectors first since they need to be contiguous
+ this->allocateVector(selection);
+
+ // First we try to put all booleans registers into flags
+ this->allocateFlags(selection);
+
+ // Sort both intervals in starting point and ending point increasing orders
+ const uint32_t regNum = ctx.sel->getRegNum();
+ this->starting.resize(regNum);
+ this->ending.resize(regNum);
+ for (uint32_t regID = 0; regID < regNum; ++regID)
+ this->starting[regID] = this->ending[regID] = &intervals[regID];
+ std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+ std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+
+ // Remove the registers that were not allocated
+ this->expiringID = 0;
+ while (this->expiringID < regNum) {
+ const GenRegInterval *interval = ending[this->expiringID];
+ if (interval->maxID == -INT_MAX)
+ this->expiringID++;
+ else
+ break;
+ }
+
+ // Allocate all the GRFs now (regular register and boolean that are not in
+ // flag registers)
+ return this->allocateGRFs(selection);
+ }
+
+ INLINE void GenRegAllocator::Opaque::outputAllocation(void) {
+ using namespace std;
+ cout << "## register allocation ##" << endl;
+ for(auto &i : RA) {
+ ir::Register vReg = (ir::Register)i.first;
+ ir::RegisterFamily family;
+ uint32_t regSize;
+ getRegAttrib(vReg, regSize, &family);
+ int offst = (int)i.second;// / sizeof(float);
+ int reg = offst / 32;
+ int subreg = (offst % 32) / regSize;
+ cout << "%" << setiosflags(ios::left) << setw(8) << vReg
+ << "g" << setiosflags(ios::left) << setw(3) << reg << "."
+ << setiosflags(ios::left) << setw(3) << subreg << ir::getFamilyName(family)
+ << " " << setw(-3) << regSize << "B\t"
+ << "[ " << setw(8) << this->intervals[(uint)vReg].minID
+ << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
+ << "]" << endl;
+ }
+ if (!spilledRegs.empty())
+ cout << "## spilled registers: " << spilledRegs.size() << endl;
+ for(auto it = spilledRegs.begin(); it != spilledRegs.end(); it++) {
+ ir::Register vReg = it->first;
+ ir::RegisterFamily family;
+ uint32_t regSize;
+ getRegAttrib(vReg, regSize, &family);
+ cout << "%" << setiosflags(ios::left) << setw(8) << vReg
+ << "@" << setw(8) << it->second.addr
+ << " " << ir::getFamilyName(family)
+ << " " << setw(-3) << regSize << "B\t"
+ << "[ " << setw(8) << this->intervals[(uint)vReg].minID
+ << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
+ << "]" << endl;
+ }
+ cout << endl;
+ }
+
+ INLINE GenRegister setGenReg(const GenRegister &src, uint32_t grfOffset) {
+ GenRegister dst;
+ dst = src;
+ dst.physical = 1;
+ dst.nr = grfOffset / GEN_REG_SIZE;
+ dst.subnr = grfOffset % GEN_REG_SIZE;
+ return dst;
+ }
+
+ INLINE GenRegister GenRegAllocator::Opaque::genReg(const GenRegister ®) {
+ if (reg.file == GEN_GENERAL_REGISTER_FILE) {
+ if(reg.physical == 1) {
+ return reg;
+ }
+ GBE_ASSERT(RA.contains(reg.reg()) != false);
+ const uint32_t grfOffset = RA.find(reg.reg())->second;
+ const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+ const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
+ if (reg.quarter != 0)
+ return GenRegister::Qn(dst, reg.quarter);
+ else
+ return dst;
+ }
+ else
+ return reg;
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+ // Register allocator public implementation
+ /////////////////////////////////////////////////////////////////////////////
+
+ GenRegAllocator::GenRegAllocator(GenContext &ctx) {
+ this->opaque = GBE_NEW(GenRegAllocator::Opaque, ctx);
+ }
+
+ GenRegAllocator::~GenRegAllocator(void) {
+ GBE_DELETE(this->opaque);
+ }
+
+ bool GenRegAllocator::allocate(Selection &selection) {
+ return this->opaque->allocate(selection);
+ }
+
+ GenRegister GenRegAllocator::genReg(const GenRegister ®) {
+ return this->opaque->genReg(reg);
+ }
+
+ void GenRegAllocator::outputAllocation(void) {
+ this->opaque->outputAllocation();
+ }
+
+ uint32_t GenRegAllocator::getRegSize(ir::Register reg) {
+ uint32_t regSize;
+ this->opaque->getRegAttrib(reg, regSize);
+ return regSize;
+ }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp
new file mode 100644
index 0000000..e41f503
--- /dev/null
+++ b/backend/src/backend/gen_reg_allocation.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_reg_allocation.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_REG_ALLOCATION_HPP__
+#define __GBE_GEN_REG_ALLOCATION_HPP__
+
+#include "ir/register.hpp"
+#include "backend/gen_register.hpp"
+
+namespace gbe
+{
+ class Selection; // Pre-register allocation code generation
+ class GenRegister; // Pre-register allocation Gen register
+ struct GenRegInterval; // Liveness interval for each register
+ class GenContext; // Gen specific context
+
+ typedef struct SpillRegTag {
+ bool isTmpReg;
+ int32_t addr;
+ } SpillRegTag;
+
+ typedef map<ir::Register, SpillRegTag> SpilledRegs;
+
+ /*! Register allocate (i.e. virtual to physical register mapping) */
+ class GenRegAllocator
+ {
+ public:
+ /*! Initialize the register allocator */
+ GenRegAllocator(GenContext &ctx);
+ /*! Release all taken resources */
+ ~GenRegAllocator(void);
+ /*! Perform the register allocation */
+ bool allocate(Selection &selection);
+ /*! Virtual to physical translation */
+ GenRegister genReg(const GenRegister ®);
+ /*! Output the register allocation */
+ void outputAllocation(void);
+ /*! Get register actual size in byte. */
+ uint32_t getRegSize(ir::Register reg);
+ private:
+ /*! Actual implementation of the register allocator (use Pimpl) */
+ class Opaque;
+ /*! Created and destroyed in cpp */
+ Opaque *opaque;
+ /*! Use custom allocator */
+ GBE_CLASS(GenRegAllocator);
+ };
+
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_REG_ALLOCATION_HPP__ */
+
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
new file mode 100644
index 0000000..da58c06
--- /dev/null
+++ b/backend/src/backend/gen_register.hpp
@@ -0,0 +1,1060 @@
+/*
+ * Copyright 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <keith at tungstengraphics.com>
+ */
+/**
+ * \file gen_register.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GEN_REGISTER_HPP__
+#define __GEN_REGISTER_HPP__
+
+#include "backend/gen_defs.hpp"
+#include "ir/register.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe
+{
+
+ /*! Type size in bytes for each Gen type */
+ INLINE int typeSize(uint32_t type) {
+ switch(type) {
+ case GEN_TYPE_DF:
+ case GEN_TYPE_UL:
+ case GEN_TYPE_L:
+ return 8;
+ case GEN_TYPE_UD:
+ case GEN_TYPE_D:
+ case GEN_TYPE_F:
+ return 4;
+ case GEN_TYPE_UW:
+ case GEN_TYPE_W:
+ return 2;
+ case GEN_TYPE_UB:
+ case GEN_TYPE_B:
+ return 1;
+ default:
+ assert(0);
+ return 0;
+ }
+ }
+
+ /*! Convert a hstride to a number of element */
+ INLINE uint32_t stride(uint32_t stride) {
+ switch (stride) {
+ case 0: return 0;
+ case 1: return 1;
+ case 2: return 2;
+ case 3: return 4;
+ case 4: return 8;
+ case 5: return 16;
+ default: assert(0); return 0;
+ }
+ }
+
+ /*! Encode the instruction state. Note that the flag register can be either
+ * physical (i.e. a real Gen flag) or a virtual boolean register. The flag
+ * register allocation will turn all virtual boolean registers into flag
+ * registers
+ */
+ class GenInstructionState
+ {
+ public:
+ INLINE GenInstructionState(uint32_t simdWidth = 8) {
+ this->execWidth = simdWidth;
+ this->quarterControl = GEN_COMPRESSION_Q1;
+ this->nibControl = 0;
+ this->accWrEnable = 0;
+ this->noMask = 0;
+ this->flag = 0;
+ this->subFlag = 0;
+ this->grfFlag = 1;
+ this->externFlag = 0;
+ this->modFlag = 0;
+ this->flagGen = 0;
+ this->predicate = GEN_PREDICATE_NONE;
+ this->inversePredicate = 0;
+ this->physicalFlag = 1;
+ this->flagIndex = 0;
+ this->saturate = GEN_MATH_SATURATE_NONE;
+ }
+ uint32_t physicalFlag:1; //!< Physical or virtual flag register
+ uint32_t flag:1; //!< Only if physical flag,
+ uint32_t subFlag:1; //!< Only if physical flag
+ uint32_t flagIndex:16; //!< Only if virtual flag (index of the register)
+ uint32_t grfFlag:1; //!< Only if virtual flag, 0 means we do not need to allocate GRF.
+ uint32_t externFlag:1; //!< Only if virtual flag, 1 means this flag is from external BB.
+ uint32_t modFlag:1; //!< Only if virtual flag, 1 means will modify flag.
+ uint32_t flagGen:1; //!< Only if virtual flag, 1 means the gen_context stage may need to
+ //!< generate the flag.
+ uint32_t execWidth:5;
+ uint32_t quarterControl:1;
+ uint32_t nibControl:1;
+ uint32_t accWrEnable:1;
+ uint32_t noMask:1;
+ uint32_t predicate:4;
+ uint32_t inversePredicate:1;
+ uint32_t saturate:1;
+ void chooseNib(int nib) {
+ switch (nib) {
+ case 0:
+ quarterControl = 0;
+ nibControl = 0;
+ break;
+ case 1:
+ quarterControl = 0;
+ nibControl = 1;
+ break;
+ case 2:
+ quarterControl = 1;
+ nibControl = 0;
+ break;
+ case 3:
+ quarterControl = 1;
+ nibControl = 1;
+ break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+ void useFlag(int nr, int subnr) {
+ flag = nr;
+ subFlag = subnr;
+ physicalFlag = 1;
+ }
+ };
+
+ /*! This is a book-keeping structure used to encode both virtual and physical
+ * registers
+ */
+ class GenRegister
+ {
+ public:
+ /*! Empty constructor */
+ INLINE GenRegister(void) {}
+
+ /*! General constructor */
+ INLINE GenRegister(uint32_t file,
+ ir::Register reg,
+ uint32_t type,
+ uint32_t vstride,
+ uint32_t width,
+ uint32_t hstride)
+ {
+ this->type = type;
+ this->file = file;
+ this->physical = 0;
+ this->subphysical = 0;
+ this->value.reg = reg;
+ this->negation = 0;
+ this->absolute = 0;
+ this->vstride = vstride;
+ this->width = width;
+ this->hstride = hstride;
+ this->quarter = 0;
+ this->nr = this->subnr = 0;
+ this->address_mode = GEN_ADDRESS_DIRECT;
+ }
+
+ /*! For specific physical registers only */
+ INLINE GenRegister(uint32_t file,
+ uint32_t nr,
+ uint32_t subnr,
+ uint32_t type,
+ uint32_t vstride,
+ uint32_t width,
+ uint32_t hstride)
+ {
+ this->type = type;
+ this->file = file;
+ this->nr = nr;
+ this->physical = 1;
+ this->subphysical = 1;
+ this->subnr = subnr * typeSize(type);
+ this->negation = 0;
+ this->absolute = 0;
+ this->vstride = vstride;
+ this->width = width;
+ this->hstride = hstride;
+ this->quarter = 0;
+ this->address_mode = GEN_ADDRESS_DIRECT;
+ }
+
+ /*! Return the IR virtual register */
+ INLINE ir::Register reg(void) const { return ir::Register(value.reg); }
+
+ /*! For immediates or virtual register */
+ union {
+ double df;
+ float f;
+ int32_t d;
+ uint32_t ud;
+ uint16_t reg;
+ int64_t i64;
+ } value;
+
+ uint32_t nr:8; //!< Just for some physical registers (acc, null)
+ uint32_t subnr:8; //!< Idem
+ uint32_t physical:1; //!< 1 if physical, 0 otherwise
+ uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise
+ uint32_t type:4; //!< Gen type
+ uint32_t file:2; //!< Register file
+ uint32_t negation:1; //!< For source
+ uint32_t absolute:1; //!< For source
+ uint32_t vstride:4; //!< Vertical stride
+ uint32_t width:3; //!< Width
+ uint32_t hstride:2; //!< Horizontal stride
+ uint32_t quarter:1; //!< To choose which part we want (Q1 / Q2)
+ uint32_t address_mode:1; //!< direct or indirect
+
+ static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+ GenRegister r = reg;
+ r.nr += nr;
+ r.subnr += subnr;
+ return r;
+ }
+
+ // split a DWORD register into unpacked Byte or Short register
+ static INLINE GenRegister splitReg(GenRegister reg, uint32_t count, uint32_t sub_part) {
+ GenRegister r = reg;
+ GBE_ASSERT(count == 4 || count == 2);
+ GBE_ASSERT(reg.type == GEN_TYPE_UD || reg.type == GEN_TYPE_D);
+
+ if(reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ GBE_ASSERT(reg.hstride == GEN_HORIZONTAL_STRIDE_1);
+ r.hstride = count == 4 ? GEN_HORIZONTAL_STRIDE_4 : GEN_HORIZONTAL_STRIDE_2;
+ }
+ if(count == 4) {
+ r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UB : GEN_TYPE_B;
+ r.vstride = GEN_VERTICAL_STRIDE_32;
+ } else {
+ r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UW : GEN_TYPE_W;
+ r.vstride = GEN_VERTICAL_STRIDE_16;
+ }
+
+ r.subnr += sub_part*typeSize(r.type);
+ r.nr += r.subnr / 32;
+ r.subnr %= 32;
+
+ return r;
+ }
+
+ INLINE bool isint64(void) const {
+ if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+ return true;
+ return false;
+ }
+
+ INLINE bool isimmdf(void) const {
+ if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+ return true;
+ return false;
+ }
+
+ INLINE GenRegister top_half(int simdWidth) const {
+ GBE_ASSERT(isint64());
+ GenRegister reg = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
+
+ if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg);
+ reg.nr += reg.subnr / 32;
+ reg.subnr %= 32;
+ } else {
+ reg.subnr += typeSize(reg.type);
+ reg.nr += reg.subnr/32;
+ reg.subnr %= 32;
+ }
+ return reg;
+ }
+
+ INLINE GenRegister bottom_half(void) const {
+ GBE_ASSERT(isint64());
+ GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
+ return r;
+ }
+
+ INLINE bool is_signed_int(void) const {
+ if ((type == GEN_TYPE_B || type == GEN_TYPE_W || type == GEN_TYPE_D || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+ return true;
+ return false;
+ }
+
+ INLINE bool isdf(void) const {
+ if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+ return true;
+ return false;
+ }
+
+ INLINE int flag_nr(void) const {
+ assert(file == GEN_ARCHITECTURE_REGISTER_FILE);
+ assert(nr >= GEN_ARF_FLAG && nr < GEN_ARF_FLAG + 2);
+ return nr & 15;
+ }
+
+ INLINE int flag_subnr(void) const {
+ return subnr / typeSize(type);
+ }
+
+ static INLINE GenRegister h2(GenRegister reg) {
+ GenRegister r = reg;
+ if(r.hstride != GEN_HORIZONTAL_STRIDE_0)
+ r.hstride = GEN_HORIZONTAL_STRIDE_2;
+ return r;
+ }
+
+ static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
+ GBE_ASSERT(reg.physical == 0);
+ if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
+ return reg;
+ else {
+ reg.quarter = quarter;
+ return reg;
+ }
+ }
+
+ static INLINE GenRegister QnPhysical(GenRegister reg, uint32_t quarter) {
+ GBE_ASSERT(reg.physical);
+ if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
+ return reg;
+ else {
+ const uint32_t typeSz = typeSize(reg.type);
+ const uint32_t horizontal = stride(reg.hstride);
+ const uint32_t grfOffset = reg.nr*GEN_REG_SIZE + reg.subnr;
+ const uint32_t nextOffset = grfOffset + 8*quarter*horizontal*typeSz;
+ reg.nr = nextOffset / GEN_REG_SIZE;
+ reg.subnr = (nextOffset % GEN_REG_SIZE);
+ return reg;
+ }
+ }
+
+ static INLINE GenRegister Qn(GenRegister reg, uint32_t quarter) {
+ if (reg.physical)
+ return QnPhysical(reg, quarter);
+ else
+ return QnVirtual(reg, quarter);
+ }
+
+ static INLINE GenRegister vec16(uint32_t file, ir::Register reg) {
+ return GenRegister(file,
+ reg,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec8(uint32_t file, ir::Register reg) {
+ return GenRegister(file,
+ reg,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec4(uint32_t file, ir::Register reg) {
+ return GenRegister(file,
+ reg,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_4,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec2(uint32_t file, ir::Register reg) {
+ return GenRegister(file,
+ reg,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_2,
+ GEN_WIDTH_2,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec1(uint32_t file, ir::Register reg) {
+ return GenRegister(file,
+ reg,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_0,
+ GEN_WIDTH_1,
+ GEN_HORIZONTAL_STRIDE_0);
+ }
+
+ static INLINE GenRegister retype(GenRegister reg, uint32_t type) {
+ reg.type = type;
+ return reg;
+ }
+
+ static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+ return retype(vec16(file, reg), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+ return retype(vec8(file, reg), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+ return retype(vec1(file, reg), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
+ return retype(vec16(file, reg), GEN_TYPE_UD);
+ }
+
+ static INLINE GenRegister ud8(uint32_t file, ir::Register reg) {
+ return retype(vec8(file, reg), GEN_TYPE_UD);
+ }
+
+ static INLINE GenRegister ud1(uint32_t file, ir::Register reg) {
+ return retype(vec1(file, reg), GEN_TYPE_UD);
+ }
+
+ static INLINE GenRegister d8(uint32_t file, ir::Register reg) {
+ return retype(vec8(file, reg), GEN_TYPE_D);
+ }
+
+ static INLINE GenRegister uw16(uint32_t file, ir::Register reg) {
+ return retype(vec16(file, reg), GEN_TYPE_UW);
+ }
+
+ static INLINE GenRegister uw8(uint32_t file, ir::Register reg) {
+ return retype(vec8(file, reg), GEN_TYPE_UW);
+ }
+
+ static INLINE GenRegister uw1(uint32_t file, ir::Register reg) {
+ return retype(vec1(file, reg), GEN_TYPE_UW);
+ }
+
+ static INLINE GenRegister ub16(uint32_t file, ir::Register reg) {
+ return GenRegister(file,
+ reg,
+ GEN_TYPE_UB,
+ GEN_VERTICAL_STRIDE_16,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
+ static INLINE GenRegister ub8(uint32_t file, ir::Register reg) {
+ return GenRegister(file,
+ reg,
+ GEN_TYPE_UB,
+ GEN_VERTICAL_STRIDE_16,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
+ static INLINE GenRegister ub1(uint32_t file, ir::Register reg) {
+ return retype(vec1(file, reg), GEN_TYPE_UB);
+ }
+
+ static INLINE GenRegister unpacked_uw(ir::Register reg, bool uniform = false) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ reg,
+ GEN_TYPE_UW,
+ uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_16,
+ uniform ? GEN_WIDTH_1 : GEN_WIDTH_8,
+ uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_2);
+ }
+
+ static INLINE GenRegister unpacked_ub(ir::Register reg, bool uniform = false) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ reg,
+ GEN_TYPE_UB,
+ uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_32,
+ uniform ? GEN_WIDTH_1 : GEN_WIDTH_8,
+ uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_4);
+ }
+
+ static INLINE GenRegister imm(uint32_t type) {
+ return GenRegister(GEN_IMMEDIATE_VALUE,
+ 0,
+ 0,
+ type,
+ GEN_VERTICAL_STRIDE_0,
+ GEN_WIDTH_1,
+ GEN_HORIZONTAL_STRIDE_0);
+ }
+
+ static INLINE GenRegister immint64(int64_t i) {
+ GenRegister immediate = imm(GEN_TYPE_L);
+ immediate.value.i64 = i;
+ return immediate;
+ }
+
+ static INLINE GenRegister immdf(double df) {
+ GenRegister immediate = imm(GEN_TYPE_DF);
+ immediate.value.df = df;
+ return immediate;
+ }
+
+ static INLINE GenRegister immf(float f) {
+ GenRegister immediate = imm(GEN_TYPE_F);
+ immediate.value.f = f;
+ return immediate;
+ }
+
+ static INLINE GenRegister immd(int d) {
+ GenRegister immediate = imm(GEN_TYPE_D);
+ immediate.value.d = d;
+ return immediate;
+ }
+
+ static INLINE GenRegister immud(uint32_t ud) {
+ GenRegister immediate = imm(GEN_TYPE_UD);
+ immediate.value.ud = ud;
+ return immediate;
+ }
+
+ static INLINE GenRegister immuw(uint16_t uw) {
+ GenRegister immediate = imm(GEN_TYPE_UW);
+ immediate.value.ud = uw;
+ return immediate;
+ }
+
+ static INLINE GenRegister immw(int16_t w) {
+ GenRegister immediate = imm(GEN_TYPE_W);
+ immediate.value.d = w;
+ return immediate;
+ }
+
+ static INLINE GenRegister immv(uint32_t v) {
+ GenRegister immediate = imm(GEN_TYPE_V);
+ immediate.vstride = GEN_VERTICAL_STRIDE_0;
+ immediate.width = GEN_WIDTH_8;
+ immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
+ immediate.value.ud = v;
+ return immediate;
+ }
+
+ static INLINE GenRegister immvf(uint32_t v) {
+ GenRegister immediate = imm(GEN_TYPE_VF);
+ immediate.vstride = GEN_VERTICAL_STRIDE_0;
+ immediate.width = GEN_WIDTH_4;
+ immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
+ immediate.value.ud = v;
+ return immediate;
+ }
+
+ static INLINE GenRegister immvf4(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3) {
+ GenRegister immediate = imm(GEN_TYPE_VF);
+ immediate.vstride = GEN_VERTICAL_STRIDE_0;
+ immediate.width = GEN_WIDTH_4;
+ immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
+ immediate.value.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24));
+ return immediate;
+ }
+
+ static INLINE GenRegister f1grf(ir::Register reg) {
+ return vec1(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister f2grf(ir::Register reg) {
+ return vec2(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister f4grf(ir::Register reg) {
+ return vec4(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister f8grf(ir::Register reg) {
+ return vec8(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister f16grf(ir::Register reg) {
+ return vec16(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister df1grf(ir::Register reg) {
+ return df1(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister df8grf(ir::Register reg) {
+ return df8(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister df16grf(ir::Register reg) {
+ return df16(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister ud16grf(ir::Register reg) {
+ return ud16(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister ud8grf(ir::Register reg) {
+ return ud8(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister ud1grf(ir::Register reg) {
+ return ud1(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister uw1grf(ir::Register reg) {
+ return uw1(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister uw8grf(ir::Register reg) {
+ return uw8(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister uw16grf(ir::Register reg) {
+ return uw16(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister ub1grf(ir::Register reg) {
+ return ub1(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister ub8grf(ir::Register reg) {
+ return ub8(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister ub16grf(ir::Register reg) {
+ return ub16(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister null(void) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_NULL,
+ 0,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister nullud(void) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_NULL,
+ 0,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+
+ static INLINE bool isNull(GenRegister reg) {
+ return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
+ && reg.nr == GEN_ARF_NULL);
+ }
+
+ static INLINE GenRegister vec1(GenRegister reg) {
+ reg.width = GEN_WIDTH_1;
+ reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+ reg.vstride = GEN_VERTICAL_STRIDE_0;
+ return reg;
+ }
+
+ static INLINE GenRegister acc(void) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_ACCUMULATOR,
+ 0,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister ip(void) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_IP,
+ 0,
+ GEN_TYPE_D,
+ GEN_VERTICAL_STRIDE_4,
+ GEN_WIDTH_1,
+ GEN_HORIZONTAL_STRIDE_0);
+ }
+
+ static INLINE GenRegister notification1(void) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_NOTIFICATION_COUNT,
+ 0,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_0,
+ GEN_WIDTH_1,
+ GEN_HORIZONTAL_STRIDE_0);
+ }
+
+ static INLINE GenRegister flag(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+ GEN_ARF_FLAG | nr,
+ subnr,
+ GEN_TYPE_UW,
+ GEN_VERTICAL_STRIDE_0,
+ GEN_WIDTH_1,
+ GEN_HORIZONTAL_STRIDE_0);
+ }
+
+ static INLINE GenRegister next(GenRegister reg) {
+ if (reg.physical)
+ reg.nr++;
+ else
+ reg.quarter++;
+ return reg;
+ }
+
+ /*! Build an indirectly addressed source */
+ static INLINE GenRegister indirect(uint32_t type, uint32_t subnr, uint32_t width) {
+ GenRegister reg;
+ reg.type = type;
+ reg.file = GEN_GENERAL_REGISTER_FILE;
+ reg.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
+ reg.width = width;
+ reg.subnr = subnr;
+ reg.nr = 0;
+ reg.negation = 0;
+ reg.absolute = 0;
+ reg.vstride = 0;
+ reg.hstride = 0;
+ return reg;
+ }
+
+ static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return GenRegister(file,
+ nr,
+ subnr,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return GenRegister(file,
+ nr,
+ subnr,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec4(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return GenRegister(file,
+ nr,
+ subnr,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_4,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec2(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return GenRegister(file,
+ nr,
+ subnr,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_2,
+ GEN_WIDTH_2,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister vec1(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return GenRegister(file,
+ nr,
+ subnr,
+ GEN_TYPE_F,
+ GEN_VERTICAL_STRIDE_0,
+ GEN_WIDTH_1,
+ GEN_HORIZONTAL_STRIDE_0);
+ }
+
+ static INLINE int hstride_size(GenRegister reg) {
+ switch (reg.hstride) {
+ case GEN_HORIZONTAL_STRIDE_0: return 0;
+ case GEN_HORIZONTAL_STRIDE_1: return 1;
+ case GEN_HORIZONTAL_STRIDE_2: return 2;
+ case GEN_HORIZONTAL_STRIDE_4: return 4;
+ default: NOT_IMPLEMENTED; return 0;
+ }
+ }
+
+ static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
+ if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ reg.subnr += delta * typeSize(reg.type) * hstride_size(reg);
+ reg.nr += reg.subnr / 32;
+ reg.subnr %= 32;
+ }
+ return reg;
+ }
+
+ static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
+ }
+
+ static INLINE GenRegister ud8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec8(file, nr, subnr), GEN_TYPE_UD);
+ }
+
+ static INLINE GenRegister ud1(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec1(file, nr, subnr), GEN_TYPE_UD);
+ }
+
+ static INLINE GenRegister d8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec8(file, nr, subnr), GEN_TYPE_D);
+ }
+
+ static INLINE GenRegister uw16(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return suboffset(retype(vec16(file, nr, 0), GEN_TYPE_UW), subnr);
+ }
+
+ static INLINE GenRegister uw8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return suboffset(retype(vec8(file, nr, 0), GEN_TYPE_UW), subnr);
+ }
+
+ static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
+ }
+
+ static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return GenRegister(file,
+ nr,
+ subnr,
+ GEN_TYPE_UB,
+ GEN_VERTICAL_STRIDE_16,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
+ static INLINE GenRegister ub8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return GenRegister(file,
+ nr,
+ subnr,
+ GEN_TYPE_UB,
+ GEN_VERTICAL_STRIDE_16,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
+ static INLINE GenRegister ub1(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UB), subnr);
+ }
+
+ static INLINE GenRegister f1grf(uint32_t nr, uint32_t subnr) {
+ return vec1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister f2grf(uint32_t nr, uint32_t subnr) {
+ return vec2(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister f4grf(uint32_t nr, uint32_t subnr) {
+ return vec4(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister f8grf(uint32_t nr, uint32_t subnr) {
+ return vec8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister f16grf(uint32_t nr, uint32_t subnr) {
+ return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+ return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+ return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+ return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
+ return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister ud8grf(uint32_t nr, uint32_t subnr) {
+ return ud8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister ud1grf(uint32_t nr, uint32_t subnr) {
+ return ud1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister ud1arf(uint32_t nr, uint32_t subnr) {
+ return ud1(GEN_ARCHITECTURE_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister uw1grf(uint32_t nr, uint32_t subnr) {
+ return uw1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister uw8grf(uint32_t nr, uint32_t subnr) {
+ return uw8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister uw16grf(uint32_t nr, uint32_t subnr) {
+ return uw16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister ub1grf(uint32_t nr, uint32_t subnr) {
+ return ub1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister ub8grf(uint32_t nr, uint32_t subnr) {
+ return ub8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister ub16grf(uint32_t nr, uint32_t subnr) {
+ return ub16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister unpacked_uw(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UW,
+ GEN_VERTICAL_STRIDE_16,
+ GEN_WIDTH_8,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
+ static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
+ static INLINE GenRegister mask(uint32_t subnr) {
+ return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
+ }
+
+ static INLINE GenRegister addr1(uint32_t subnr) {
+ return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_ADDRESS, subnr);
+ }
+
+ static INLINE GenRegister addr8(uint32_t subnr) {
+ return uw8(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_ADDRESS, subnr);
+ }
+
+ static INLINE GenRegister negate(GenRegister reg) {
+ if (reg.file != GEN_IMMEDIATE_VALUE)
+ reg.negation ^= 1;
+ else {
+ if (reg.type == GEN_TYPE_F)
+ reg.value.f = -reg.value.f;
+ else if (reg.type == GEN_TYPE_UD)
+ reg.value.ud = -reg.value.ud;
+ else if (reg.type == GEN_TYPE_D)
+ reg.value.d = -reg.value.d;
+ else if (reg.type == GEN_TYPE_UW) {
+ const uint16_t uw = reg.value.ud & 0xffff;
+ reg = GenRegister::immuw(-uw);
+ } else if (reg.type == GEN_TYPE_W) {
+ const uint16_t uw = reg.value.ud & 0xffff;
+ reg = GenRegister::immw(-(int16_t)uw);
+ } else
+ NOT_SUPPORTED;
+ }
+ return reg;
+ }
+
+ static INLINE GenRegister abs(GenRegister reg) {
+ reg.absolute = 1;
+ reg.negation = 0;
+ return reg;
+ }
+
+ /*! Generate register encoding with run-time simdWidth */
+#define DECL_REG_ENCODER(NAME, SIMD16, SIMD8, SIMD1) \
+ template <typename... Args> \
+ static INLINE GenRegister NAME(uint32_t simdWidth, Args... values) { \
+ if (simdWidth == 16) \
+ return SIMD16(values...); \
+ else if (simdWidth == 8) \
+ return SIMD8(values...); \
+ else if (simdWidth == 1) \
+ return SIMD1(values...); \
+ else { \
+ NOT_IMPLEMENTED; \
+ return SIMD1(values...); \
+ } \
+ }
+ DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
+ DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
+ DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
+ DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
+#undef DECL_REG_ENCODER
+ };
+} /* namespace gbe */
+
+#endif /* __GEN_REGISTER_HPP__ */
+
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
new file mode 100644
index 0000000..787d111
--- /dev/null
+++ b/backend/src/backend/program.cpp
@@ -0,0 +1,1317 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file callback interface for the compiler
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "program.h"
+#include "program.hpp"
+#include "gen_program.h"
+#include "sys/platform.hpp"
+#include "sys/cvar.hpp"
+#include "ir/liveness.hpp"
+#include "ir/value.hpp"
+#include "ir/unit.hpp"
+#include "ir/printf.hpp"
+#include "llvm/llvm_to_gen.hpp"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/LLVMContext.h"
+#include <cstring>
+#include <algorithm>
+#include <fstream>
+#include <dlfcn.h>
+#include <sstream>
+#include <iostream>
+#include <unistd.h>
+#include <mutex>
+
+/* Not defined for LLVM 3.0 */
+#if !defined(LLVM_VERSION_MAJOR)
+#define LLVM_VERSION_MAJOR 3
+#endif /* !defined(LLVM_VERSION_MAJOR) */
+
+/* Not defined for LLVM 3.0 */
+#if !defined(LLVM_VERSION_MINOR)
+#define LLVM_VERSION_MINOR 0
+#endif /* !defined(LLVM_VERSION_MINOR) */
+
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/CompilerInvocation.h>
+#if LLVM_VERSION_MINOR <= 1
+#include <clang/Frontend/DiagnosticOptions.h>
+#else
+#include <clang/Basic/DiagnosticOptions.h>
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Basic/TargetInfo.h>
+#include <clang/Basic/TargetOptions.h>
+#include <llvm/ADT/IntrusiveRefCntPtr.h>
+#if LLVM_VERSION_MINOR <= 2
+#include <llvm/Module.h>
+#else
+#include <llvm/IR/Module.h>
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Support/raw_ostream.h>
+#include "src/GBEConfig.h"
+
+namespace gbe {
+
+ Kernel::Kernel(const std::string &name) :
+ name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false),
+ slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), printfSet(NULL) {}
+ Kernel::~Kernel(void) {
+ if(ctx) GBE_DELETE(ctx);
+ if(samplerSet) GBE_DELETE(samplerSet);
+ if(imageSet) GBE_DELETE(imageSet);
+ if(printfSet) GBE_DELETE(printfSet);
+ GBE_SAFE_DELETE_ARRAY(args);
+ }
+ int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
+ const PatchInfo patch(type, subType);
+ const auto it = std::lower_bound(patches.begin(), patches.end(), patch);
+ if (it == patches.end()) return -1; // nothing found
+ if (patch < *it) return -1; // they are not equal
+ return it->offset; // we found it!
+ }
+
+ Program::Program(void) : constantSet(NULL) {}
+ Program::~Program(void) {
+ for (auto &kernel : kernels) GBE_DELETE(kernel.second);
+ if (constantSet) delete constantSet;
+ }
+
+#ifdef GBE_COMPILER_AVAILABLE
+ BVAR(OCL_OUTPUT_GEN_IR, false);
+
+ bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
+ ir::Unit *unit = new ir::Unit();
+ llvm::Module * cloned_module = NULL;
+ if(module){
+ cloned_module = llvm::CloneModule((llvm::Module*)module);
+ }
+ if (llvmToGen(*unit, fileName, module, optLevel) == false) {
+ if (fileName)
+ error = std::string(fileName) + " not found";
+ delete unit;
+ return false;
+ }
+ //If unit is not valid, maybe some thing don't support by backend, introduce by some passes
+ //use optLevel 0 to try again.
+ if(!unit->getValid()) {
+ delete unit; //clear unit
+ unit = new ir::Unit();
+ if(cloned_module){
+ llvmToGen(*unit, fileName, cloned_module, 0); //suppose file exists and llvmToGen will not return false.
+ }else{
+ llvmToGen(*unit, fileName, module, 0); //suppose file exists and llvmToGen will not return false.
+ }
+ }
+ assert(unit->getValid());
+ this->buildFromUnit(*unit, error);
+ delete unit;
+ if(cloned_module){
+ delete (llvm::Module*) cloned_module;
+ }
+ return true;
+ }
+
+ BVAR(OCL_STRICT_CONFORMANCE, false);
+
+ bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
+ constantSet = new ir::ConstantSet(unit.getConstantSet());
+ const auto &set = unit.getFunctionSet();
+ const uint32_t kernelNum = set.size();
+ if (OCL_OUTPUT_GEN_IR) std::cout << unit;
+ if (kernelNum == 0) return true;
+ for (const auto &pair : set) {
+ const std::string &name = pair.first;
+ Kernel *kernel = this->compileKernel(unit, name, !OCL_STRICT_CONFORMANCE);
+ kernel->setSamplerSet(pair.second->getSamplerSet());
+ kernel->setImageSet(pair.second->getImageSet());
+ kernel->setPrintfSet(pair.second->getPrintfSet());
+ kernel->setCompileWorkGroupSize(pair.second->getCompileWorkGroupSize());
+ kernel->setFunctionAttributes(pair.second->getFunctionAttributes());
+ kernels.insert(std::make_pair(name, kernel));
+ }
+ return true;
+ }
+#endif
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ size_t Program::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+ size_t ker_num = kernels.size();
+ int has_constset = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ if (constantSet) {
+ has_constset = 1;
+ OUT_UPDATE_SZ(has_constset);
+ size_t sz = constantSet->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ } else {
+ OUT_UPDATE_SZ(has_constset);
+ }
+
+ OUT_UPDATE_SZ(ker_num);
+ for (auto ker : kernels) {
+ size_t sz = ker.second->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+
+ OUT_UPDATE_SZ(ret_size);
+ return ret_size;
+ }
+
+ size_t Program::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ int has_constset = 0;
+ size_t ker_num;
+ uint32_t magic;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(has_constset);
+ if(has_constset) {
+ constantSet = new ir::ConstantSet;
+ size_t sz = constantSet->deserializeFromBin(ins);
+
+ if (sz == 0) {
+ return 0;
+ }
+
+ total_size += sz;
+ }
+
+ IN_UPDATE_SZ(ker_num);
+
+ for (size_t i = 0; i < ker_num; i++) {
+ size_t ker_serial_sz;
+ std::string ker_name; // Just a empty name here.
+ Kernel* ker = allocateKernel(ker_name);
+
+ if(!(ker_serial_sz = ker->deserializeFromBin(ins)))
+ return 0;
+
+ kernels.insert(std::make_pair(ker->getName(), ker));
+ total_size += ker_serial_sz;
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+ size_t Kernel::serializeToBin(std::ostream& outs) {
+ unsigned int i;
+ size_t ret_size = 0;
+ int has_samplerset = 0;
+ int has_imageset = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ OUT_UPDATE_SZ(name.size());
+ outs.write(name.c_str(), name.size());
+ ret_size += sizeof(char)*name.size();
+
+ OUT_UPDATE_SZ(argNum);
+ for (i = 0; i < argNum; i++) {
+ KernelArgument& arg = args[i];
+ OUT_UPDATE_SZ(arg.type);
+ OUT_UPDATE_SZ(arg.size);
+ OUT_UPDATE_SZ(arg.align);
+ OUT_UPDATE_SZ(arg.bti);
+ }
+
+ OUT_UPDATE_SZ(patches.size());
+ for (auto patch : patches) {
+ unsigned int tmp;
+ tmp = patch.type;
+ OUT_UPDATE_SZ(tmp);
+ tmp = patch.subType;
+ OUT_UPDATE_SZ(tmp);
+ tmp = patch.offset;
+ OUT_UPDATE_SZ(tmp);
+ }
+
+ OUT_UPDATE_SZ(curbeSize);
+ OUT_UPDATE_SZ(simdWidth);
+ OUT_UPDATE_SZ(stackSize);
+ OUT_UPDATE_SZ(scratchSize);
+ OUT_UPDATE_SZ(useSLM);
+ OUT_UPDATE_SZ(slmSize);
+ OUT_UPDATE_SZ(compileWgSize[0]);
+ OUT_UPDATE_SZ(compileWgSize[1]);
+ OUT_UPDATE_SZ(compileWgSize[2]);
+ /* samplers. */
+ if (!samplerSet->empty()) { //samplerSet is always valid, allocated in Function::Function
+ has_samplerset = 1;
+ OUT_UPDATE_SZ(has_samplerset);
+ size_t sz = samplerSet->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ } else {
+ OUT_UPDATE_SZ(has_samplerset);
+ }
+
+ /* images. */
+ if (!imageSet->empty()) { //imageSet is always valid, allocated in Function::Function
+ has_imageset = 1;
+ OUT_UPDATE_SZ(has_imageset);
+ size_t sz = imageSet->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ } else {
+ OUT_UPDATE_SZ(has_imageset);
+ }
+
+ /* Code. */
+ const char * code = getCode();
+ OUT_UPDATE_SZ(getCodeSize());
+ outs.write(code, getCodeSize()*sizeof(char));
+ ret_size += getCodeSize()*sizeof(char);
+
+ OUT_UPDATE_SZ(magic_end);
+
+ OUT_UPDATE_SZ(ret_size);
+ return ret_size;
+ }
+
+ size_t Kernel::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ int has_samplerset = 0;
+ int has_imageset = 0;
+ size_t code_size = 0;
+ uint32_t magic = 0;
+ size_t patch_num = 0;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ size_t name_len;
+ IN_UPDATE_SZ(name_len);
+ char* c_name = new char[name_len+1];
+ ins.read(c_name, name_len*sizeof(char));
+ total_size += sizeof(char)*name_len;
+ c_name[name_len] = 0;
+ name = c_name;
+ delete[] c_name;
+
+ IN_UPDATE_SZ(argNum);
+ args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, argNum);
+ for (uint32_t i = 0; i < argNum; i++) {
+ KernelArgument& arg = args[i];
+ IN_UPDATE_SZ(arg.type);
+ IN_UPDATE_SZ(arg.size);
+ IN_UPDATE_SZ(arg.align);
+ IN_UPDATE_SZ(arg.bti);
+ }
+
+ IN_UPDATE_SZ(patch_num);
+ for (uint32_t i = 0; i < patch_num; i++) {
+ unsigned int tmp;
+ PatchInfo patch;
+ IN_UPDATE_SZ(tmp);
+ patch.type = tmp;
+ IN_UPDATE_SZ(tmp);
+ patch.subType = tmp;
+ IN_UPDATE_SZ(tmp);
+ patch.offset = tmp;
+
+ patches.push_back(patch);
+ }
+
+ IN_UPDATE_SZ(curbeSize);
+ IN_UPDATE_SZ(simdWidth);
+ IN_UPDATE_SZ(stackSize);
+ IN_UPDATE_SZ(scratchSize);
+ IN_UPDATE_SZ(useSLM);
+ IN_UPDATE_SZ(slmSize);
+ IN_UPDATE_SZ(compileWgSize[0]);
+ IN_UPDATE_SZ(compileWgSize[1]);
+ IN_UPDATE_SZ(compileWgSize[2]);
+
+ IN_UPDATE_SZ(has_samplerset);
+ if (has_samplerset) {
+ samplerSet = GBE_NEW(ir::SamplerSet);
+ size_t sz = samplerSet->deserializeFromBin(ins);
+ if (sz == 0) {
+ return 0;
+ }
+
+ total_size += sz;
+ }
+ else
+ samplerSet = NULL;
+
+ IN_UPDATE_SZ(has_imageset);
+ if (has_imageset) {
+ imageSet = GBE_NEW(ir::ImageSet);
+ size_t sz = imageSet->deserializeFromBin(ins);
+ if (sz == 0) {
+ return 0;
+ }
+
+ total_size += sz;
+ }
+ else
+ imageSet = NULL;
+
+ IN_UPDATE_SZ(code_size);
+ if (code_size) {
+ char* code = GBE_NEW_ARRAY_NO_ARG(char, code_size);
+ ins.read(code, code_size*sizeof(char));
+ total_size += sizeof(char)*code_size;
+ setCode(code, code_size);
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+#undef OUT_UPDATE_SZ
+#undef IN_UPDATE_SZ
+
+ void Program::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+
+ outs << spaces << "=============== Begin Program ===============" << "\n";
+
+ if (constantSet) {
+ constantSet->printStatus(indent + 4, outs);
+ }
+
+ for (auto ker : kernels) {
+ ker.second->printStatus(indent + 4, outs);
+ }
+
+ outs << spaces << "================ End Program ================" << "\n";
+ }
+
+ void Kernel::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+ string spaces_nl = indent_to_str(indent + 4);
+ int num;
+
+ outs << spaces << "+++++++++++ Begin Kernel +++++++++++" << "\n";
+ outs << spaces_nl << "Kernel Name: " << name << "\n";
+ outs << spaces_nl << " curbeSize: " << curbeSize << "\n";
+ outs << spaces_nl << " simdWidth: " << simdWidth << "\n";
+ outs << spaces_nl << " stackSize: " << stackSize << "\n";
+ outs << spaces_nl << " scratchSize: " << scratchSize << "\n";
+ outs << spaces_nl << " useSLM: " << useSLM << "\n";
+ outs << spaces_nl << " slmSize: " << slmSize << "\n";
+ outs << spaces_nl << " compileWgSize: " << compileWgSize[0] << compileWgSize[1] << compileWgSize[2] << "\n";
+
+ outs << spaces_nl << " Argument Number is " << argNum << "\n";
+ for (uint32_t i = 0; i < argNum; i++) {
+ KernelArgument& arg = args[i];
+ outs << spaces_nl << " Arg " << i << ":\n";
+ outs << spaces_nl << " type value: "<< arg.type << "\n";
+ outs << spaces_nl << " size: "<< arg.size << "\n";
+ outs << spaces_nl << " align: "<< arg.align << "\n";
+ outs << spaces_nl << " bti: "<< arg.bti << "\n";
+ }
+
+ outs << spaces_nl << " Patches Number is " << patches.size() << "\n";
+ num = 0;
+ for (auto patch : patches) {
+ num++;
+ outs << spaces_nl << " patch " << num << ":\n";
+ outs << spaces_nl << " type value: "<< patch.type << "\n";
+ outs << spaces_nl << " subtype value: "<< patch.subType << "\n";
+ outs << spaces_nl << " offset: "<< patch.offset << "\n";
+ }
+
+ if (samplerSet) {
+ samplerSet->printStatus(indent + 4, outs);
+ }
+
+ if (imageSet) {
+ imageSet->printStatus(indent + 4, outs);
+ }
+
+ outs << spaces << "++++++++++++ End Kernel ++++++++++++" << "\n";
+ }
+
+ /*********************** End of Program class member function *************************/
+
+#define REDEF_MATH_FUNC(x) "#ifdef "#x"\n#undef "#x"\n#endif\n#define "#x" __gen_ocl_internal_fastpath_"#x"\n"
+ std::string ocl_mathfunc_fastpath_str =
+ REDEF_MATH_FUNC(acosh)
+ REDEF_MATH_FUNC(asinh)
+ REDEF_MATH_FUNC(atanh)
+ REDEF_MATH_FUNC(cbrt)
+ REDEF_MATH_FUNC(cos)
+ REDEF_MATH_FUNC(cosh)
+ REDEF_MATH_FUNC(cospi)
+ REDEF_MATH_FUNC(exp)
+ REDEF_MATH_FUNC(exp10)
+ REDEF_MATH_FUNC(expm1)
+ REDEF_MATH_FUNC(fmod)
+ REDEF_MATH_FUNC(hypot)
+ REDEF_MATH_FUNC(ilogb)
+ REDEF_MATH_FUNC(ldexp)
+ REDEF_MATH_FUNC(log)
+ REDEF_MATH_FUNC(log2)
+ REDEF_MATH_FUNC(log10)
+ REDEF_MATH_FUNC(log1p)
+ REDEF_MATH_FUNC(logb)
+ REDEF_MATH_FUNC(remainder)
+ REDEF_MATH_FUNC(rootn)
+ REDEF_MATH_FUNC(sin)
+ REDEF_MATH_FUNC(sincos)
+ REDEF_MATH_FUNC(sinh)
+ REDEF_MATH_FUNC(sinpi)
+ REDEF_MATH_FUNC(tan)
+ REDEF_MATH_FUNC(tanh)
+ "\n"
+ ;
+
+ static void programDelete(gbe_program gbeProgram) {
+ gbe::Program *program = (gbe::Program*)(gbeProgram);
+ GBE_SAFE_DELETE(program);
+ }
+
+ static void programCleanLlvmResource(gbe_program gbeProgram) {
+ gbe::Program *program = (gbe::Program*)(gbeProgram);
+ program->CleanLlvmResource();
+ }
+
+#ifdef GBE_COMPILER_AVAILABLE
+ BVAR(OCL_OUTPUT_BUILD_LOG, false);
+ SVAR(OCL_PCH_PATH, PCH_OBJECT_DIR);
+ SVAR(OCL_PCM_PATH, PCM_OBJECT_DIR);
+
+ static bool buildModuleFromSource(const char* input, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx, std::string options,
+ size_t stringSize, char *err, size_t *errSize) {
+ // Arguments to pass to the clang frontend
+ vector<const char *> args;
+ bool bFastMath = false;
+
+ vector<std::string> useless; //hold substrings to avoid c_str free
+ size_t start = 0, end = 0;
+ /* FIXME
+ clang unsupport options:
+ -cl-denorms-are-zero, -cl-strict-aliasing
+ -cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt
+ all support options, refer to clang/include/clang/Driver/Options.inc
+ */
+ //Handle -cl-opt-disable in llvmToGen, skip here
+ const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
+ "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
+ bool useDefaultCLCVersion = true;
+ while (end != std::string::npos) {
+ end = options.find(' ', start);
+ std::string str = options.substr(start, end - start);
+ start = end + 1;
+ if(str.size() == 0)
+ continue;
+ if(str == "-cl-fast-relaxed-math") bFastMath = true;
+ if(unsupportedOptions.find(str) != std::string::npos)
+ continue;
+ if(str.find("-cl-std=") != std::string::npos) {
+ useDefaultCLCVersion = false;
+ if (str == "-cl-std=CL1.1")
+ args.push_back("-D__OPENCL_C_VERSION__=110");
+ else if (str == "-cl-std=CL1.2")
+ args.push_back("-D__OPENCL_C_VERSION__=120");
+ else {
+ if (err && stringSize > 0 && errSize)
+ *errSize = snprintf(err, stringSize, "Invalid build option: %s\n", str.c_str());
+ return false;
+ }
+ }
+ useless.push_back(str);
+ args.push_back(str.c_str());
+ }
+ if (useDefaultCLCVersion) {
+ args.push_back("-D__OPENCL_C_VERSION__=120");
+ args.push_back("-cl-std=CL1.2");
+ }
+ args.push_back("-mllvm");
+ args.push_back("-inline-threshold=200000");
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
+#endif
+ args.push_back("-emit-llvm");
+ // FIXME we haven't implement those builtin functions,
+ // so disable it currently.
+ args.push_back("-fno-builtin");
+ args.push_back("-disable-llvm-optzns");
+ if(bFastMath)
+ args.push_back("-D __FAST_RELAXED_MATH__=1");
+#if LLVM_VERSION_MINOR <= 2
+ args.push_back("-triple");
+ args.push_back("nvptx");
+#else
+ args.push_back("-x");
+ args.push_back("cl");
+ args.push_back("-triple");
+ args.push_back("spir");
+#endif /* LLVM_VERSION_MINOR <= 2 */
+ args.push_back(input);
+
+ // The compiler invocation needs a DiagnosticsEngine so it can report problems
+ std::string ErrorString;
+ llvm::raw_string_ostream ErrorInfo(ErrorString);
+ llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
+ DiagOpts->ShowCarets = false;
+ DiagOpts->ShowPresumedLoc = true;
+#if LLVM_VERSION_MINOR <= 1
+ args.push_back("-triple");
+ args.push_back("ptx32");
+
+ clang::TextDiagnosticPrinter *DiagClient =
+ new clang::TextDiagnosticPrinter(ErrorInfo, *DiagOpts)
+ llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
+ clang::DiagnosticsEngine Diags(DiagID, DiagClient);
+#else
+ args.push_back("-ffp-contract=off");
+
+ clang::TextDiagnosticPrinter *DiagClient =
+ new clang::TextDiagnosticPrinter(ErrorInfo, &*DiagOpts);
+ llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
+ clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
+#endif /* LLVM_VERSION_MINOR <= 1 */
+ // Create the compiler invocation
+ std::unique_ptr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
+ clang::CompilerInvocation::CreateFromArgs(*CI,
+ &args[0],
+ &args[0] + args.size(),
+ Diags);
+
+ // Create the compiler instance
+ clang::CompilerInstance Clang;
+ Clang.setInvocation(CI.release());
+ // Get ready to report problems
+#if LLVM_VERSION_MINOR <= 2
+ Clang.createDiagnostics(args.size(), &args[0]);
+#else
+ Clang.createDiagnostics(DiagClient, false);
+#endif /* LLVM_VERSION_MINOR <= 2 */
+
+ Clang.getDiagnosticOpts().ShowCarets = false;
+ if (!Clang.hasDiagnostics())
+ return false;
+
+ // Set Language
+ clang::LangOptions & lang_opts = Clang.getLangOpts();
+ lang_opts.OpenCL = 1;
+
+ clang::PreprocessorOptions& prep_opt = Clang.getPreprocessorOpts();
+ prep_opt.DisablePCHValidation = 1;
+
+ //llvm flags need command line parsing to take effect
+ if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
+ unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
+ const char **Args = new const char*[NumArgs + 2];
+ Args[0] = "clang (LLVM option parsing)";
+ for (unsigned i = 0; i != NumArgs; ++i){
+ Args[i + 1] = Clang.getFrontendOpts().LLVMArgs[i].c_str();
+ }
+ Args[NumArgs + 1] = 0;
+ llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
+ delete [] Args;
+ }
+
+ // Create an action and make the compiler instance carry it out
+ std::unique_ptr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction(llvm_ctx));
+
+ std::string dirs = OCL_PCM_PATH;
+ std::string pcmFileName;
+ std::istringstream idirs(dirs);
+ bool findPcm = false;
+
+ while (getline(idirs, pcmFileName, ':')) {
+ if(access(pcmFileName.c_str(), R_OK) == 0) {
+ findPcm |= true;
+ break;
+ }
+ }
+
+ GBE_ASSERT(findPcm && "Could not find pre compiled module library.\n");
+
+ Clang.getCodeGenOpts().LinkBitcodeFile = pcmFileName;
+ auto retVal = Clang.ExecuteAction(*Act);
+
+ if (err != NULL) {
+ GBE_ASSERT(errSize != NULL);
+ *errSize = ErrorString.copy(err, stringSize - 1, 0);
+ }
+
+ if (err == NULL || OCL_OUTPUT_BUILD_LOG) {
+ // flush the error messages to the errs() if there is no
+ // error string buffer.
+ llvm::errs() << ErrorString;
+ }
+ ErrorString.clear();
+ if (!retVal)
+ return false;
+
+ llvm::Module *module = Act->takeModule();
+
+ *out_module = module;
+ return true;
+ }
+
+ extern std::string ocl_stdlib_str;
+
+ BVAR(OCL_USE_PCH, true);
+ static void processSourceAndOption(const char *source,
+ const char *options,
+ const char *temp_header_path,
+ std::string& clOpt,
+ std::string& clName,
+ int& optLevel)
+ {
+ char clStr[] = "/tmp/XXXXXX.cl";
+ int clFd = mkstemps(clStr, 3);
+ clName = std::string(clStr);
+
+ FILE *clFile = fdopen(clFd, "w");
+ FATAL_IF(clFile == NULL, "Failed to open temporary file");
+
+ bool usePCH = OCL_USE_PCH;
+ bool findPCH = false;
+
+ /* Because our header file is so big, we want to avoid recompile the header from
+ scratch. We use the PCH support of Clang to save the huge compiling time.
+ We just use the most general build opt to build the PCH header file, so if
+ user pass new build options here, the PCH can not pass the Clang's compitable
+ validating. Clang will do three kinds of compatible check: Language Option,
+ Target Option and Preprocessing Option. Other kinds of options such as the
+ CodeGen options will not affect the AST result, so no need to check.
+
+ According to OpenCL 1.1's spec, the CL build options:
+ -D name=definition
+ If the definition is not used in our header, it is compitable
+
+ -cl-single-precision-constant
+ -cl-denorms-are-zero
+ -cl-std=
+ Language options, really affect.
+
+ -cl-opt-disable
+ -cl-mad-enable
+ -cl-no-signed-zeros
+ -cl-unsafe-math-optimizations
+ -cl-finite-math-only
+ -cl-fast-relaxed-math
+ CodeGen options, not affect
+
+ -Werror
+ -w
+ Our header should not block the compiling because of warning.
+
+ So we just disable the PCH validation of Clang and do the judgement by ourself. */
+
+ /* We always add -cl-kernel-arg-info to the options. This option just generate the arg
+ information for the backend, no other side effect and does not have performance issue. */
+ if (!options || !strstr(const_cast<char *>(options), "-cl-kernel-arg-info"))
+ clOpt += "-cl-kernel-arg-info ";
+
+ if (options) {
+ char *p;
+ /* FIXME: Though we can disable the pch valid check, and load pch successfully,
+ but these language opts and pre-defined macro will still generate the diag msg
+ to the diag engine of the Clang and cause the Clang to report error.
+ We filter them all here to avoid these. */
+ const char * incompatible_opts[] = {
+ "-cl-single-precision-constant",
+// "-cl-denorms-are-zero",
+ "-cl-fast-relaxed-math",
+ "-cl-std=CL1.1"
+ };
+ const char * incompatible_defs[] = {
+ "GET_FLOAT_WORD",
+ "__NV_CL_C_VERSION",
+ "GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND"
+ };
+
+ for (unsigned int i = 0; i < sizeof(incompatible_opts)/sizeof(char *); i++ ) {
+ p = strstr(const_cast<char *>(options), incompatible_opts[i]);
+ if (p) {
+ usePCH = false;
+ break;
+ }
+ }
+
+ if (usePCH) {
+ for (unsigned int i = 0; i < sizeof(incompatible_defs)/sizeof(char *); i++ ) {
+ p = strstr(const_cast<char *>(options), incompatible_defs[i]);
+ if (p) {
+ usePCH = false;
+ break;
+ }
+ }
+ }
+
+ p = strstr(const_cast<char *>(options), "-cl-opt-disable");
+ if (p)
+ optLevel = 0;
+ // XXX enable cl_khr_fp64 may cause some potential bugs.
+ // we may need to revisit here latter when we want to support fp64 completely.
+ // For now, as we don't support fp64 actually, just disable it by default.
+#if 0
+ #define ENABLE_CL_KHR_FP64_STR "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+ if (!strstr(const_cast<char *>(options), "-cl-std=CL1.1"))
+ fwrite(ENABLE_CL_KHR_FP64_STR, strlen(ENABLE_CL_KHR_FP64_STR), 1, clFile);
+#endif
+
+ clOpt += options;
+ }
+
+ std::string dirs = OCL_PCH_PATH;
+ std::istringstream idirs(dirs);
+ std::string pchFileName;
+
+ while (getline(idirs, pchFileName, ':')) {
+ if(access(pchFileName.c_str(), R_OK) == 0) {
+ findPCH = true;
+ break;
+ }
+ }
+
+ if (usePCH && findPCH) {
+ clOpt += " -include-pch ";
+ clOpt += pchFileName;
+ clOpt += " ";
+ } else
+ fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
+
+ //for clCompilerProgram usage.
+ if(temp_header_path){
+ clOpt += " -I ";
+ clOpt += temp_header_path;
+ clOpt += " ";
+ }
+
+ if (!OCL_STRICT_CONFORMANCE) {
+ fwrite(ocl_mathfunc_fastpath_str.c_str(), strlen(ocl_mathfunc_fastpath_str.c_str()), 1, clFile);
+ }
+
+ // reset the file number in case we have inserted something into the kernel
+ std::string resetFileNum = "#line 1\n";
+ fwrite(resetFileNum.c_str(), strlen(resetFileNum.c_str()), 1, clFile);
+
+ // Write the source to the cl file
+ fwrite(source, strlen(source), 1, clFile);
+ fclose(clFile);
+ }
+
+ static gbe_program programNewFromSource(uint32_t deviceID,
+ const char *source,
+ size_t stringSize,
+ const char *options,
+ char *err,
+ size_t *errSize)
+ {
+ int optLevel = 1;
+ std::string clOpt;
+ std::string clName;
+ processSourceAndOption(source, options, NULL, clOpt, clName, optLevel);
+
+ gbe_program p;
+ // will delete the module and act in GenProgram::CleanLlvmResource().
+ llvm::Module * out_module;
+ llvm::LLVMContext* llvm_ctx = new llvm::LLVMContext;
+
+ static std::mutex llvm_mutex;
+ if (!llvm::llvm_is_multithreaded())
+ llvm_mutex.lock();
+
+ if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
+ stringSize, err, errSize)) {
+ // Now build the program from llvm
+ size_t clangErrSize = 0;
+ if (err != NULL) {
+ GBE_ASSERT(errSize != NULL);
+ stringSize -= *errSize;
+ err += *errSize;
+ clangErrSize = *errSize;
+ }
+
+ p = gbe_program_new_from_llvm(deviceID, NULL, out_module, llvm_ctx, stringSize,
+ err, errSize, optLevel);
+ if (err != NULL)
+ *errSize += clangErrSize;
+ if (OCL_OUTPUT_BUILD_LOG && options)
+ llvm::errs() << options;
+ } else
+ p = NULL;
+
+ if (!llvm::llvm_is_multithreaded())
+ llvm_mutex.unlock();
+
+ remove(clName.c_str());
+ return p;
+ }
+#endif
+
+#ifdef GBE_COMPILER_AVAILABLE
+
+ static gbe_program programCompileFromSource(uint32_t deviceID,
+ const char *source,
+ const char *temp_header_path,
+ size_t stringSize,
+ const char *options,
+ char *err,
+ size_t *errSize)
+ {
+ int optLevel = 1;
+ std::string clOpt;
+ std::string clName;
+ processSourceAndOption(source, options, temp_header_path, clOpt, clName, optLevel);
+
+ gbe_program p;
+ acquireLLVMContextLock();
+ //FIXME: if use new allocated context to link two modules there would be context mismatch
+ //for some functions, so we use global context now, need switch to new context later.
+ llvm::Module * out_module;
+ llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
+ if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
+ stringSize, err, errSize)) {
+ // Now build the program from llvm
+ if (err != NULL) {
+ GBE_ASSERT(errSize != NULL);
+ stringSize -= *errSize;
+ err += *errSize;
+ }
+
+ p = gbe_program_new_gen_program(deviceID, out_module, NULL);
+
+ if (OCL_OUTPUT_BUILD_LOG && options)
+ llvm::errs() << options;
+ } else
+ p = NULL;
+ remove(clName.c_str());
+ releaseLLVMContextLock();
+ return p;
+ }
+#endif
+
+#ifdef GBE_COMPILER_AVAILABLE
+ static void programLinkProgram(gbe_program dst_program,
+ gbe_program src_program,
+ size_t stringSize,
+ char * err,
+ size_t * errSize)
+ {
+ acquireLLVMContextLock();
+
+ gbe_program_link_from_llvm(dst_program, src_program, stringSize, err, errSize);
+
+ releaseLLVMContextLock();
+
+ if (OCL_OUTPUT_BUILD_LOG && err)
+ llvm::errs() << err;
+ }
+#endif
+
+ static size_t programGetGlobalConstantSize(gbe_program gbeProgram) {
+ if (gbeProgram == NULL) return 0;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ return program->getGlobalConstantSize();
+ }
+
+ static void programGetGlobalConstantData(gbe_program gbeProgram, char *mem) {
+ if (gbeProgram == NULL) return;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ program->getGlobalConstantData(mem);
+ }
+
+ static uint32_t programGetKernelNum(gbe_program gbeProgram) {
+ if (gbeProgram == NULL) return 0;
+ const gbe::Program *program = (const gbe::Program*) gbeProgram;
+ return program->getKernelNum();
+ }
+
+ static gbe_kernel programGetKernelByName(gbe_program gbeProgram, const char *name) {
+ if (gbeProgram == NULL) return NULL;
+ const gbe::Program *program = (gbe::Program*) gbeProgram;
+ return (gbe_kernel) program->getKernel(std::string(name));
+ }
+
+ static gbe_kernel programGetKernel(const gbe_program gbeProgram, uint32_t ID) {
+ if (gbeProgram == NULL) return NULL;
+ const gbe::Program *program = (gbe::Program*) gbeProgram;
+ return (gbe_kernel) program->getKernel(ID);
+ }
+
+ static const char *kernelGetName(gbe_kernel genKernel) {
+ if (genKernel == NULL) return NULL;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getName();
+ }
+
+ static const char *kernelGetAttributes(gbe_kernel genKernel) {
+ if (genKernel == NULL) return NULL;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getFunctionAttributes();
+ }
+
+ static const char *kernelGetCode(gbe_kernel genKernel) {
+ if (genKernel == NULL) return NULL;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getCode();
+ }
+
+ static size_t kernelGetCodeSize(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0u;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getCodeSize();
+ }
+
+ static uint32_t kernelGetArgNum(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0u;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getArgNum();
+ }
+
+ static void *kernelGetArgInfo(gbe_kernel genKernel, uint32_t argID, uint32_t value) {
+ if (genKernel == NULL) return NULL;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ ir::FunctionArgument::InfoFromLLVM* info = kernel->getArgInfo(argID);
+
+ switch (value) {
+ case GBE_GET_ARG_INFO_ADDRSPACE:
+ return (void*)((unsigned long)info->addrSpace);
+ case GBE_GET_ARG_INFO_TYPE:
+ return (void *)(info->typeName.c_str());
+ case GBE_GET_ARG_INFO_ACCESS:
+ return (void *)(info->accessQual.c_str());
+ case GBE_GET_ARG_INFO_TYPEQUAL:
+ return (void *)(info->typeQual.c_str());
+ case GBE_GET_ARG_INFO_NAME:
+ return (void *)(info->argName.c_str());
+ default:
+ assert(0);
+ }
+
+ return NULL;
+ }
+
+ static uint32_t kernelGetArgSize(gbe_kernel genKernel, uint32_t argID) {
+ if (genKernel == NULL) return 0u;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getArgSize(argID);
+ }
+
+ static uint8_t kernelGetArgBTI(gbe_kernel genKernel, uint32_t argID) {
+ if (genKernel == NULL) return 0u;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getArgBTI(argID);
+ }
+
+ static uint32_t kernelGetArgAlign(gbe_kernel genKernel, uint32_t argID) {
+ if (genKernel == NULL) return 0u;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getArgAlign(argID);
+ }
+ static gbe_arg_type kernelGetArgType(gbe_kernel genKernel, uint32_t argID) {
+ if (genKernel == NULL) return GBE_ARG_INVALID;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getArgType(argID);
+ }
+
+ static uint32_t kernelGetSIMDWidth(gbe_kernel genKernel) {
+ if (genKernel == NULL) return GBE_ARG_INVALID;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getSIMDWidth();
+ }
+
+ static int32_t kernelGetCurbeOffset(gbe_kernel genKernel, gbe_curbe_type type, uint32_t subType) {
+ if (genKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getCurbeOffset(type, subType);
+ }
+
+ static int32_t kernelGetCurbeSize(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getCurbeSize();
+ }
+
+ static int32_t kernelGetStackSize(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getStackSize();
+ }
+
+ static int32_t kernelGetScratchSize(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getScratchSize();
+ }
+
+ static int32_t kernelUseSLM(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getUseSLM() ? 1 : 0;
+ }
+
+ static int32_t kernelGetSLMSize(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getSLMSize();
+ }
+
+ static size_t kernelGetSamplerSize(gbe_kernel gbeKernel) {
+ if (gbeKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ return kernel->getSamplerSize();
+ }
+
+ static void kernelGetSamplerData(gbe_kernel gbeKernel, uint32_t *samplers) {
+ if (gbeKernel == NULL) return;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ kernel->getSamplerData(samplers);
+ }
+
+ static uint32_t kernelGetPrintfNum(void * printf_info) {
+ if (printf_info == NULL) return 0;
+ const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+ return ps->getPrintfNum();
+ }
+
+ static void* kernelDupPrintfSet(gbe_kernel gbeKernel) {
+ if (gbeKernel == NULL) return NULL;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ return kernel->dupPrintfSet();
+ }
+
+ static uint8_t kernelGetPrintfBufBTI(void * printf_info) {
+ if (printf_info == NULL) return 0;
+ const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+ return ps->getBufBTI();
+ }
+
+ static uint8_t kernelGetPrintfIndexBufBTI(void * printf_info) {
+ if (printf_info == NULL) return 0;
+ const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+ return ps->getIndexBufBTI();
+ }
+
+ static void kernelReleasePrintfSet(void * printf_info) {
+ if (printf_info == NULL) return;
+ ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+ delete ps;
+ }
+
+ static uint32_t kernelGetPrintfSizeOfSize(void * printf_info) {
+ if (printf_info == NULL) return 0;
+ const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+ return ps->getPrintfSizeOfSize();
+ }
+
+ static void kernelOutputPrintf(void * printf_info, void* index_addr,
+ void* buf_addr, size_t global_wk_sz0,
+ size_t global_wk_sz1, size_t global_wk_sz2)
+ {
+ if (printf_info == NULL) return;
+ ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+ ps->outputPrintf(index_addr, buf_addr, global_wk_sz0,
+ global_wk_sz1, global_wk_sz2);
+ }
+
+ static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
+ if (gbeKernel == NULL) return;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ kernel->getCompileWorkGroupSize(wg_size);
+ }
+
+ static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
+ if (gbeKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ return kernel->getImageSize();
+ }
+
+ static void kernelGetImageData(gbe_kernel gbeKernel, ImageInfo *images) {
+ if (gbeKernel == NULL) return;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ kernel->getImageData(images);
+ }
+
+ static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
+ return 0u;
+ }
+} /* namespace gbe */
+
+std::mutex llvm_ctx_mutex;
+void acquireLLVMContextLock()
+{
+ llvm_ctx_mutex.lock();
+}
+
+void releaseLLVMContextLock()
+{
+ llvm_ctx_mutex.unlock();
+}
+
+GBE_EXPORT_SYMBOL gbe_program_new_from_source_cb *gbe_program_new_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_compile_from_source_cb *gbe_program_compile_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_link_program_cb *gbe_program_link_program = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_binary_cb *gbe_program_new_from_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_binary_cb *gbe_program_new_from_llvm_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_gen_program_cb *gbe_program_new_gen_program = NULL;
+GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
+GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL;
+GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_kernel_cb *gbe_program_get_kernel = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_name_cb *gbe_kernel_get_name = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_code_cb *gbe_kernel_get_code = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_num_cb *gbe_get_printf_num = NULL;
+GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size = NULL;
+GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
+
+#ifdef GBE_COMPILER_AVAILABLE
+namespace gbe
+{
+ /* Use pre-main to setup the call backs */
+ struct CallBackInitializer
+ {
+ CallBackInitializer(void) {
+ gbe_program_new_from_source = gbe::programNewFromSource;
+ gbe_program_compile_from_source = gbe::programCompileFromSource;
+ gbe_program_link_program = gbe::programLinkProgram;
+ gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
+ gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+ gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
+ gbe_program_delete = gbe::programDelete;
+ gbe_program_get_kernel_num = gbe::programGetKernelNum;
+ gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
+ gbe_program_get_kernel = gbe::programGetKernel;
+ gbe_kernel_get_name = gbe::kernelGetName;
+ gbe_kernel_get_attributes = gbe::kernelGetAttributes;
+ gbe_kernel_get_code = gbe::kernelGetCode;
+ gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
+ gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
+ gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
+ gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+ gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
+ gbe_kernel_get_arg_type = gbe::kernelGetArgType;
+ gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
+ gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
+ gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
+ gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
+ gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+ gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
+ gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
+ gbe_kernel_use_slm = gbe::kernelUseSLM;
+ gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
+ gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
+ gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+ gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
+ gbe_kernel_get_image_size = gbe::kernelGetImageSize;
+ gbe_kernel_get_image_data = gbe::kernelGetImageData;
+ gbe_get_printf_num = gbe::kernelGetPrintfNum;
+ gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
+ gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
+ gbe_dup_printfset = gbe::kernelDupPrintfSet;
+ gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
+ gbe_release_printf_info = gbe::kernelReleasePrintfSet;
+ gbe_output_printf = gbe::kernelOutputPrintf;
+ genSetupCallBacks();
+ }
+
+ ~CallBackInitializer() {
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
+ llvm::llvm_shutdown();
+#endif
+ }
+ };
+
+ static CallBackInitializer cbInitializer;
+} /* namespace gbe */
+#endif
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
new file mode 100644
index 0000000..1421993
--- /dev/null
+++ b/backend/src/backend/program.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.h
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * C interface for the Gen kernels and programs (either real Gen ISA or Gen
+ * simulator). This is the only thing the run-time can see from the compiler
+ */
+
+#ifndef __GBE_PROGRAM_H__
+#define __GBE_PROGRAM_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/*! Opaque structure that interfaces a GBE program */
+typedef struct _gbe_program *gbe_program;
+
+/*! Opaque structure that interfaces a GBE kernel (ie one OCL function) */
+typedef struct _gbe_kernel *gbe_kernel;
+
+/*! Argument type for each function call */
+enum gbe_arg_type {
+ GBE_ARG_VALUE = 0, // int, float and so on
+ GBE_ARG_GLOBAL_PTR = 1, // __global
+ GBE_ARG_CONSTANT_PTR = 2, // __constant
+ GBE_ARG_LOCAL_PTR = 3, // __local
+ GBE_ARG_IMAGE = 4, // image2d_t, image3d_t
+ GBE_ARG_SAMPLER = 5, // sampler_t
+ GBE_ARG_INVALID = 0xffffffff
+};
+
+/*! Get argument info values */
+enum gbe_get_arg_info_value {
+ GBE_GET_ARG_INFO_ADDRSPACE = 0,
+ GBE_GET_ARG_INFO_ACCESS = 1,
+ GBE_GET_ARG_INFO_TYPE = 2,
+ GBE_GET_ARG_INFO_TYPEQUAL = 3,
+ GBE_GET_ARG_INFO_NAME = 4,
+ GBE_GET_ARG_INFO_INVALID = 0xffffffff
+};
+
+// BTI magic number
+#define BTI_CONSTANT 0
+#define BTI_PRIVATE 1
+#define BTI_RESERVED_NUM 2
+#define BTI_MAX_IMAGE_NUM 128
+#define BTI_MAX_ID (BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM - 1)
+
+/*! Constant buffer values (ie values to setup in the constant buffer) */
+enum gbe_curbe_type {
+ GBE_CURBE_LOCAL_ID_X = 0,
+ GBE_CURBE_LOCAL_ID_Y,
+ GBE_CURBE_LOCAL_ID_Z,
+ GBE_CURBE_LOCAL_SIZE_X,
+ GBE_CURBE_LOCAL_SIZE_Y,
+ GBE_CURBE_LOCAL_SIZE_Z,
+ GBE_CURBE_GLOBAL_SIZE_X,
+ GBE_CURBE_GLOBAL_SIZE_Y,
+ GBE_CURBE_GLOBAL_SIZE_Z,
+ GBE_CURBE_GLOBAL_OFFSET_X,
+ GBE_CURBE_GLOBAL_OFFSET_Y,
+ GBE_CURBE_GLOBAL_OFFSET_Z,
+ GBE_CURBE_GROUP_NUM_X,
+ GBE_CURBE_GROUP_NUM_Y,
+ GBE_CURBE_GROUP_NUM_Z,
+ GBE_CURBE_WORK_DIM,
+ GBE_CURBE_IMAGE_INFO,
+ GBE_CURBE_STACK_POINTER,
+ GBE_CURBE_PRINTF_BUF_POINTER,
+ GBE_CURBE_PRINTF_INDEX_POINTER,
+ GBE_CURBE_KERNEL_ARGUMENT,
+ GBE_CURBE_EXTRA_ARGUMENT,
+ GBE_CURBE_BLOCK_IP,
+ GBE_CURBE_THREAD_NUM,
+ GBE_CURBE_ZERO,
+ GBE_CURBE_ONE,
+ GBE_CURBE_SLM_OFFSET,
+};
+
+/*! Extra arguments use the negative range of sub-values */
+enum gbe_extra_argument {
+ GBE_STACK_BUFFER = 0, /* Give stack location in curbe */
+ GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
+};
+
+typedef struct ImageInfo {
+ int32_t arg_idx;
+ int32_t idx;
+ int32_t wSlot;
+ int32_t hSlot;
+ int32_t depthSlot;
+ int32_t dataTypeSlot;
+ int32_t channelOrderSlot;
+ int32_t dimOrderSlot;
+} ImageInfo;
+
+typedef void (gbe_set_image_base_index_cb)(uint32_t base_idx);
+extern gbe_set_image_base_index_cb *gbe_set_image_base_index;
+
+typedef uint32_t (gbe_get_image_base_index_cb)();
+extern gbe_get_image_base_index_cb *gbe_get_image_base_index;
+
+/*! Get the size of defined images */
+typedef size_t (gbe_kernel_get_image_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size;
+
+/*! Get the content of defined images */
+typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo *images);
+extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
+
+/*! Get the printf number */
+typedef uint32_t (gbe_get_printf_num_cb)(void* printf_info);
+extern gbe_get_printf_num_cb *gbe_get_printf_num;
+
+/*! Get the printf buffer bti */
+typedef uint8_t (gbe_get_printf_buf_bti_cb)(void* printf_info);
+extern gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti;
+
+typedef uint8_t (gbe_get_printf_indexbuf_bti_cb)(void* printf_info);
+extern gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti;
+
+/*! Release the printfset */
+typedef void (gbe_release_printf_info_cb)(void* printf_info);
+extern gbe_release_printf_info_cb *gbe_release_printf_info;
+
+/*! Dup the printf set */
+typedef void* (gbe_dup_printfset_cb)(gbe_kernel gbeKernel);
+extern gbe_dup_printfset_cb *gbe_dup_printfset;
+
+/*! Get the printf buffer const offset */
+typedef uint32_t (gbe_get_printf_sizeof_size_cb)(void* printf_info);
+extern gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size;
+
+typedef void (gbe_output_printf_cb) (void* printf_info, void* index_addr, void* buf_addr,
+ size_t global_wk_sz0, size_t global_wk_sz1, size_t global_wk_sz2);
+extern gbe_output_printf_cb* gbe_output_printf;
+
+/*! Create a new program from the given source code (zero terminated string) */
+typedef gbe_program (gbe_program_new_from_source_cb)(uint32_t deviceID,
+ const char *source,
+ size_t stringSize,
+ const char *options,
+ char *err,
+ size_t *err_size);
+extern gbe_program_new_from_source_cb *gbe_program_new_from_source;
+/*! Create a new program from the given source code and compile it (zero terminated string) */
+typedef gbe_program (gbe_program_compile_from_source_cb)(uint32_t deviceID,
+ const char *source,
+ const char *temp_header_path,
+ size_t stringSize,
+ const char *options,
+ char *err,
+ size_t *err_size);
+extern gbe_program_compile_from_source_cb *gbe_program_compile_from_source;
+/*! link the programs. */
+typedef void (gbe_program_link_program_cb)(gbe_program dst_program,
+ gbe_program src_program,
+ size_t stringSize,
+ char * err,
+ size_t * errSize);
+extern gbe_program_link_program_cb *gbe_program_link_program;
+
+/*! create s new genprogram for link. */
+typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
+ const void *module,
+ const void *act);
+extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
+
+/*! Create a new program from the given blob */
+typedef gbe_program (gbe_program_new_from_binary_cb)(uint32_t deviceID, const char *binary, size_t size);
+extern gbe_program_new_from_binary_cb *gbe_program_new_from_binary;
+
+/*! Create a new program from the llvm bitcode*/
+typedef gbe_program (gbe_program_new_from_llvm_binary_cb)(uint32_t deviceID, const char *binary, size_t size);
+extern gbe_program_new_from_llvm_binary_cb *gbe_program_new_from_llvm_binary;
+
+/*! Serialize a program to a bin, 0 means executable, 1 means llvm bitcode*/
+typedef size_t (gbe_program_serialize_to_binary_cb)(gbe_program program, char **binary, int binary_type);
+extern gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary;
+
+/*! Create a new program from the given LLVM file */
+typedef gbe_program (gbe_program_new_from_llvm_cb)(uint32_t deviceID,
+ const char *fileName,
+ const void *module,
+ const void *llvm_ctx,
+ size_t string_size,
+ char *err,
+ size_t *err_size,
+ int optLevel);
+extern gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm;
+
+/*! create s new genprogram for link. */
+typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
+ const void *module,
+ const void *act);
+extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
+
+/*! link the programs from llvm level. */
+typedef void (gbe_program_link_from_llvm_cb)(gbe_program dst_program,
+ gbe_program src_program,
+ size_t stringSize,
+ char * err,
+ size_t * errSize);
+extern gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm;
+/* build the program to gen binary */
+typedef void gbe_program_build_from_llvm_cb(gbe_program program,
+ size_t stringSize,
+ char *err,
+ size_t *errSize,
+ const char * options);
+extern gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm;
+
+/*! Get the size of global constants */
+typedef size_t (gbe_program_get_global_constant_size_cb)(gbe_program gbeProgram);
+extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size;
+
+/*! Get the content of global constants */
+typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
+extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
+
+/*! Get the size of defined samplers */
+typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
+
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
+extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
+
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_compile_wg_size_cb)(gbe_kernel gbeKernel, size_t wg_sz[3]);
+extern gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size;
+
+/*! Clean LLVM resource of the given program */
+typedef void (gbe_program_clean_llvm_resource_cb)(gbe_program);
+extern gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource;
+
+/*! Destroy and deallocate the given program */
+typedef void (gbe_program_delete_cb)(gbe_program);
+extern gbe_program_delete_cb *gbe_program_delete;
+
+/*! Get the number of functions in the program */
+typedef uint32_t (gbe_program_get_kernel_num_cb)(gbe_program);
+extern gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num;
+
+/*! Get the kernel from its name */
+typedef gbe_kernel (gbe_program_get_kernel_by_name_cb)(gbe_program, const char *name);
+extern gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name;
+
+/*! Get the kernel from its ID */
+typedef gbe_kernel (gbe_program_get_kernel_cb)(gbe_program, uint32_t ID);
+extern gbe_program_get_kernel_cb *gbe_program_get_kernel;
+
+/*! Get the kernel name */
+typedef const char *(gbe_kernel_get_name_cb)(gbe_kernel);
+extern gbe_kernel_get_name_cb *gbe_kernel_get_name;
+
+/*! Get the kernel attributes*/
+typedef const char *(gbe_kernel_get_attributes_cb)(gbe_kernel);
+extern gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes;
+
+/*! Get the kernel source code */
+typedef const char *(gbe_kernel_get_code_cb)(gbe_kernel);
+extern gbe_kernel_get_code_cb *gbe_kernel_get_code;
+
+/*! Get the size of the source code */
+typedef size_t (gbe_kernel_get_code_size_cb)(gbe_kernel);
+extern gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size;
+
+/*! Get the total number of arguments */
+typedef uint32_t (gbe_kernel_get_arg_num_cb)(gbe_kernel);
+extern gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num;
+
+/*! Get the argument info */
+typedef void* (gbe_kernel_get_arg_info_cb)(gbe_kernel, uint32_t argID, uint32_t value);
+extern gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info;
+
+/*! Get the size of the given argument */
+typedef uint32_t (gbe_kernel_get_arg_size_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
+
+/*! Get the the bti of a __global buffer */
+typedef uint8_t (gbe_kernel_get_arg_bti_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti;
+
+/*! Get the type of the given argument */
+typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
+
+/*! Get the align of the given argument */
+typedef uint32_t (gbe_kernel_get_arg_align_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align;
+
+/*! Get the simd width for the kernel */
+typedef uint32_t (gbe_kernel_get_simd_width_cb)(gbe_kernel);
+extern gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width;
+
+/*! Get the curbe size required by the kernel */
+typedef int32_t (gbe_kernel_get_curbe_size_cb)(gbe_kernel);
+extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
+
+/*! Get the stack size (zero if no stack is required) */
+typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
+extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
+
+/*! Get the scratch size (zero if no scratch is required) */
+typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel);
+extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
+
+/*! Get the curbe offset where to put the data. Returns -1 if not required */
+typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
+extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
+
+/*! Indicates if a work group size is required. Return the required width or 0
+ * if none
+ */
+typedef uint32_t (gbe_kernel_get_required_work_group_size_cb)(gbe_kernel, uint32_t dim);
+extern gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size;
+
+/*! Says if SLM is used. Required to reconfigure the L3 complex */
+typedef int32_t (gbe_kernel_use_slm_cb)(gbe_kernel);
+extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
+/*! Get slm size needed for kernel local variables */
+typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
+extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
+
+/*mutex to lock global llvmcontext access.*/
+extern void acquireLLVMContextLock();
+extern void releaseLLVMContextLock();
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __GBE_PROGRAM_H__ */
+
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
new file mode 100644
index 0000000..56f60af
--- /dev/null
+++ b/backend/src/backend/program.hpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_PROGRAM_HPP__
+#define __GBE_PROGRAM_HPP__
+
+#include "backend/program.h"
+#include "backend/context.hpp"
+#include "ir/constant.hpp"
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/printf.hpp"
+#include "ir/sampler.hpp"
+#include "sys/hash_map.hpp"
+#include "sys/vector.hpp"
+#include <string>
+
+namespace gbe {
+namespace ir {
+ class Unit; // Compilation unit. Contains the program to compile
+} /* namespace ir */
+} /* namespace gbe */
+
+namespace gbe {
+
+ /*! Info for the kernel argument */
+ struct KernelArgument {
+ gbe_arg_type type; //!< Pointer, structure, image, regular value?
+ uint32_t size; //!< Size of the argument
+ uint32_t align; //!< addr alignment of the argument
+ uint8_t bti; //!< binding table index for __global buffer
+ ir::FunctionArgument::InfoFromLLVM info;
+ };
+
+ /*! Stores the offset where to patch where to patch */
+ struct PatchInfo {
+ INLINE PatchInfo(gbe_curbe_type type, uint32_t subType = 0u, uint32_t offset = 0u) :
+ type(uint32_t(type)), subType(subType), offset(offset) {}
+ INLINE PatchInfo(void) {}
+ uint64_t type : 16; //!< Type of the patch (see program.h for the list)
+ uint64_t subType : 32; //!< Optional sub-type of the patch (see program.h)
+ uint64_t offset : 16; //!< Optional offset to encode
+ };
+
+ /*! We will sort PatchInfo to make binary search */
+ INLINE bool operator< (PatchInfo i0, PatchInfo i1) {
+ if (i0.type != i1.type) return i0.type < i1.type;
+ return i0.subType < i1.subType;
+ }
+
+ /*! Describe a compiled kernel */
+ class Kernel : public NonCopyable, public Serializable
+ {
+ public:
+ /*! Create an empty kernel with the given name */
+ Kernel(const std::string &name);
+ /*! Destroy it */
+ virtual ~Kernel(void);
+ /*! Return the instruction stream (to be implemented) */
+ virtual const char *getCode(void) const = 0;
+ /*! Set the instruction stream.*/
+ virtual const void setCode(const char *, size_t size) = 0;
+ /*! Return the instruction stream size (to be implemented) */
+ virtual size_t getCodeSize(void) const = 0;
+ /*! Get the kernel name */
+ INLINE const char *getName(void) const { return name.c_str(); }
+ /*! Return the number of arguments for the kernel call */
+ INLINE uint32_t getArgNum(void) const { return argNum; }
+ /*! Return the size of the given argument */
+ INLINE uint32_t getArgSize(uint32_t argID) const {
+ return argID >= argNum ? 0u : args[argID].size;
+ }
+ /*! Return the bti for __global buffer */
+ INLINE uint8_t getArgBTI(uint32_t argID) const {
+ return argID >= argNum ? 0u : args[argID].bti;
+ }
+ /*! Return the alignment of buffer argument */
+ INLINE uint32_t getArgAlign(uint32_t argID) const {
+ return argID >= argNum ? 0u : args[argID].align;
+ }
+ /*! Return the type of the given argument */
+ INLINE gbe_arg_type getArgType(uint32_t argID) const {
+ return argID >= argNum ? GBE_ARG_INVALID : args[argID].type;
+ }
+ /*! Get the offset where to patch. Returns -1 if no patch needed */
+ int32_t getCurbeOffset(gbe_curbe_type type, uint32_t subType) const;
+ /*! Get the curbe size required by the kernel */
+ INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; }
+ /*! Return the size of the stack (zero if none) */
+ INLINE uint32_t getStackSize(void) const { return this->stackSize; }
+ /*! Return the size of the scratch memory needed (zero if none) */
+ INLINE uint32_t getScratchSize(void) const { return this->scratchSize; }
+ /*! Get the SIMD width for the kernel */
+ INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
+ /*! Says if SLM is needed for it */
+ INLINE bool getUseSLM(void) const { return this->useSLM; }
+ /*! get slm size for kernel local variable */
+ INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
+ /*! Set sampler set. */
+ void setSamplerSet(ir::SamplerSet *from) {
+ samplerSet = from;
+ }
+ /*! Get defined sampler size */
+ size_t getSamplerSize(void) const { return (samplerSet == NULL ? 0 : samplerSet->getDataSize()); }
+ /*! Get defined sampler value array */
+ void getSamplerData(uint32_t *samplers) const { samplerSet->getData(samplers); }
+ /*! Set image set. */
+ void setImageSet(ir::ImageSet * from) {
+ imageSet = from;
+ }
+ /*! Set printf set. */
+ void setPrintfSet(ir::PrintfSet * from) {
+ printfSet = from;
+ }
+ /* ! Return the offset in the sizeof(xxx). */
+ uint32_t getPrintfSizeOfSize(void) const {
+ return printfSet ? printfSet->getPrintfSizeOfSize() : 0;
+ }
+ uint32_t getPrintfNum() const {
+ return printfSet ? printfSet->getPrintfNum() : 0;
+ }
+
+ void * dupPrintfSet() const {
+ void* ptr = printfSet ? (void *)(new ir::PrintfSet(*printfSet)) : NULL;
+ return ptr;
+ }
+ uint8_t getPrintfBufBTI() const {
+ GBE_ASSERT(printfSet);
+ return printfSet->getBufBTI();
+ }
+
+ uint8_t getPrintfIndexBufBTI() const {
+ GBE_ASSERT(printfSet);
+ return printfSet->getIndexBufBTI();
+ }
+
+ void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+ size_t global_wk_sz1, size_t global_wk_sz2) {
+ if(printfSet)
+ printfSet->outputPrintf(index_addr, buf_addr, global_wk_sz0,
+ global_wk_sz1, global_wk_sz2);
+ }
+
+ ir::FunctionArgument::InfoFromLLVM* getArgInfo(uint32_t id) const { return &args[id].info; }
+
+ /*! Set compile work group size */
+ void setCompileWorkGroupSize(const size_t wg_sz[3]) {
+ compileWgSize[0] = wg_sz[0];
+ compileWgSize[1] = wg_sz[1];
+ compileWgSize[2] = wg_sz[2];
+ }
+ /*! Get compile work group size */
+ void getCompileWorkGroupSize (size_t wg_sz[3]) const {
+ wg_sz[0] = compileWgSize[0];
+ wg_sz[1] = compileWgSize[1];
+ wg_sz[2] = compileWgSize[2];
+ }
+ /*! Set function attributes string. */
+ void setFunctionAttributes(const std::string& functionAttributes) { this->functionAttributes= functionAttributes; }
+ /*! Get function attributes string. */
+ const char* getFunctionAttributes(void) const {return this->functionAttributes.c_str();}
+
+ /*! Get defined image size */
+ size_t getImageSize(void) const { return (imageSet == NULL ? 0 : imageSet->getDataSize()); }
+ /*! Get defined image value array */
+ void getImageData(ImageInfo *images) const { imageSet->getData(images); }
+
+ static const uint32_t magic_begin = TO_MAGIC('K', 'E', 'R', 'N');
+ static const uint32_t magic_end = TO_MAGIC('N', 'R', 'E', 'K');
+
+ /* format:
+ magic_begin |
+ name_size |
+ name |
+ arg_num |
+ args |
+ PatchInfo_num |
+ PatchInfo |
+ curbeSize |
+ simdWidth |
+ stackSize |
+ scratchSize |
+ useSLM |
+ slmSize |
+ samplers |
+ images |
+ code_size |
+ code |
+ magic_end
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
+ protected:
+ friend class Context; //!< Owns the kernels
+ friend class GenContext;
+ std::string name; //!< Kernel name
+ KernelArgument *args; //!< Each argument
+ vector<PatchInfo> patches; //!< Indicates how to build the curbe
+ uint32_t argNum; //!< Number of function arguments
+ uint32_t curbeSize; //!< Size of the data to push
+ uint32_t simdWidth; //!< SIMD size for the kernel (lane number)
+ uint32_t stackSize; //!< Stack size (may be 0 if unused)
+ uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused)
+ bool useSLM; //!< SLM requires a special HW config
+ uint32_t slmSize; //!< slm size for kernel variable
+ Context *ctx; //!< Save context after compiler to alloc constant buffer curbe
+ ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
+ ir::ImageSet *imageSet; //!< Copy from the corresponding function.
+ ir::PrintfSet *printfSet; //!< Copy from the corresponding function.
+ size_t compileWgSize[3]; //!< required work group size by kernel attribute.
+ std::string functionAttributes; //!< function attribute qualifiers combined.
+ GBE_CLASS(Kernel); //!< Use custom allocators
+ };
+
+ /*! Describe a compiled program */
+ class Program : public NonCopyable, public Serializable
+ {
+ public:
+ /*! Create an empty program */
+ Program(void);
+ /*! Destroy the program */
+ virtual ~Program(void);
+ /*! Clean LLVM resource of the program */
+ virtual void CleanLlvmResource() = 0;
+ /*! Get the number of kernels in the program */
+ uint32_t getKernelNum(void) const { return kernels.size(); }
+ /*! Get the kernel from its name */
+ Kernel *getKernel(const std::string &name) const {
+ auto it = kernels.find(name);
+ if (it == kernels.end())
+ return NULL;
+ else
+ return it->second;
+ }
+ /*! Get the kernel from its ID */
+ Kernel *getKernel(uint32_t ID) const {
+ uint32_t currID = 0;
+ Kernel *kernel = NULL;
+ for (const auto &pair : kernels) {
+ if (currID == ID) {
+ kernel = pair.second;
+ break;
+ }
+ currID++;
+ }
+ return kernel;
+ }
+ /*! Build a program from a ir::Unit */
+ bool buildFromUnit(const ir::Unit &unit, std::string &error);
+ /*! Buils a program from a LLVM source code */
+ bool buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel);
+ /*! Buils a program from a OCL string */
+ bool buildFromSource(const char *source, std::string &error);
+ /*! Get size of the global constant arrays */
+ size_t getGlobalConstantSize(void) const { return constantSet->getDataSize(); }
+ /*! Get the content of global constant arrays */
+ void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
+
+ static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
+ static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
+
+ /* format:
+ magic_begin |
+ constantSet_flag |
+ constSet_data |
+ kernel_num |
+ kernel_1 |
+ ........ |
+ kernel_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
+ protected:
+ /*! Compile a kernel */
+ virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) = 0;
+ /*! Allocate an empty kernel. */
+ virtual Kernel *allocateKernel(const std::string &name) = 0;
+ /*! Kernels sorted by their name */
+ hash_map<std::string, Kernel*> kernels;
+ /*! Global (constants) outside any kernel */
+ ir::ConstantSet *constantSet;
+ /*! Use custom allocators */
+ GBE_CLASS(Program);
+ };
+
+} /* namespace gbe */
+
+#endif /* __GBE_PROGRAM_HPP__ */
+
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
new file mode 100644
index 0000000..18d23ca
--- /dev/null
+++ b/backend/src/builtin_vector_proto.def
@@ -0,0 +1,295 @@
+##math
+gentype acos (gentype)
+gentype acosh (gentype)
+gentype acospi (gentype x)
+gentype asin (gentype)
+gentype asinh (gentype)
+gentype asinpi (gentype x)
+gentype atan (gentype y_over_x)
+gentype atan2 (gentype y, gentype x)
+gentype atanh (gentype)
+gentype atanpi (gentype x)
+gentype atan2pi (gentype y, gentype x)
+gentype cbrt (gentype)
+gentype ceil (gentype)
+gentype copysign (gentype x, gentype y)
+gentype cos (gentype)
+gentype cosh (gentype)
+gentype cospi (gentype x)
+gentype erfc (gentype)
+gentype erf (gentype)
+gentype exp (gentype x)
+gentype exp2 (gentype)
+gentype exp10 (gentype)
+gentype expm1 (gentype x)
+gentype fabs (gentype)
+gentype fdim (gentype x, gentype y)
+gentype floor (gentype)
+# XXX we use madd for fma
+#gentype fma (gentype a, gentype b, gentype c)
+gentype fmax (gentype x, gentype y)
+gentypef fmax (gentypef x, float y)
+gentyped fmax (gentyped x, double y)
+gentype fmin (gentype x, gentype y)
+gentypef fmin (gentypef x, float y)
+gentyped fmin (gentyped x, double y)
+gentype fmod (gentype x, gentype y)
+gentype fract (gentype x, __global gentype *iptr)
+gentype fract (gentype x, __local gentype *iptr)
+gentype fract (gentype x, __private gentype *iptr)
+floatn frexp (floatn x, __global intn *exp)
+floatn frexp (floatn x, __local intn *exp)
+floatn frexp (floatn x, __private intn *exp)
+float frexp (float x, __global int *exp)
+float frexp (float x, __local int *exp)
+float frexp (float x, __private int *exp)
+doublen frexp (doublen x, __global intn *exp)
+doublen frexp (doublen x, __local intn *exp)
+doublen frexp (doublen x, __private intn *exp)
+double frexp (double x, __global int *exp)
+double frexp (double x, __local int *exp)
+double frexp (double x, __private int *exp)
+gentype hypot (gentype x, gentype y)
+intn ilogb (floatn x)
+int ilogb (float x)
+intn ilogb (doublen x)
+int ilogb (double x)
+floatn ldexp (floatn x, intn k)
+floatn ldexp (floatn x, int k)
+float ldexp (float x, int k)
+doublen ldexp (doublen x, intn k)
+doublen ldexp (doublen x, int k)
+double ldexp (double x, int k)
+gentype lgamma (gentype x)
+floatn lgamma_r (floatn x, __global intn *signp)
+floatn lgamma_r (floatn x, __local intn *signp)
+floatn lgamma_r (floatn x, __private intn *signp)
+float lgamma_r (float x, __global int *signp)
+float lgamma_r (float x, __local int *signp)
+float lgamma_r (float x, __private int *signp)
+#doublen lgamma_r (doublen x, __global intn *signp)
+#doublen lgamma_r (doublen x, __local intn *signp)
+#doublen lgamma_r (doublen x, __private intn *signp)
+#double lgamma_r (double x, __global int *signp)
+#double lgamma_r (double x, __local int *signp)
+#double lgamma_r (double x, __private int *signp)
+gentype log (gentype)
+gentype log2 (gentype)
+gentype log10 (gentype)
+gentype log1p (gentype x)
+gentype logb (gentype x)
+gentype mad (gentype a, gentype b, gentype c)
+gentype maxmag (gentype x, gentype y)
+gentype minmag (gentype x, gentype y)
+gentype modf (gentype x, __global gentype *iptr)
+gentype modf (gentype x, __local gentype *iptr)
+gentype modf (gentype x, __private gentype *iptr)
+floatn nan (uintn nancode)
+float nan (uint nancode)
+doublen nan (ulongn nancode)
+double nan (ulong nancode)
+gentype nextafter (gentype x, gentype y)
+gentype pow (gentype x, gentype y)
+floatn pown (floatn x, intn y)
+float pown (float x, int y)
+doublen pown (doublen x, intn y)
+double pown (double x, int y)
+#XXX we define powr as pow
+#gentype powr (gentype x, gentype y)
+gentype remainder (gentype x, gentype y)
+floatn remquo (floatn x, floatn y, __global intn *quo)
+floatn remquo (floatn x, floatn y, __local intn *quo)
+floatn remquo (floatn x, floatn y, __private intn *quo)
+float remquo (float x, float y, __global int *quo)
+float remquo (float x, float y, __local int *quo)
+float remquo (float x, float y, __private int *quo)
+doublen remquo (doublen x, doublen y, __global intn *quo)
+doublen remquo (doublen x, doublen y, __local intn *quo)
+doublen remquo (doublen x, doublen y, __private intn *quo)
+double remquo (double x, double y, __global int *quo)
+double remquo (double x, double y, __local int *quo)
+double remquo (double x, double y, __private int *quo)
+gentype rint (gentype)
+floatn rootn (floatn x, intn y)
+
+doublen rootn (doublen x, intn y)
+doublen rootn (double x, int y)
+gentype round (gentype x)
+gentype rsqrt (gentype)
+gentype sin (gentype)
+gentype sincos (gentype x, __global gentype *cosval)
+gentype sincos (gentype x, __local gentype *cosval)
+gentype sincos (gentype x, __private gentype *cosval)
+gentype sinh (gentype)
+gentype sinpi (gentype x)
+gentype sqrt (gentype)
+gentype tan (gentype)
+gentype tanh (gentype)
+gentype tanpi (gentype x)
+gentype tgamma (gentype)
+gentype trunc (gentype)
+
+##math function fast path
+gentype __gen_ocl_internal_fastpath_acosh (gentype x)
+gentype __gen_ocl_internal_fastpath_asinh (gentype x)
+gentype __gen_ocl_internal_fastpath_atanh (gentype x)
+gentype __gen_ocl_internal_fastpath_cbrt (gentype x)
+gentype __gen_ocl_internal_fastpath_cos (gentype x)
+gentype __gen_ocl_internal_fastpath_cosh (gentype x)
+gentype __gen_ocl_internal_fastpath_cospi (gentype x)
+gentype __gen_ocl_internal_fastpath_exp (gentype x)
+gentype __gen_ocl_internal_fastpath_exp10 (gentype x)
+gentype __gen_ocl_internal_fastpath_expm1 (gentype x)
+gentype __gen_ocl_internal_fastpath_fmod (gentype x, gentype y)
+gentype __gen_ocl_internal_fastpath_hypot (gentype x, gentype y)
+intn __gen_ocl_internal_fastpath_ilogb (floatn x)
+int __gen_ocl_internal_fastpath_ilogb (float x)
+intn __gen_ocl_internal_fastpath_ilogb (doublen x)
+int __gen_ocl_internal_fastpath_ilogb (double x)
+floatn __gen_ocl_internal_fastpath_ldexp (floatn x, intn k)
+floatn __gen_ocl_internal_fastpath_ldexp (floatn x, int k)
+float __gen_ocl_internal_fastpath_ldexp (float x, int k)
+doublen __gen_ocl_internal_fastpath_ldexp (doublen x, intn k)
+doublen __gen_ocl_internal_fastpath_ldexp (doublen x, int k)
+double __gen_ocl_internal_fastpath_ldexp (double x, int k)
+gentype __gen_ocl_internal_fastpath_log (gentype x)
+gentype __gen_ocl_internal_fastpath_log2 (gentype x)
+gentype __gen_ocl_internal_fastpath_log10 (gentype x)
+gentype __gen_ocl_internal_fastpath_log1p (gentype x)
+gentype __gen_ocl_internal_fastpath_logb (gentype x)
+gentype __gen_ocl_internal_fastpath_remainder (gentype x, gentype y)
+floatn __gen_ocl_internal_fastpath_rootn (floatn x, intn k)
+gentype __gen_ocl_internal_fastpath_sin (gentype x)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __global gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __local gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __private gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sinh (gentype x)
+gentype __gen_ocl_internal_fastpath_sinpi (gentype x)
+gentype __gen_ocl_internal_fastpath_tan (gentype x)
+gentype __gen_ocl_internal_fastpath_tanh (gentype x)
+
+##half_native_math
+#gentype half_cos (gentype x)
+#gentype half_divide (gentype x, gentype y)
+#gentype half_exp (gentype x)
+#gentype half_exp2 (gentype x)
+#gentype half_exp10 (gentype x)
+#gentype half_log (gentype x)
+#gentype half_log2 (gentype x)
+#gentype half_log10 (gentype x)
+#gentype half_powr (gentype x, gentype y)
+#gentype half_recip (gentype x)
+#gentype half_rsqrt (gentype x)
+#gentype half_sin (gentype x)
+#gentype half_sqrt (gentype x)
+#gentype half_tan (gentype x)
+
+# XXX we already defined all native and non-native
+# functions to the same one.
+gentype native_cos (gentype x)
+gentype native_divide (gentype x, gentype y)
+gentype native_exp (gentype x)
+#gentype native_exp2 (gentype x)
+gentype native_exp10 (gentype x)
+gentype native_log (gentype x)
+gentype native_log2 (gentype x)
+gentype native_log10 (gentype x)
+gentype native_powr (gentype x, gentype y)
+gentype native_recip (gentype x)
+gentype native_rsqrt (gentype x)
+gentype native_sin (gentype x)
+#gentype native_sqrt (gentype x)
+gentype native_tan (gentype x)
+
+##integer
+ugentype abs (gentype x)
+ugentype abs_diff (gentype x, gentype y)
+gentype add_sat (gentype x, gentype y)
+gentype hadd (gentype x, gentype y)
+gentype rhadd (gentype x, gentype y)
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentype clamp (gentype x, sgentype minval, sgentype maxval)
+gentype clz (gentype x)
+gentype mad_hi (gentype a, gentype b, gentype c)
+gentype mad_sat (gentype a, gentype b, gentype c)
+gentype max (gentype x, gentype y)
+gentype max (gentype x, sgentype y)
+gentype min (gentype x, gentype y)
+gentype min (gentype x, sgentype y)
+gentype mul_hi (gentype x, gentype y)
+gentype rotate (gentype v, gentype i)
+gentype sub_sat (gentype x, gentype y)
+shortn upsample (charn hi, ucharn lo)
+ushortn upsample (ucharn hi, ucharn lo)
+intn upsample (shortn hi, ushortn lo)
+uintn upsample (ushortn hi, ushortn lo)
+longn upsample (intn hi, uintn lo)
+ulongn upsample (uintn hi, uintn lo)
+# XXX not implemented
+#gentype popcount (gentype x)
+
+##fast_integer
+gentype mad24 (gentype x, gentype y, gentype z)
+gentype mul24 (gentype x, gentype y)
+
+##common
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentypef clamp (gentypef x, float minval, float maxval)
+gentyped clamp (gentyped x, double minval, double maxval)
+gentype degrees (gentype radians)
+gentype max (gentype x, gentype y)
+gentypef max (gentypef x, float y)
+gentyped max (gentyped x, double y)
+gentype min (gentype x, gentype y)
+gentypef min (gentypef x, float y)
+gentyped min (gentyped x, double y)
+gentype mix (gentype x, gentype y, gentype a)
+gentypef mix (gentypef x, gentypef y, float a)
+gentyped mix (gentyped x, gentyped y, double a)
+gentype radians (gentype degrees)
+gentype step (gentype edge, gentype x)
+gentypef step (float edge, gentypef x)
+gentyped step (double edge, gentyped x)
+gentype smoothstep (gentype edge0, gentype edge1, gentype x)
+gentypef smoothstep (float edge0, float edge1, gentypef x)
+gentyped smoothstep (double edge0, double edge1, gentyped x)
+gentype sign (gentype x)
+
+##relational
+intn isequal (floatn x, floatn y)
+longn isequal (doublen x, doublen y)
+intn isnotequal (floatn x, floatn y)
+longn isnotequal (doublen x, doublen y)
+intn isgreater (floatn x, floatn y)
+longn isgreater (doublen x, doublen y)
+intn isgreaterequal (floatn x, floatn y)
+longn isgreaterequal (doublen x, doublen y)
+intn isless (floatn x, floatn y)
+longn isless (doublen x, doublen y)
+intn islessequal (floatn x, floatn y)
+longn islessequal (doublen x, doublen y)
+intn islessgreater (floatn x, floatn y)
+longn islessgreater (doublen x, doublen y)
+intn isfinite (floatn
+longn isfinite (doublen)
+intn isinf (floatn)
+longn isinf (doublen)
+intn isnan (floatn)
+longn isnan (doublen)
+intn isnormal (floatn)
+longn isnormal (doublen)
+intn isordered (floatn x, floatn y)
+longn isordered (doublen x, doublen y)
+intn isunordered (floatn x, floatn y)
+longn isunordered (doublen x, doublen y)
+intn signbit (floatn)
+longn signbit (doublen)
+int any (igentype x)
+int all (igentype x)
+gentype bitselect (gentype a, gentype b, gentype c)
+gentype select (gentype a, gentype b, igentype c)
+gentype select (gentype a, gentype b, ugentype c)
+
+##misc
+#gentypen shuffle (gentypem x, ugentypen mask)
+#gentypen shuffle2 (gentypem x, gentypem y, ugentypen mask)
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
new file mode 100644
index 0000000..79e3935
--- /dev/null
+++ b/backend/src/gbe_bin_generater.cpp
@@ -0,0 +1,437 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*******************************************************************************
+ This file is used to generating the gbe kernel binary. These binary may be
+ used in CL API, such as enqueue memory We generate the binary in build time
+ to improve the performance.
+ *******************************************************************************/
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <fstream>
+#include <deque>
+#include <vector>
+#include <algorithm>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "backend/program.h"
+#include "backend/program.hpp"
+#include "backend/src/sys/platform.hpp"
+#include "src/cl_device_data.h"
+
+using namespace std;
+
+#define FILE_NOT_FIND_ERR 1
+#define FILE_MAP_ERR 2
+#define FILE_BUILD_FAILED 3
+#define FILE_SERIALIZATION_FAILED 4
+
+static uint32_t gen_pci_id = 0;
+
+class program_build_instance {
+
+protected:
+ string prog_path;
+ string build_opt;
+ static string bin_path;
+ static bool str_fmt_out;
+ int fd;
+ int file_len;
+ const char* code;
+ gbe::Program* gbe_prog;
+
+public:
+ program_build_instance (void) : fd(-1), file_len(0), code(NULL), gbe_prog(NULL) { }
+ explicit program_build_instance (const char* file_path, const char* option = NULL)
+ : prog_path(file_path), build_opt(option), fd(-1), file_len(0),
+ code(NULL), gbe_prog(NULL) { }
+
+ ~program_build_instance () {
+ if (code) {
+ munmap((void *)(code), file_len);
+ code = NULL;
+ }
+
+ if (fd >= 0)
+ close(fd);
+
+ if (gbe_prog)
+ gbe_program_delete(reinterpret_cast<gbe_program>(gbe_prog));
+ }
+
+ program_build_instance(program_build_instance&& other) = default;
+#if 0
+ {
+#define SWAP(ELT) \
+ do { \
+ auto elt = this->ELT; \
+ this->ELT = other.ELT; \
+ other.ELT = elt; \
+ } while(0)
+
+ SWAP(fd);
+ SWAP(code);
+ SWAP(file_len);
+ SWAP(prog_path);
+ SWAP(build_opt);
+#undef SWAP
+ }
+#endif
+
+ explicit program_build_instance(const program_build_instance& other) = delete;
+ program_build_instance& operator= (const program_build_instance& other) {
+ /* we do not want to be Lvalue copied, but operator is needed to instance the
+ template of vector<program_build_instance>. */
+ assert(1);
+ return *this;
+ }
+
+ const char* file_map_open (void) throw (int);
+
+ const char* get_code (void) {
+ return code;
+ }
+
+ const string& get_program_path (void) {
+ return prog_path;
+ }
+
+ int get_size (void) {
+ return file_len;
+ }
+
+ void print_file (void) {
+ cout << code << endl;
+ }
+
+ void dump (void) {
+ cout << "program path: " << prog_path << endl;
+ cout << "Build option: " << build_opt << endl;
+ print_file();
+ }
+
+ static void set_str_fmt_out (bool flag) {
+ str_fmt_out = flag;
+ }
+
+ static int set_bin_path (const char* path) {
+ if (bin_path.size())
+ return 0;
+
+ bin_path = path;
+ return 1;
+ }
+
+ void build_program(void) throw(int);
+ void serialize_program(void) throw(int);
+};
+
+string program_build_instance::bin_path;
+bool program_build_instance::str_fmt_out = false;
+#define OUTS_UPDATE_SZ(elt) SERIALIZE_OUT(elt, oss, header_sz)
+#define OUTF_UPDATE_SZ(elt) SERIALIZE_OUT(elt, ofs, header_sz)
+
+void program_build_instance::serialize_program(void) throw(int)
+{
+ ofstream ofs;
+ ostringstream oss;
+ size_t sz = 0, header_sz = 0;
+ ofs.open(bin_path, ofstream::out | ofstream::trunc | ofstream::binary);
+
+ char src_hw_info[4]="";
+ if(IS_IVYBRIDGE(gen_pci_id)){
+ src_hw_info[0]='I';
+ src_hw_info[1]='V';
+ src_hw_info[2]='B';
+ if(IS_BAYTRAIL_T(gen_pci_id)){
+ src_hw_info[0]='B';
+ src_hw_info[1]='Y';
+ src_hw_info[2]='T';
+ }
+ }else if(IS_HASWELL(gen_pci_id)){
+ src_hw_info[0]='H';
+ src_hw_info[1]='S';
+ src_hw_info[2]='W';
+ }
+
+ if (str_fmt_out) {
+
+ if(gen_pci_id){
+ //add header to differeciate from llvm bitcode binary.
+ // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
+ char gen_header[6] = "\0GENC";
+ OUTS_UPDATE_SZ(gen_header[0]);
+ OUTS_UPDATE_SZ(gen_header[1]);
+ OUTS_UPDATE_SZ(gen_header[2]);
+ OUTS_UPDATE_SZ(gen_header[3]);
+ OUTS_UPDATE_SZ(gen_header[4]);
+ OUTS_UPDATE_SZ(src_hw_info[0]);
+ OUTS_UPDATE_SZ(src_hw_info[1]);
+ OUTS_UPDATE_SZ(src_hw_info[2]);
+ }
+
+ string array_name = "Unknown_name_array";
+ unsigned long last_slash = bin_path.rfind("/");
+ unsigned long last_dot = bin_path.rfind(".");
+
+ if (last_slash != string::npos && last_dot != string::npos)
+ array_name = bin_path.substr(last_slash + 1, last_dot - 1 - last_slash);
+
+ ofs << "#include <stddef.h>" << "\n";
+ ofs << "char " << array_name << "[] = {" << "\n";
+
+ if(gen_pci_id){
+ sz = gbe_prog->serializeToBin(oss);
+ sz += header_sz;
+ }else{
+ char *llvm_binary;
+ size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
+ oss.write(llvm_binary, bin_length);
+ sz += bin_length;
+ }
+
+ for (size_t i = 0; i < sz; i++) {
+ unsigned char c = oss.str().c_str()[i];
+ char asic_str[9];
+ sprintf(asic_str, "%2.2x", c);
+ ofs << "0x";
+ ofs << asic_str << ((i == sz - 1) ? "" : ", ");
+ }
+ ofs << "};\n";
+
+ string array_size = array_name + "_size";
+ ofs << "size_t " << array_size << " = " << sz << ";" << "\n";
+ } else {
+ if(gen_pci_id){
+ //add header to differeciate from llvm bitcode binary.
+ // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
+ char gen_header[6] = "\0GENC";
+ OUTF_UPDATE_SZ(gen_header[0]);
+ OUTF_UPDATE_SZ(gen_header[1]);
+ OUTF_UPDATE_SZ(gen_header[2]);
+ OUTF_UPDATE_SZ(gen_header[3]);
+ OUTF_UPDATE_SZ(gen_header[4]);
+ OUTF_UPDATE_SZ(src_hw_info[0]);
+ OUTF_UPDATE_SZ(src_hw_info[1]);
+ OUTF_UPDATE_SZ(src_hw_info[2]);
+ sz = gbe_prog->serializeToBin(ofs);
+ }else{
+ char *llvm_binary;
+ size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
+ ofs.write(llvm_binary, bin_length);
+ sz+=bin_length;
+ }
+ }
+
+ ofs.close();
+
+ if (!sz) {
+ throw FILE_SERIALIZATION_FAILED;
+ }
+}
+
+
+void program_build_instance::build_program(void) throw(int)
+{
+ gbe_program opaque = NULL;
+ if(gen_pci_id){
+ opaque = gbe_program_new_from_source(gen_pci_id, code, 0, build_opt.c_str(), NULL, NULL);
+ }else{
+ opaque = gbe_program_compile_from_source(0, code, NULL, 0, build_opt.c_str(), NULL, NULL);
+ }
+ if (!opaque)
+ throw FILE_BUILD_FAILED;
+
+ gbe_prog = reinterpret_cast<gbe::Program*>(opaque);
+
+ if(gen_pci_id){
+ assert(gbe_program_get_kernel_num(opaque));
+ }
+}
+
+const char* program_build_instance::file_map_open(void) throw(int)
+{
+ void * address;
+
+ /* Open the file */
+ fd = ::open(prog_path.c_str(), O_RDONLY);
+ if (fd < 0) {
+ throw FILE_NOT_FIND_ERR;
+ }
+
+ /* Map it */
+ file_len = lseek(fd, 0, SEEK_END);
+ lseek(fd, 0, SEEK_SET);
+ address = mmap(0, file_len, PROT_READ, MAP_SHARED, fd, 0);
+ if (address == NULL) {
+ throw FILE_MAP_ERR;
+ }
+
+ code = reinterpret_cast<const char*>(address);
+ return code;
+}
+
+typedef vector<program_build_instance> prog_vector;
+
+int main (int argc, const char **argv)
+{
+ prog_vector prog_insts;
+ vector<string> argv_saved;
+ const char* build_opt;
+ const char* file_path;
+ int i;
+ int oc;
+ deque<int> used_index;
+
+ if (argc < 2) {
+ cout << "Usage: kernel_path [-pbuild_parameter]\n[-obin_path]" << endl;
+ return 0;
+ }
+
+ used_index.assign(argc, 0);
+
+ /* because getopt will re-sort the argv, so we save here. */
+ for (i=0; i< argc; i++) {
+ argv_saved.push_back(string(argv[i]));
+ }
+
+ while ( (oc = getopt(argc, (char * const *)argv, "t:o:p:s")) != -1 ) {
+ switch (oc) {
+ case 'p':
+ {
+ int opt_index;
+
+ if (argv[optind-1][0] == '-') {// -pXXX like
+ opt_index = optind - 1;
+ } else { // Must be -p XXXX mode
+ opt_index = optind - 2;
+ used_index[opt_index + 1] = 1;
+ }
+
+ /* opt must follow the file name.*/
+ if ((opt_index < 2 ) || argv[opt_index-1][0] == '-') {
+ cout << "Usage note: Building option must follow file name" << endl;
+ return 1;
+ }
+
+ file_path = argv[opt_index - 1];
+ build_opt = optarg;
+
+ prog_insts.push_back(program_build_instance(file_path, build_opt));
+ break;
+ }
+
+ case 'o':
+ if (!program_build_instance::set_bin_path(optarg)) {
+ cout << "Can not specify the bin path more than once." << endl;
+ return 1;
+ }
+ used_index[optind-1] = 1;
+ break;
+
+ case 't':
+ {
+ char *s = optarg;
+ if (optarg[0] == '0' && (optarg[1] == 'x' || optarg[1] == 'X'))
+ s += 2;
+
+ if (s[0] < '0' || s[0] > '9') {
+ cout << "Invalid target option argument" << endl;
+ return 1;
+ }
+
+ std::stringstream str(s);
+ str >> std::hex >> gen_pci_id;
+
+ used_index[optind-1] = 1;
+ break;
+ }
+
+ case 's':
+ program_build_instance::set_str_fmt_out(true);
+ used_index[optind-1] = 1;
+ break;
+
+ case ':':
+ cout << "Miss the file option argument" << endl;
+ return 1;
+
+ default:
+ cout << "Unknown opt" << endl;
+ }
+ }
+
+ for (i=1; i < argc; i++) {
+ //cout << argv_saved[i] << endl;
+ if (argv_saved[i].size() && argv_saved[i][0] != '-') {
+ if (used_index[i])
+ continue;
+
+ string file_name = argv_saved[i];
+ prog_vector::iterator result = find_if(prog_insts.begin(), prog_insts.end(),
+ [&](program_build_instance & prog_inst)-> bool {
+ bool result = false;
+ if (prog_inst.get_program_path() == file_name)
+ result = true;
+
+ return result;
+ });
+
+ if (result == prog_insts.end()) {
+ prog_insts.push_back(program_build_instance(file_name.c_str(), ""));
+ }
+ }
+ }
+
+ for (auto& inst : prog_insts) {
+ try {
+ inst.file_map_open();
+ inst.build_program();
+ inst.serialize_program();
+ }
+ catch (int & err_no) {
+ if (err_no == FILE_NOT_FIND_ERR) {
+ cout << "can not open the file " <<
+ inst.get_program_path() << endl;
+ } else if (err_no == FILE_MAP_ERR) {
+ cout << "map the file " <<
+ inst.get_program_path() << " failed" << endl;
+ } else if (err_no == FILE_BUILD_FAILED) {
+ cout << "build the file " <<
+ inst.get_program_path() << " failed" << endl;
+ } else if (err_no == FILE_SERIALIZATION_FAILED) {
+ cout << "Serialize the file " <<
+ inst.get_program_path() << " failed" << endl;
+ }
+ return -1;
+ }
+ }
+
+ //for (auto& inst : prog_insts) {
+ // inst.dump();
+ //}
+
+ return 0;
+}
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
new file mode 100644
index 0000000..1c67a4b
--- /dev/null
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "sys/alloc.cpp"
+#include "sys/cvar.cpp"
+#include "sys/assert.cpp"
+#include "sys/platform.cpp"
+#include "ir/constant.cpp"
+#include "ir/printf.cpp"
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#undef GBE_COMPILER_AVAILABLE
+#include "backend/program.cpp"
+#include "backend/gen_program.cpp"
+#include "ir/sampler.cpp"
+#include "ir/image.cpp"
+
+struct BinInterpCallBackInitializer
+{
+ BinInterpCallBackInitializer() {
+ gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
+ gbe_program_get_kernel_num = gbe::programGetKernelNum;
+ gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
+ gbe_program_get_kernel = gbe::programGetKernel;
+ gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
+ gbe_kernel_get_code = gbe::kernelGetCode;
+ gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
+ gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
+ gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
+ gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
+ gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+ gbe_kernel_get_image_size = gbe::kernelGetImageSize;
+ gbe_kernel_get_name = gbe::kernelGetName;
+ gbe_kernel_get_attributes = gbe::kernelGetAttributes;
+ gbe_kernel_get_arg_type = gbe::kernelGetArgType;
+ gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+ gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
+ gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
+ gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
+ gbe_kernel_use_slm = gbe::kernelUseSLM;
+ gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
+ gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
+ gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
+ gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
+ gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
+ gbe_program_delete = gbe::programDelete;
+ gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+ gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+ gbe_kernel_get_image_data = gbe::kernelGetImageData;
+ gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
+ gbe_get_printf_num = gbe::kernelGetPrintfNum;
+ gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
+ gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
+ gbe_dup_printfset = gbe::kernelDupPrintfSet;
+ gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
+ gbe_release_printf_info = gbe::kernelReleasePrintfSet;
+ gbe_output_printf = gbe::kernelOutputPrintf;
+ }
+
+ ~BinInterpCallBackInitializer() {
+ }
+};
+
+static struct BinInterpCallBackInitializer binInterpCB;
diff --git a/backend/src/gen_as.sh b/backend/src/gen_as.sh
new file mode 100755
index 0000000..7dea15d
--- /dev/null
+++ b/backend/src/gen_as.sh
@@ -0,0 +1,101 @@
+#! /bin/sh -e
+
+. ./genconfig.sh
+
+# Generate list of union sizes
+for type in $TYPES; do
+ size=`IFS=:; set -- dummy $type; echo $3`
+ for vector_length in $VECTOR_LENGTHS; do
+ if test $vector_length -eq 3; then
+ continue;
+ fi
+ union_sizes="$union_sizes `expr $vector_length \* $size`"
+ done
+done
+union_sizes="`echo $union_sizes | tr ' ' '\n' | sort -n | uniq`"
+
+# For each union size
+for union_size in $union_sizes; do
+
+ # Define an union that contains all vector types that have the same size as the union
+ unionname="union _type_cast_${union_size}_b"
+ echo "$unionname {"
+ for type in $TYPES; do
+ basetype=`IFS=:; set -- dummy $type; echo $2`
+ basesize=`IFS=:; set -- dummy $type; echo $3`
+ for vector_length in $VECTOR_LENGTHS; do
+ if test $vector_length -eq 3; then
+ vector_size_length="4"
+ else
+ vector_size_length=$vector_length;
+ fi
+ vector_size_in_union="`expr $vector_size_length \* $basesize`"
+ if test $union_size -ne $vector_size_in_union; then
+ continue
+ fi
+ if test $vector_length -eq 1; then
+ vectortype=$basetype
+ else
+ vectortype=$basetype$vector_length
+ fi
+ echo " $vectortype _$vectortype;"
+ done
+
+ done
+ echo "};"
+ echo
+
+ # For each tuple of vector types that has the same size as the current union size,
+ # define an as_* function that converts types without changing binary representation.
+ for ftype in $TYPES; do
+ fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+ fbasesize=`IFS=:; set -- dummy $ftype; echo $3`
+ for fvector_length in $VECTOR_LENGTHS; do
+ if test $fvector_length -eq 3; then
+ fvector_size_length="4"
+ else
+ fvector_size_length=$fvector_length;
+ fi
+ fvector_size_in_union="`expr $fvector_size_length \* $fbasesize`"
+ if test $union_size -ne $fvector_size_in_union; then
+ continue
+ fi
+ if test $fvector_length -eq 1; then
+ fvectortype=$fbasetype
+ else
+ fvectortype=$fbasetype$fvector_length
+ fi
+ for ttype in $TYPES; do
+ tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+ tbasesize=`IFS=:; set -- dummy $ttype; echo $3`
+ if test $fbasetype = $tbasetype; then
+ continue
+ fi
+ for tvector_length in $VECTOR_LENGTHS; do
+ if test $tvector_length -eq 3; then
+ tvector_size_length="4"
+ else
+ tvector_size_length=$tvector_length;
+ fi
+ tvector_size_in_union="`expr $tvector_size_length \* $tbasesize`"
+ if test $union_size -ne $tvector_size_in_union; then
+ continue
+ fi
+ if test $tvector_length -eq 1; then
+ tvectortype=$tbasetype
+ else
+ tvectortype=$tbasetype$tvector_length
+ fi
+ echo "INLINE OVERLOADABLE $tvectortype as_$tvectortype($fvectortype v) {"
+ echo " $unionname u;"
+ echo " u._$fvectortype = v;"
+ echo " return u._$tvectortype;"
+ echo "}"
+ echo
+ done
+ done
+ done
+
+ done
+
+done
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
new file mode 100755
index 0000000..2d602c8
--- /dev/null
+++ b/backend/src/gen_builtin_vector.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2012 Intel Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Author: Zhigang Gong <zhigang.gong at linux.intel.com>
+#/
+
+# This file is to generate inline code to lower down those builtin
+# vector functions to scalar functions.
+import re
+import sys
+import os
+
+if len(sys.argv) != 3:
+ print "Invalid argument {0}".format(sys.argv)
+ print "use {0} spec_file_name output_file_name".format(sys.argv[0])
+ raise
+
+all_vector = 1,2,3,4,8,16
+
+# generate generic type sets
+def gen_vector_type(type_set, vector_set = all_vector):
+ ret = []
+ for t in type_set:
+ for i in vector_set:
+ ret.append((t, i))
+ return ret
+
+def set_vector_memspace(vector_type_set, memspace):
+ ret = []
+ if memspace == '':
+ return vector_type_set
+ for t in vector_type_set:
+ ret.append((t[0], t[1], memspace))
+ return ret
+
+# if we have 3 elements in the type tuple, we are a pointer with a memory space type
+# at the third element.
+def isPointer(t):
+ return len(t) == 3
+
+all_itype = "char","short","int","long"
+all_utype = "uchar","ushort","uint","ulong"
+all_int_type = all_itype + all_utype
+
+all_float_type = "float","double"
+all_type = all_int_type + all_float_type
+
+# all vector/scalar types
+for t in all_type:
+ exec "{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t)
+ exec "s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t)
+
+# Predefined type sets according to the Open CL spec.
+math_gentype = ["math_gentype", gen_vector_type(all_float_type)]
+math_gentypef = ["math_gentypef", gen_vector_type(["float"])]
+math_gentyped = ["math_gentyped", gen_vector_type(["double"])]
+
+half_native_math_gentype = ["half_native_math_gentype", gen_vector_type(["float"])]
+
+integer_gentype = ["integer_gentype", gen_vector_type(all_int_type)]
+integer_ugentype = ["integer_ugentype", gen_vector_type(all_utype)]
+integer_sgentype = ["integer_sgentype", gen_vector_type(all_int_type, [1])]
+
+fast_integer_gentype = ["fast_integer_gentype", gen_vector_type(["uint", "int"])]
+
+common_gentype = ["common_gentype", gen_vector_type(all_float_type)]
+common_gentypef = ["common_gentypef", gen_vector_type(["float"])]
+common_gentyped = ["common_gentyped", gen_vector_type(["double"])]
+
+relational_gentype = ["relational_gentype", gen_vector_type(all_type)]
+relational_igentype = ["relational_igentype", gen_vector_type(all_itype)]
+relational_ugentype = ["relational_ugentype", gen_vector_type(all_utype)]
+
+misc_gentypem = ["misc_gentypem", gen_vector_type(all_type, [2, 4, 8, 16])]
+misc_gentypen = ["misc_gentypen", gen_vector_type(all_type, [2, 4, 8, 16])]
+misc_ugentypem = ["misc_ugentypem", gen_vector_type(all_utype, [2, 4, 8, 16])]
+misc_ugentypen = ["misc_ugentypen", gen_vector_type(all_utype, [2, 4, 8, 16])]
+
+all_predefined_type = math_gentype, math_gentypef, math_gentyped, \
+ half_native_math_gentype, integer_gentype,integer_sgentype,\
+ integer_ugentype, charn, ucharn, shortn, ushortn, intn, \
+ uintn, longn, ulongn, floatn, doublen, \
+ fast_integer_gentype, common_gentype, common_gentypef, \
+ common_gentyped, relational_gentype, relational_igentype, \
+ relational_ugentype, schar, suchar, sshort, sint, suint, \
+ slong, sulong, sfloat, sdouble, misc_gentypem, \
+ misc_ugentypem, misc_gentypen, misc_ugentypen
+
+# type dictionary contains all the predefined type sets.
+type_dict = {}
+
+for t in all_predefined_type:
+ type_dict.update({t[0]:t[1]})
+
+def _prefix(prefix, dtype):
+ if dtype.count("gentype") != 0:
+ return prefix + '_' + dtype
+ return dtype
+
+memspaces = ["__local ", "__private ", "__global "]
+
+def stripMemSpace(t):
+ if t[0:2] == '__':
+ for memspace in memspaces :
+ if t[0:len(memspace)] == memspace:
+ return memspace, t[len(memspace):]
+ return '', t
+
+def check_type(types):
+ for t in types:
+ memspace, t = stripMemSpace(t)
+ if not t in type_dict:
+ print t
+ raise "found invalid type."
+
+def match_unsigned(dtype):
+ if dtype[0] == 'float':
+ return ["uint", dtype[1]]
+ if dtype[0] == 'double':
+ return ["ulong", dtype[1]]
+ if dtype[0][0] == 'u':
+ return dtype
+ return ['u' + dtype[0], dtype[1]]
+
+def match_signed(dtype):
+ if dtype[0] == 'float':
+ return ["int", dtype[1]]
+ if dtype[0] == 'double':
+ return ["long", dtype[1]]
+ if dtype[0][0] != 'u':
+ return dtype
+ return [dtype[0][1:], dtype[1]]
+
+def match_scalar(dtype):
+ return [dtype[0], 1]
+
+# The dstType is the expected type, srcType is
+# the reference type. Sometimes, the dstType and
+# srcType are different. We need to fix this issue
+# and return correct dst type.
+def fixup_type(dstType, srcType, n):
+ if dstType == srcType:
+ return dstType[n]
+
+ if dstType != srcType:
+ # scalar dst type
+ if len(dstType) == 1:
+ return dstType[0]
+ # dst is not scalar bug src is scalar
+ if len(srcType) == 1:
+ return dstType[n]
+ if dstType == integer_sgentype[1] and srcType == integer_gentype[1]:
+ return match_scalar(srcType[n])
+
+ if dstType == integer_gentype[1] and \
+ (srcType == integer_sgentype[1] or \
+ srcType == integer_ugentype[1]):
+ return dstType[n]
+
+ if dstType == integer_ugentype[1] and srcType == integer_gentype[1]:
+ return match_unsigned(srcType[n])
+
+ if dstType == relational_igentype[1] and srcType == relational_gentype[1]:
+ return match_signed(srcType[n])
+ if dstType == relational_ugentype[1] and srcType == relational_gentype[1]:
+ return match_unsigned(srcType[n])
+
+ if dstType == relational_gentype[1] and \
+ (srcType == relational_igentype[1] or \
+ srcType == relational_ugentype[1]):
+ return dstType[n]
+
+ if (len(dstType) == len(srcType)):
+ return dstType[n]
+
+ print dstType, srcType
+ raise "type mispatch"
+
+class builtinProto():
+ valueTypeStr = ""
+ functionName = ""
+ paramTypeStrs = []
+ paramCount = 0
+ outputStr = []
+ prefix = ""
+
+ def init(self, sectionHeader, sectionPrefix):
+ self.valueTypeStr = ""
+ self.functionName = ""
+ self.paramTypeStrs = []
+ self.paramCount = 0
+ if sectionHeader != "":
+ self.outputStr = [sectionHeader]
+ else:
+ self.outputStr = []
+ if sectionPrefix != "":
+ self.prefix = sectionPrefix
+ self.indent = 0
+
+ def append(self, line, nextInit = ""):
+ self.outputStr.append(line);
+ return nextInit;
+
+ def indentSpace(self):
+ ret = ""
+ for i in range(self.indent):
+ ret += ' '
+
+ return ret
+
+ def init_from_line(self, t):
+ self.append('//{0}'.format(t))
+ line = filter(None, re.split(',| |\(', t.rstrip(')\n')))
+ self.paramCount = 0
+ stripped = 0
+ memSpace = ''
+ for i, text in enumerate(line):
+ idx = i - stripped
+ if idx == 0:
+ self.valueTypeStr = _prefix(self.prefix, line[i])
+ continue
+
+ if idx == 1:
+ self.functionName = line[i];
+ continue
+
+ if idx % 2 == 0:
+ if line[i][0] == '(':
+ tmpType = line[i][1:]
+ else:
+ tmpType = line[i]
+ if tmpType == '__local' or \
+ tmpType == '__private' or \
+ tmpType == '__global':
+ memSpace = tmpType + ' '
+ stripped += 1
+ continue
+ self.paramTypeStrs.append(memSpace + _prefix(self.prefix, tmpType))
+ memSpace = ''
+ self.paramCount += 1
+
+ def gen_proto_str_1(self, vtypeSeq, ptypeSeqs, i):
+ for n in range(0, self.paramCount):
+ ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i);
+ vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i);
+ # XXX FIXME now skip all double vector, as we don't
+ # defined those scalar version's prototype.
+ if ptype[0].find('double') != -1 or \
+ vtype[0].find('double') != -1:
+ return
+
+ if (n == 0):
+ formatStr = 'INLINE_OVERLOADABLE {0}{1} {2} ('.format(vtype[0], vtype[1], self.functionName)
+ else:
+ formatStr += ', '
+
+ if vtype[1] == 1:
+ return
+
+ if isPointer(ptype):
+ formatStr += ptype[2]
+ pointerStr = '*'
+ else:
+ pointerStr = ''
+
+ if ptype[1] != 1:
+ formatStr += '{0}{1} {2}param{3}'.format(ptype[0], ptype[1], pointerStr, n)
+ else:
+ formatStr += '{0} {1}param{2}'.format(ptype[0], pointerStr, n)
+
+ formatStr += ')'
+ formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
+ self.indent = len(formatStr)
+ for j in range(0, vtype[1]):
+ if (j != 0):
+ formatStr += ','
+ if (j + 1) % 2 == 0:
+ formatStr += ' '
+ if j % 2 == 0:
+ formatStr = self.append(formatStr, self.indentSpace())
+
+ if self.prefix == 'relational' and self.functionName != 'bitselect' and self.functionName != 'select':
+ formatStr += '-'
+ formatStr += '{0}('.format(self.functionName)
+ for n in range(0, self.paramCount):
+ if n != 0:
+ formatStr += ', '
+
+ ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i)
+ vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i)
+ if vtype[1] != ptype[1]:
+ if ptype[1] != 1:
+ raise "parameter is not a scalar but has different width with result value."
+ if isPointer(ptype):
+ formatStr += '&'
+ formatStr += 'param{0}'.format(n)
+ continue
+
+ if (isPointer(ptype)):
+ formatStr += '({0} {1} *)param{2} + {3:2d}'.format(ptype[2], ptype[0], n, j)
+ else:
+ if (self.functionName == 'select' and n == 2):
+ formatStr += '({0})(param{1}.s{2:X} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
+ else:
+ formatStr += 'param{0}.s{1:X}'.format(n, j)
+
+ formatStr += ')'
+
+ formatStr += '); }\n'
+ self.append(formatStr)
+
+ return formatStr
+
+ def output(self):
+ for line in self.outputStr:
+ print line
+
+ def output(self, outFile):
+ for line in self.outputStr:
+ outFile.write('{0}\n'.format(line))
+
+ def gen_proto_str(self):
+ check_type([self.valueTypeStr] + self.paramTypeStrs)
+ vtypeSeq = type_dict[self.valueTypeStr]
+ ptypeSeqs = []
+ count = len(vtypeSeq);
+ for t in self.paramTypeStrs:
+ memspace,t = stripMemSpace(t)
+ ptypeSeqs.append(set_vector_memspace(type_dict[t], memspace))
+ count = max(count, len(type_dict[t]))
+
+ for i in range(count):
+ formatStr = self.gen_proto_str_1(vtypeSeq, ptypeSeqs, i)
+
+ self.append("")
+
+def safeUnlink(filename):
+ try:
+ os.remove(filename)
+ except OSError:
+ pass
+
+# save the prototypes into ocl_vector.h
+specFile = open(sys.argv[1], 'r')
+headerFileName = sys.argv[2]
+tempHeaderFileName = sys.argv[2] + '.tmp'
+safeUnlink(headerFileName)
+tempHeader = open(tempHeaderFileName, 'w')
+
+tempHeader.write("//This file is autogenerated by {0}.\n".format(sys.argv[0]))
+tempHeader.write("//Don't modify it manually.\n")
+
+functionProto = builtinProto()
+for line in specFile:
+ if line.isspace():
+ continue
+ if line[0] == '#':
+ if line[1] == '#':
+ sectionHeader = "//{0} builtin functions".format(line[2:].rstrip())
+ sectionPrefix=(line[2:].split())[0]
+ continue
+ functionProto.init(sectionHeader, sectionPrefix)
+ sectionHeader = ""
+ setionPrefix = ""
+ functionProto.init_from_line(line)
+ functionProto.gen_proto_str()
+ functionProto.output(tempHeader)
+
+tempHeader.close()
+os.rename(tempHeaderFileName, headerFileName)
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
new file mode 100755
index 0000000..b940222
--- /dev/null
+++ b/backend/src/gen_convert.sh
@@ -0,0 +1,553 @@
+#! /bin/sh -e
+
+. ./genconfig.sh
+
+# For all vector lengths and types, generate conversion functions
+for vector_length in $VECTOR_LENGTHS; do
+ if test $vector_length -eq 1; then
+ for ftype in $TYPES; do
+ fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+ for ttype in $TYPES; do
+ tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+ echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
+ echo " return ($tbasetype)v;"
+ echo "}"
+ echo
+ done
+ done
+ else
+ for ftype in $TYPES; do
+ fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+ for ttype in $TYPES; do
+ tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+ if test $fbasetype = $tbasetype; then
+ if test $vector_length -gt 1; then
+ fvectortype=$fbasetype$vector_length
+ tvectortype=$tbasetype$vector_length
+ echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
+ else
+ echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
+ fi
+ continue
+ fi
+ fvectortype=$fbasetype$vector_length
+ tvectortype=$tbasetype$vector_length
+ construct="($tbasetype)(v.s0)"
+ if test $vector_length -gt 1; then
+ construct="$construct, ($tbasetype)(v.s1)"
+ fi
+ if test $vector_length -gt 2; then
+ construct="$construct, ($tbasetype)(v.s2)"
+ fi
+ if test $vector_length -gt 3; then
+ construct="$construct, ($tbasetype)(v.s3)"
+ fi
+ if test $vector_length -gt 4; then
+ construct="$construct, ($tbasetype)(v.s4)"
+ construct="$construct, ($tbasetype)(v.s5)"
+ construct="$construct, ($tbasetype)(v.s6)"
+ construct="$construct, ($tbasetype)(v.s7)"
+ fi
+ if test $vector_length -gt 8; then
+ construct="$construct, ($tbasetype)(v.s8)"
+ construct="$construct, ($tbasetype)(v.s9)"
+ construct="$construct, ($tbasetype)(v.sA)"
+ construct="$construct, ($tbasetype)(v.sB)"
+ construct="$construct, ($tbasetype)(v.sC)"
+ construct="$construct, ($tbasetype)(v.sD)"
+ construct="$construct, ($tbasetype)(v.sE)"
+ construct="$construct, ($tbasetype)(v.sF)"
+ fi
+
+ echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) {"
+ echo " return ($tvectortype)($construct);"
+ echo "}"
+ echo
+ done
+ done
+ fi
+done
+
+echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+ OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
+ }
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x >= MAX ? (DSTTYPE)MAX : x; \
+ }
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+
+INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
+ ulong MAX = 0x7ffffffffffffffful;
+ return x >= MAX ? MAX : x;
+}
+
+#define DEF(DSTTYPE, SRCTYPE) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x <= 0 ? 0 : x; \
+ }
+DEF(ushort, char);
+DEF(uint, char);
+DEF(uint, short);
+DEF(ulong, char);
+DEF(ulong, short);
+DEF(ulong, int);
+DEF(ulong, long);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x; \
+ }
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, uchar);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, uchar);
+DEF(ulong, ushort);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+'
+
+# vector convert_DSTTYPE_sat function
+for vector_length in $VECTOR_LENGTHS; do
+ if test $vector_length -eq 1; then continue; fi
+
+ for ftype in $TYPES; do
+ fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+ if test $fbasetype = "double"; then continue; fi
+
+ for ttype in $TYPES; do
+ tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+ if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+ fvectortype=$fbasetype$vector_length
+ tvectortype=$tbasetype$vector_length
+ conv="convert_${tbasetype}_sat"
+
+ construct="$conv(v.s0)"
+ if test $vector_length -gt 1; then
+ construct="$construct, $conv(v.s1)"
+ fi
+ if test $vector_length -gt 2; then
+ construct="$construct, $conv(v.s2)"
+ fi
+ if test $vector_length -gt 3; then
+ construct="$construct, $conv(v.s3)"
+ fi
+ if test $vector_length -gt 4; then
+ construct="$construct, $conv(v.s4)"
+ construct="$construct, $conv(v.s5)"
+ construct="$construct, $conv(v.s6)"
+ construct="$construct, $conv(v.s7)"
+ fi
+ if test $vector_length -gt 8; then
+ construct="$construct, $conv(v.s8)"
+ construct="$construct, $conv(v.s9)"
+ construct="$construct, $conv(v.sA)"
+ construct="$construct, $conv(v.sB)"
+ construct="$construct, $conv(v.sC)"
+ construct="$construct, $conv(v.sD)"
+ construct="$construct, $conv(v.sE)"
+ construct="$construct, $conv(v.sF)"
+ fi
+
+ echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v) {"
+ echo " return ($tvectortype)($construct);"
+ echo "}"
+ echo
+ done
+ done
+done
+
+echo '
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __convert_float_rtz(long x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long l = u.f;
+ if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+ (l < x && x < 0)) {
+ u.u -= 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(long x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long l = u.f; //can not use u.f < x
+ if(l < x && x < 0x7fffffc000000000) {
+ if(x > 0)
+ u.u = u.u + 1;
+ else
+ u.u = u.u - 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(long x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long l = u.f; //avoid overflow
+ if(l > x || x >= 0x7fffffc000000000) {
+ if(x > 0)
+ u.u = u.u - 1;
+ else
+ u.u = u.u + 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ ulong l = u.f;
+ if(l > x || x >= 0xffffff8000000000)
+ u.u -= 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ ulong l = u.f; //can not use u.f < x
+ if(l < x && x < 0xffffff8000000000)
+ u.u = u.u + 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+ return __convert_float_rtz(x);
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(int x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long i = u.f;
+ if((i > x && x > 0) ||
+ (i < x && x < 0)) {
+ u.u -= 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(int x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ int i = u.f;
+ if(i < x) {
+ if(x > 0)
+ u.u += 1;
+ else
+ u.u -= 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(int x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long i = u.f; //avoid overflow
+ if(i > x) {
+ if(x > 0)
+ u.u = u.u - 1;
+ else
+ u.u = u.u + 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ ulong i = u.f;
+ if(i > x)
+ u.u -= 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ uint i = u.f;
+ if(i < x)
+ u.u += 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
+{
+ return __convert_float_rtz(x);
+}
+'
+
+# convert_DSTTYPE_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+ for ftype in $TYPES; do
+ fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+ if test $fbasetype = "double"; then continue; fi
+
+ for ttype in $TYPES; do
+ tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+ if test $tbasetype = "double"; then continue; fi
+
+ if test $vector_length -eq 1; then
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x)"
+ if test $fbasetype = "float" -a $tbasetype != "float"; then
+ echo "{ return __gen_ocl_rnde(x); }"
+ else
+ echo "{ return x; }"
+ fi
+
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x)"
+ if test $fbasetype = "float" -a $tbasetype != "float"; then
+ echo "{ return __gen_ocl_rndz(x); }"
+ elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+ echo "{ return __convert_${tbasetype}_rtz(x); }"
+ else
+ echo "{ return x; }"
+ fi
+
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x)"
+ if test $fbasetype = "float" -a $tbasetype != "float"; then
+ echo "{ return __gen_ocl_rndu(x); }"
+ elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+ echo "{ return __convert_${tbasetype}_rtp(x); }"
+ else
+ echo "{ return x; }"
+ fi
+
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x)"
+ if test $fbasetype = "float" -a $tbasetype != "float"; then
+ echo "{ return __gen_ocl_rndd(x); }"
+ elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+ echo "{ return __convert_${tbasetype}_rtn(x); }"
+ else
+ echo "{ return x; }"
+ fi
+
+ continue
+ fi
+
+ for rounding in $ROUNDING_MODES; do
+ fvectortype=$fbasetype$vector_length
+ tvectortype=$tbasetype$vector_length
+ conv="convert_${tbasetype}_${rounding}"
+
+ construct="$conv(v.s0)"
+ if test $vector_length -gt 1; then
+ construct="$construct, $conv(v.s1)"
+ fi
+ if test $vector_length -gt 2; then
+ construct="$construct, $conv(v.s2)"
+ fi
+ if test $vector_length -gt 3; then
+ construct="$construct, $conv(v.s3)"
+ fi
+ if test $vector_length -gt 4; then
+ construct="$construct, $conv(v.s4)"
+ construct="$construct, $conv(v.s5)"
+ construct="$construct, $conv(v.s6)"
+ construct="$construct, $conv(v.s7)"
+ fi
+ if test $vector_length -gt 8; then
+ construct="$construct, $conv(v.s8)"
+ construct="$construct, $conv(v.s9)"
+ construct="$construct, $conv(v.sA)"
+ construct="$construct, $conv(v.sB)"
+ construct="$construct, $conv(v.sC)"
+ construct="$construct, $conv(v.sD)"
+ construct="$construct, $conv(v.sE)"
+ construct="$construct, $conv(v.sF)"
+ fi
+
+ echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v) {"
+ echo " return ($tvectortype)($construct);"
+ echo "}"
+ echo
+ done
+ done
+ done
+done
+
+# convert_DSTTYPE_sat_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+ for ftype in $TYPES; do
+ fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+ if test $fbasetype = "double"; then continue; fi
+
+ for ttype in $TYPES; do
+ tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+ if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+ if test $vector_length -eq 1; then
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x)"
+ if test $fbasetype = "float"; then
+ echo "{ return convert_${tbasetype}_sat(__gen_ocl_rnde(x)); }"
+ else
+ echo "{ return convert_${tbasetype}_sat(x); }"
+ fi
+
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x)"
+ if test $fbasetype = "float"; then
+ echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndz(x)); }"
+ else
+ echo "{ return convert_${tbasetype}_sat(x); }"
+ fi
+
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x)"
+ if test $fbasetype = "float"; then
+ echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndu(x)); }"
+ else
+ echo "{ return convert_${tbasetype}_sat(x); }"
+ fi
+
+ echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x)"
+ if test $fbasetype = "float"; then
+ echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndd(x)); }"
+ else
+ echo "{ return convert_${tbasetype}_sat(x); }"
+ fi
+
+ continue
+ fi
+
+ for rounding in $ROUNDING_MODES; do
+ fvectortype=$fbasetype$vector_length
+ tvectortype=$tbasetype$vector_length
+ conv="convert_${tbasetype}_sat_${rounding}"
+
+ construct="$conv(v.s0)"
+ if test $vector_length -gt 1; then
+ construct="$construct, $conv(v.s1)"
+ fi
+ if test $vector_length -gt 2; then
+ construct="$construct, $conv(v.s2)"
+ fi
+ if test $vector_length -gt 3; then
+ construct="$construct, $conv(v.s3)"
+ fi
+ if test $vector_length -gt 4; then
+ construct="$construct, $conv(v.s4)"
+ construct="$construct, $conv(v.s5)"
+ construct="$construct, $conv(v.s6)"
+ construct="$construct, $conv(v.s7)"
+ fi
+ if test $vector_length -gt 8; then
+ construct="$construct, $conv(v.s8)"
+ construct="$construct, $conv(v.s9)"
+ construct="$construct, $conv(v.sA)"
+ construct="$construct, $conv(v.sB)"
+ construct="$construct, $conv(v.sC)"
+ construct="$construct, $conv(v.sD)"
+ construct="$construct, $conv(v.sE)"
+ construct="$construct, $conv(v.sF)"
+ fi
+
+ echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v) {"
+ echo " return ($tvectortype)($construct);"
+ echo "}"
+ echo
+ done
+ done
+ done
+done
diff --git a/backend/src/genconfig.sh b/backend/src/genconfig.sh
new file mode 100644
index 0000000..689499e
--- /dev/null
+++ b/backend/src/genconfig.sh
@@ -0,0 +1,11 @@
+#! /bin/false
+# This is to be sourced by the generation scripts
+
+# Supported base types and their lengths
+TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
+
+# Supported vector lengths
+VECTOR_LENGTHS="1 2 3 4 8 16"
+
+ROUNDING_MODES="rte rtz rtp rtn"
+## No user serviceable parts below here
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
new file mode 100644
index 0000000..a38d392
--- /dev/null
+++ b/backend/src/ir/constant.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file constant.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "constant.hpp"
+
+namespace gbe {
+namespace ir {
+
+ void ConstantSet::append(const char *data,
+ const std::string &name,
+ uint32_t size,
+ uint32_t alignment)
+ {
+ const uint32_t offset = ALIGN(this->data.size(), alignment);
+ const uint32_t padding = offset - this->data.size();
+ const Constant constant(name, size, alignment, offset);
+ constants.push_back(constant);
+ for (uint32_t i = 0; i < padding; ++i) this->data.push_back(0);
+ for (uint32_t i = 0; i < size; ++i) this->data.push_back(data[i]);
+ }
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ size_t ConstantSet::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ /* output the const data. */
+ OUT_UPDATE_SZ((data.size()*sizeof(char)));
+ if(data.size() > 0) {
+ outs.write(data.data(), data.size()*sizeof(char));
+ ret_size += data.size()*sizeof(char);
+ }
+
+ OUT_UPDATE_SZ(constants.size());
+ for (auto const &cnst : constants) {
+ size_t bytes = sizeof(cnst.getName().size()) //name length self
+ + cnst.getName().size()*sizeof(char) //name
+ + sizeof(cnst.getSize()) //size
+ + sizeof(cnst.getAlignment()) //alignment
+ + sizeof(cnst.getOffset()); //offset
+ OUT_UPDATE_SZ(bytes);
+
+ OUT_UPDATE_SZ(cnst.getName().size());
+ outs.write(cnst.getName().c_str(), cnst.getName().size());
+ ret_size += sizeof(char)*cnst.getName().size();
+ OUT_UPDATE_SZ(cnst.getSize());
+ OUT_UPDATE_SZ(cnst.getAlignment());
+ OUT_UPDATE_SZ(cnst.getOffset());
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+ OUT_UPDATE_SZ(ret_size);
+
+ return ret_size;
+ }
+
+ size_t ConstantSet::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ size_t global_data_sz = 0;
+ size_t const_num;
+ uint32_t magic;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(global_data_sz);
+ for (size_t i = 0; i < global_data_sz; i++) {
+ char elt;
+ IN_UPDATE_SZ(elt);
+ data.push_back(elt);
+ }
+
+ IN_UPDATE_SZ(const_num);
+ for (size_t i = 0; i < const_num; i++) {
+ size_t bytes;
+ IN_UPDATE_SZ(bytes);
+
+ size_t name_len;
+ IN_UPDATE_SZ(name_len);
+
+ char* c_name = new char[name_len+1];
+ ins.read(c_name, name_len);
+ total_size += sizeof(char)*name_len;
+ c_name[name_len] = 0;
+
+ uint32_t size, align, offset;
+ IN_UPDATE_SZ(size);
+ IN_UPDATE_SZ(align);
+ IN_UPDATE_SZ(offset);
+
+ ir::Constant constant(c_name, size, align, offset);
+ constants.push_back(constant);
+
+ delete[] c_name;
+
+ /* Saint check */
+ if (bytes != sizeof(name_len) + sizeof(char)*name_len + sizeof(size)
+ + sizeof(align) + sizeof(offset))
+ return 0;
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
new file mode 100644
index 0000000..70d09aa
--- /dev/null
+++ b/backend/src/ir/constant.hpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file constant.cpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_CONSTANT_HPP__
+#define __GBE_IR_CONSTANT_HPP__
+
+#include "sys/vector.hpp"
+
+namespace gbe {
+namespace ir {
+
+ /*! Describe one constant (may be a scalar or an array) */
+ class Constant
+ {
+ public:
+ /*! Build a constant description */
+ INLINE Constant(const std::string &name, uint32_t size, uint32_t alignment, uint32_t offset) :
+ name(name), size(size), alignment(alignment), offset(offset) {}
+ /*! Copy constructor */
+ INLINE Constant(const Constant &other) :
+ name(other.name), size(other.size), alignment(other.alignment), offset(other.offset) {}
+ /*! Copy operator */
+ INLINE Constant& operator= (const Constant &other) {
+ this->name = other.name;
+ this->size = other.size;
+ this->alignment = other.alignment;
+ this->offset = other.offset;
+ return *this;
+ }
+ /*! Nothing happens here */
+ INLINE ~Constant(void) {}
+ const std::string& getName(void) const { return name; }
+ uint32_t getSize (void) const { return size; }
+ uint32_t getAlignment (void) const { return alignment; }
+ uint32_t getOffset(void) const { return offset; }
+ private:
+ std::string name; //!< Optional name of the constant
+ uint32_t size; //!< Size of the constant
+ uint32_t alignment; //!< Alignment required for each constant
+ uint32_t offset; //!< Offset of the constant in the data segment
+ GBE_CLASS(Constant);
+ };
+
+ /*! A constant set is a set of immutable data associated to a compilation
+ * unit
+ */
+ class ConstantSet : public Serializable
+ {
+ public:
+ /*! Append a new constant in the constant set */
+ void append(const char*, const std::string&, uint32_t size, uint32_t alignment);
+ /*! Number of constants */
+ size_t getConstantNum(void) const { return constants.size(); }
+ /*! Get a special constant */
+ Constant& getConstant(size_t i) { return constants[i]; }
+ /*! Get a special constant */
+ Constant& getConstant(const std::string & name) {
+ for (auto & c : constants) {
+ if (c.getName() == name)
+ return c;
+ }
+ GBE_ASSERT(false);
+ return *(Constant *)nullptr;
+ }
+ /*! Number of bytes of serialized constant data */
+ size_t getDataSize(void) const { return data.size(); }
+ /*! Store serialized constant data into an array */
+ void getData(char *mem) const {
+ for (size_t i = 0; i < data.size(); i ++)
+ mem[i] = data[i];
+ }
+ ConstantSet() {}
+ ConstantSet(const ConstantSet& other) : Serializable(other),
+ data(other.data), constants(other.constants) {}
+ ConstantSet & operator = (const ConstantSet& other) {
+ if (&other != this) {
+ data = other.data;
+ constants = other.constants;
+ }
+ return *this;
+ }
+
+ static const uint32_t magic_begin = TO_MAGIC('C', 'N', 'S', 'T');
+ static const uint32_t magic_end = TO_MAGIC('T', 'S', 'N', 'C');
+
+ /* format:
+ magic_begin |
+ const_data_size |
+ const_data |
+ constant_1_size |
+ constant_1 |
+ ........ |
+ constant_n_size |
+ constant_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+
+ private:
+ vector<char> data; //!< The constant data serialized in one array
+ vector<Constant> constants;//!< Each constant description
+ GBE_CLASS(ConstantSet);
+ };
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_CONSTANT_HPP__ */
+
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
new file mode 100644
index 0000000..1528a8d
--- /dev/null
+++ b/backend/src/ir/context.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file context.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/context.hpp"
+#include "ir/unit.hpp"
+#include "ir/lowering.hpp"
+
+namespace gbe {
+namespace ir {
+
+ Context::Context(Unit &unit) :
+ unit(unit), fn(NULL), bb(NULL), usedLabels(NULL) {}
+
+ Context::~Context(void) {
+ for (const auto &elem : fnStack) GBE_SAFE_DELETE(elem.usedLabels);
+ GBE_SAFE_DELETE(usedLabels);
+ }
+
+ Function &Context::getFunction(void) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ return *fn;
+ }
+
+ void Context::appendPushedConstant(Register reg, const PushLocation &pushed)
+ {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ GBE_ASSERTM(fn->pushMap.contains(reg) == false, "Register already pushed");
+ fn->pushMap.insert(std::make_pair(reg, pushed));
+ fn->locationMap.insert(std::make_pair(pushed, reg));
+ }
+
+ void Context::startFunction(const std::string &name) {
+ fnStack.push_back(StackElem(fn,bb,usedLabels));
+ fn = unit.newFunction(name);
+ usedLabels = GBE_NEW_NO_ARG(vector<uint8_t>);
+ bb = NULL;
+ }
+
+ void Context::endFunction(void) {
+ GBE_ASSERTM(fn != NULL, "No function to end");
+ GBE_ASSERT(fnStack.size() != 0);
+ GBE_ASSERT(usedLabels != NULL);
+
+ // Empty function -> append a return
+ if (fn->blockNum() == 0) this->RET();
+
+ // Check first that all branch instructions point to valid labels
+ GBE_ASSERT(usedLabels);
+#if GBE_DEBUG
+ for (auto usage : *usedLabels)
+ GBE_ASSERTM(usage != LABEL_IS_POINTED, "A label is used and not defined");
+#endif /* GBE_DEBUG */
+ GBE_DELETE(usedLabels);
+
+ // Remove all returns and insert one unique return block at the end of the
+ // function
+ lowerReturn(unit, fn->getName());
+ // check if there is empty labels at first
+ fn->checkEmptyLabels();
+ // Properly order labels and compute the CFG, it's needed by FunctionArgumentLower
+ fn->sortLabels();
+ fn->computeCFG();
+
+ // Spill function argument to the stack if required and identify which
+ // function arguments can use constant push
+ lowerFunctionArguments(unit, fn->getName());
+
+ const StackElem elem = fnStack.back();
+ fnStack.pop_back();
+ fn = elem.fn;
+ bb = elem.bb;
+ usedLabels = elem.usedLabels;
+ }
+
+ Register Context::reg(RegisterFamily family, bool uniform) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ return fn->newRegister(family, uniform);
+ }
+
+ LabelIndex Context::label(void) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ const LabelIndex index = fn->newLabel();
+ if (index >= usedLabels->size()) {
+ usedLabels->resize(index + 1);
+ (*usedLabels)[index] = 0;
+ }
+ return index;
+ }
+
+ void Context::input(const std::string &name, FunctionArgument::Type type, Register reg,
+ FunctionArgument::InfoFromLLVM& info, uint32_t elementSize, uint32_t align, unsigned char bti) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
+ FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info, bti);
+ fn->args.push_back(arg);
+ }
+
+ void Context::output(Register reg) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
+ fn->outputs.push_back(reg);
+ }
+
+ void Context::startBlock(void) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ this->bb = GBE_NEW(BasicBlock, *fn);
+ fn->blocks.push_back(bb);
+ }
+
+ void Context::endBlock(void) {
+ this->bb = NULL;
+ }
+
+ void Context::append(const Instruction &insn) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+
+ // Start a new block if this is a label
+ if (insn.isMemberOf<LabelInstruction>() == true) {
+ this->endBlock();
+ this->startBlock();
+ const LabelIndex index = cast<LabelInstruction>(insn).getLabelIndex();
+ GBE_ASSERTM(index < fn->labelNum(), "Out-of-bound label");
+ GBE_ASSERTM(fn->labels[index] == NULL, "Label used in a previous block");
+ fn->labels[index] = bb;
+
+ // Now the label index is properly defined
+ GBE_ASSERT(index < usedLabels->size());
+ (*usedLabels)[index] |= LABEL_IS_DEFINED;
+ }
+ // We create a new label for a new block if the user did not do it
+ else if (bb == NULL) {
+ // this->startBlock();
+ const LabelIndex index = this->label();
+ const Instruction insn = ir::LABEL(index);
+ this->append(insn);
+ }
+
+ // Append the instruction in the stream
+ Instruction *insnPtr = fn->newInstruction(insn);
+ bb->append(*insnPtr);
+#if GBE_DEBUG
+ std::string whyNot;
+ if(getUnit().getValid())
+ GBE_ASSERTM(insnPtr->wellFormed(whyNot), whyNot.c_str());
+#endif /* GBE_DEBUG */
+
+ // Close the current block if this is a branch
+ if (insn.isMemberOf<BranchInstruction>() == true) {
+ // We must book keep the fact that the label is used
+ if (insn.getOpcode() == OP_BRA) {
+ const BranchInstruction &branch = cast<BranchInstruction>(insn);
+ const LabelIndex index = branch.getLabelIndex();
+ GBE_ASSERT(index < usedLabels->size());
+ (*usedLabels)[index] |= LABEL_IS_POINTED;
+ }
+ this->endBlock();
+ }
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
new file mode 100644
index 0000000..cd09413
--- /dev/null
+++ b/backend/src/ir/context.hpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file context.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_CONTEXT_HPP__
+#define __GBE_IR_CONTEXT_HPP__
+
+#include "ir/instruction.hpp"
+#include "ir/function.hpp"
+#include "ir/register.hpp"
+#include "ir/immediate.hpp"
+#include "ir/unit.hpp"
+#include "sys/vector.hpp"
+#include <tuple>
+
+namespace gbe {
+namespace ir {
+
+ /*! A context allows an easy creation of the functions (instruction stream and
+ * the set of immediates and registers needed for it) and constant arrays
+ */
+ class Context
+ {
+ public:
+ /*! Create a new context for this unit */
+ Context(Unit &unit);
+ /*! Free resources needed by context */
+ virtual ~Context(void);
+ /*! Create a new function "name" */
+ void startFunction(const std::string &name);
+ /*! Close the function */
+ void endFunction(void);
+ /*! Get the current processed unit */
+ INLINE Unit &getUnit(void) { return unit; }
+ /*! Get the current processed function */
+ Function &getFunction(void);
+ /*! Get the current processed block */
+ BasicBlock *getBlock(void) { return bb; }
+ /*! Set the SIMD width of the function */
+ void setSimdWidth(uint32_t width) const {
+ GBE_ASSERT(width == 8 || width == 16);
+ fn->simdWidth = width;
+ }
+ /*! Append a new pushed constant */
+ void appendPushedConstant(Register reg, const PushLocation &pushed);
+ /*! Create a new register with the given family for the current function */
+ Register reg(RegisterFamily family, bool uniform = false);
+ /*! Create a new immediate value */
+ template <typename T> INLINE ImmediateIndex newImmediate(T value) {
+ const Immediate imm(value);
+ return fn->newImmediate(imm);
+ }
+ template <typename T> INLINE ImmediateIndex newImmediate(T value, uint32_t num) {
+ const Immediate imm(value, num);
+ return fn->newImmediate(imm);
+ }
+ /*! Create a new immediate value */
+ INLINE ImmediateIndex newImmediate(vector<ImmediateIndex>indexVector) {
+ vector<const Immediate*> immVector;
+ for( uint32_t i = 0; i < indexVector.size(); i++)
+ immVector.push_back(&fn->getImmediate(indexVector[i]));
+ const Immediate imm(immVector);
+ return fn->newImmediate(imm);
+ }
+ /*! Create an integer immediate value */
+ INLINE ImmediateIndex newIntegerImmediate(int64_t x, Type type) {
+ switch (type) {
+ case TYPE_S8: return this->newImmediate(int8_t(x));
+ case TYPE_U8: return this->newImmediate(uint8_t(x));
+ case TYPE_S16: return this->newImmediate(int16_t(x));
+ case TYPE_U16: return this->newImmediate(uint16_t(x));
+ case TYPE_S32: return this->newImmediate(int32_t(x));
+ case TYPE_U32: return this->newImmediate(uint32_t(x));
+ case TYPE_S64: return this->newImmediate(int64_t(x));
+ case TYPE_U64: return this->newImmediate(uint64_t(x));
+ default: NOT_SUPPORTED; return ImmediateIndex(0);
+ }
+ return ImmediateIndex(0);
+ }
+ INLINE ImmediateIndex newFloatImmediate(float x) {
+ return this->newImmediate(x);
+ }
+ INLINE ImmediateIndex newDoubleImmediate(double x) {
+ return this->newImmediate(x);
+ }
+
+ INLINE ImmediateIndex processImm(ImmOpCode op, ImmediateIndex src, Type type) {
+ const Immediate &imm = fn->getImmediate(src);
+ const Immediate &dstImm = Immediate(op, imm, type);
+ return fn->newImmediate(dstImm);
+ }
+
+ INLINE ImmediateIndex processImm(ImmOpCode op, ImmediateIndex src0,
+ ImmediateIndex src1, Type type) {
+ const Immediate &imm0 = fn->getImmediate(src0);
+ const Immediate &imm1 = fn->getImmediate(src1);
+ const Immediate &dstImm = Immediate(op, imm0, imm1, type);
+ return fn->newImmediate(dstImm);
+ }
+
+ /*! Set an immediate value */
+ template <typename T> INLINE void setImmediate(ImmediateIndex index, T value) {
+ const Immediate imm(value);
+ fn->immediates[index] = imm;
+ }
+ /*! Create a new register holding the given value. A LOADI is pushed */
+ template <typename T> INLINE Register immReg(T value) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ const Immediate imm(value);
+ const ImmediateIndex index = fn->newImmediate(imm);
+ const RegisterFamily family = getFamily(imm.getType());
+ const Register reg = this->reg(family);
+ this->LOADI(imm.getType(), reg, index);
+ return reg;
+ }
+ /*! Create a new label for the current function */
+ LabelIndex label(void);
+ /*! Append a new input register for the function */
+ void input(const std::string &name, FunctionArgument::Type type, Register reg,
+ FunctionArgument::InfoFromLLVM& info, uint32_t elemSz = 0u, uint32_t align = 0, uint8_t bti = 0);
+ /*! Append a new output register for the function */
+ void output(Register reg);
+ /*! Get the immediate value */
+ INLINE Immediate getImmediate(ImmediateIndex index) const {
+ return fn->getImmediate(index);
+ }
+ /*! Append a new tuple */
+ template <typename... Args> INLINE Tuple tuple(Args...args) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ return fn->file.appendTuple(args...);
+ }
+ /*! Make a tuple from an array of register */
+ INLINE Tuple arrayTuple(const Register *reg, uint32_t regNum) {
+ GBE_ASSERTM(fn != NULL, "No function currently defined");
+ return fn->file.appendArrayTuple(reg, regNum);
+ }
+ /*! We just use variadic templates to forward instruction functions */
+#define DECL_INSN(NAME, FAMILY) \
+ template <typename... Args> INLINE void NAME(Args...args);
+#include "ir/instruction.hxx"
+#undef DECL_INSN
+ /*! Return the pointer size handled by the unit */
+ INLINE PointerSize getPointerSize(void) const {
+ return unit.getPointerSize();
+ }
+ /*! Return the family of registers that contain pointer */
+ INLINE RegisterFamily getPointerFamily(void) const {
+ return unit.getPointerFamily();
+ }
+#define DECL_THREE_SRC_INSN(NAME) \
+ INLINE void NAME(Type type, \
+ Register dst, \
+ Register src0, \
+ Register src1, \
+ Register src2) \
+ { \
+ const Tuple index = this->tuple(src0, src1, src2); \
+ this->NAME(type, dst, index); \
+ }
+ DECL_THREE_SRC_INSN(SEL);
+ DECL_THREE_SRC_INSN(I64MADSAT);
+ DECL_THREE_SRC_INSN(MAD);
+#undef DECL_THREE_SRC_INSN
+
+ /*! For all unary functions */
+ void ALU1(Opcode opcode, Type type, Register dst, Register src) {
+ const Instruction insn = gbe::ir::ALU1(opcode, type, dst, src);
+ this->append(insn);
+ }
+
+ /*! LOAD with the destinations directly specified */
+ template <typename... Args>
+ void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+ {
+ const Tuple index = this->tuple(values...);
+ const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
+ GBE_ASSERT(valueNum > 0);
+ this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
+ }
+
+ /*! STORE with the sources directly specified */
+ template <typename... Args>
+ void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+ {
+ const Tuple index = this->tuple(values...);
+ const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
+ GBE_ASSERT(valueNum > 0);
+ this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
+ }
+ void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
+
+ protected:
+ /*! A block must be started with a label */
+ void startBlock(void);
+ /*! A block must be ended with a branch */
+ void endBlock(void);
+ /*! Append the instruction in the current basic block */
+ void append(const Instruction &insn);
+ Unit &unit; //!< A unit is associated to a contect
+ Function *fn; //!< Current function we are processing
+ BasicBlock *bb; //!< Current basic block we are filling
+ static const uint8_t LABEL_IS_POINTED = 1 << 0; //!< Branch is using it
+ static const uint8_t LABEL_IS_DEFINED = 1 << 1; //!< Label is defining it
+ vector<uint8_t> *usedLabels;
+ /*! Functions can be defined recursiely */
+ struct StackElem {
+ INLINE StackElem(Function *fn, BasicBlock *bb, vector<uint8_t> *usedLabels)
+ : fn(fn), bb(bb), usedLabels(usedLabels)
+ {}
+ Function *fn; //!< Function to process
+ BasicBlock *bb; //!< Basic block currently processed
+ vector<uint8_t> *usedLabels; //!< Store all labels that are defined
+ };
+ vector<StackElem> fnStack; //!< Stack of functions still to finish
+ GBE_CLASS(Context);
+ };
+
+ // Use argument checker to assert argument value correctness
+#define DECL_INSN(NAME, FAMILY) \
+ template <typename... Args> \
+ INLINE void Context::NAME(Args...args) { \
+ GBE_ASSERTM(fn != NULL, "No function currently defined"); \
+ const Instruction insn = gbe::ir::NAME(args...); \
+ this->append(insn); \
+ }
+#include "ir/instruction.hxx"
+#undef DECL_INSN
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_CONTEXT_HPP__ */
+
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
new file mode 100644
index 0000000..85e7934
--- /dev/null
+++ b/backend/src/ir/function.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file function.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/function.hpp"
+#include "ir/unit.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+ ///////////////////////////////////////////////////////////////////////////
+ // PushLocation
+ ///////////////////////////////////////////////////////////////////////////
+
+ Register PushLocation::getRegister(void) const {
+ const Function::LocationMap &locationMap = fn.getLocationMap();
+ GBE_ASSERT(locationMap.contains(*this) == true);
+ return locationMap.find(*this)->second;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Function
+ ///////////////////////////////////////////////////////////////////////////
+
+ Function::Function(const std::string &name, const Unit &unit, Profile profile) :
+ name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0)
+ {
+ initProfile(*this);
+ samplerSet = GBE_NEW(SamplerSet);
+ imageSet = GBE_NEW(ImageSet);
+ printfSet = GBE_NEW(PrintfSet);
+ }
+
+ Function::~Function(void) {
+ for (auto block : blocks) GBE_DELETE(block);
+ for (auto loop : loops) GBE_DELETE(loop);
+ for (auto arg : args) GBE_DELETE(arg);
+ }
+
+ RegisterFamily Function::getPointerFamily(void) const {
+ return unit.getPointerFamily();
+ }
+
+ void Function::addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits) {
+ loops.push_back(GBE_NEW(Loop, bbs, exits));
+ }
+
+ void Function::checkEmptyLabels(void) {
+ // Empty label map, we map the removed label to the next label.
+ map<LabelIndex, LabelIndex> labelMap;
+ map<LabelIndex, LabelIndex> revLabelMap;
+ foreachBlock([&](BasicBlock &BB) {
+ Instruction * insn = BB.getLastInstruction();
+ if (insn->getOpcode() == OP_LABEL) {
+ GBE_ASSERTM(0, "Found empty block. ");
+ }
+ });
+ }
+
+ void Function::sortLabels(void) {
+ uint32_t last = 0;
+
+ // Compute the new labels and patch the label instruction
+ map<LabelIndex, LabelIndex> labelMap;
+ foreachInstruction([&](Instruction &insn) {
+ if (insn.getOpcode() != OP_LABEL) return;
+
+ // Create the new label
+ const Instruction newLabel = LABEL(LabelIndex(last));
+
+ // Replace the previous label instruction
+ LabelInstruction &label = cast<LabelInstruction>(insn);
+ const LabelIndex index = label.getLabelIndex();
+ labelMap.insert(std::make_pair(index, LabelIndex(last++)));
+ newLabel.replace(&insn);
+ });
+
+ // Patch all branch instructions with the new labels
+ foreachInstruction([&](Instruction &insn) {
+ if (insn.getOpcode() != OP_BRA) return;
+
+ // Get the current branch instruction
+ BranchInstruction &bra = cast<BranchInstruction>(insn);
+ const LabelIndex index = bra.getLabelIndex();
+ const LabelIndex newIndex = labelMap.find(index)->second;
+
+ // Insert the patched branch instruction
+ if (bra.isPredicated() == true) {
+ const Instruction newBra = BRA(newIndex, bra.getPredicateIndex());
+ newBra.replace(&insn);
+ } else {
+ const Instruction newBra = BRA(newIndex);
+ newBra.replace(&insn);
+ }
+ });
+
+ // fix labels for loops
+ for (auto &x : loops) {
+ for (auto &y : x->bbs)
+ y = labelMap[y];
+
+ for (auto &z : x->exits) {
+ z.first = labelMap[z.first];
+ z.second = labelMap[z.second];
+ }
+ }
+
+ // Reset the label to block mapping
+ this->labels.resize(last);
+ foreachBlock([&](BasicBlock &bb) {
+ const Instruction *first = bb.getFirstInstruction();
+ const LabelInstruction *label = cast<LabelInstruction>(first);
+ const LabelIndex index = label->getLabelIndex();
+ this->labels[index] = &bb;
+ });
+ }
+
+ LabelIndex Function::newLabel(void) {
+ GBE_ASSERTM(labels.size() < 0xffff,
+ "Too many labels are defined (65536 only are supported)");
+ const LabelIndex index(labels.size());
+ labels.push_back(NULL);
+ return index;
+ }
+
+ void Function::outImmediate(std::ostream &out, ImmediateIndex index) const {
+ GBE_ASSERT(index < immediates.size());
+ const Immediate imm = immediates[index];
+ switch (imm.getType()) {
+ case TYPE_BOOL: out << !!imm.getIntegerValue(); break;
+ case TYPE_S8:
+ case TYPE_U8:
+ case TYPE_S16:
+ case TYPE_U16:
+ case TYPE_S32:
+ case TYPE_U32:
+ case TYPE_S64: out << imm.getIntegerValue(); break;
+ case TYPE_U64: out << (uint64_t)imm.getIntegerValue(); break;
+ case TYPE_HALF: out << "half(" << imm.getIntegerValue() << ")"; break;
+ case TYPE_FLOAT: out << imm.getFloatValue(); break;
+ case TYPE_DOUBLE: out << imm.getDoubleValue(); break;
+ default:
+ GBE_ASSERT(0 && "unsupported imm type.\n");
+ }
+ }
+
+ uint32_t Function::getLargestBlockSize(void) const {
+ uint32_t insnNum = 0;
+ foreachBlock([&insnNum](const ir::BasicBlock &bb) {
+ insnNum = std::max(insnNum, uint32_t(bb.size()));
+ });
+ return insnNum;
+ }
+
+ uint32_t Function::getFirstSpecialReg(void) const {
+ return this->profile == PROFILE_OCL ? 0u : ~0u;
+ }
+
+ uint32_t Function::getSpecialRegNum(void) const {
+ return this->profile == PROFILE_OCL ? ocl::regNum : ~0u;
+ }
+
+ bool Function::isEntryBlock(const BasicBlock &bb) const {
+ if (this->blockNum() == 0)
+ return false;
+ else
+ return &bb == this->blocks[0];
+ }
+
+ const BasicBlock &Function::getTopBlock(void) const {
+ GBE_ASSERT(blockNum() > 0 && blocks[0] != NULL);
+ return *blocks[0];
+ }
+
+ const BasicBlock &Function::getBottomBlock(void) const {
+ const uint32_t n = blockNum();
+ GBE_ASSERT(n > 0 && blocks[n-1] != NULL);
+ return *blocks[n-1];
+ }
+
+ BasicBlock &Function::getBottomBlock(void) {
+ const uint32_t n = blockNum();
+ GBE_ASSERT(n > 0 && blocks[n-1] != NULL);
+ return *blocks[n-1];
+ }
+
+ const BasicBlock &Function::getBlock(LabelIndex label) const {
+ GBE_ASSERT(label < labelNum() && labels[label] != NULL);
+ return *labels[label];
+ }
+
+ const LabelInstruction *Function::getLabelInstruction(LabelIndex index) const {
+ const BasicBlock *bb = this->labels[index];
+ const Instruction *first = bb->getFirstInstruction();
+ return cast<LabelInstruction>(first);
+ }
+
+ /*! Indicate if the given register is a special one (like localID in OCL) */
+ bool Function::isSpecialReg(const Register ®) const {
+ const uint32_t ID = uint32_t(reg);
+ const uint32_t firstID = this->getFirstSpecialReg();
+ const uint32_t specialNum = this->getSpecialRegNum();
+ return ID >= firstID && ID < firstID + specialNum;
+ }
+ Register Function::getSurfaceBaseReg(uint8_t bti) const {
+ map<uint8_t, Register>::const_iterator iter = btiRegMap.find(bti);
+ GBE_ASSERT(iter != btiRegMap.end());
+ return iter->second;
+ }
+
+ void Function::appendSurface(uint8_t bti, Register reg) {
+ btiRegMap.insert(std::make_pair(bti, reg));
+ }
+
+ void Function::computeCFG(void) {
+ // Clear possible previously computed CFG and compute the direct
+ // predecessors and successors
+ BasicBlock *prev = NULL;
+ this->foreachBlock([this, &prev](BasicBlock &bb) {
+ bb.successors.clear();
+ bb.predecessors.clear();
+ if (prev != NULL) {
+ prev->nextBlock = &bb;
+ bb.prevBlock = prev;
+ }
+ prev = &bb;
+ });
+
+ // Update it. Do not forget that a branch can also jump to the next block
+ BasicBlock *jumpToNext = NULL;
+ this->foreachBlock([this, &jumpToNext](BasicBlock &bb) {
+ if (jumpToNext) {
+ jumpToNext->successors.insert(&bb);
+ bb.predecessors.insert(jumpToNext);
+ jumpToNext = NULL;
+ }
+ if (bb.size() == 0) return;
+ Instruction *last = bb.getLastInstruction();
+ if (last->isMemberOf<BranchInstruction>() == false) {
+ jumpToNext = &bb;
+ return;
+ }
+ const BranchInstruction &insn = cast<BranchInstruction>(*last);
+ if (insn.getOpcode() == OP_BRA) {
+ const LabelIndex label = insn.getLabelIndex();
+ BasicBlock *target = this->blocks[label];
+ GBE_ASSERT(target != NULL);
+ target->predecessors.insert(&bb);
+ bb.successors.insert(target);
+ if ( insn.isPredicated() == true) jumpToNext = &bb;
+ }
+ });
+ }
+
+ std::ostream &operator<< (std::ostream &out, const Function &fn)
+ {
+ out << ".decl_function " << fn.getName() << std::endl;
+ out << fn.getRegisterFile();
+ out << "## " << fn.argNum() << " input register"
+ << (fn.argNum() ? "s" : "") << " ##" << std::endl;
+ for (uint32_t i = 0; i < fn.argNum(); ++i) {
+ const FunctionArgument &input = fn.getArg(i);
+ out << "decl_input.";
+ switch (input.type) {
+ case FunctionArgument::GLOBAL_POINTER: out << "global"; break;
+ case FunctionArgument::LOCAL_POINTER: out << "local"; break;
+ case FunctionArgument::CONSTANT_POINTER: out << "constant"; break;
+ case FunctionArgument::VALUE: out << "value"; break;
+ case FunctionArgument::STRUCTURE:
+ out << "structure." << input.size;
+ break;
+ case FunctionArgument::IMAGE: out << "image"; break;
+ default: break;
+ }
+ out << " %" << input.reg << " " << input.name << std::endl;
+ }
+ out << "## " << fn.outputNum() << " output register"
+ << (fn.outputNum() ? "s" : "") << " ##" << std::endl;
+ for (uint32_t i = 0; i < fn.outputNum(); ++i)
+ out << "decl_output %" << fn.getOutput(i) << std::endl;
+ out << "## " << fn.pushedNum() << " pushed register" << std::endl;
+ const Function::PushMap &pushMap = fn.getPushMap();
+ for (const auto &pushed : pushMap) {
+ out << "decl_pushed %" << pushed.first
+ << " @{" << pushed.second.argID << ","
+ << pushed.second.offset << "}" << std::endl;
+ }
+ out << "## " << fn.blockNum() << " block"
+ << (fn.blockNum() ? "s" : "") << " ##" << std::endl;
+ fn.foreachBlock([&](const BasicBlock &bb) {
+ const_cast<BasicBlock&>(bb).foreach([&out] (const Instruction &insn) {
+ out << insn << std::endl;
+ });
+ out << std::endl;
+ });
+ out << ".end_function" << std::endl;
+ return out;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Basic Block
+ ///////////////////////////////////////////////////////////////////////////
+
+ BasicBlock::BasicBlock(Function &fn) : fn(fn) {
+ this->nextBlock = this->prevBlock = NULL;
+ }
+
+ BasicBlock::~BasicBlock(void) {
+ this->foreach([this] (Instruction &insn) {
+ this->fn.deleteInstruction(&insn);
+ });
+ }
+
+ void BasicBlock::append(Instruction &insn) {
+ insn.setParent(this);
+ this->push_back(&insn);
+ }
+
+ Instruction *BasicBlock::getFirstInstruction(void) const {
+ GBE_ASSERT(this->begin() != this->end());
+ const Instruction &insn = *this->begin();
+ return const_cast<Instruction*>(&insn);
+ }
+
+ Instruction *BasicBlock::getLastInstruction(void) const {
+ GBE_ASSERT(this->begin() != this->end());
+ const Instruction &insn = *(--this->end());
+ return const_cast<Instruction*>(&insn);
+ }
+
+ LabelIndex BasicBlock::getLabelIndex(void) const {
+ const Instruction *first = this->getFirstInstruction();
+ const LabelInstruction *label = cast<LabelInstruction>(first);
+ return label->getLabelIndex();
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
new file mode 100644
index 0000000..9aa1e8d
--- /dev/null
+++ b/backend/src/ir/function.hpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file function.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_FUNCTION_HPP__
+#define __GBE_IR_FUNCTION_HPP__
+
+#include "ir/immediate.hpp"
+#include "ir/register.hpp"
+#include "ir/instruction.hpp"
+#include "ir/profile.hpp"
+#include "ir/sampler.hpp"
+#include "ir/printf.hpp"
+#include "ir/image.hpp"
+#include "sys/vector.hpp"
+#include "sys/set.hpp"
+#include "sys/map.hpp"
+#include "sys/alloc.hpp"
+
+#include <ostream>
+
+namespace gbe {
+namespace ir {
+
+ /*! Commonly used in the CFG */
+ typedef set<BasicBlock*> BlockSet;
+ class Unit; // Function belongs to a unit
+
+ /*! Function basic blocks really belong to a function since:
+ * 1 - registers used in the basic blocks belongs to the function register
+ * file
+ * 2 - branches point to basic blocks of the same function
+ */
+ class BasicBlock : public NonCopyable, public intrusive_list<Instruction>
+ {
+ public:
+ /*! Empty basic block */
+ BasicBlock(Function &fn);
+ /*! Releases all the instructions */
+ ~BasicBlock(void);
+ /*! Append a new instruction at the end of the stream */
+ void append(Instruction &insn);
+ /*! Get the parent function */
+ Function &getParent(void) { return fn; }
+ const Function &getParent(void) const { return fn; }
+ /*! Get the next and previous allocated block */
+ BasicBlock *getNextBlock(void) const { return this->nextBlock; }
+ BasicBlock *getPrevBlock(void) const { return this->prevBlock; }
+ /*! Get / set the first and last instructions */
+ Instruction *getFirstInstruction(void) const;
+ Instruction *getLastInstruction(void) const;
+ /*! Get successors and predecessors */
+ const BlockSet &getSuccessorSet(void) const { return successors; }
+ const BlockSet &getPredecessorSet(void) const { return predecessors; }
+ /*! Get the label index of this block */
+ LabelIndex getLabelIndex(void) const;
+ /*! Apply the given functor on all instructions */
+ template <typename T>
+ INLINE void foreach(const T &functor) {
+ auto it = this->begin();
+ while (it != this->end()) {
+ auto curr = it++;
+ functor(*curr);
+ }
+ }
+ set <Register> undefPhiRegs;
+ set <Register> definedPhiRegs;
+ private:
+ friend class Function; //!< Owns the basic blocks
+ BlockSet predecessors; //!< Incoming blocks
+ BlockSet successors; //!< Outgoing blocks
+ BasicBlock *nextBlock; //!< Block allocated just after this one
+ BasicBlock *prevBlock; //!< Block allocated just before this one
+ Function &fn; //!< Function the block belongs to
+ GBE_CLASS(BasicBlock);
+ };
+
+ /*! In fine, function input arguments can be pushed from the constant
+ * buffer if they are structures. Other arguments can be images (textures)
+ * and will also require special treatment.
+ */
+ struct FunctionArgument {
+ enum Type {
+ GLOBAL_POINTER = 0, // __global
+ CONSTANT_POINTER = 1, // __constant
+ LOCAL_POINTER = 2, // __local
+ VALUE = 3, // int, float
+ STRUCTURE = 4, // struct foo
+ IMAGE = 5, // image*d_t
+ SAMPLER = 6
+ };
+
+ struct InfoFromLLVM { // All the info about passed by llvm, using -cl-kernel-arg-info
+ uint32_t addrSpace;
+ std::string typeName;
+ std::string accessQual;
+ std::string typeQual;
+ std::string argName; // My different from arg->getName()
+ };
+
+ /*! Create a function input argument */
+ INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align, InfoFromLLVM& info, uint8_t bti) :
+ type(type), reg(reg), size(size), align(align), name(name), info(info), bti(bti) { }
+
+ Type type; //!< Gives the type of argument we have
+ Register reg; //!< Holds the argument
+ uint32_t size; //!< == sizeof(void*) for ptr, sizeof(elem) for the rest
+ uint32_t align; //!< address alignment for the argument
+ const std::string name; //!< Holds the function name for IR output
+ InfoFromLLVM info; //!< Holds the llvm passed info
+ uint8_t bti; //!< binding table index
+ GBE_STRUCT(FunctionArgument); // Use custom allocator
+ };
+
+ /*! Maps the pushed register to the function argument */
+ struct PushLocation {
+ INLINE PushLocation(const Function &fn, uint32_t argID, uint32_t offset) :
+ fn(fn), argID(argID), offset(offset) {}
+ /*! Get the pushed virtual register */
+ Register getRegister(void) const;
+ const Function &fn; //!< Function it belongs to
+ uint32_t argID; //!< Function argument
+ uint32_t offset; //!< Offset in the function argument
+ GBE_STRUCT(PushLocation); // Use custom allocator
+ };
+
+ /*! For maps and sets */
+ INLINE bool operator< (const PushLocation &arg0, const PushLocation &arg1) {
+ if (arg0.argID != arg1.argID) return arg0.argID < arg1.argID;
+ return arg0.offset < arg1.offset;
+ }
+
+ /*! CFG loops */
+ struct Loop : public NonCopyable
+ {
+ public:
+ Loop(const vector<LabelIndex> &in, const vector<std::pair<LabelIndex, LabelIndex>> &exit) :
+ bbs(in), exits(exit) {}
+ vector<LabelIndex> bbs;
+ vector<std::pair<LabelIndex, LabelIndex>> exits;
+ GBE_STRUCT(Loop);
+ };
+
+ /*! A function is :
+ * - a register file
+ * - a set of basic block layout into a CGF
+ * - input arguments
+ */
+ class Function : public NonCopyable
+ {
+ public:
+ /*! Map of all pushed registers */
+ typedef map<Register, PushLocation> PushMap;
+ /*! Map of all pushed location (i.e. part of function argument) */
+ typedef map<PushLocation, Register> LocationMap;
+ /*! Create an empty function */
+ Function(const std::string &name, const Unit &unit, Profile profile = PROFILE_OCL);
+ /*! Release everything *including* the basic block pointers */
+ ~Function(void);
+ /*! Get the function profile */
+ INLINE Profile getProfile(void) const { return profile; }
+ /*! Get a new valid register */
+ INLINE Register newRegister(RegisterFamily family, bool uniform = false) {
+ return this->file.append(family, uniform);
+ }
+ /*! Get the function name */
+ const std::string &getName(void) const { return name; }
+ /*! When set, we do not have choice any more in the back end for it */
+ INLINE void setSimdWidth(uint32_t width) { simdWidth = width; }
+ /*! Get the SIMD width (0 if not forced) */
+ uint32_t getSimdWidth(void) const { return simdWidth; }
+ /*! Extract the register from the register file */
+ INLINE RegisterData getRegisterData(Register reg) const { return file.get(reg); }
+ /*! set a register to uniform or nonuniform type. */
+ INLINE void setRegisterUniform(Register reg, bool uniform) { file.setUniform(reg, uniform); }
+ /*! return true if the specified regsiter is uniform type */
+ INLINE bool isUniformRegister(Register reg) { return file.isUniform(reg); }
+ /*! Get the register family from the register itself */
+ INLINE RegisterFamily getRegisterFamily(Register reg) const {
+ return this->getRegisterData(reg).family;
+ }
+ /*! Get the register from the tuple vector */
+ INLINE Register getRegister(Tuple ID, uint32_t which) const {
+ return file.get(ID, which);
+ }
+ /*! Set the register from the tuple vector */
+ INLINE void setRegister(Tuple ID, uint32_t which, Register reg) {
+ file.set(ID, which, reg);
+ }
+ /*! Get the register file */
+ INLINE const RegisterFile &getRegisterFile(void) const { return file; }
+ /*! Get the given value ie immediate from the function */
+ INLINE const Immediate &getImmediate(ImmediateIndex ID) const {
+ return immediates[ID];
+ }
+ /*! Create a new immediate and returns its index */
+ INLINE ImmediateIndex newImmediate(const Immediate &imm) {
+ const ImmediateIndex index(this->immediateNum());
+ this->immediates.push_back(imm);
+ return index;
+ }
+ /*! Fast allocation / deallocation of instructions */
+ DECL_POOL(Instruction, insnPool);
+ /*! Get input argument */
+ INLINE const FunctionArgument &getArg(uint32_t ID) const {
+ GBE_ASSERT(args[ID] != NULL);
+ return *args[ID];
+ }
+ INLINE FunctionArgument &getArg(uint32_t ID) {
+ GBE_ASSERT(args[ID] != NULL);
+ return *args[ID];
+ }
+
+ /*! Get arg ID. */
+ INLINE int32_t getArgID(FunctionArgument *requestArg) {
+ for (uint32_t ID = 0; ID < args.size(); ID++)
+ {
+ if ( args[ID] == requestArg )
+ return ID;
+ }
+ GBE_ASSERTM(0, "Failed to get a valid argument ID.");
+ return -1;
+ }
+
+ /*! Get the number of pushed registers */
+ INLINE uint32_t pushedNum(void) const { return pushMap.size(); }
+ /*! Get the pushed data location for the given register */
+ INLINE const PushLocation *getPushLocation(Register reg) const {
+ auto it = pushMap.find(reg);
+ if (it == pushMap.end())
+ return NULL;
+ else
+ return &it->second;
+ }
+ /*! Get the map of pushed registers */
+ const PushMap &getPushMap(void) const { return this->pushMap; }
+ /*! Get the map of pushed registers */
+ const LocationMap &getLocationMap(void) const { return this->locationMap; }
+ /*! Get input argument from the register (linear research). Return NULL if
+ * this is not an input argument
+ */
+ INLINE const FunctionArgument *getArg(const Register ®) const {
+ for (auto arg : args) if (arg->reg == reg) return arg;
+ return NULL;
+ }
+
+ INLINE FunctionArgument *getArg(const Register ®) {
+ for (auto arg : args) if (arg->reg == reg) return arg;
+ return NULL;
+ }
+
+ /*! Get output register */
+ INLINE Register getOutput(uint32_t ID) const { return outputs[ID]; }
+ /*! Get the argument location for the pushed register */
+ INLINE const PushLocation &getPushLocation(Register reg) {
+ GBE_ASSERT(pushMap.contains(reg) == true);
+ return pushMap.find(reg)->second;
+ }
+ /*! Says if this is the top basic block (entry point) */
+ bool isEntryBlock(const BasicBlock &bb) const;
+ /*! Get function the entry point block */
+ const BasicBlock &getTopBlock(void) const;
+ /*! Get the last block */
+ const BasicBlock &getBottomBlock(void) const;
+ /*! Get the last block */
+ BasicBlock &getBottomBlock(void);
+ /*! Get block from its label */
+ const BasicBlock &getBlock(LabelIndex label) const;
+ /*! Get the label instruction from its label index */
+ const LabelInstruction *getLabelInstruction(LabelIndex index) const;
+ /*! Return the number of instructions of the largest basic block */
+ uint32_t getLargestBlockSize(void) const;
+ /*! Get the first index of the special registers and number of them */
+ uint32_t getFirstSpecialReg(void) const;
+ uint32_t getSpecialRegNum(void) const;
+ /*! Indicate if the given register is a special one (like localID in OCL) */
+ bool isSpecialReg(const Register ®) const;
+ /*! Create a new label (still not bound to a basic block) */
+ LabelIndex newLabel(void);
+ /*! Create the control flow graph */
+ void computeCFG(void);
+ /*! Sort labels in increasing orders (top block has the smallest label) */
+ void sortLabels(void);
+ /*! check empty Label. */
+ void checkEmptyLabels(void);
+ /*! Get the pointer family */
+ RegisterFamily getPointerFamily(void) const;
+ /*! Number of registers in the register file */
+ INLINE uint32_t regNum(void) const { return file.regNum(); }
+ /*! Number of register tuples in the register file */
+ INLINE uint32_t tupleNum(void) const { return file.tupleNum(); }
+ /*! Number of labels in the function */
+ INLINE uint32_t labelNum(void) const { return labels.size(); }
+ /*! Number of immediate values in the function */
+ INLINE uint32_t immediateNum(void) const { return immediates.size(); }
+ /*! Get the number of argument register */
+ INLINE uint32_t argNum(void) const { return args.size(); }
+ /*! Get the number of output register */
+ INLINE uint32_t outputNum(void) const { return outputs.size(); }
+ /*! Number of blocks in the function */
+ INLINE uint32_t blockNum(void) const { return blocks.size(); }
+ /*! Output an immediate value in a stream */
+ void outImmediate(std::ostream &out, ImmediateIndex index) const;
+ /*! Apply the given functor on all basic blocks */
+ template <typename T>
+ INLINE void foreachBlock(const T &functor) const {
+ for (auto block : blocks) functor(*block);
+ }
+ /*! Apply the given functor on all instructions */
+ template <typename T>
+ INLINE void foreachInstruction(const T &functor) const {
+ for (auto block : blocks) block->foreach(functor);
+ }
+ /*! Does it use SLM */
+ INLINE bool getUseSLM(void) const { return this->useSLM; }
+ /*! Change the SLM config for the function */
+ INLINE bool setUseSLM(bool useSLM) { return this->useSLM = useSLM; }
+ /*! get SLM size needed for local variable inside kernel function */
+ INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
+ /*! set slm size needed for local variable inside kernel function */
+ INLINE void setSLMSize(uint32_t size) { this->slmSize = size; }
+ /*! Get sampler set in this function */
+ SamplerSet* getSamplerSet(void) const {return samplerSet; }
+ /*! Get image set in this function */
+ ImageSet* getImageSet(void) const {return imageSet; }
+ /*! Get printf set in this function */
+ PrintfSet* getPrintfSet(void) const {return printfSet; }
+ /*! Set required work group size. */
+ void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compileWgSize[0] = x; compileWgSize[1] = y; compileWgSize[2] = z; }
+ /*! Get required work group size. */
+ const size_t *getCompileWorkGroupSize(void) const {return compileWgSize;}
+ /*! Set function attributes string. */
+ void setFunctionAttributes(const std::string& functionAttributes) { this->functionAttributes= functionAttributes; }
+ /*! Get function attributes string. */
+ const std::string& getFunctionAttributes(void) const {return this->functionAttributes;}
+ /*! Get stack size. */
+ INLINE const uint32_t getStackSize(void) const { return this->stackSize; }
+ /*! Push stack size. */
+ INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
+ /*! add the loop info for later liveness analysis */
+ void addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits);
+ INLINE const vector<Loop * > &getLoops() { return loops; }
+ /*! Get surface starting address register from bti */
+ Register getSurfaceBaseReg(uint8_t bti) const;
+ void appendSurface(uint8_t bti, Register reg);
+ private:
+ friend class Context; //!< Can freely modify a function
+ std::string name; //!< Function name
+ const Unit &unit; //!< Function belongs to this unit
+ vector<FunctionArgument*> args; //!< Input registers of the function
+ vector<Register> outputs; //!< Output registers of the function
+ vector<BasicBlock*> labels; //!< Each label points to a basic block
+ vector<Immediate> immediates; //!< All immediate values in the function
+ vector<BasicBlock*> blocks; //!< All chained basic blocks
+ vector<Loop *> loops; //!< Loops info of the function
+ map<uint8_t, Register> btiRegMap;//!< map bti to surface base address
+ RegisterFile file; //!< RegisterDatas used by the instructions
+ Profile profile; //!< Current function profile
+ PushMap pushMap; //!< Pushed function arguments (reg->loc)
+ LocationMap locationMap; //!< Pushed function arguments (loc->reg)
+ uint32_t simdWidth; //!< 8 or 16 if forced, 0 otherwise
+ bool useSLM; //!< Is SLM required?
+ uint32_t slmSize; //!< local variable size inside kernel function
+ uint32_t stackSize; //!< stack size for private memory.
+ SamplerSet *samplerSet; //!< samplers used in this function.
+ ImageSet* imageSet; //!< Image set in this function's arguments..
+ PrintfSet *printfSet; //!< printfSet store the printf info.
+ size_t compileWgSize[3]; //!< required work group size specified by
+ // __attribute__((reqd_work_group_size(X, Y, Z))).
+ std::string functionAttributes; //!< function attribute qualifiers combined.
+ GBE_CLASS(Function); //!< Use custom allocator
+ };
+
+ /*! Output the function string in the given stream */
+ std::ostream &operator<< (std::ostream &out, const Function &fn);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_FUNCTION_HPP__ */
+
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
new file mode 100644
index 0000000..a9b1563
--- /dev/null
+++ b/backend/src/ir/image.cpp
@@ -0,0 +1,278 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.cpp
+ *
+ */
+#include "image.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+#include "backend/program.h"
+
+namespace gbe {
+namespace ir {
+
+ static uint32_t getInfoOffset4Type(struct ImageInfo *imageInfo, int type)
+ {
+ switch (type) {
+ case GetImageInfoInstruction::WIDTH: return imageInfo->wSlot;
+ case GetImageInfoInstruction::HEIGHT: return imageInfo->hSlot;
+ case GetImageInfoInstruction::DEPTH: return imageInfo->depthSlot;
+ case GetImageInfoInstruction::CHANNEL_DATA_TYPE: return imageInfo->dataTypeSlot;
+ case GetImageInfoInstruction::CHANNEL_ORDER: return imageInfo->channelOrderSlot;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ return 0;
+ }
+
+ static uint32_t setInfoOffset4Type(struct ImageInfo *imageInfo, int type, uint32_t offset)
+ {
+ switch (type) {
+ case GetImageInfoInstruction::WIDTH: imageInfo->wSlot = offset; break;
+ case GetImageInfoInstruction::HEIGHT: imageInfo->hSlot = offset; break;
+ case GetImageInfoInstruction::DEPTH: imageInfo->depthSlot = offset; break;
+ case GetImageInfoInstruction::CHANNEL_DATA_TYPE: imageInfo->dataTypeSlot = offset; break;
+ case GetImageInfoInstruction::CHANNEL_ORDER: imageInfo->channelOrderSlot = offset; break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ return 0;
+ }
+
+ void ImageSet::appendInfo(ImageInfoKey key, uint32_t offset)
+ {
+ auto it = indexMap.find(key.index);
+ assert(it != indexMap.end());
+ struct ImageInfo *imageInfo = it->second;
+ setInfoOffset4Type(imageInfo, key.type, offset);
+ }
+
+ void ImageSet::clearInfo()
+ {
+ struct ImageInfo *imageInfo;
+ for(auto &it : indexMap) {
+ imageInfo = it.second;
+ imageInfo->wSlot = -1;
+ imageInfo->hSlot = -1;
+ imageInfo->depthSlot = -1;
+ imageInfo->dataTypeSlot = -1;
+ imageInfo->channelOrderSlot = -1;
+ }
+ }
+
+ const int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
+ {
+ auto it = indexMap.find(key.index);
+ if (it == indexMap.end())
+ return -1;
+ struct ImageInfo *imageInfo = it->second;
+ return getInfoOffset4Type(imageInfo, key.type);
+ }
+
+ const uint32_t ImageSet::getIdx(const Register imageReg) const
+ {
+ auto it = regMap.find(imageReg);
+ GBE_ASSERT(it != regMap.end());
+ return it->second->idx;
+ }
+
+ void ImageSet::getData(struct ImageInfo *imageInfos) const {
+ int id = 0;
+ for(auto &it : regMap)
+ imageInfos[id++] = *it.second;
+ }
+
+ ImageSet::~ImageSet() {
+ for(auto &it : regMap)
+ GBE_DELETE(it.second);
+ }
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ /*! Implements the serialization. */
+ size_t ImageSet::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ OUT_UPDATE_SZ(regMap.size());
+ for (auto iter : regMap) {
+ OUT_UPDATE_SZ(iter.first);
+ OUT_UPDATE_SZ(iter.second->arg_idx);
+ OUT_UPDATE_SZ(iter.second->idx);
+ OUT_UPDATE_SZ(iter.second->wSlot);
+ OUT_UPDATE_SZ(iter.second->hSlot);
+ OUT_UPDATE_SZ(iter.second->depthSlot);
+ OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+ OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+ OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+ }
+
+ OUT_UPDATE_SZ(indexMap.size());
+ for (auto iter : indexMap) {
+ OUT_UPDATE_SZ(iter.first);
+ OUT_UPDATE_SZ(iter.second->arg_idx);
+ OUT_UPDATE_SZ(iter.second->idx);
+ OUT_UPDATE_SZ(iter.second->wSlot);
+ OUT_UPDATE_SZ(iter.second->hSlot);
+ OUT_UPDATE_SZ(iter.second->depthSlot);
+ OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+ OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+ OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+ OUT_UPDATE_SZ(ret_size);
+
+ return ret_size;
+ }
+
+ size_t ImageSet::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ uint32_t magic;
+ size_t image_map_sz = 0;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(image_map_sz); //regMap
+ for (size_t i = 0; i < image_map_sz; i++) {
+ ir::Register reg;
+ ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+ IN_UPDATE_SZ(reg);
+ IN_UPDATE_SZ(img_info->arg_idx);
+ IN_UPDATE_SZ(img_info->idx);
+ IN_UPDATE_SZ(img_info->wSlot);
+ IN_UPDATE_SZ(img_info->hSlot);
+ IN_UPDATE_SZ(img_info->depthSlot);
+ IN_UPDATE_SZ(img_info->dataTypeSlot);
+ IN_UPDATE_SZ(img_info->channelOrderSlot);
+ IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+ regMap.insert(std::make_pair(reg, img_info));
+ }
+
+ IN_UPDATE_SZ(image_map_sz); //indexMap
+ for (uint32_t i = 0; i < image_map_sz; i++) {
+ uint32_t index;
+ ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+ IN_UPDATE_SZ(index);
+ IN_UPDATE_SZ(img_info->arg_idx);
+ IN_UPDATE_SZ(img_info->idx);
+ IN_UPDATE_SZ(img_info->wSlot);
+ IN_UPDATE_SZ(img_info->hSlot);
+ IN_UPDATE_SZ(img_info->depthSlot);
+ IN_UPDATE_SZ(img_info->dataTypeSlot);
+ IN_UPDATE_SZ(img_info->channelOrderSlot);
+ IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+ indexMap.insert(std::make_pair(img_info->idx, img_info));
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+ void ImageSet::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+ string spaces_nl = indent_to_str(indent + 4);
+
+ outs << spaces << "------------ Begin ImageSet ------------" << "\n";
+
+ outs << spaces_nl << " ImageSet Map: [reg, arg_idx, idx, wSlot, hSlot, depthSlot, "
+ "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+ outs << spaces_nl << " regMap size: " << regMap.size() << "\n";
+ for (auto iter : regMap) {
+ outs << spaces_nl << " [" << iter.first << ", "
+ << iter.second->arg_idx << ", "
+ << iter.second->idx << ", "
+ << iter.second->wSlot << ", "
+ << iter.second->hSlot << ", "
+ << iter.second->depthSlot << ", "
+ << iter.second->dataTypeSlot << ", "
+ << iter.second->channelOrderSlot << ", "
+ << iter.second->dimOrderSlot << "]" << "\n";
+ }
+
+ outs << spaces_nl << " ImageSet Map: [index, arg_idx, idx, wSlot, hSlot, depthSlot, "
+ "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+ outs << spaces_nl << " regMap size: " << indexMap.size() << "\n";
+ for (auto iter : indexMap) {
+ outs << spaces_nl << " [" << iter.first << ", "
+ << iter.second->arg_idx << ", "
+ << iter.second->idx << ", "
+ << iter.second->wSlot << ", "
+ << iter.second->hSlot << ", "
+ << iter.second->depthSlot << ", "
+ << iter.second->dataTypeSlot << ", "
+ << iter.second->channelOrderSlot << ", "
+ << iter.second->dimOrderSlot << ", " << "\n";
+ }
+
+ outs << spaces << "------------- End ImageSet -------------" << "\n";
+ }
+
+#ifdef GBE_COMPILER_AVAILABLE
+ Register ImageSet::appendInfo(ImageInfoKey key, Context *ctx)
+ {
+ auto it = infoRegMap.find(key.data);
+ if (it != infoRegMap.end())
+ return it->second;
+ Register reg = ctx->reg(FAMILY_DWORD);
+ infoRegMap.insert(std::make_pair(key.data, reg));
+ return reg;
+ }
+
+ void ImageSet::append(Register imageReg, Context *ctx, uint8_t bti)
+ {
+ ir::FunctionArgument *arg = ctx->getFunction().getArg(imageReg);
+ GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
+ GBE_ASSERTM(regMap.find(imageReg) == regMap.end(), "Append the same image reg twice.");
+
+ int32_t id = ctx->getFunction().getArgID(arg);
+ struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
+ imageInfo->arg_idx = id;
+ imageInfo->idx = bti;
+ imageInfo->wSlot = -1;
+ imageInfo->hSlot = -1;
+ imageInfo->depthSlot = -1;
+ imageInfo->dataTypeSlot = -1;
+ imageInfo->channelOrderSlot = -1;
+ imageInfo->dimOrderSlot = -1;
+ regMap.insert(std::make_pair(imageReg, imageInfo));
+ indexMap.insert(std::make_pair(imageInfo->idx, imageInfo));
+ }
+#endif
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
new file mode 100644
index 0000000..b31c7da
--- /dev/null
+++ b/backend/src/ir/image.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.hpp
+ *
+ */
+#ifndef __GBE_IR_IMAGE_HPP__
+#define __GBE_IR_IMAGE_HPP__
+
+#include "ir/register.hpp"
+#include "ir/instruction.hpp" // for ImageInfoKey
+#include "sys/map.hpp"
+
+extern "C" {
+ struct ImageInfo;
+}
+
+namespace gbe {
+namespace ir {
+
+ class Context;
+ /*! An image set is a set of images which are defined in kernel args.
+ * We use this set to gather the images here and allocate a unique index
+ * for each individual image. And that individual image could be used
+ * at backend to identify this image's location.
+ */
+ class ImageSet : public Serializable
+ {
+ public:
+ /*! Append an image argument. */
+ void append(Register imageReg, Context *ctx, uint8_t bti);
+ /*! Append an image info slot. */
+ void appendInfo(ImageInfoKey key, uint32_t offset);
+ /*! Append an image info register. */
+ Register appendInfo(ImageInfoKey, Context *ctx);
+ /*! clear image info. */
+ void clearInfo();
+ /*! Get the image's index(actual location). */
+ const uint32_t getIdx(const Register imageReg) const;
+ size_t getDataSize(void) { return regMap.size(); }
+ size_t getDataSize(void) const { return regMap.size(); }
+
+ const int32_t getInfoOffset(ImageInfoKey key) const;
+ void getData(struct ImageInfo *imageInfos) const;
+ void operator = (const ImageSet& other) {
+ regMap.insert(other.regMap.begin(), other.regMap.end());
+ }
+
+ bool empty() const { return regMap.empty(); }
+
+ ImageSet(const ImageSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
+ ImageSet() {}
+ ~ImageSet();
+
+ static const uint32_t magic_begin = TO_MAGIC('I', 'M', 'A', 'G');
+ static const uint32_t magic_end = TO_MAGIC('G', 'A', 'M', 'I');
+
+ /* format:
+ magic_begin |
+ regMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ indexMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
+ private:
+ map<Register, struct ImageInfo *> regMap;
+ map<uint32_t, struct ImageInfo *> indexMap;
+ map<uint16_t, Register> infoRegMap;
+ GBE_CLASS(ImageSet);
+ };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_IMAGE_HPP__ */
diff --git a/backend/src/ir/immediate.cpp b/backend/src/ir/immediate.cpp
new file mode 100644
index 0000000..3a6b9a2
--- /dev/null
+++ b/backend/src/ir/immediate.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "immediate.hpp"
+
+using namespace gbe;
+using namespace ir;
+
+#define SCALAR_SAME_TYPE_ASSERT() \
+ GBE_ASSERT(this->getType() == right.getType() && \
+ this->getElemNum() == right.getElemNum() && \
+ this->getElemNum() == 1 && \
+ this->getType() != TYPE_BOOL);
+
+#define DECLAR_BINARY_ALL_TYPE_OP(OP) \
+ Immediate Immediate::operator OP (const Immediate &right) const { \
+ SCALAR_SAME_TYPE_ASSERT(); \
+ switch (this->getType()) { \
+ default: \
+ GBE_ASSERT(0); \
+ case TYPE_S8: return Immediate(*this->data.s8 OP *right.data.s8); \
+ case TYPE_U8: return Immediate(*this->data.u8 OP *right.data.u8); \
+ case TYPE_S16: return Immediate(*this->data.s16 OP *right.data.s16); \
+ case TYPE_U16: return Immediate(*this->data.u16 OP *right.data.u16); \
+ case TYPE_S32: return Immediate(*this->data.s32 OP *right.data.s32); \
+ case TYPE_U32: return Immediate(*this->data.u32 OP *right.data.u32); \
+ case TYPE_S64: return Immediate(*this->data.s64 OP *right.data.s64); \
+ case TYPE_U64: return Immediate(*this->data.u64 OP *right.data.u64); \
+ case TYPE_FLOAT: return Immediate(*this->data.f32 OP *right.data.f32); \
+ case TYPE_DOUBLE: return Immediate(*this->data.f64 OP *right.data.f64); \
+ }\
+ return *this;\
+ }
+
+ DECLAR_BINARY_ALL_TYPE_OP(+)
+ DECLAR_BINARY_ALL_TYPE_OP(-)
+ DECLAR_BINARY_ALL_TYPE_OP(*)
+ DECLAR_BINARY_ALL_TYPE_OP(/)
+
+#undef DECLAR_BINARY_ALL_TYPE_OP
+
+#define DECLAR_BINARY_INT_TYPE_OP(OP) \
+ Immediate Immediate::operator OP (const Immediate &right) const { \
+ SCALAR_SAME_TYPE_ASSERT(); \
+ switch (this->getType()) { \
+ default: \
+ GBE_ASSERT(0); \
+ case TYPE_S8: return Immediate(*this->data.s8 OP *right.data.s8); \
+ case TYPE_U8: return Immediate(*this->data.u8 OP *right.data.u8); \
+ case TYPE_S16: return Immediate(*this->data.s16 OP *right.data.s16); \
+ case TYPE_U16: return Immediate(*this->data.u16 OP *right.data.u16); \
+ case TYPE_S32: return Immediate(*this->data.s32 OP *right.data.s32); \
+ case TYPE_U32: return Immediate(*this->data.u32 OP *right.data.u32); \
+ case TYPE_S64: return Immediate(*this->data.s64 OP *right.data.s64); \
+ case TYPE_U64: return Immediate(*this->data.u64 OP *right.data.u64); \
+ }\
+ return *this;\
+ }
+ DECLAR_BINARY_INT_TYPE_OP(%)
+ DECLAR_BINARY_INT_TYPE_OP(&)
+ DECLAR_BINARY_INT_TYPE_OP(|)
+ DECLAR_BINARY_INT_TYPE_OP(^)
+#undef DECLAR_BINARY_INT_TYPE_OP
+
+
+#define DECLAR_BINARY_ASHIFT_OP(OP) \
+ Immediate Immediate::operator OP (const Immediate &right) const { \
+ GBE_ASSERT(this->getType() > TYPE_BOOL && this->getType() <= TYPE_U64); \
+ int32_t shift = right.getIntegerValue(); \
+ if (shift == 0) \
+ return *this; \
+ else \
+ switch (this->getType()) { \
+ default: \
+ GBE_ASSERT(0); \
+ case TYPE_S8: return Immediate((*this->data.s8 OP shift)); \
+ case TYPE_U8: return Immediate((*this->data.u8 OP shift)); \
+ case TYPE_S16: return Immediate((*this->data.s16 OP shift)); \
+ case TYPE_U16: return Immediate((*this->data.u16 OP shift)); \
+ case TYPE_S32: return Immediate((*this->data.s32 OP shift)); \
+ case TYPE_U32: return Immediate((*this->data.u32 OP shift)); \
+ case TYPE_S64: return Immediate((*this->data.s64 OP shift)); \
+ case TYPE_U64: return Immediate((*this->data.u64 OP shift)); \
+ } \
+ }
+
+ DECLAR_BINARY_ASHIFT_OP(>>)
+ DECLAR_BINARY_ASHIFT_OP(<<)
+
+#undef DECLAR_BINARY_ASHIFT_OP
+ Immediate Immediate::lshr (const Immediate &left, const Immediate &right) {
+ GBE_ASSERT(left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64);
+ int32_t shift = right.getIntegerValue();
+ if (shift == 0)
+ return left;
+ else
+ switch (left.getType()) {
+ default:
+ GBE_ASSERT(0);
+ case TYPE_S8:
+ case TYPE_U8: return Immediate((*left.data.u8 >> shift));
+ case TYPE_S16:
+ case TYPE_U16: return Immediate((*left.data.u16 >> shift));
+ case TYPE_S32:
+ case TYPE_U32: return Immediate((*left.data.u32 >> shift));
+ case TYPE_S64:
+ case TYPE_U64: return Immediate((*left.data.u64 >> shift));
+ }
+ }
+
+ Immediate::Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType) {
+ switch (op) {
+ default:
+ GBE_ASSERT(0 && "unsupported imm op\n");
+ case IMM_ADD: *this = left + right; break;
+ case IMM_SUB: *this = left - right; break;
+ case IMM_MUL: *this = left * right; break;
+ case IMM_DIV: *this = left / right; break;
+ case IMM_AND: *this = left & right; break;
+ case IMM_OR: *this = left | right; break;
+ case IMM_XOR: *this = left ^ right; break;
+ case IMM_REM:
+ {
+ if (left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64)
+ *this = left % right;
+ else if (left.getType() == TYPE_FLOAT && right.getType() == TYPE_FLOAT) {
+ *this = Immediate(left);
+ *this->data.f32 = fmodf(left.getFloatValue(), right.getFloatValue());
+ }
+ else if (left.getType() == TYPE_DOUBLE && right.getType() == TYPE_DOUBLE) {
+ *this = Immediate(left);
+ *this->data.f64 += fmod(left.getDoubleValue(), right.getDoubleValue());
+ }
+ else
+ GBE_ASSERT(0);
+ break;
+ }
+ case IMM_LSHR:
+ {
+ if (left.getElemNum() == 1)
+ lshr(left, right);
+ else {
+ GBE_ASSERT(right.getIntegerValue() <= (left.getElemNum() * left.getTypeSize() * 8));
+ GBE_ASSERT(right.getIntegerValue() % (left.getTypeSize() * 8) == 0);
+ copy(left, right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+ }
+ break;
+ }
+ case IMM_ASHR:
+ {
+ if (left.getElemNum() == 1)
+ *this = left >> right;
+ else {
+ GBE_ASSERT(0 && "Doesn't support ashr on array constant.");
+ copy(left, right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+ }
+ break;
+ }
+ case IMM_SHL:
+ {
+ if (left.getElemNum() == 1)
+ *this = left << right;
+ else {
+ GBE_ASSERT(right.getIntegerValue() <= (left.getElemNum() * left.getTypeSize() * 8));
+ GBE_ASSERT(right.getIntegerValue() % (left.getTypeSize() * 8) == 0);
+ copy(left, -right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+ }
+ break;
+ }
+ }
+ // If the dst type is large int, we will not change the imm type to large int.
+ GBE_ASSERT(type == (ImmType)dstType || dstType == TYPE_LARGE_INT);
+ }
+
+ Immediate::Immediate(const vector<const Immediate*> immVec) {
+ if (immVec.size() == 1) {
+ *this = *immVec[0];
+ } else if (!(immVec[0]->isCompType()) && immVec[0]->elemNum == 1) {
+ this->type = immVec[0]->type;
+ this->elemNum = immVec.size();
+ if (immVec[0]->getTypeSize() * immVec.size() < 8)
+ this->data.p = &this->defaultData;
+ else
+ this->data.p = malloc(immVec[0]->getTypeSize() * immVec.size());
+ uint8_t *p = (uint8_t*)this->data.p;
+ for(uint32_t i = 0; i < immVec.size(); i++) {
+ GBE_ASSERT(immVec[i]->type == immVec[0]->type && immVec[i]->elemNum == 1);
+ memcpy(p, immVec[i]->data.p, immVec[i]->getTypeSize());
+ p += immVec[i]->getTypeSize();
+ }
+ } else {
+ this->type = IMM_TYPE_COMP;
+ if (immVec.size() * sizeof(Immediate*) < 8)
+ this->data.p = &this->defaultData;
+ else
+ this->data.p = malloc(immVec.size() * sizeof(Immediate*));
+ this->elemNum = immVec.size();
+ for(uint32_t i = 0; i < immVec.size(); i++)
+ this->data.immVec[i] = immVec[i];
+ }
+ }
+
+
+ // operator = and copy() are only called from constructor functions
+ // which this never hold a memory pointer, we don't need to bother
+ // to check the data.p before assignment.
+ Immediate & Immediate::operator= (const Immediate & other) {
+ if (this != &other) {
+ type = other.type;
+ elemNum = other.elemNum;
+ if (other.data.p != &other.defaultData) {
+ data.p = malloc(other.elemNum * other.getTypeSize());
+ memcpy(data.p, other.data.p, other.elemNum * other.getTypeSize());
+ }
+ else {
+ defaultData = other.defaultData;
+ data.p = &defaultData;
+ }
+ }
+ return *this;
+ }
+
+ void Immediate::copy(const Immediate &other, int32_t offset, uint32_t num) {
+ if (this != &other) {
+ if (other.type == IMM_TYPE_COMP && num == 1) {
+ GBE_ASSERT(offset >= 0 && offset <= (int32_t)other.elemNum);
+ *this = *other.data.immVec[offset];
+ return;
+ }
+ type = other.type;
+ elemNum = num;
+ if (num * other.getTypeSize() < 8)
+ data.p = &defaultData;
+ else
+ data.p = malloc(num * other.getTypeSize());
+ uint8_t* datap = (uint8_t*)data.p;
+ memset(datap, 0, num * other.getTypeSize());
+ if (offset < 0) {
+ datap += (-offset) * other.getTypeSize();
+ num -= num < (uint32_t)(-offset) ? num : (-offset);
+ offset = 0;
+ } else if (offset > 0 && num > 1) {
+ GBE_ASSERT((int32_t)num > offset);
+ num -= offset;
+ }
+ memcpy(datap, (uint8_t*)other.data.p + offset * other.getTypeSize(),
+ num * other.getTypeSize());
+ }
+ }
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
new file mode 100644
index 0000000..6a5c819
--- /dev/null
+++ b/backend/src/ir/immediate.hpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file Immediate.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_IMMEDIATE_HPP__
+#define __GBE_IR_IMMEDIATE_HPP__
+
+#include <string.h>
+#include "ir/type.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe {
+namespace ir {
+
+ typedef enum {
+ IMM_TRUNC = 0,
+ IMM_BITCAST,
+ IMM_ADD,
+ IMM_SUB,
+ IMM_MUL,
+ IMM_DIV,
+ IMM_REM,
+ IMM_SHL,
+ IMM_ASHR,
+ IMM_LSHR,
+ IMM_AND,
+ IMM_OR,
+ IMM_XOR
+ } ImmOpCode;
+
+ typedef enum {
+ IMM_TYPE_BOOL = TYPE_BOOL,
+ IMM_TYPE_S8 = TYPE_S8,
+ IMM_TYPE_U8 = TYPE_U8,
+ IMM_TYPE_S16 = TYPE_S16,
+ IMM_TYPE_U16 = TYPE_U16,
+ IMM_TYPE_S32 = TYPE_S32,
+ IMM_TYPE_U32 = TYPE_U32,
+ IMM_TYPE_S64 = TYPE_S64,
+ IMM_TYPE_U64 = TYPE_U64,
+ IMM_TYPE_FLOAT = TYPE_FLOAT,
+ IMM_TYPE_DOUBLE = TYPE_DOUBLE,
+ IMM_TYPE_COMP // compond immediate which consist many immediates.
+ } ImmType;
+
+ /*! The value as stored in the instruction */
+ class Immediate
+ {
+ public:
+ INLINE Immediate(void) { }
+
+ INLINE Type getType(void) const {
+ return (Type)type;
+ }
+
+ INLINE bool isCompType(void) const {
+ return type == IMM_TYPE_COMP;
+ }
+
+ INLINE uint32_t getElemNum(void) const {
+ return elemNum;
+ }
+
+ uint32_t getTypeSize(void) const {
+ switch(type) {
+ default:
+ GBE_ASSERT(0 && "Invalid immeidate type.\n");
+ case TYPE_BOOL:
+ case TYPE_S8:
+ case TYPE_U8: return 1;
+ case TYPE_S16:
+ case TYPE_U16: return 2;
+ case TYPE_FLOAT:
+ case TYPE_S32:
+ case TYPE_U32: return 4;
+ case TYPE_DOUBLE:
+ case TYPE_S64:
+ case TYPE_U64: return 8;
+ case IMM_TYPE_COMP: return sizeof(Immediate*);
+ }
+ }
+
+#define DECL_CONSTRUCTOR(TYPE, FIELD, IR_TYPE) \
+ Immediate(TYPE FIELD) { \
+ this->type = (ImmType)IR_TYPE; \
+ this->elemNum = 1; \
+ this->data.p = &defaultData; \
+ defaultData = 0ull; \
+ *this->data.FIELD = FIELD; \
+ }
+
+ DECL_CONSTRUCTOR(bool, b, TYPE_BOOL)
+ DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8)
+ DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8)
+ DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16)
+ DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16)
+ DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32)
+ DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32)
+ DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64)
+ DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64)
+ DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT)
+ DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE)
+#undef DECL_CONSTRUCTOR
+
+#define DECL_CONSTRUCTOR(TYPE, FIELD, IR_TYPE, ELEMNUM) \
+ Immediate(TYPE *FIELD, uint32_t ELEMNUM) { \
+ this->type = (ImmType)IR_TYPE; \
+ this->elemNum = ELEMNUM; \
+ if (elemNum * ELEMNUM > 8) \
+ this->data.p = malloc(ELEMNUM * getTypeSize()); \
+ else \
+ this->data.p = &defaultData; \
+ defaultData = 0ull; \
+ memcpy(this->data.FIELD, FIELD, ELEMNUM * getTypeSize()); \
+ }
+
+ DECL_CONSTRUCTOR(bool, b, TYPE_BOOL, elemNum)
+ DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8, elemNum)
+ DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8, elemNum)
+ DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16, elemNum)
+ DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16, elemNum)
+ DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32, elemNum)
+ DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32, elemNum)
+ DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64, elemNum)
+ DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64, elemNum)
+ DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT, elemNum)
+ DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE, elemNum)
+#undef DECL_CONSTRUCTOR
+
+ Immediate(const vector<const Immediate*> immVec);
+
+ INLINE int64_t getIntegerValue(void) const {
+ switch (type) {
+ default:
+ GBE_ASSERT(0 && "Invalid immediate type.\n");
+ case TYPE_BOOL: return *data.b;
+ case TYPE_S8: return *data.s8;
+ case TYPE_U8: return *data.u8;
+ case TYPE_S16: return *data.s16;
+ case TYPE_U16: return *data.u16;
+ case TYPE_S32: return *data.s32;
+ case TYPE_U32: return *data.u32;
+ case TYPE_S64: return *data.s64;
+ case TYPE_U64: return *data.u64;
+ }
+ }
+
+ INLINE float getFloatValue(void) const {
+ GBE_ASSERT(type == IMM_TYPE_FLOAT);
+ return *data.f32;
+ }
+
+ INLINE float asFloatValue(void) const {
+ GBE_ASSERT(type == IMM_TYPE_FLOAT || type == IMM_TYPE_U32 || type == IMM_TYPE_S32);
+ return *data.f32;
+ }
+
+ INLINE int64_t asIntegerValue(void) const {
+ GBE_ASSERT(elemNum == 1);
+ return *data.s64;
+ }
+
+ INLINE double getDoubleValue(void) const {
+ GBE_ASSERT(type == IMM_TYPE_DOUBLE);
+ return *data.f64;
+ }
+
+ INLINE Immediate(const Immediate & other) {
+ *this = other;
+ }
+
+ Immediate(ImmOpCode op, const Immediate &other, Type dstType) {
+ if (op == IMM_TRUNC) {
+ copy(other, 0, 1);
+ } else if (op == IMM_BITCAST) {
+ *this = other;
+ type = (ImmType)dstType;
+ }
+ }
+
+ Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType);
+
+ ~Immediate() {
+ if (data.p != &defaultData) {
+ free(data.p);
+ data.p = NULL;
+ }
+ }
+
+ private:
+ union {
+ bool *b;
+ int8_t *s8;
+ uint8_t *u8;
+ int16_t *s16;
+ uint16_t *u16;
+ int32_t *s32;
+ uint32_t *u32;
+ int64_t *s64;
+ uint64_t *u64;
+ float *f32;
+ double *f64;
+ const Immediate *immVec[];
+ void *p;
+ } data; //!< Value to store
+ ImmType type; //!< Type of the value
+ uint32_t elemNum; //!< vector imm data type
+ uint64_t defaultData;
+ Immediate & operator= (const Immediate &);
+ Immediate operator+ (const Immediate &) const;
+ Immediate operator- (const Immediate &) const;
+ Immediate operator* (const Immediate &) const;
+ Immediate operator/ (const Immediate &) const;
+ Immediate operator% (const Immediate &) const;
+ Immediate operator& (const Immediate &) const;
+ Immediate operator| (const Immediate &) const;
+ Immediate operator^ (const Immediate &) const;
+ Immediate operator<< (const Immediate &) const;
+ Immediate operator>> (const Immediate &) const;
+ static Immediate lshr (const Immediate &left, const Immediate &right);
+
+
+ void copy(const Immediate &other, int32_t offset, uint32_t num);
+ GBE_CLASS(Immediate);
+ };
+
+ /*! Compare two immediates */
+ INLINE bool operator< (const Immediate &imm0, const Immediate &imm1) {
+ if (imm0.getType() != imm1.getType())
+ return uint32_t(imm0.getType()) < uint32_t(imm1.getType());
+ else if (imm0.getType() == TYPE_FLOAT || imm0.getType() == TYPE_DOUBLE)
+ return imm0.asIntegerValue() < imm1.asIntegerValue();
+ else
+ return imm0.getIntegerValue() < imm1.getIntegerValue();
+ }
+
+ /*! A value is stored in a per-function vector. This is the index to it */
+ TYPE_SAFE(ImmediateIndex, uint16_t)
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_IMMEDIATE_HPP__ */
+
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
new file mode 100644
index 0000000..5fc1535
--- /dev/null
+++ b/backend/src/ir/instruction.cpp
@@ -0,0 +1,1684 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file instruction.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/instruction.hpp"
+#include "ir/function.hpp"
+
+namespace gbe {
+namespace ir {
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Implements the concrete implementations of the instruction classes. We
+ // cast an instruction to an internal class to run the given member function
+ ///////////////////////////////////////////////////////////////////////////
+ namespace internal
+ {
+#define ALIGNED_INSTRUCTION ALIGNED(ALIGNOF(Instruction))
+
+ /*! Policy shared by all the internal instructions */
+ struct BasePolicy {
+ /*! Create an instruction from its internal representation */
+ Instruction convert(void) const {
+ return Instruction(reinterpret_cast<const char *>(&this->opcode));
+ }
+ /*! Output the opcode in the given stream */
+ INLINE void outOpcode(std::ostream &out) const {
+ switch (opcode) {
+#define DECL_INSN(OPCODE, CLASS) case OP_##OPCODE: out << #OPCODE; break;
+#include "instruction.hxx"
+#undef DECL_INSN
+ case OP_INVALID: NOT_SUPPORTED; break;
+ };
+ }
+
+ /*! Instruction opcode */
+ Opcode opcode;
+ };
+
+ /*! For regular n source instructions */
+ template <typename T, uint32_t srcNum>
+ struct NSrcPolicy {
+ INLINE uint32_t getSrcNum(void) const { return srcNum; }
+ INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM((int) ID < (int) srcNum, "Out-of-bound source");
+ return static_cast<const T*>(this)->src[ID];
+ }
+ INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM((int) ID < (int) srcNum, "Out-of-bound source");
+ static_cast<T*>(this)->src[ID] = reg;
+ }
+ };
+
+ /*! For regular n destinations instructions */
+ template <typename T, uint32_t dstNum>
+ struct NDstPolicy {
+ INLINE uint32_t getDstNum(void) const { return dstNum; }
+ INLINE Register getDst(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM((int) ID < (int) dstNum, "Out-of-bound destination");
+ return static_cast<const T*>(this)->dst[ID];
+ }
+ INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM((int) ID < (int) dstNum, "Out-of-bound destination");
+ static_cast<T*>(this)->dst[ID] = reg;
+ }
+ };
+
+ /*! For instructions that use a tuple for source */
+ template <typename T>
+ struct TupleSrcPolicy {
+ INLINE uint32_t getSrcNum(void) const {
+ return static_cast<const T*>(this)->srcNum;
+ }
+ INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM(ID < static_cast<const T*>(this)->srcNum, "Out-of-bound source register");
+ return fn.getRegister(static_cast<const T*>(this)->src, ID);
+ }
+ INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM(ID < static_cast<const T*>(this)->srcNum, "Out-of-bound source register");
+ return fn.setRegister(static_cast<T*>(this)->src, ID, reg);
+ }
+ };
+
+ /*! For instructions that use a tuple for destination */
+ template <typename T>
+ struct TupleDstPolicy {
+ INLINE uint32_t getDstNum(void) const {
+ return static_cast<const T*>(this)->dstNum;
+ }
+ INLINE Register getDst(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM(ID < static_cast<const T*>(this)->dstNum, "Out-of-bound source register");
+ return fn.getRegister(static_cast<const T*>(this)->dst, ID);
+ }
+ INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM(ID < static_cast<const T*>(this)->dstNum, "Out-of-bound source register");
+ return fn.setRegister(static_cast<T*>(this)->dst, ID, reg);
+ }
+ };
+
+ /*! All unary and binary arithmetic instructions */
+ template <uint32_t srcNum> // 1 or 2
+ class ALIGNED_INSTRUCTION NaryInstruction :
+ public BasePolicy,
+ public NSrcPolicy<NaryInstruction<srcNum>, srcNum>,
+ public NDstPolicy<NaryInstruction<1>, 1>
+ {
+ public:
+ INLINE Type getType(void) const { return this->type; }
+ INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Type type; //!< Type of the instruction
+ Register dst[1]; //!< Index of the register in the register file
+ Register src[srcNum]; //!< Indices of the sources
+ };
+
+ /*! All 1-source arithmetic instructions */
+ class ALIGNED_INSTRUCTION UnaryInstruction : public NaryInstruction<1>
+ {
+ public:
+ UnaryInstruction(Opcode opcode, Type type, Register dst, Register src) {
+ this->opcode = opcode;
+ this->type = type;
+ this->dst[0] = dst;
+ this->src[0] = src;
+ }
+ };
+
+ /*! All 2-source arithmetic instructions */
+ class ALIGNED_INSTRUCTION BinaryInstruction : public NaryInstruction<2>
+ {
+ public:
+ BinaryInstruction(Opcode opcode,
+ Type type,
+ Register dst,
+ Register src0,
+ Register src1) {
+ this->opcode = opcode;
+ this->type = type;
+ this->dst[0] = dst;
+ this->src[0] = src0;
+ this->src[1] = src1;
+ }
+ INLINE bool commutes(void) const {
+ switch (opcode) {
+ case OP_ADD:
+ case OP_ADDSAT:
+ case OP_XOR:
+ case OP_OR:
+ case OP_AND:
+ case OP_MUL:
+ return true;
+ default:
+ return false;
+ }
+ }
+ };
+
+ class ALIGNED_INSTRUCTION TernaryInstruction :
+ public BasePolicy,
+ public NDstPolicy<TernaryInstruction, 1>,
+ public TupleSrcPolicy<TernaryInstruction>
+ {
+ public:
+ TernaryInstruction(Opcode opcode,
+ Type type,
+ Register dst,
+ Tuple src) {
+ this->opcode = opcode;
+ this->type = type;
+ this->dst[0] = dst;
+ this->src = src;
+ }
+ Type getType(void) const { return type; }
+ bool wellFormed(const Function &fn, std::string &whyNot) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Type type;
+ Register dst[1];
+ Tuple src;
+ static const uint32_t srcNum = 3;
+ };
+
+ /*! Three sources mean we need a tuple to encode it */
+ class ALIGNED_INSTRUCTION SelectInstruction :
+ public BasePolicy,
+ public NDstPolicy<SelectInstruction, 1>,
+ public TupleSrcPolicy<SelectInstruction>
+ {
+ public:
+ SelectInstruction(Type type, Register dst, Tuple src) {
+ this->opcode = OP_SEL;
+ this->type = type;
+ this->dst[0] = dst;
+ this->src = src;
+ }
+ INLINE Type getType(void) const { return this->type; }
+ INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Type type; //!< Type of the instruction
+ Register dst[1]; //!< Dst is the register index
+ Tuple src; //!< 3 sources do not fit in 8 bytes -> use a tuple
+ static const uint32_t srcNum = 3;
+ };
+
+ /*! Comparison instructions take two sources of the same type and return a
+ * boolean value. Since it is pretty similar to binary instruction, we
+ * steal all the methods from it, except wellFormed (dst register is always
+ * a boolean value)
+ */
+ class ALIGNED_INSTRUCTION CompareInstruction :
+ public NaryInstruction<2>
+ {
+ public:
+ CompareInstruction(Opcode opcode,
+ Type type,
+ Register dst,
+ Register src0,
+ Register src1)
+ {
+ this->opcode = opcode;
+ this->type = type;
+ this->dst[0] = dst;
+ this->src[0] = src0;
+ this->src[1] = src1;
+ }
+ INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+ };
+
+ class ALIGNED_INSTRUCTION BitCastInstruction :
+ public BasePolicy,
+ public TupleSrcPolicy<BitCastInstruction>,
+ public TupleDstPolicy<BitCastInstruction>
+ {
+ public:
+ BitCastInstruction(Type dstType,
+ Type srcType,
+ Tuple dst,
+ Tuple src,
+ uint8_t dstNum,
+ uint8_t srcNum)
+ {
+ this->opcode = OP_BITCAST;
+ this->dst = dst;
+ this->src = src;
+ this->dstFamily = getFamily(dstType);
+ this->srcFamily = getFamily(srcType);
+ GBE_ASSERT(srcNum <= 16 && dstNum <= 16);
+ this->dstNum = dstNum;
+ this->srcNum = srcNum;
+ }
+ INLINE Type getSrcType(void) const { return getType((RegisterFamily)srcFamily); }
+ INLINE Type getDstType(void) const { return getType((RegisterFamily)dstFamily); }
+ INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ uint8_t dstFamily:4; //!< family to cast to
+ uint8_t srcFamily:4; //!< family to cast from
+ Tuple dst;
+ Tuple src;
+ uint8_t dstNum; //!<Dst Number
+ uint8_t srcNum; //!<Src Number
+ };
+
+ class ALIGNED_INSTRUCTION ConvertInstruction :
+ public BasePolicy,
+ public NDstPolicy<ConvertInstruction, 1>,
+ public NSrcPolicy<ConvertInstruction, 1>
+ {
+ public:
+ ConvertInstruction(Opcode opcode,
+ Type dstType,
+ Type srcType,
+ Register dst,
+ Register src)
+ {
+ this->opcode = opcode;
+ this->dst[0] = dst;
+ this->src[0] = src;
+ this->dstType = dstType;
+ this->srcType = srcType;
+ }
+ INLINE Type getSrcType(void) const { return this->srcType; }
+ INLINE Type getDstType(void) const { return this->dstType; }
+ INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Register dst[1];
+ Register src[1];
+ Type dstType; //!< Type to convert to
+ Type srcType; //!< Type to convert from
+ };
+
+ class ALIGNED_INSTRUCTION AtomicInstruction :
+ public BasePolicy,
+ public TupleSrcPolicy<AtomicInstruction>,
+ public NDstPolicy<AtomicInstruction, 1>
+ {
+ public:
+ AtomicInstruction(AtomicOps atomicOp,
+ Register dst,
+ AddressSpace addrSpace,
+ BTI bti,
+ Tuple src)
+ {
+ this->opcode = OP_ATOMIC;
+ this->atomicOp = atomicOp;
+ this->dst[0] = dst;
+ this->src = src;
+ this->addrSpace = addrSpace;
+ this->bti = bti;
+ srcNum = 2;
+ if((atomicOp == ATOMIC_OP_INC) ||
+ (atomicOp == ATOMIC_OP_DEC))
+ srcNum = 1;
+ if(atomicOp == ATOMIC_OP_CMPXCHG)
+ srcNum = 3;
+ }
+ INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
+ INLINE BTI getBTI(void) const { return bti; }
+ INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
+ INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Register dst[1];
+ Tuple src;
+ AddressSpace addrSpace; //!< Address space
+ BTI bti; //!< bti
+ uint8_t srcNum:2; //!<Source Number
+ AtomicOps atomicOp:6; //!<Source Number
+ };
+
+ class ALIGNED_INSTRUCTION BranchInstruction :
+ public BasePolicy,
+ public NDstPolicy<BranchInstruction, 0>
+ {
+ public:
+ INLINE BranchInstruction(Opcode op, LabelIndex labelIndex, Register predicate) {
+ GBE_ASSERT(op == OP_BRA);
+ this->opcode = op;
+ this->predicate = predicate;
+ this->labelIndex = labelIndex;
+ this->hasPredicate = true;
+ this->hasLabel = true;
+ }
+ INLINE BranchInstruction(Opcode op, LabelIndex labelIndex) {
+ GBE_ASSERT(op == OP_BRA);
+ this->opcode = OP_BRA;
+ this->labelIndex = labelIndex;
+ this->hasPredicate = false;
+ this->hasLabel = true;
+ }
+ INLINE BranchInstruction(Opcode op) {
+ GBE_ASSERT(op == OP_RET);
+ this->opcode = OP_RET;
+ this->hasPredicate = false;
+ this->hasLabel = false;
+ }
+ INLINE LabelIndex getLabelIndex(void) const {
+ GBE_ASSERTM(hasLabel, "No target label for this branch instruction");
+ return labelIndex;
+ }
+ INLINE uint32_t getSrcNum(void) const { return hasPredicate ? 1 : 0; }
+ INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM(hasPredicate, "No source for unpredicated branches");
+ GBE_ASSERTM(ID == 0, "Only one source for the branch instruction");
+ return predicate;
+ }
+ INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM(hasPredicate, "No source for unpredicated branches");
+ GBE_ASSERTM(ID == 0, "Only one source for the branch instruction");
+ predicate = reg;
+ }
+ INLINE bool isPredicated(void) const { return hasPredicate; }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Register predicate; //!< Predication means conditional branch
+ LabelIndex labelIndex; //!< Index of the label the branch targets
+ bool hasPredicate:1; //!< Is it predicated?
+ bool hasLabel:1; //!< Is there any target label?
+ Register dst[0]; //!< No destination
+ };
+
+ class ALIGNED_INSTRUCTION LoadInstruction :
+ public BasePolicy,
+ public NSrcPolicy<LoadInstruction, 1>
+ {
+ public:
+ LoadInstruction(Type type,
+ Tuple dstValues,
+ Register offset,
+ AddressSpace addrSpace,
+ uint32_t valueNum,
+ bool dwAligned,
+ BTI bti)
+ {
+ GBE_ASSERT(valueNum < 128);
+ this->opcode = OP_LOAD;
+ this->type = type;
+ this->offset = offset;
+ this->values = dstValues;
+ this->addrSpace = addrSpace;
+ this->valueNum = valueNum;
+ this->dwAligned = dwAligned ? 1 : 0;
+ this->bti = bti;
+ }
+ INLINE Register getDst(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
+ return fn.getRegister(values, ID);
+ }
+ INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
+ fn.setRegister(values, ID, reg);
+ }
+ INLINE uint32_t getDstNum(void) const { return valueNum; }
+ INLINE Type getValueType(void) const { return type; }
+ INLINE uint32_t getValueNum(void) const { return valueNum; }
+ INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+ INLINE BTI getBTI(void) const { return bti; }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ INLINE bool isAligned(void) const { return !!dwAligned; }
+ Type type; //!< Type to store
+ Register src[0]; //!< Address where to load from
+ Register offset; //!< Alias to make it similar to store
+ Tuple values; //!< Values to load
+ AddressSpace addrSpace; //!< Where to load
+ BTI bti;
+ uint8_t valueNum:7; //!< Number of values to load
+ uint8_t dwAligned:1; //!< DWORD aligned is what matters with GEN
+ };
+
+ class ALIGNED_INSTRUCTION StoreInstruction :
+ public BasePolicy, public NDstPolicy<StoreInstruction, 0>
+ {
+ public:
+ StoreInstruction(Type type,
+ Tuple values,
+ Register offset,
+ AddressSpace addrSpace,
+ uint32_t valueNum,
+ bool dwAligned,
+ BTI bti)
+ {
+ GBE_ASSERT(valueNum < 255);
+ this->opcode = OP_STORE;
+ this->type = type;
+ this->offset = offset;
+ this->values = values;
+ this->addrSpace = addrSpace;
+ this->valueNum = valueNum;
+ this->dwAligned = dwAligned ? 1 : 0;
+ this->bti = bti;
+ }
+ INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+ if (ID == 0u)
+ return offset;
+ else
+ return fn.getRegister(values, ID - 1);
+ }
+ INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+ if (ID == 0u)
+ offset = reg;
+ else
+ fn.setRegister(values, ID - 1, reg);
+ }
+ INLINE uint32_t getSrcNum(void) const { return valueNum + 1u; }
+ INLINE uint32_t getValueNum(void) const { return valueNum; }
+ INLINE Type getValueType(void) const { return type; }
+ INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+ INLINE BTI getBTI(void) const { return bti; }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ INLINE bool isAligned(void) const { return !!dwAligned; }
+ Type type; //!< Type to store
+ Register offset; //!< First source is the offset where to store
+ Tuple values; //!< Values to store
+ AddressSpace addrSpace; //!< Where to store
+ BTI bti; //!< Which btis need access
+ uint8_t valueNum:7; //!< Number of values to store
+ uint8_t dwAligned:1; //!< DWORD aligned is what matters with GEN
+ Register dst[0]; //!< No destination
+ };
+
+ class ALIGNED_INSTRUCTION SampleInstruction : // TODO
+ public BasePolicy,
+ public TupleSrcPolicy<SampleInstruction>,
+ public TupleDstPolicy<SampleInstruction>
+ {
+ public:
+ SampleInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
+ this->opcode = OP_SAMPLE;
+ this->dst = dstTuple;
+ this->src = srcTuple;
+ this->dstIsFloat = dstIsFloat;
+ this->srcIsFloat = srcIsFloat;
+ this->samplerIdx = sampler;
+ this->imageIdx = imageIdx;
+ this->samplerOffset = samplerOffset;
+ }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << this->getDstType()
+ << "." << this->getSrcType()
+ << " surface id " << (int)this->getImageIndex()
+ << " coord u %" << this->getSrc(fn, 0)
+ << " coord v %" << this->getSrc(fn, 1)
+ << " coord w %" << this->getSrc(fn, 2)
+ << " %" << this->getDst(fn, 0)
+ << " %" << this->getDst(fn, 1)
+ << " %" << this->getDst(fn, 2)
+ << " %" << this->getDst(fn, 3)
+ << " sampler idx " << (int)this->getSamplerIndex();
+ }
+ Tuple src;
+ Tuple dst;
+
+ INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+ INLINE Type getSrcType(void) const { return this->srcIsFloat ? TYPE_FLOAT : TYPE_S32; }
+ INLINE Type getDstType(void) const { return this->dstIsFloat ? TYPE_FLOAT : TYPE_U32; }
+ INLINE const uint8_t getSamplerIndex(void) const { return this->samplerIdx; }
+ INLINE const uint8_t getSamplerOffset(void) const { return this->samplerOffset; }
+ uint8_t srcIsFloat:1;
+ uint8_t dstIsFloat:1;
+ uint8_t samplerIdx:4;
+ uint8_t samplerOffset:2;
+ uint8_t imageIdx;
+ static const uint32_t srcNum = 3;
+ static const uint32_t dstNum = 4;
+ };
+
+ class ALIGNED_INSTRUCTION TypedWriteInstruction : // TODO
+ public BasePolicy,
+ public TupleSrcPolicy<TypedWriteInstruction>,
+ public NDstPolicy<TypedWriteInstruction, 0>
+ {
+ public:
+
+ INLINE TypedWriteInstruction(uint8_t imageIdx, Tuple srcTuple, Type srcType, Type coordType) {
+ this->opcode = OP_TYPED_WRITE;
+ this->src = srcTuple;
+ this->coordType = coordType;
+ this->srcType = srcType;
+ this->imageIdx = imageIdx;
+ }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << this->getSrcType()
+ << " surface id " << (int)this->getImageIndex()
+ << " coord u %" << this->getSrc(fn, 0)
+ << " coord v %" << this->getSrc(fn, 1)
+ << " coord w %" << this->getSrc(fn, 2)
+ << " %" << this->getSrc(fn, 3)
+ << " %" << this->getSrc(fn, 4)
+ << " %" << this->getSrc(fn, 5)
+ << " %" << this->getSrc(fn, 6);
+ }
+
+ Tuple src;
+ uint8_t srcType;
+ uint8_t coordType;
+ uint8_t imageIdx;
+
+ INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+ INLINE Type getSrcType(void) const { return (Type)this->srcType; }
+ INLINE Type getCoordType(void) const { return (Type)this->coordType; }
+ // bti, u, v, w, 4 data elements
+ static const uint32_t srcNum = 7;
+ Register dst[0]; //!< No dest register
+ };
+
+ class ALIGNED_INSTRUCTION GetImageInfoInstruction :
+ public BasePolicy,
+ public NSrcPolicy<GetImageInfoInstruction, 1>,
+ public NDstPolicy<GetImageInfoInstruction, 1>
+ {
+ public:
+ GetImageInfoInstruction( int type,
+ Register dst,
+ uint8_t imageIdx,
+ Register infoReg)
+ {
+ this->opcode = OP_GET_IMAGE_INFO;
+ this->infoType = type;
+ this->dst[0] = dst;
+ this->src[0] = infoReg;
+ this->imageIdx = imageIdx;
+ }
+
+ INLINE uint32_t getInfoType(void) const { return infoType; }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << this->getInfoType()
+ << " %" << this->getDst(fn, 0)
+ << " surface id " << (int)this->getImageIndex()
+ << " info reg %" << this->getSrc(fn, 0);
+ }
+
+ INLINE const uint8_t getImageIndex(void) const { return imageIdx; }
+
+ uint8_t infoType; //!< Type of the requested information.
+ uint8_t imageIdx; //!< surface index.
+ Register src[1]; //!< surface info register.
+ Register dst[1]; //!< dest register to put the information.
+ static const uint32_t dstNum = 1;
+ };
+
+ class ALIGNED_INSTRUCTION LoadImmInstruction :
+ public BasePolicy,
+ public NSrcPolicy<LoadImmInstruction, 0>,
+ public NDstPolicy<LoadImmInstruction, 1>
+ {
+ public:
+ INLINE LoadImmInstruction(Type type, Register dst, ImmediateIndex index)
+ {
+ this->dst[0] = dst;
+ this->opcode = OP_LOADI;
+ this->immediateIndex = index;
+ this->type = type;
+ }
+ INLINE Immediate getImmediate(const Function &fn) const {
+ return fn.getImmediate(immediateIndex);
+ }
+ INLINE Type getType(void) const { return this->type; }
+ bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Register dst[1]; //!< RegisterData to store into
+ Register src[0]; //!< No source register
+ ImmediateIndex immediateIndex; //!< Index in the vector of immediates
+ Type type; //!< Type of the immediate
+ };
+
+ class ALIGNED_INSTRUCTION SyncInstruction :
+ public BasePolicy,
+ public NSrcPolicy<SyncInstruction, 0>,
+ public NDstPolicy<SyncInstruction, 0>
+ {
+ public:
+ INLINE SyncInstruction(uint32_t parameters) {
+ this->opcode = OP_SYNC;
+ this->parameters = parameters;
+ }
+ INLINE uint32_t getParameters(void) const { return this->parameters; }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ uint32_t parameters;
+ Register dst[0], src[0];
+ };
+
+ class ALIGNED_INSTRUCTION LabelInstruction :
+ public BasePolicy,
+ public NSrcPolicy<LabelInstruction, 0>,
+ public NDstPolicy<LabelInstruction, 0>
+ {
+ public:
+ INLINE LabelInstruction(LabelIndex labelIndex) {
+ this->opcode = OP_LABEL;
+ this->labelIndex = labelIndex;
+ }
+ INLINE LabelIndex getLabelIndex(void) const { return labelIndex; }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ LabelIndex labelIndex; //!< Index of the label
+ Register dst[0], src[0];
+ };
+
+#undef ALIGNED_INSTRUCTION
+
+ /////////////////////////////////////////////////////////////////////////
+ // Implements all the wellFormed methods
+ /////////////////////////////////////////////////////////////////////////
+
+ /*! All Nary instruction registers must be of the same family and properly
+ * defined (i.e. not out-of-bound)
+ */
+ static INLINE bool checkRegisterData(RegisterFamily family,
+ const Register &ID,
+ const Function &fn,
+ std::string &whyNot)
+ {
+ if (UNLIKELY(uint16_t(ID) >= fn.regNum())) {
+ whyNot = "Out-of-bound destination register index";
+ return false;
+ }
+ const RegisterData reg = fn.getRegisterData(ID);
+ if (UNLIKELY(reg.family != family)) {
+ whyNot = "Destination family does not match instruction type";
+ return false;
+ }
+ return true;
+ }
+
+ /*! Special registers are *not* writeable */
+ static INLINE bool checkSpecialRegForWrite(const Register ®,
+ const Function &fn,
+ std::string &whyNot)
+ {
+ if (fn.isSpecialReg(reg) == true && reg != ir::ocl::stackptr) {
+ whyNot = "Non stack pointer special registers are not writeable";
+ return false;
+ }
+ return true;
+ }
+
+ /*! We check that the given type belongs to the provided type family */
+ static INLINE bool checkTypeFamily(const Type &type,
+ const Type *family,
+ uint32_t typeNum,
+ std::string &whyNot)
+ {
+ uint32_t typeID = 0;
+ for (; typeID < typeNum; ++typeID)
+ if (family[typeID] == type)
+ break;
+ if (typeID == typeNum) {
+ whyNot = "Type is not supported by the instruction";
+ return false;
+ }
+ return true;
+ }
+
+#define CHECK_TYPE(TYPE, FAMILY) \
+ do { \
+ if (UNLIKELY(checkTypeFamily(TYPE, FAMILY, FAMILY##Num, whyNot)) == false) \
+ return false; \
+ } while (0)
+
+ static const Type madType[] = {TYPE_FLOAT};
+ static const uint32_t madTypeNum = ARRAY_ELEM_NUM(madType);
+
+ // TODO add support for 64 bits values
+ static const Type allButBool[] = {TYPE_S8, TYPE_U8,
+ TYPE_S16, TYPE_U16,
+ TYPE_S32, TYPE_U32,
+ TYPE_S64, TYPE_U64,
+ TYPE_FLOAT, TYPE_DOUBLE};
+ static const uint32_t allButBoolNum = ARRAY_ELEM_NUM(allButBool);
+
+ // TODO add support for 64 bits values
+ static const Type logicalType[] = {TYPE_S8, TYPE_U8,
+ TYPE_S16, TYPE_U16,
+ TYPE_S32, TYPE_U32,
+ TYPE_S64, TYPE_U64,
+ TYPE_BOOL};
+ static const uint32_t logicalTypeNum = ARRAY_ELEM_NUM(logicalType);
+
+ // Unary and binary instructions share the same rules
+ template <uint32_t srcNum>
+ INLINE bool NaryInstruction<srcNum>::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const RegisterFamily family = getFamily(this->type);
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+ return false;
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
+ if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
+ return false;
+ // We actually support logical operations on boolean values for AND, OR,
+ // and XOR
+ switch (this->opcode) {
+ case OP_OR:
+ case OP_XOR:
+ case OP_AND:
+ CHECK_TYPE(this->type, logicalType);
+ break;
+ default:
+ CHECK_TYPE(this->type, allButBool);
+ break;
+ case OP_MOV:
+ break;
+ case OP_POW:
+ case OP_COS:
+ case OP_SIN:
+ case OP_RCP:
+ case OP_ABS:
+ case OP_RSQ:
+ case OP_SQR:
+ case OP_RNDD:
+ case OP_RNDE:
+ case OP_RNDU:
+ case OP_RNDZ:
+ const Type fp = TYPE_FLOAT;
+ if (UNLIKELY(checkTypeFamily(TYPE_FLOAT, &fp, 1, whyNot)) == false)
+ return false;
+ break;
+ }
+ return true;
+ }
+
+ // First source must a boolean. Other must match the destination type
+ INLINE bool SelectInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const RegisterFamily family = getFamily(this->type);
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(src + 3u > fn.tupleNum())) {
+ whyNot = "Out-of-bound index for ternary instruction";
+ return false;
+ }
+ const Register regID = fn.getRegister(src, 0);
+ if (UNLIKELY(checkRegisterData(FAMILY_BOOL, regID, fn, whyNot) == false))
+ return false;
+ for (uint32_t srcID = 1; srcID < 3; ++srcID) {
+ const Register regID = fn.getRegister(src, srcID);
+ if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+ return false;
+ }
+ CHECK_TYPE(this->type, allButBool);
+ return true;
+ }
+
+ // Pretty similar to binary instruction. Only the destination is of type
+ // boolean
+ INLINE bool CompareInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(FAMILY_BOOL, dst[0], fn, whyNot) == false))
+ return false;
+ const RegisterFamily family = getFamily(this->type);
+ for (uint32_t srcID = 0; srcID < 2; ++srcID)
+ if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
+ return false;
+ CHECK_TYPE(this->type, allButBool);
+ return true;
+ }
+
+ // The bit sizes of src and the dst must be identical, and don't support bool now, bool need double check.
+ INLINE bool BitCastInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ if (UNLIKELY(checkSpecialRegForWrite(getDst(fn, dstID), fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData((RegisterFamily)dstFamily, getDst(fn, dstID), fn, whyNot) == false))
+ return false;
+ }
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ if (UNLIKELY(checkRegisterData((RegisterFamily)srcFamily, getSrc(fn, srcID), fn, whyNot) == false))
+ return false;
+ }
+
+ CHECK_TYPE(getType((RegisterFamily)dstFamily), allButBool);
+ CHECK_TYPE(getType((RegisterFamily)srcFamily), allButBool);
+
+ uint32_t dstBytes = 0, srcBtyes = 0;
+ dstBytes = dstNum * getFamilySize((RegisterFamily)dstFamily);
+ srcBtyes = srcNum * getFamilySize((RegisterFamily)srcFamily);
+
+ if(dstBytes != srcBtyes){
+ whyNot = " The bit sizes of src and the dst is not identical.";
+ return false;
+ }
+
+ return true;
+ }
+
+ // We can convert anything to anything, but types and families must match
+ INLINE bool ConvertInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const RegisterFamily dstFamily = getFamily(dstType);
+ const RegisterFamily srcFamily = getFamily(srcType);
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(dstFamily, dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(srcFamily, src[0], fn, whyNot) == false))
+ return false;
+ CHECK_TYPE(this->dstType, allButBool);
+ CHECK_TYPE(this->srcType, allButBool);
+ return true;
+ }
+
+ // We can convert anything to anything, but types and families must match
+ INLINE bool AtomicInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+ return false;
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
+ if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
+ return false;
+
+ return true;
+ }
+
+ INLINE bool TernaryInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const RegisterFamily family = getFamily(this->type);
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(src + 3u > fn.tupleNum())) {
+ whyNot = "Out-of-bound index for ternary instruction";
+ return false;
+ }
+ for (uint32_t srcID = 0; srcID < 3; ++srcID) {
+ const Register regID = fn.getRegister(src, srcID);
+ if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+ return false;
+ }
+ return true;
+ }
+
+ /*! Loads and stores follow the same restrictions */
+ template <typename T>
+ INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
+ {
+ if (UNLIKELY(insn.offset >= fn.regNum())) {
+ whyNot = "Out-of-bound offset register index";
+ return false;
+ }
+ if (UNLIKELY(insn.values + insn.valueNum > fn.tupleNum())) {
+ whyNot = "Out-of-bound tuple index";
+ return false;
+ }
+ // Check all registers
+ const RegisterFamily family = getFamily(insn.type);
+ for (uint32_t valueID = 0; valueID < insn.valueNum; ++valueID) {
+ const Register regID = fn.getRegister(insn.values, valueID);
+ if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+ return false;
+ }
+ return true;
+ }
+
+ INLINE bool LoadInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const uint32_t dstNum = this->getDstNum();
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const Register reg = this->getDst(fn, dstID);
+ const bool isOK = checkSpecialRegForWrite(reg, fn, whyNot);
+ if (UNLIKELY(isOK == false)) return false;
+ }
+ if (UNLIKELY(dstNum > Instruction::MAX_DST_NUM)) {
+ whyNot = "Too many destinations for load instruction";
+ return false;
+ }
+ return wellFormedLoadStore(*this, fn, whyNot);
+ }
+
+ INLINE bool StoreInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const uint32_t srcNum = this->getSrcNum();
+ if (UNLIKELY(srcNum > Instruction::MAX_SRC_NUM)) {
+ whyNot = "Too many source for store instruction";
+ return false;
+ }
+ return wellFormedLoadStore(*this, fn, whyNot);
+ }
+
+ // TODO
+ INLINE bool SampleInstruction::wellFormed(const Function &fn, std::string &why) const
+ { return true; }
+ INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
+ { return true; }
+ INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+ { return true; }
+
+
+ // Ensure that types and register family match
+ INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ if (UNLIKELY(immediateIndex >= fn.immediateNum())) {
+ whyNot = "Out-of-bound immediate value index";
+ return false;
+ }
+ const ir::Type immType = fn.getImmediate(immediateIndex).getType();
+ if (UNLIKELY(type != immType)) {
+ whyNot = "Inconsistant type for the immediate value to load";
+ return false;
+ }
+ const RegisterFamily family = getFamily(type);
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+ return false;
+ //Support all type IMM, disable check
+ //CHECK_TYPE(this->type, allButBool);
+ return true;
+ }
+
+ INLINE bool SyncInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const uint32_t maxParams = SYNC_WORKGROUP_EXEC |
+ SYNC_LOCAL_READ_FENCE |
+ SYNC_LOCAL_WRITE_FENCE |
+ SYNC_GLOBAL_READ_FENCE |
+ SYNC_GLOBAL_WRITE_FENCE;
+ if (UNLIKELY(this->parameters > maxParams)) {
+ whyNot = "Invalid parameters for sync instruction";
+ return false;
+ } else if (UNLIKELY(this->parameters == 0)) {
+ whyNot = "Missing parameters for sync instruction";
+ return false;
+ }
+ return true;
+ }
+
+ // Only a label index is required
+ INLINE bool LabelInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ if (UNLIKELY(labelIndex >= fn.labelNum())) {
+ whyNot = "Out-of-bound label index";
+ return false;
+ }
+ return true;
+ }
+
+ // The label must exist and the register must of boolean family
+ INLINE bool BranchInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+ if (hasLabel)
+ if (UNLIKELY(labelIndex >= fn.labelNum())) {
+ whyNot = "Out-of-bound label index";
+ return false;
+ }
+ if (hasPredicate)
+ if (UNLIKELY(checkRegisterData(FAMILY_BOOL, predicate, fn, whyNot) == false))
+ return false;
+ return true;
+ }
+
+#undef CHECK_TYPE
+
+ /////////////////////////////////////////////////////////////////////////
+ // Implements all the output stream methods
+ /////////////////////////////////////////////////////////////////////////
+ template <uint32_t srcNum>
+ INLINE void NaryInstruction<srcNum>::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << this->getType()
+ << " %" << this->getDst(fn, 0);
+ for (uint32_t i = 0; i < srcNum; ++i)
+ out << " %" << this->getSrc(fn, i);
+ }
+
+ template <typename T>
+ static void ternaryOrSelectOut(const T &insn, std::ostream &out, const Function &fn) {
+ insn.outOpcode(out);
+ out << "." << insn.getType()
+ << " %" << insn.getDst(fn, 0)
+ << " %" << insn.getSrc(fn, 0)
+ << " %" << insn.getSrc(fn, 1)
+ << " %" << insn.getSrc(fn, 2);
+ }
+
+ INLINE void SelectInstruction::out(std::ostream &out, const Function &fn) const {
+ ternaryOrSelectOut(*this, out, fn);
+ }
+
+ INLINE void TernaryInstruction::out(std::ostream &out, const Function &fn) const {
+ ternaryOrSelectOut(*this, out, fn);
+ }
+
+ INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << addrSpace;
+ out << " %" << this->getDst(fn, 0);
+ out << " {" << "%" << this->getSrc(fn, 0) << "}";
+ for (uint32_t i = 1; i < srcNum; ++i)
+ out << " %" << this->getSrc(fn, i);
+ out << " bti";
+ for (uint32_t i = 0; i < bti.count; ++i)
+ out << ": " << (int)bti.bti[i];
+ }
+
+
+ INLINE void BitCastInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << this->getDstType()
+ << "." << this->getSrcType();
+ out << " {";
+ for (uint32_t i = 0; i < dstNum; ++i)
+ out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
+ out << "}";
+ out << " {";
+ for (uint32_t i = 0; i < srcNum; ++i)
+ out << "%" << this->getSrc(fn, i) << (i != (srcNum-1u) ? " " : "");
+ out << "}";
+ }
+
+
+ INLINE void ConvertInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << this->getDstType()
+ << "." << this->getSrcType()
+ << " %" << this->getDst(fn, 0)
+ << " %" << this->getSrc(fn, 0);
+ }
+
+ INLINE void LoadInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
+ out << " {";
+ for (uint32_t i = 0; i < valueNum; ++i)
+ out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
+ out << "}";
+ out << " %" << this->getSrc(fn, 0);
+ out << " bti";
+ for (uint32_t i = 0; i < bti.count; ++i)
+ out << ": " << (int)bti.bti[i];
+ }
+
+ INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
+ out << " %" << this->getSrc(fn, 0) << " {";
+ for (uint32_t i = 0; i < valueNum; ++i)
+ out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
+ out << "}";
+ out << " bti";
+ for (uint32_t i = 0; i < bti.count; ++i)
+ out << ": " << (int)bti.bti[i];
+ }
+
+ INLINE void LabelInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << " $" << labelIndex;
+ }
+
+ INLINE void BranchInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ if (hasPredicate)
+ out << "<%" << this->getSrc(fn, 0) << ">";
+ if (hasLabel) out << " -> label$" << labelIndex;
+ }
+
+ INLINE void LoadImmInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << type;
+ out << " %" << this->getDst(fn,0) << " ";
+ fn.outImmediate(out, immediateIndex);
+ }
+
+ static const char *syncStr[syncFieldNum] = {
+ "workgroup", "local_read", "local_write", "global_read", "global_write"
+ };
+
+ INLINE void SyncInstruction::out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ for (uint32_t field = 0; field < syncFieldNum; ++field)
+ if (this->parameters & (1 << field))
+ out << "." << syncStr[field];
+ }
+
+
+ } /* namespace internal */
+
+ std::ostream &operator<< (std::ostream &out, AddressSpace addrSpace) {
+ switch (addrSpace) {
+ case MEM_GLOBAL: return out << "global";
+ case MEM_LOCAL: return out << "local";
+ case MEM_CONSTANT: return out << "constant";
+ case MEM_PRIVATE: return out << "private";
+ case IMAGE: return out << "image";
+ case MEM_INVALID: return out << "invalid";
+ };
+ return out;
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Implements the various introspection functions
+ ///////////////////////////////////////////////////////////////////////////
+ template <typename T, typename U> struct HelperIntrospection {
+ enum { value = 0 };
+ };
+ template <typename T> struct HelperIntrospection<T,T> {
+ enum { value = 1 };
+ };
+
+ RegisterData Instruction::getDstData(uint32_t ID) const {
+ const Function &fn = this->getFunction();
+ return fn.getRegisterData(this->getDst(ID));
+ }
+ RegisterData Instruction::getSrcData(uint32_t ID) const {
+ const Function &fn = this->getFunction();
+ return fn.getRegisterData(this->getSrc(ID));
+ }
+
+#define DECL_INSN(OPCODE, CLASS) \
+ case OP_##OPCODE: \
+ return HelperIntrospection<CLASS, RefClass>::value == 1;
+
+#define START_INTROSPECTION(CLASS) \
+ static_assert(sizeof(internal::CLASS) == (sizeof(uint64_t)*2), \
+ "Bad instruction size"); \
+ static_assert(offsetof(internal::CLASS, opcode) == 0, \
+ "Bad opcode offset"); \
+ bool CLASS::isClassOf(const Instruction &insn) { \
+ const Opcode op = insn.getOpcode(); \
+ typedef CLASS RefClass; \
+ switch (op) {
+
+#define END_INTROSPECTION(CLASS) \
+ default: return false; \
+ }; \
+ }
+
+START_INTROSPECTION(UnaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(UnaryInstruction)
+
+START_INTROSPECTION(BinaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(BinaryInstruction)
+
+START_INTROSPECTION(CompareInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(CompareInstruction)
+
+START_INTROSPECTION(BitCastInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(BitCastInstruction)
+
+START_INTROSPECTION(ConvertInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(ConvertInstruction)
+
+START_INTROSPECTION(AtomicInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(AtomicInstruction)
+
+START_INTROSPECTION(SelectInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SelectInstruction)
+
+START_INTROSPECTION(TernaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(TernaryInstruction)
+
+START_INTROSPECTION(BranchInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(BranchInstruction)
+
+START_INTROSPECTION(SampleInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SampleInstruction)
+
+START_INTROSPECTION(TypedWriteInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(TypedWriteInstruction)
+
+START_INTROSPECTION(GetImageInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetImageInfoInstruction)
+
+START_INTROSPECTION(LoadImmInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(LoadImmInstruction)
+
+START_INTROSPECTION(LoadInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(LoadInstruction)
+
+START_INTROSPECTION(StoreInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(StoreInstruction)
+
+START_INTROSPECTION(SyncInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SyncInstruction)
+
+START_INTROSPECTION(LabelInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(LabelInstruction)
+
+#undef END_INTROSPECTION
+#undef START_INTROSPECTION
+#undef DECL_INSN
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Implements the function dispatching from public to internal with some
+ // macro horrors
+ ///////////////////////////////////////////////////////////////////////////
+
+#define DECL_INSN(OPCODE, CLASS) \
+ case OP_##OPCODE: return reinterpret_cast<const internal::CLASS*>(this)->CALL;
+
+#define START_FUNCTION(CLASS, RET, PROTOTYPE) \
+ RET CLASS::PROTOTYPE const { \
+ const Opcode op = this->getOpcode(); \
+ switch (op) {
+
+#define END_FUNCTION(CLASS, RET) \
+ case OP_INVALID: return RET(); \
+ }; \
+ return RET(); \
+ }
+
+#define CALL getSrcNum()
+START_FUNCTION(Instruction, uint32_t, getSrcNum(void))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, uint32_t)
+#undef CALL
+
+#define CALL getDstNum()
+START_FUNCTION(Instruction, uint32_t, getDstNum(void))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, uint32_t)
+#undef CALL
+
+#undef DECL_INSN
+
+#define DECL_INSN(OPCODE, CLASS) \
+ case OP_##OPCODE: \
+ { \
+ const Function &fn = this->getFunction(); \
+ return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
+ }
+
+#define CALL wellFormed(fn, whyNot)
+START_FUNCTION(Instruction, bool, wellFormed(std::string &whyNot))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, bool)
+#undef CALL
+
+#define CALL getDst(fn, ID)
+START_FUNCTION(Instruction, Register, getDst(uint32_t ID))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, Register)
+#undef CALL
+
+#define CALL getSrc(fn, ID)
+START_FUNCTION(Instruction, Register, getSrc(uint32_t ID))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, Register)
+#undef CALL
+
+#undef DECL_INSN
+#undef END_FUNCTION
+#undef START_FUNCTION
+
+ void Instruction::setSrc(uint32_t srcID, Register reg) {
+ Function &fn = this->getFunction();
+#if GBE_DEBUG
+ const RegisterData oldData = this->getSrcData(srcID);
+ const RegisterData newData = fn.getRegisterData(reg);
+ GBE_ASSERT(oldData.family == newData.family);
+#endif /* GBE_DEBUG */
+ const Opcode op = this->getOpcode();
+ switch (op) {
+#define DECL_INSN(OP, FAMILY)\
+ case OP_##OP:\
+ reinterpret_cast<internal::FAMILY*>(this)->setSrc(fn, srcID, reg);\
+ break;
+#include "instruction.hxx"
+#undef DECL_INSN
+ case OP_INVALID: NOT_SUPPORTED; break;
+ };
+ }
+
+ void Instruction::setDst(uint32_t dstID, Register reg) {
+ Function &fn = this->getFunction();
+#if GBE_DEBUG
+ const RegisterData oldData = this->getDstData(dstID);
+ const RegisterData newData = fn.getRegisterData(reg);
+ GBE_ASSERT(oldData.family == newData.family);
+#endif /* GBE_DEBUG */
+ const Opcode op = this->getOpcode();
+ switch (op) {
+#define DECL_INSN(OP, FAMILY)\
+ case OP_##OP:\
+ reinterpret_cast<internal::FAMILY*>(this)->setDst(fn, dstID, reg);\
+ break;
+#include "instruction.hxx"
+#undef DECL_INSN
+ case OP_INVALID: NOT_SUPPORTED; break;
+ };
+ }
+
+ const Function &Instruction::getFunction(void) const {
+ const BasicBlock *bb = this->getParent();
+ GBE_ASSERT(bb != NULL);
+ return bb->getParent();
+ }
+ Function &Instruction::getFunction(void) {
+ BasicBlock *bb = this->getParent();
+ GBE_ASSERT(bb != NULL);
+ return bb->getParent();
+ }
+
+ void Instruction::replace(Instruction *other) const {
+ Function &fn = other->getFunction();
+ Instruction *insn = fn.newInstruction(*this);
+ intrusive_list_node *prev = other->prev;
+ insn->parent = other->parent;
+ other->remove();
+ append(insn, prev);
+ }
+
+ void Instruction::remove(void) {
+ Function &fn = this->getFunction();
+ unlink(this);
+ fn.deleteInstruction(this);
+ }
+
+ void Instruction::insert(Instruction *prev, Instruction ** new_ins) {
+ Function &fn = prev->getFunction();
+ Instruction *insn = fn.newInstruction(*this);
+ insn->parent = prev->parent;
+ append(insn, prev);
+ if (new_ins)
+ *new_ins = insn;
+ }
+
+ bool Instruction::hasSideEffect(void) const {
+ return opcode == OP_STORE ||
+ opcode == OP_TYPED_WRITE ||
+ opcode == OP_SYNC ||
+ opcode == OP_ATOMIC;
+ }
+
+#define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
+ RET CLASS::PROTOTYPE const { \
+ return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
+ }
+
+DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())
+DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())
+DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
+DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
+DECL_MEM_FN(TernaryInstruction, Type, getType(void), getType())
+DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
+DECL_MEM_FN(BitCastInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
+DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
+DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
+DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
+DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
+DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
+DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
+DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
+DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
+DECL_MEM_FN(BranchInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
+DECL_MEM_FN(SyncInstruction, uint32_t, getParameters(void), getParameters())
+DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerIndex(void), getSamplerIndex())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerOffset(void), getSamplerOffset())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
+DECL_MEM_FN(TypedWriteInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
+DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+
+#undef DECL_MEM_FN
+
+ Immediate LoadImmInstruction::getImmediate(void) const {
+ const Function &fn = this->getFunction();
+ return reinterpret_cast<const internal::LoadImmInstruction*>(this)->getImmediate(fn);
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Implements the emission functions
+ ///////////////////////////////////////////////////////////////////////////
+
+ // For all unary functions with given opcode
+ Instruction ALU1(Opcode opcode, Type type, Register dst, Register src) {
+ return internal::UnaryInstruction(opcode, type, dst, src).convert();
+ }
+
+ // All unary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+ Instruction NAME(Type type, Register dst, Register src) { \
+ return ALU1(OP_##NAME, type, dst, src);\
+ }
+
+ DECL_EMIT_FUNCTION(MOV)
+ DECL_EMIT_FUNCTION(FBH)
+ DECL_EMIT_FUNCTION(FBL)
+ DECL_EMIT_FUNCTION(COS)
+ DECL_EMIT_FUNCTION(SIN)
+ DECL_EMIT_FUNCTION(LOG)
+ DECL_EMIT_FUNCTION(SQR)
+ DECL_EMIT_FUNCTION(RSQ)
+ DECL_EMIT_FUNCTION(RNDD)
+ DECL_EMIT_FUNCTION(RNDE)
+ DECL_EMIT_FUNCTION(RNDU)
+ DECL_EMIT_FUNCTION(RNDZ)
+
+#undef DECL_EMIT_FUNCTION
+
+ // All binary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+ Instruction NAME(Type type, Register dst, Register src0, Register src1) { \
+ return internal::BinaryInstruction(OP_##NAME, type, dst, src0, src1).convert(); \
+ }
+
+ DECL_EMIT_FUNCTION(POW)
+ DECL_EMIT_FUNCTION(MUL)
+ DECL_EMIT_FUNCTION(ADD)
+ DECL_EMIT_FUNCTION(ADDSAT)
+ DECL_EMIT_FUNCTION(SUB)
+ DECL_EMIT_FUNCTION(SUBSAT)
+ DECL_EMIT_FUNCTION(MUL_HI)
+ DECL_EMIT_FUNCTION(I64_MUL_HI)
+ DECL_EMIT_FUNCTION(UPSAMPLE_SHORT)
+ DECL_EMIT_FUNCTION(UPSAMPLE_INT)
+ DECL_EMIT_FUNCTION(UPSAMPLE_LONG)
+ DECL_EMIT_FUNCTION(DIV)
+ DECL_EMIT_FUNCTION(REM)
+ DECL_EMIT_FUNCTION(SHL)
+ DECL_EMIT_FUNCTION(SHR)
+ DECL_EMIT_FUNCTION(ASR)
+ DECL_EMIT_FUNCTION(BSF)
+ DECL_EMIT_FUNCTION(BSB)
+ DECL_EMIT_FUNCTION(OR)
+ DECL_EMIT_FUNCTION(XOR)
+ DECL_EMIT_FUNCTION(AND)
+ DECL_EMIT_FUNCTION(HADD)
+ DECL_EMIT_FUNCTION(RHADD)
+ DECL_EMIT_FUNCTION(I64HADD)
+ DECL_EMIT_FUNCTION(I64RHADD)
+
+#undef DECL_EMIT_FUNCTION
+
+ // SEL
+ Instruction SEL(Type type, Register dst, Tuple src) {
+ return internal::SelectInstruction(type, dst, src).convert();
+ }
+
+ Instruction I64MADSAT(Type type, Register dst, Tuple src) {
+ return internal::TernaryInstruction(OP_I64MADSAT, type, dst, src).convert();
+ }
+
+ Instruction MAD(Type type, Register dst, Tuple src) {
+ return internal::TernaryInstruction(OP_MAD, type, dst, src).convert();
+ }
+ // All compare functions
+#define DECL_EMIT_FUNCTION(NAME) \
+ Instruction NAME(Type type, Register dst, Register src0, Register src1) { \
+ const internal::CompareInstruction insn(OP_##NAME, type, dst, src0, src1); \
+ return insn.convert(); \
+ }
+
+ DECL_EMIT_FUNCTION(EQ)
+ DECL_EMIT_FUNCTION(NE)
+ DECL_EMIT_FUNCTION(LE)
+ DECL_EMIT_FUNCTION(LT)
+ DECL_EMIT_FUNCTION(GE)
+ DECL_EMIT_FUNCTION(GT)
+ DECL_EMIT_FUNCTION(ORD)
+
+#undef DECL_EMIT_FUNCTION
+
+ // BITCAST
+ Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum) {
+ return internal::BitCastInstruction(dstType, srcType, dst, src, dstNum, srcNum).convert();
+ }
+
+ // CVT
+ Instruction CVT(Type dstType, Type srcType, Register dst, Register src) {
+ return internal::ConvertInstruction(OP_CVT, dstType, srcType, dst, src).convert();
+ }
+
+ // saturated convert
+ Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src) {
+ return internal::ConvertInstruction(OP_SAT_CVT, dstType, srcType, dst, src).convert();
+ }
+
+ // CVT
+ Instruction F16TO32(Type dstType, Type srcType, Register dst, Register src) {
+ return internal::ConvertInstruction(OP_F16TO32, dstType, srcType, dst, src).convert();
+ }
+
+ // saturated convert
+ Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src) {
+ return internal::ConvertInstruction(OP_F32TO16, dstType, srcType, dst, src).convert();
+ }
+
+ // For all unary functions with given opcode
+ Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
+ return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
+ }
+
+ // BRA
+ Instruction BRA(LabelIndex labelIndex) {
+ return internal::BranchInstruction(OP_BRA, labelIndex).convert();
+ }
+ Instruction BRA(LabelIndex labelIndex, Register pred) {
+ return internal::BranchInstruction(OP_BRA, labelIndex, pred).convert();
+ }
+
+ // RET
+ Instruction RET(void) {
+ return internal::BranchInstruction(OP_RET).convert();
+ }
+
+ // LOADI
+ Instruction LOADI(Type type, Register dst, ImmediateIndex value) {
+ return internal::LoadImmInstruction(type, dst, value).convert();
+ }
+
+ // LOAD and STORE
+#define DECL_EMIT_FUNCTION(NAME, CLASS) \
+ Instruction NAME(Type type, \
+ Tuple tuple, \
+ Register offset, \
+ AddressSpace space, \
+ uint32_t valueNum, \
+ bool dwAligned, \
+ BTI bti) \
+ { \
+ return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
+ }
+
+ DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
+ DECL_EMIT_FUNCTION(STORE, StoreInstruction)
+
+#undef DECL_EMIT_FUNCTION
+
+ // FENCE
+ Instruction SYNC(uint32_t parameters) {
+ return internal::SyncInstruction(parameters).convert();
+ }
+
+ // LABEL
+ Instruction LABEL(LabelIndex labelIndex) {
+ return internal::LabelInstruction(labelIndex).convert();
+ }
+
+ // SAMPLE
+ Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
+ return internal::SampleInstruction(imageIndex, dst, src, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
+ }
+
+ Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType) {
+ return internal::TypedWriteInstruction(imageIndex, src, srcType, coordType).convert();
+ }
+
+ Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg) {
+ return internal::GetImageInfoInstruction(infoType, dst, imageIndex, infoReg).convert();
+ }
+
+ std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
+ const Function &fn = insn.getFunction();
+ switch (insn.getOpcode()) {
+#define DECL_INSN(OPCODE, CLASS) \
+ case OP_##OPCODE: \
+ reinterpret_cast<const internal::CLASS&>(insn).out(out, fn); \
+ break;
+#include "instruction.hxx"
+#undef DECL_INSN
+ case OP_INVALID: NOT_SUPPORTED; break;
+ };
+ return out;
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
new file mode 100644
index 0000000..a75a441
--- /dev/null
+++ b/backend/src/ir/instruction.hpp
@@ -0,0 +1,687 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file instruction.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_INSTRUCTION_HPP__
+#define __GBE_IR_INSTRUCTION_HPP__
+
+#include "ir/register.hpp"
+#include "ir/immediate.hpp"
+#include "ir/type.hpp"
+#include "sys/platform.hpp"
+#include "sys/intrusive_list.hpp"
+
+#include <ostream>
+#define MAX_MIXED_POINTER 4
+
+namespace gbe {
+namespace ir {
+ struct BTI {
+ uint8_t bti[MAX_MIXED_POINTER];
+ uint8_t count;
+ BTI() : count(0) {
+ memset(bti, 0, MAX_MIXED_POINTER);
+ }
+ ~BTI() {}
+ };
+
+ /*! All opcodes */
+ enum Opcode : uint8_t {
+#define DECL_INSN(INSN, FAMILY) OP_##INSN,
+#include "ir/instruction.hxx"
+#undef DECL_INSN
+ OP_INVALID
+ };
+
+ /*! Different memory spaces */
+ enum AddressSpace : uint8_t {
+ MEM_GLOBAL = 0, //!< Global memory (a la OCL)
+ MEM_LOCAL, //!< Local memory (thread group memory)
+ MEM_CONSTANT, //!< Immutable global memory
+ MEM_PRIVATE, //!< Per thread private memory
+ IMAGE, //!< For texture image.
+ MEM_INVALID
+ };
+
+ enum AtomicOps {
+ ATOMIC_OP_AND = 1,
+ ATOMIC_OP_OR = 2,
+ ATOMIC_OP_XOR = 3,
+ ATOMIC_OP_XCHG = 4,
+ ATOMIC_OP_INC = 5,
+ ATOMIC_OP_DEC = 6,
+ ATOMIC_OP_ADD = 7,
+ ATOMIC_OP_SUB = 8,
+ ATOMIC_OP_IMAX = 10,
+ ATOMIC_OP_IMIN = 11,
+ ATOMIC_OP_UMAX = 12,
+ ATOMIC_OP_UMIN = 13,
+ ATOMIC_OP_CMPXCHG = 14,
+ ATOMIC_OP_INVALID
+ };
+
+ /* Vote function per hardware thread */
+ enum VotePredicate : uint8_t {
+ VOTE_ALL = 0,
+ VOTE_ANY
+ };
+
+ /*! Output the memory space */
+ std::ostream &operator<< (std::ostream &out, AddressSpace addrSpace);
+
+ /*! A label is identified with an unsigned short */
+ TYPE_SAFE(LabelIndex, uint16_t)
+
+ /*! Function class contains the register file and the register tuple. Any
+ * information related to the registers may therefore require a function
+ */
+ class Function;
+
+ /*! Contains the stream of instructions */
+ class BasicBlock;
+
+ ///////////////////////////////////////////////////////////////////////////
+ /// All public instruction classes as manipulated by all public classes
+ ///////////////////////////////////////////////////////////////////////////
+
+ /*! Stores instruction internal data and opcode */
+ class ALIGNED(sizeof(uint64_t)*2) InstructionBase
+ {
+ public:
+ /*! Initialize the instruction from a 8 bytes stream */
+ INLINE InstructionBase(const char *stream) {
+ opcode = Opcode(stream[0]);
+ for (uint32_t byte = 0; byte < opaqueSize; ++byte)
+ opaque[byte] = stream[byte+1];
+ }
+ /*! Uninitialized instruction */
+ INLINE InstructionBase(void) {}
+ /*! Get the instruction opcode */
+ INLINE Opcode getOpcode(void) const { return opcode; }
+ protected:
+ enum { opaqueSize = sizeof(uint64_t)*2-sizeof(uint8_t) };
+ Opcode opcode; //!< Idendifies the instruction
+ char opaque[opaqueSize]; //!< Remainder of it
+ GBE_CLASS(InstructionBase); //!< Use internal allocators
+ };
+
+ /*! Store the instruction description in 32 bytes */
+ class Instruction : public InstructionBase, public intrusive_list_node
+ {
+ public:
+ /*! Initialize the instruction from a 8 bytes stream */
+ INLINE Instruction(const char *stream) : InstructionBase(stream) {
+ parent = NULL;
+ }
+ /*! Copy the private fields and give it the same parent */
+ INLINE Instruction(const Instruction &other) :
+ InstructionBase(reinterpret_cast<const char*>(&other.opcode)) {
+ parent = other.parent;
+ }
+ private:
+ /*! To be consistant with copy constructor */
+ INLINE Instruction &operator= (const Instruction &other) { return *this; }
+ public:
+ /*! Nothing to do here */
+ INLINE ~Instruction(void) {}
+ /*! Uninitialized instruction */
+ INLINE Instruction(void) {}
+ /*! Get the number of sources for this instruction */
+ uint32_t getSrcNum(void) const;
+ /*! Get the number of destination for this instruction */
+ uint32_t getDstNum(void) const;
+ /*! Get the register index of the given source */
+ Register getSrc(uint32_t ID = 0u) const;
+ /*! Get the register index of the given destination */
+ Register getDst(uint32_t ID = 0u) const;
+ /*! Get the register of the given source */
+ RegisterData getDstData(uint32_t ID = 0u) const;
+ /*! Get the register of the given destination */
+ RegisterData getSrcData(uint32_t ID = 0u) const;
+ /*! Set a register in src srcID */
+ void setSrc(uint32_t srcID, Register reg);
+ /*! Set a register in dst dstID */
+ void setDst(uint32_t dstID, Register reg);
+ /*! Is there any side effect in the memory sub-system? */
+ bool hasSideEffect(void) const;
+ /*! Get / set the parent basic block */
+ BasicBlock *getParent(void) { return parent; }
+ const BasicBlock *getParent(void) const { return parent; }
+ void setParent(BasicBlock *block) { this->parent = block; }
+ /*! Get the function from the parent basic block */
+ const Function &getFunction(void) const;
+ Function &getFunction(void);
+ /*! Check that the instruction is well formed (type properly match,
+ * registers not of bound and so on). If not well formed, provide a reason
+ * in string why
+ */
+ bool wellFormed(std::string &why) const;
+ /*! Replace other by this instruction */
+ void replace(Instruction *other) const;
+ /*! Remove the instruction from the instruction stream */
+ void remove(void);
+ /* Insert the instruction after the previous one. */
+ void insert(Instruction *prev, Instruction ** new_ins = NULL);
+ /*! Indicates if the instruction belongs to instruction type T. Typically, T
+ * can be BinaryInstruction, UnaryInstruction, LoadInstruction and so on
+ */
+ template <typename T> INLINE bool isMemberOf(void) const {
+ return T::isClassOf(*this);
+ }
+ /*! max_src for store instruction (vec16 + addr) */
+ static const uint32_t MAX_SRC_NUM = 17;
+ static const uint32_t MAX_DST_NUM = 16;
+ protected:
+ BasicBlock *parent; //!< The basic block containing the instruction
+ GBE_CLASS(Instruction); //!< Use internal allocators
+ };
+
+ /*! Output the instruction string in the given stream */
+ std::ostream &operator<< (std::ostream &out, const Instruction &proxy);
+
+ /*! Unary instructions are typed. dst and sources share the same type */
+ class UnaryInstruction : public Instruction {
+ public:
+ /*! Get the type manipulated by the instruction */
+ Type getType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Binary instructions are typed. dst and sources share the same type */
+ class BinaryInstruction : public Instruction {
+ public:
+ /*! Get the type manipulated by the instruction */
+ Type getType(void) const;
+ /*! Commutative instructions can allow better optimizations */
+ bool commutes(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Ternary instructions are typed. dst and sources share the same type */
+ class TernaryInstruction : public Instruction {
+ public:
+ Type getType(void) const;
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Select instructions writes src0 to dst if cond is true. Otherwise, it
+ * writes src1
+ */
+ class SelectInstruction : public Instruction {
+ public:
+ /*! Predicate is in slot 0. So first source to selec is in slot 1 */
+ static const uint32_t src0Index = 1;
+ /*! Second source to select is in slot 2 */
+ static const uint32_t src1Index = 2;
+ /*! Get the predicate of the selection instruction */
+ INLINE Register getPredicate(void) const { return this->getSrc(0); }
+ /*! Get the type of both sources */
+ Type getType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Compare instructions compare anything from the same type and return a
+ * boolean value
+ */
+ class CompareInstruction : public Instruction {
+ public:
+ /*! Get the type of the source registers */
+ Type getType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! BitCast instruction converts from one type to another */
+ class BitCastInstruction : public Instruction {
+ public:
+ /*! Get the type of the source */
+ Type getSrcType(void) const;
+ /*! Get the type of the destination */
+ Type getDstType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Conversion instruction converts from one type to another */
+ class ConvertInstruction : public Instruction {
+ public:
+ /*! Get the type of the source */
+ Type getSrcType(void) const;
+ /*! Get the type of the destination */
+ Type getDstType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Atomic instruction */
+ class AtomicInstruction : public Instruction {
+ public:
+ /*! Where the address register goes */
+ static const uint32_t addressIndex = 0;
+ /*! Address space that is manipulated here */
+ AddressSpace getAddressSpace(void) const;
+ BTI getBTI(void) const;
+ /*! Return the atomic function code */
+ AtomicOps getAtomicOpcode(void) const;
+ /*! Return the register that contains the addresses */
+ INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Store instruction. First source is the address. Next sources are the
+ * values to store contiguously at the given address
+ */
+ class StoreInstruction : public Instruction {
+ public:
+ /*! Where the address register goes */
+ static const uint32_t addressIndex = 0;
+ /*! Return the types of the values to store */
+ Type getValueType(void) const;
+ /*! Give the number of values the instruction is storing (srcNum-1) */
+ uint32_t getValueNum(void) const;
+ BTI getBTI(void) const;
+ /*! Address space that is manipulated here */
+ AddressSpace getAddressSpace(void) const;
+ /*! DWORD aligned means untyped read for Gen. That is what matters */
+ bool isAligned(void) const;
+ /*! Return the register that contains the addresses */
+ INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
+ /*! Return the register that contain value valueID */
+ INLINE Register getValue(uint32_t valueID) const {
+ GBE_ASSERT(valueID < this->getValueNum());
+ return this->getSrc(valueID + 1u);
+ }
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Load instruction. The source is simply the address where to get the data.
+ * The multiple destinations are the contiguous values loaded at the given
+ * address
+ */
+ class LoadInstruction : public Instruction {
+ public:
+ /*! Type of the loaded values (ie type of all the destinations) */
+ Type getValueType(void) const;
+ /*! Number of values loaded (ie number of destinations) */
+ uint32_t getValueNum(void) const;
+ /*! Address space that is manipulated here */
+ AddressSpace getAddressSpace(void) const;
+ /*! DWORD aligned means untyped read for Gen. That is what matters */
+ bool isAligned(void) const;
+ /*! Return the register that contains the addresses */
+ INLINE Register getAddress(void) const { return this->getSrc(0u); }
+ BTI getBTI(void) const;
+ /*! Return the register that contain value valueID */
+ INLINE Register getValue(uint32_t valueID) const {
+ return this->getDst(valueID);
+ }
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Load immediate instruction loads an typed immediate value into the given
+ * register. Since double and uint64_t values will not fit into an
+ * instruction, the immediate themselves are stored in the function core.
+ * Contrary to regular load instructions, there is only one destination
+ * possible
+ */
+ class LoadImmInstruction : public Instruction {
+ public:
+ /*! Return the value stored in the instruction */
+ Immediate getImmediate(void) const;
+ /*! Return the type of the stored value */
+ Type getType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Store data in an texture */
+ class TypedWriteInstruction : public Instruction {
+ public:
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ const uint8_t getImageIndex() const;
+ Type getSrcType(void) const;
+ Type getCoordType(void) const;
+ };
+
+ /*! Load texels from a texture */
+ class SampleInstruction : public Instruction {
+ public:
+ const uint8_t getImageIndex() const;
+ const uint8_t getSamplerIndex(void) const;
+ const uint8_t getSamplerOffset(void) const;
+ Type getSrcType(void) const;
+ Type getDstType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ typedef union _ImageInfoKey{
+ _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
+ struct {
+ uint8_t index; /*! the allocated image index */
+ uint8_t type; /*! the information type */
+ };
+ uint16_t data;
+ } ImageInfoKey;
+
+ /*! Get image information */
+ class GetImageInfoInstruction : public Instruction {
+ public:
+ enum {
+ WIDTH = 0,
+ HEIGHT = 1,
+ DEPTH = 2,
+ CHANNEL_DATA_TYPE = 3,
+ CHANNEL_ORDER = 4,
+ };
+
+ static INLINE uint32_t getDstNum4Type(int infoType) {
+ switch (infoType) {
+ case WIDTH:
+ case HEIGHT:
+ case DEPTH:
+ case CHANNEL_DATA_TYPE:
+ case CHANNEL_ORDER:
+ return 1;
+ break;
+ default:
+ GBE_ASSERT(0);
+ }
+ return 0;
+ }
+
+ const uint8_t getImageIndex() const;
+ uint32_t getInfoType() const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Branch instruction is the unified way to branch (with or without
+ * predicate)
+ */
+ class BranchInstruction : public Instruction {
+ public:
+ /*! Indicate if the branch is predicated */
+ bool isPredicated(void) const;
+ /*! Return the predicate register (if predicated) */
+ RegisterData getPredicate(void) const {
+ GBE_ASSERTM(this->isPredicated() == true, "Branch is not predicated");
+ return this->getSrcData(0);
+ }
+ /*! Return the predicate register index (if predicated) */
+ Register getPredicateIndex(void) const {
+ GBE_ASSERTM(this->isPredicated() == true, "Branch is not predicated");
+ return this->getSrc(0);
+ }
+ /*! Return the label index pointed by the branch */
+ LabelIndex getLabelIndex(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Label instruction are actual no-op but are referenced by branches as their
+ * targets
+ */
+ class LabelInstruction : public Instruction {
+ public:
+ /*! Return the label index of the instruction */
+ LabelIndex getLabelIndex(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Texture instruction are used for any texture mapping requests */
+ class TextureInstruction : public Instruction {
+ public:
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Mapped to OpenCL (mem_fence, read_mem_fence, write_mem_fence, barrier) */
+ enum {
+ SYNC_WORKGROUP_EXEC = 1<<0,
+ SYNC_LOCAL_READ_FENCE = 1<<1,
+ SYNC_LOCAL_WRITE_FENCE = 1<<2,
+ SYNC_GLOBAL_READ_FENCE = 1<<3,
+ SYNC_GLOBAL_WRITE_FENCE = 1<<4,
+ SYNC_INVALID = 1<<5
+ };
+
+ /*! 5 bits to encode all possible synchronization capablities */
+ static const uint32_t syncFieldNum = 5u;
+
+ /*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */
+ static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE;
+
+ /*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */
+ static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE;
+
+ /*! Sync instructions are used to order loads and stores for a given memory
+ * space and/or to serialize threads at a given point in the program
+ */
+ class SyncInstruction : public Instruction {
+ public:
+ /*! Get the parameters (bitfields) of the sync instructions (see above) */
+ uint32_t getParameters(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
+ /*! Specialize the instruction. Also performs typechecking first based on the
+ * opcode. Crashes if it fails
+ */
+ template <typename T>
+ INLINE T *cast(Instruction *insn) {
+ if(insn->isMemberOf<T>())
+ return reinterpret_cast<T*>(insn);
+ else
+ return NULL;
+ }
+ template <typename T>
+ INLINE const T *cast(const Instruction *insn) {
+ if(insn->isMemberOf<T>())
+ return reinterpret_cast<const T*>(insn);
+ else
+ return NULL;
+ }
+ template <typename T>
+ INLINE T &cast(Instruction &insn) {
+ GBE_ASSERTM(insn.isMemberOf<T>() == true, "Invalid instruction type");
+ return reinterpret_cast<T&>(insn);
+ }
+ template <typename T>
+ INLINE const T &cast(const Instruction &insn) {
+ GBE_ASSERTM(insn.isMemberOf<T>() == true, "Invalid instruction type");
+ return reinterpret_cast<const T&>(insn);
+ }
+
+ /*! Indicates if the given opcode belongs the given instruction family */
+ template <typename T, typename U> struct EqualType {enum {value = false};};
+ template <typename T> struct EqualType<T,T> { enum {value = true};};
+ template <typename T>
+ INLINE bool isOpcodeFrom(Opcode op) {
+ switch (op) {
+#define DECL_INSN(OPCODE, FAMILY) \
+ case OP_##OPCODE: return EqualType<T, FAMILY>::value;
+#include "instruction.hxx"
+#undef DECL_INSN
+ default: NOT_SUPPORTED; return false;
+ }
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ /// All emission functions
+ ///////////////////////////////////////////////////////////////////////////
+
+ /*! alu1.type dst src */
+ Instruction ALU1(Opcode opcode, Type type, Register dst, Register src);
+ /*! mov.type dst src */
+ Instruction MOV(Type type, Register dst, Register src);
+ /*! cos.type dst src */
+ Instruction COS(Type type, Register dst, Register src);
+ /*! sin.type dst src */
+ Instruction SIN(Type type, Register dst, Register src);
+ /*! mul_hi.type dst src */
+ Instruction MUL_HI(Type type, Register dst, Register src0, Register src1);
+ /*! i64_mul_hi.type dst src */
+ Instruction I64_MUL_HI(Type type, Register dst, Register src0, Register src1);
+ /*! i64madsat.type dst src */
+ Instruction I64MADSAT(Type type, Register dst, Tuple src);
+ /*! mad.type dst src */
+ Instruction MAD(Type type, Register dst, Tuple src);
+ /*! upsample_short.type dst src */
+ Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
+ /*! upsample_int.type dst src */
+ Instruction UPSAMPLE_INT(Type type, Register dst, Register src0, Register src1);
+ /*! upsample_long.type dst src */
+ Instruction UPSAMPLE_LONG(Type type, Register dst, Register src0, Register src1);
+ /*! fbh.type dst src */
+ Instruction FBH(Type type, Register dst, Register src);
+ /*! fbl.type dst src */
+ Instruction FBL(Type type, Register dst, Register src);
+ /*! hadd.type dst src */
+ Instruction HADD(Type type, Register dst, Register src0, Register src1);
+ /*! rhadd.type dst src */
+ Instruction RHADD(Type type, Register dst, Register src0, Register src1);
+ /*! i64hadd.type dst src */
+ Instruction I64HADD(Type type, Register dst, Register src0, Register src1);
+ /*! i64rhadd.type dst src */
+ Instruction I64RHADD(Type type, Register dst, Register src0, Register src1);
+ /*! tan.type dst src */
+ Instruction RCP(Type type, Register dst, Register src);
+ /*! abs.type dst src */
+ Instruction ABS(Type type, Register dst, Register src);
+ /*! simd_all.type dst src */
+ Instruction SIMD_ALL(Type type, Register dst, Register src);
+ /*! simd_any.type dst src */
+ Instruction SIMD_ANY(Type type, Register dst, Register src);
+ /*! log.type dst src */
+ Instruction LOG(Type type, Register dst, Register src);
+ /*! exp.type dst src */
+ Instruction EXP(Type type, Register dst, Register src);
+ /*! sqr.type dst src */
+ Instruction SQR(Type type, Register dst, Register src);
+ /*! rsq.type dst src */
+ Instruction RSQ(Type type, Register dst, Register src);
+ /*! rndd.type dst src */
+ Instruction RNDD(Type type, Register dst, Register src);
+ /*! rnde.type dst src */
+ Instruction RNDE(Type type, Register dst, Register src);
+ /*! rndu.type dst src */
+ Instruction RNDU(Type type, Register dst, Register src);
+ /*! rndz.type dst src */
+ Instruction RNDZ(Type type, Register dst, Register src);
+ /*! pow.type dst src0 src1 */
+ Instruction POW(Type type, Register dst, Register src0, Register src1);
+ /*! mul.type dst src0 src1 */
+ Instruction MUL(Type type, Register dst, Register src0, Register src1);
+ /*! add.type dst src0 src1 */
+ Instruction ADD(Type type, Register dst, Register src0, Register src1);
+ /*! addsat.type dst src0 src1 */
+ Instruction ADDSAT(Type type, Register dst, Register src0, Register src1);
+ /*! sub.type dst src0 src1 */
+ Instruction SUB(Type type, Register dst, Register src0, Register src1);
+ /*! subsat.type dst src0 src1 */
+ Instruction SUBSAT(Type type, Register dst, Register src0, Register src1);
+ /*! div.type dst src0 src1 */
+ Instruction DIV(Type type, Register dst, Register src0, Register src1);
+ /*! rem.type dst src0 src1 */
+ Instruction REM(Type type, Register dst, Register src0, Register src1);
+ /*! shl.type dst src0 src1 */
+ Instruction SHL(Type type, Register dst, Register src0, Register src1);
+ /*! shr.type dst src0 src1 */
+ Instruction SHR(Type type, Register dst, Register src0, Register src1);
+ /*! asr.type dst src0 src1 */
+ Instruction ASR(Type type, Register dst, Register src0, Register src1);
+ /*! bsf.type dst src0 src1 */
+ Instruction BSF(Type type, Register dst, Register src0, Register src1);
+ /*! bsb.type dst src0 src1 */
+ Instruction BSB(Type type, Register dst, Register src0, Register src1);
+ /*! or.type dst src0 src1 */
+ Instruction OR(Type type, Register dst, Register src0, Register src1);
+ /*! xor.type dst src0 src1 */
+ Instruction XOR(Type type, Register dst, Register src0, Register src1);
+ /*! and.type dst src0 src1 */
+ Instruction AND(Type type, Register dst, Register src0, Register src1);
+ /*! sel.type dst {cond, src0, src1} (== src) */
+ Instruction SEL(Type type, Register dst, Tuple src);
+ /*! eq.type dst src0 src1 */
+ Instruction EQ(Type type, Register dst, Register src0, Register src1);
+ /*! ne.type dst src0 src1 */
+ Instruction NE(Type type, Register dst, Register src0, Register src1);
+ /*! lt.type dst src0 src1 */
+ Instruction LE(Type type, Register dst, Register src0, Register src1);
+ /*! le.type dst src0 src1 */
+ Instruction LT(Type type, Register dst, Register src0, Register src1);
+ /*! gt.type dst src0 src1 */
+ Instruction GE(Type type, Register dst, Register src0, Register src1);
+ /*! ge.type dst src0 src1 */
+ Instruction GT(Type type, Register dst, Register src0, Register src1);
+ /*! ord.type dst src0 src1 */
+ Instruction ORD(Type type, Register dst, Register src0, Register src1);
+ /*! BITCAST.{dstType <- srcType} dst src */
+ Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum);
+ /*! cvt.{dstType <- srcType} dst src */
+ Instruction CVT(Type dstType, Type srcType, Register dst, Register src);
+ /*! sat_cvt.{dstType <- srcType} dst src */
+ Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src);
+ /*! F16TO32.{dstType <- srcType} dst src */
+ Instruction F16TO32(Type dstType, Type srcType, Register dst, Register src);
+ /*! F32TO16.{dstType <- srcType} dst src */
+ Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
+ /*! atomic dst addr.space {src1 {src2}} */
+ Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
+ /*! bra labelIndex */
+ Instruction BRA(LabelIndex labelIndex);
+ /*! (pred) bra labelIndex */
+ Instruction BRA(LabelIndex labelIndex, Register pred);
+ /*! ret */
+ Instruction RET(void);
+ /*! load.type.space {dst1,...,dst_valueNum} offset value */
+ Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+ /*! store.type.space offset {src1,...,src_valueNum} value */
+ Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+ /*! loadi.type dst value */
+ Instruction LOADI(Type type, Register dst, ImmediateIndex value);
+ /*! sync.params... (see Sync instruction) */
+ Instruction SYNC(uint32_t parameters);
+ /*! typed write */
+ Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType);
+ /*! sample textures */
+ Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
+ /*! get image information , such as width/height/depth/... */
+ Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
+ /*! label labelIndex */
+ Instruction LABEL(LabelIndex labelIndex);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_INSTRUCTION_HPP__ */
+
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
new file mode 100644
index 0000000..587517b
--- /dev/null
+++ b/backend/src/ir/instruction.hxx
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file instruction.hxx
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+DECL_INSN(MOV, UnaryInstruction)
+DECL_INSN(COS, UnaryInstruction)
+DECL_INSN(SIN, UnaryInstruction)
+DECL_INSN(LOG, UnaryInstruction)
+DECL_INSN(EXP, UnaryInstruction)
+DECL_INSN(SQR, UnaryInstruction)
+DECL_INSN(RSQ, UnaryInstruction)
+DECL_INSN(RCP, UnaryInstruction)
+DECL_INSN(ABS, UnaryInstruction)
+DECL_INSN(RNDD, UnaryInstruction)
+DECL_INSN(RNDE, UnaryInstruction)
+DECL_INSN(RNDU, UnaryInstruction)
+DECL_INSN(RNDZ, UnaryInstruction)
+DECL_INSN(SIMD_ANY, UnaryInstruction)
+DECL_INSN(SIMD_ALL, UnaryInstruction)
+DECL_INSN(POW, BinaryInstruction)
+DECL_INSN(MUL, BinaryInstruction)
+DECL_INSN(ADD, BinaryInstruction)
+DECL_INSN(ADDSAT, BinaryInstruction)
+DECL_INSN(SUB, BinaryInstruction)
+DECL_INSN(SUBSAT, BinaryInstruction)
+DECL_INSN(DIV, BinaryInstruction)
+DECL_INSN(REM, BinaryInstruction)
+DECL_INSN(SHL, BinaryInstruction)
+DECL_INSN(SHR, BinaryInstruction)
+DECL_INSN(ASR, BinaryInstruction)
+DECL_INSN(BSF, BinaryInstruction)
+DECL_INSN(BSB, BinaryInstruction)
+DECL_INSN(OR, BinaryInstruction)
+DECL_INSN(XOR, BinaryInstruction)
+DECL_INSN(AND, BinaryInstruction)
+DECL_INSN(SEL, SelectInstruction)
+DECL_INSN(EQ, CompareInstruction)
+DECL_INSN(NE, CompareInstruction)
+DECL_INSN(LE, CompareInstruction)
+DECL_INSN(LT, CompareInstruction)
+DECL_INSN(GE, CompareInstruction)
+DECL_INSN(GT, CompareInstruction)
+DECL_INSN(ORD, CompareInstruction)
+DECL_INSN(BITCAST, BitCastInstruction)
+DECL_INSN(CVT, ConvertInstruction)
+DECL_INSN(SAT_CVT, ConvertInstruction)
+DECL_INSN(F16TO32, ConvertInstruction)
+DECL_INSN(F32TO16, ConvertInstruction)
+DECL_INSN(ATOMIC, AtomicInstruction)
+DECL_INSN(BRA, BranchInstruction)
+DECL_INSN(RET, BranchInstruction)
+DECL_INSN(LOADI, LoadImmInstruction)
+DECL_INSN(LOAD, LoadInstruction)
+DECL_INSN(STORE, StoreInstruction)
+DECL_INSN(TYPED_WRITE, TypedWriteInstruction)
+DECL_INSN(SAMPLE, SampleInstruction)
+DECL_INSN(SYNC, SyncInstruction)
+DECL_INSN(LABEL, LabelInstruction)
+DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_INSN(MUL_HI, BinaryInstruction)
+DECL_INSN(I64_MUL_HI, BinaryInstruction)
+DECL_INSN(FBH, UnaryInstruction)
+DECL_INSN(FBL, UnaryInstruction)
+DECL_INSN(HADD, BinaryInstruction)
+DECL_INSN(RHADD, BinaryInstruction)
+DECL_INSN(I64HADD, BinaryInstruction)
+DECL_INSN(I64RHADD, BinaryInstruction)
+DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
+DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
+DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
+DECL_INSN(I64MADSAT, TernaryInstruction)
+DECL_INSN(MAD, TernaryInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
new file mode 100644
index 0000000..afed476
--- /dev/null
+++ b/backend/src/ir/liveness.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file liveness.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/liveness.hpp"
+#include <sstream>
+
+namespace gbe {
+namespace ir {
+
+ Liveness::Liveness(Function &fn) : fn(fn) {
+ // Initialize UEVar and VarKill for each block
+ fn.foreachBlock([this](const BasicBlock &bb) {
+ this->initBlock(bb);
+ // If the bb has ret instruction, add it to the work list set.
+ const Instruction *lastInsn = bb.getLastInstruction();
+ const ir::Opcode op = lastInsn->getOpcode();
+ struct BlockInfo * info = liveness[&bb];
+ if (op == OP_RET) {
+ workSet.insert(info);
+ info->liveOut.insert(ocl::retVal);
+ }
+ });
+ // Now with iterative analysis, we compute liveout and livein sets
+ this->computeLiveInOut();
+ // extend register (def in loop, use out-of-loop) liveness to the whole loop
+ set<Register> extentRegs;
+ this->computeExtraLiveInOut(extentRegs);
+ // analyze uniform values. The extentRegs contains all the values which is
+ // defined in a loop and use out-of-loop which could not be a uniform. The reason
+ // is that when it reenter the second time, it may active different lanes. So
+ // reenter many times may cause it has different values in different lanes.
+ this->analyzeUniform(&extentRegs);
+ }
+
+ Liveness::~Liveness(void) {
+ for (auto &pair : liveness) GBE_SAFE_DELETE(pair.second);
+ }
+
+ void Liveness::analyzeUniform(set<Register> *extentRegs) {
+ fn.foreachBlock([this, extentRegs](const BasicBlock &bb) {
+ const_cast<BasicBlock&>(bb).foreach([this, extentRegs](const Instruction &insn) {
+ const uint32_t srcNum = insn.getSrcNum();
+ const uint32_t dstNum = insn.getDstNum();
+ bool uniform = true;
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const Register reg = insn.getSrc(srcID);
+ if (!fn.isUniformRegister(reg))
+ uniform = false;
+ }
+ // A destination is a killed value
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const Register reg = insn.getDst(dstID);
+ int opCode = insn.getOpcode();
+ // FIXME, ADDSAT and uniform vector should be supported.
+ if (uniform &&
+ fn.getRegisterFamily(reg) != ir::FAMILY_QWORD &&
+ !insn.getParent()->definedPhiRegs.contains(reg) &&
+ opCode != ir::OP_ATOMIC &&
+ opCode != ir::OP_MUL_HI &&
+ opCode != ir::OP_HADD &&
+ opCode != ir::OP_RHADD &&
+ opCode != ir::OP_ADDSAT &&
+ (dstNum == 1 || insn.getOpcode() != ir::OP_LOAD) &&
+ !extentRegs->contains(reg)
+ )
+ fn.setRegisterUniform(reg, true);
+ }
+ });
+ });
+ }
+
+ void Liveness::initBlock(const BasicBlock &bb) {
+ GBE_ASSERT(liveness.contains(&bb) == false);
+ BlockInfo *info = GBE_NEW(BlockInfo, bb);
+ // Traverse all instructions to handle UEVar and VarKill
+ const_cast<BasicBlock&>(bb).foreach([this, info](const Instruction &insn) {
+ this->initInstruction(*info, insn);
+ });
+ liveness[&bb] = info;
+ }
+
+ void Liveness::initInstruction(BlockInfo &info, const Instruction &insn) {
+ const uint32_t srcNum = insn.getSrcNum();
+ const uint32_t dstNum = insn.getDstNum();
+ // First look for used before killed
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const Register reg = insn.getSrc(srcID);
+ // Not killed -> it is really an upward use
+ if (info.varKill.contains(reg) == false)
+ info.upwardUsed.insert(reg);
+ }
+ // A destination is a killed value
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const Register reg = insn.getDst(dstID);
+ info.varKill.insert(reg);
+ }
+ }
+
+// Use simple backward data flow analysis to solve the liveness problem.
+ void Liveness::computeLiveInOut(void) {
+ while(!workSet.empty()) {
+ auto currInfo = *workSet.begin();
+ workSet.erase(currInfo);
+ for (auto currOutVar : currInfo->liveOut)
+ if (!currInfo->varKill.contains(currOutVar))
+ currInfo->upwardUsed.insert(currOutVar);
+ bool isChanged = false;
+ for (auto prev : currInfo->bb.getPredecessorSet()) {
+ BlockInfo *prevInfo = liveness[prev];
+ for (auto currInVar : currInfo->upwardUsed) {
+ if (!prevInfo->bb.undefPhiRegs.contains(currInVar)) {
+ auto changed = prevInfo->liveOut.insert(currInVar);
+ if (changed.second) isChanged = true;
+ }
+ }
+ if (isChanged )
+ workSet.insert(prevInfo);
+ }
+ };
+#if 0
+ fn.foreachBlock([this](const BasicBlock &bb){
+ printf("label %d:\n", bb.getLabelIndex());
+ BlockInfo *info = liveness[&bb];
+ auto &outVarSet = info->liveOut;
+ auto &inVarSet = info->upwardUsed;
+ printf("\n\tin Lives: ");
+ for (auto inVar : inVarSet) {
+ printf("%d ", inVar);
+ }
+ printf("\n");
+ printf("\tout Lives: ");
+ for (auto outVar : outVarSet) {
+ printf("%d ", outVar);
+ }
+ printf("\n");
+
+ });
+#endif
+ }
+/*
+ As we run in SIMD mode with prediction mask to indicate active lanes.
+ If a vreg is defined in a loop, and there are som uses of the vreg out of the loop,
+ the define point may be run several times under *different* prediction mask.
+ For these kinds of vreg, we must extend the vreg liveness into the whole loop.
+ If we don't do this, it's liveness is killed before the def point inside loop.
+ If the vreg's corresponding physical reg is assigned to other vreg during the
+ killed period, and the instructions before kill point were re-executed with different prediction,
+ the inactive lanes of vreg maybe over-written. Then the out-of-loop use will got wrong data.
+*/
+ void Liveness::computeExtraLiveInOut(set<Register> &extentRegs) {
+ const vector<Loop *> &loops = fn.getLoops();
+ extentRegs.clear();
+ if(loops.size() == 0) return;
+
+ for (auto l : loops) {
+ for (auto x : l->exits) {
+ const BasicBlock &a = fn.getBlock(x.first);
+ const BasicBlock &b = fn.getBlock(x.second);
+ BlockInfo * exiting = liveness[&a];
+ BlockInfo * exit = liveness[&b];
+ std::vector<Register> toExtend;
+
+ if(b.getPredecessorSet().size() > 1) {
+ for (auto p : exit->upwardUsed)
+ toExtend.push_back(p);
+ } else {
+ std::set_intersection(exiting->liveOut.begin(), exiting->liveOut.end(), exit->upwardUsed.begin(), exit->upwardUsed.end(), std::back_inserter(toExtend));
+ }
+ if (toExtend.size() == 0) continue;
+ for(auto r : toExtend)
+ extentRegs.insert(r);
+ for (auto bb : l->bbs) {
+ BlockInfo * bI = liveness[&fn.getBlock(bb)];
+ for(auto r : toExtend) {
+ if(!bI->upwardUsed.contains(r))
+ bI->upwardUsed.insert(r);
+ bI->liveOut.insert(r);
+ }
+ }
+ }
+ }
+#if 0
+ fn.foreachBlock([this](const BasicBlock &bb){
+ printf("label %d:\n", bb.getLabelIndex());
+ BlockInfo *info = liveness[&bb];
+ auto &outVarSet = info->liveOut;
+ auto &inVarSet = info->upwardUsed;
+ printf("\n\tLive Ins: ");
+ for (auto inVar : inVarSet) {
+ printf("%d ", inVar);
+ }
+ printf("\n");
+ printf("\tLive outs: ");
+ for (auto outVar : outVarSet) {
+ printf("%d ", outVar);
+ }
+ printf("\n");
+
+ });
+#endif
+ }
+
+
+ /*! To pretty print the livfeness info */
+ static const uint32_t prettyInsnStrSize = 48;
+ static const uint32_t prettyRegStrSize = 5;
+
+ /*! Describe how the register is used */
+ static const uint32_t USE_NONE = 0;
+ static const uint32_t USE_READ = 1 << 0;
+ static const uint32_t USE_WRITTEN = 1 << 1;
+
+ enum UsePosition {
+ POS_BEFORE = 0,
+ POS_HERE = 1,
+ POS_AFTER = 2
+ };
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/liveness.hpp b/backend/src/ir/liveness.hpp
new file mode 100644
index 0000000..d55e00d
--- /dev/null
+++ b/backend/src/ir/liveness.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file liveness.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_LIVENESS_HPP__
+#define __GBE_IR_LIVENESS_HPP__
+
+#include <list>
+#include "sys/map.hpp"
+#include "sys/set.hpp"
+#include "ir/register.hpp"
+#include "ir/function.hpp"
+
+namespace gbe {
+namespace ir {
+
+ // Liveness is computed per function
+ class Function;
+
+ /*! To choose the iteration direction, we either look at predecessors or
+ * successors
+ */
+ enum DataFlowDirection {
+ DF_PRED = 0,
+ DF_SUCC = 1
+ };
+
+ /*! Compute liveness of each register */
+ class Liveness : public NonCopyable
+ {
+ public:
+ Liveness(Function &fn);
+ ~Liveness(void);
+ /*! Set of variables used upwards in the block (before a definition) */
+ typedef set<Register> UEVar;
+ /*! Set of variables alive at the exit of the block */
+ typedef set<Register> LiveOut;
+ /*! Set of variables actually killed in each block */
+ typedef set<Register> VarKill;
+ /*! Per-block info */
+ struct BlockInfo : public NonCopyable {
+ BlockInfo(const BasicBlock &bb) : bb(bb) {}
+ const BasicBlock &bb;
+ INLINE bool inUpwardUsed(Register reg) const {
+ return upwardUsed.contains(reg);
+ }
+ INLINE bool inLiveOut(Register reg) const {
+ return liveOut.contains(reg);
+ }
+ INLINE bool inVarKill(Register reg) const {
+ return varKill.contains(reg);
+ }
+ UEVar upwardUsed;
+ LiveOut liveOut;
+ VarKill varKill;
+ };
+ /*! Gives for each block the variables alive at entry / exit */
+ typedef map<const BasicBlock*, BlockInfo*> Info;
+ /*! Return the complete liveness info */
+ INLINE const Info &getLivenessInfo(void) const { return liveness; }
+ /*! Return the complete block info */
+ INLINE const BlockInfo &getBlockInfo(const BasicBlock *bb) const {
+ auto it = liveness.find(bb);
+ GBE_ASSERT(it != liveness.end() && it->second != NULL);
+ return *it->second;
+ }
+ /*! Get the set of registers alive at the end of the block */
+ const LiveOut &getLiveOut(const BasicBlock *bb) const {
+ const BlockInfo &info = this->getBlockInfo(bb);
+ return info.liveOut;
+ }
+ /*! Get the set of registers alive at the beginning of the block */
+ const UEVar &getLiveIn(const BasicBlock *bb) const {
+ const BlockInfo &info = this->getBlockInfo(bb);
+ return info.upwardUsed;
+ }
+
+ /*! Return the function the liveness was computed on */
+ INLINE const Function &getFunction(void) const { return fn; }
+ /*! Actually do something for each successor / predecessor of *all* blocks */
+ template <DataFlowDirection dir, typename T>
+ void foreach(const T &functor) {
+ // Iterate on all blocks
+ for (const auto &pair : liveness) {
+ BlockInfo &info = *pair.second;
+ const BasicBlock &bb = info.bb;
+ const BlockSet *set = NULL;
+ if (dir == DF_SUCC)
+ set = &bb.getSuccessorSet();
+ else
+ set = &bb.getPredecessorSet();
+ // Iterate over all successors
+ for (auto other : *set) {
+ auto otherInfo = liveness.find(other);
+ GBE_ASSERT(otherInfo != liveness.end() && otherInfo->second != NULL);
+ functor(info, *otherInfo->second);
+ }
+ }
+ }
+ private:
+ /*! Store the liveness of all blocks */
+ Info liveness;
+ /*! Compute the liveness for this function */
+ Function &fn;
+ /*! Initialize UEVar and VarKill per block */
+ void initBlock(const BasicBlock &bb);
+ /*! Initialize UEVar and VarKill per instruction */
+ void initInstruction(BlockInfo &info, const Instruction &insn);
+ /*! Now really compute LiveOut based on UEVar and VarKill */
+ void computeLiveInOut(void);
+ void computeExtraLiveInOut(set<Register> &extentRegs);
+ void analyzeUniform(set<Register> *extentRegs);
+ /*! Set of work list block which has exit(return) instruction */
+ typedef set <struct BlockInfo*> WorkSet;
+ WorkSet workSet;
+
+ /*! Use custom allocators */
+ GBE_CLASS(Liveness);
+
+ };
+
+ /*! Output a nice ASCII reprensation of the liveness */
+ std::ostream &operator<< (std::ostream &out, const Liveness &liveness);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_LIVENESS_HPP__ */
+
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
new file mode 100644
index 0000000..f71fd72
--- /dev/null
+++ b/backend/src/ir/lowering.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file lowering.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "ir/context.hpp"
+#include "ir/value.hpp"
+#include "ir/liveness.hpp"
+#include "sys/set.hpp"
+
+namespace gbe {
+namespace ir {
+
+ /*! Small helper class to lower return instructions */
+ class ContextReturn : public Context
+ {
+ public:
+ /*! Initialize a context dedicated to return instruction lowering */
+ ContextReturn(Unit &unit) : Context(unit) {
+ this->usedLabels = GBE_NEW_NO_ARG(vector<uint8_t>);
+ }
+ /*! Lower the return instruction to gotos for the given function */
+ void lower(const std::string &functionName);
+ };
+
+ void ContextReturn::lower(const std::string &functionName) {
+ if ((this->fn = unit.getFunction(functionName)) == NULL)
+ return;
+
+ // Append a new block at the end of the function with a return instruction:
+ // the only one we are going to have
+ this->bb = &this->fn->getBottomBlock();
+ const LabelIndex index = this->label();
+ this->LABEL(index);
+ const BasicBlock *lastBlock = this->bb;
+ this->RET();
+
+ // Now traverse all instructions and replace all returns by GOTO index
+ fn->foreachInstruction([&](Instruction &insn) {
+ if (insn.getParent() == lastBlock) return; // This is the last block
+ if (insn.getOpcode() != OP_RET) return;
+ const Instruction bra = ir::BRA(index);
+ bra.replace(&insn);
+ });
+ }
+
+ void lowerReturn(Unit &unit, const std::string &functionName) {
+ ContextReturn ctx(unit);
+ ctx.lower(functionName);
+ }
+
+ /*! Characterizes how the argument is used (directly read, indirectly read,
+ * written)
+ */
+ enum ArgUse {
+ ARG_DIRECT_READ = 0,
+ ARG_INDIRECT_READ = 1,
+ ARG_WRITTEN = 2
+ };
+
+ /*! Just to book keep the sequence of instructions that directly load an input
+ * argument
+ */
+ struct LoadAddImm {
+ Instruction *load; //!< Load from the argument
+ Instruction *add; //!< Can be NULL if we only have load(arg)
+ Instruction *loadImm; //!< Can also be NULL
+ uint64_t offset; //!< Offset where to load in the structure
+ uint32_t argID; //!< Associated function argument
+ };
+
+ /*! List of direct loads */
+ typedef vector<LoadAddImm> LoadAddImmSeq;
+
+ /*! Helper class to lower function arguments if required */
+ class FunctionArgumentLowerer : public Context
+ {
+ public:
+ /*! Build the helper structure */
+ FunctionArgumentLowerer(Unit &unit);
+ /*! Free everything we needed */
+ virtual ~FunctionArgumentLowerer(void);
+ /*! Perform all function arguments substitution if needed */
+ void lower(const std::string &name);
+ /*! Lower the given function argument accesses */
+ void lower(uint32_t argID);
+ /*! Build the constant push for the function */
+ void buildConstantPush(void);
+ /*! Inspect the given function argument to see how it is used. If this is
+ * direct loads only, we also output the list of instructions used for each
+ * load
+ */
+ ArgUse getArgUse(uint32_t argID);
+ /*! Recursively look if there is a store in the given use */
+ bool useStore(const ValueDef &def, set<const Instruction*> &visited);
+ /*! Look if the pointer use only load with immediate offsets */
+ bool matchLoadAddImm(uint32_t argID);
+ Liveness *liveness; //!< To compute the function graph
+ FunctionDAG *dag; //!< Contains complete dependency information
+ LoadAddImmSeq seq; //!< All the direct loads
+ };
+
+ INLINE uint64_t getOffsetFromImm(const Immediate &imm) {
+ switch (imm.getType()) {
+ // bit-cast these ones
+ case TYPE_DOUBLE:
+ case TYPE_FLOAT: NOT_SUPPORTED; return 0;
+ case TYPE_S64:
+ case TYPE_U64:
+ case TYPE_U32:
+ case TYPE_U16:
+ case TYPE_U8:
+ // sign extend these ones
+ case TYPE_S32:
+ case TYPE_S16:
+ case TYPE_S8: return imm.getIntegerValue();
+ case TYPE_BOOL:
+ case TYPE_HALF: NOT_SUPPORTED; return 0;
+ default:
+ GBE_ASSERT(0 && "Unsupported imm type.\n");
+ }
+ return 0;
+ }
+
+ bool matchLoad(Instruction *insn,
+ Instruction *add,
+ Instruction *loadImm,
+ uint64_t offset,
+ uint32_t argID,
+ LoadAddImm &loadAddImm)
+ {
+ const Opcode opcode = insn->getOpcode();
+
+ if (opcode == OP_LOAD) {
+ LoadInstruction *load = cast<LoadInstruction>(insn);
+ if (load->getAddressSpace() != MEM_PRIVATE)
+ return false;
+ loadAddImm.load = insn;
+ loadAddImm.add = add;
+ loadAddImm.loadImm = loadImm;
+ loadAddImm.offset = offset;
+ loadAddImm.argID = argID;
+ return true;
+ } else
+ return false;
+ }
+
+
+ FunctionArgumentLowerer::FunctionArgumentLowerer(Unit &unit) :
+ Context(unit), liveness(NULL), dag(NULL) {}
+ FunctionArgumentLowerer::~FunctionArgumentLowerer(void) {
+ GBE_SAFE_DELETE(dag);
+ GBE_SAFE_DELETE(liveness);
+ }
+
+ void FunctionArgumentLowerer::lower(const std::string &functionName) {
+ if ((this->fn = unit.getFunction(functionName)) == NULL)
+ return;
+ GBE_SAFE_DELETE(dag);
+ GBE_SAFE_DELETE(liveness);
+ this->liveness = GBE_NEW(ir::Liveness, *fn);
+ this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+
+ // Process all structure arguments and find all the direct loads we can
+ // replace
+ const uint32_t argNum = fn->argNum();
+ for (uint32_t argID = 0; argID < argNum; ++argID) {
+ FunctionArgument &arg = fn->getArg(argID);
+ if (arg.type != FunctionArgument::STRUCTURE) continue;
+ this->lower(argID);
+ }
+
+ // Build the constant push description and remove the instruction that
+ // therefore become useless
+ this->buildConstantPush();
+ }
+
+// Remove all the given instructions from the stream (if dead)
+#define REMOVE_INSN(WHICH) \
+ for (const auto &loadAddImm : seq) { \
+ Instruction *WHICH = loadAddImm.WHICH; \
+ if (WHICH == NULL) continue; \
+ const UseSet &useSet = dag->getUse(WHICH, 0); \
+ bool isDead = true; \
+ for (auto use : useSet) { \
+ if (dead.contains(use->getInstruction()) == false) { \
+ isDead = false; \
+ break; \
+ } \
+ } \
+ if (isDead && !dead.contains(WHICH)) { \
+ dead.insert(WHICH); \
+ WHICH->remove(); \
+ } \
+ }
+
+ void FunctionArgumentLowerer::buildConstantPush(void)
+ {
+ if (seq.size() == 0)
+ return;
+
+ // Track instructions we remove to recursively kill them properly
+ set<const Instruction*> dead;
+
+ // The argument location we already pushed (since the same argument location
+ // can be used several times)
+ set<PushLocation> inserted;
+ for (const auto &loadAddImm : seq) {
+ LoadInstruction *load = cast<LoadInstruction>(loadAddImm.load);
+ const uint32_t valueNum = load->getValueNum();
+ bool replaced = false;
+ Instruction *ins_after = load; // the instruction to insert after.
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ const Type type = load->getValueType();
+ const RegisterFamily family = getFamily(type);
+ const uint32_t size = getFamilySize(family);
+ const uint32_t offset = loadAddImm.offset + valueID * size;
+ const PushLocation argLocation(*fn, loadAddImm.argID, offset);
+ Register pushed;
+ const Register reg = load->getValue(valueID);
+ if (offset != 0) {
+ if(inserted.contains(argLocation)) {
+ pushed = argLocation.getRegister();
+ } else {
+ // pushed register should be uniform register.
+ pushed = fn->newRegister(family, true);
+ this->appendPushedConstant(pushed, argLocation);
+ inserted.insert(argLocation);
+ }
+ } else {
+ pushed = fn->getArg(loadAddImm.argID).reg;
+ }
+
+ // TODO the MOV instruction can be most of the time avoided if the
+ // register is never written. We must however support the register
+ // replacement in the instruction interface to be able to patch all the
+ // instruction that uses "reg"
+ Instruction mov = ir::MOV(type, reg, pushed);
+ mov.insert(ins_after, &ins_after);
+ replaced = true;
+ }
+
+ if (replaced)
+ dead.insert(load);
+ }
+
+ REMOVE_INSN(load)
+ REMOVE_INSN(add)
+ REMOVE_INSN(loadImm)
+ }
+
+#undef REMOVE_INSN
+
+ bool FunctionArgumentLowerer::useStore(const ValueDef &def, set<const Instruction*> &visited)
+ {
+ const UseSet &useSet = dag->getUse(def);
+ for (const auto &use : useSet) {
+ const Instruction *insn = use->getInstruction();
+ const uint32_t srcID = use->getSrcID();
+ const Opcode opcode = insn->getOpcode();
+ if (visited.contains(insn)) continue;
+ visited.insert(insn);
+ if (opcode == OP_STORE && srcID == StoreInstruction::addressIndex)
+ return true;
+ if (insn->isMemberOf<UnaryInstruction>() == false &&
+ insn->isMemberOf<BinaryInstruction>() == false)
+ continue;
+ else {
+ const uint32_t dstNum = insn->getDstNum();
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID)
+ if (this->useStore(ValueDef(insn, dstID), visited) == true)
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FunctionArgumentLowerer::matchLoadAddImm(uint32_t argID)
+ {
+ const FunctionArgument &arg = fn->getArg(argID);
+ LoadAddImmSeq tmpSeq;
+
+ // Inspect all uses of the function argument pointer
+ const UseSet &useSet = dag->getUse(&arg);
+ for (auto use : useSet) {
+ Instruction *insn = const_cast<Instruction*>(use->getInstruction());
+ const Opcode opcode = insn->getOpcode();
+
+ // load dst arg
+ LoadAddImm loadAddImm;
+ if (matchLoad(insn, NULL, NULL, 0, argID, loadAddImm)) {
+ tmpSeq.push_back(loadAddImm);
+ continue;
+ }
+
+ // add.ptr_type dst ptr other
+ if (opcode != OP_ADD) return false;
+ BinaryInstruction *add = cast<BinaryInstruction>(insn);
+ const Type addType = add->getType();
+ const RegisterFamily family = getFamily(addType);
+ if (family != unit.getPointerFamily()) return false;
+ if (addType == TYPE_FLOAT) return false;
+
+ // step 1 -> check that the other source comes from a load immediate
+ const uint32_t srcID = use->getSrcID();
+ const uint32_t otherID = srcID ^ 1;
+ const DefSet &defSet = dag->getDef(insn, otherID);
+ const uint32_t defNum = defSet.size();
+ if (defNum == 0 || defNum > 1) continue; // undefined or more than one def
+ const ValueDef *otherDef = *defSet.begin();
+ if (otherDef->getType() != ValueDef::DEF_INSN_DST) return false;
+ Instruction *otherInsn = const_cast<Instruction*>(otherDef->getInstruction());
+ if (otherInsn->getOpcode() != OP_LOADI) return false;
+ LoadImmInstruction *loadImm = cast<LoadImmInstruction>(otherInsn);
+ const Immediate imm = loadImm->getImmediate();
+ const uint64_t offset = getOffsetFromImm(imm);
+
+ // step 2 -> check that the results of the add are loads from private
+ // memory
+ const UseSet &addUseSet = dag->getUse(add, 0);
+ for (auto addUse : addUseSet) {
+ Instruction *insn = const_cast<Instruction*>(addUse->getInstruction());
+
+ // We finally find something like load dst arg+imm
+ LoadAddImm loadAddImm;
+ if (matchLoad(insn, add, loadImm, offset, argID, loadAddImm)) {
+ tmpSeq.push_back(loadAddImm);
+ continue;
+ }
+ }
+ }
+
+ // OK, the argument only need direct loads. We can now append all the
+ // direct load definitions we found
+ for (const auto &loadImmSeq : tmpSeq)
+ seq.push_back(loadImmSeq);
+ return true;
+ }
+
+ ArgUse FunctionArgumentLowerer::getArgUse(uint32_t argID)
+ {
+ FunctionArgument &arg = fn->getArg(argID);
+
+ // case 1 - we may store something to the structure argument
+ set<const Instruction*> visited;
+ if (this->useStore(ValueDef(&arg), visited))
+ return ARG_WRITTEN;
+
+ // case 2 - we look for the patterns: LOAD(ptr) or LOAD(ptr+imm)
+ if (this->matchLoadAddImm(argID))
+ return ARG_DIRECT_READ;
+
+ // case 3 - LOAD(ptr+runtime_value)
+ return ARG_INDIRECT_READ;
+ }
+
+ void FunctionArgumentLowerer::lower(uint32_t argID) {
+ IF_DEBUG(const ArgUse argUse = )this->getArgUse(argID);
+#if GBE_DEBUG
+ GBE_ASSERTM(argUse != ARG_WRITTEN,
+ "TODO A store to a structure argument "
+ "(i.e. not a char/short/int/float argument) has been found. "
+ "This is not supported yet");
+ GBE_ASSERTM(argUse != ARG_INDIRECT_READ,
+ "TODO Only direct loads of structure arguments are "
+ "supported now");
+#endif /* GBE_DEBUG */
+ }
+
+ void lowerFunctionArguments(Unit &unit, const std::string &functionName) {
+ FunctionArgumentLowerer lowerer(unit);
+ lowerer.lower(functionName);
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/lowering.hpp b/backend/src/ir/lowering.hpp
new file mode 100644
index 0000000..ba0c87b
--- /dev/null
+++ b/backend/src/ir/lowering.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file lowering.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ * Lower instructions that are not supported properly. Typical example is
+ * handling returns or unsupported vector scatters / gathers
+ */
+
+#ifndef __GBE_IR_LOWERING_HPP__
+#define __GBE_IR_LOWERING_HPP__
+
+namespace gbe {
+namespace ir {
+
+ // Structure to update
+ class Unit;
+
+ /*! Remove all return instructions and replace them to forward branches that
+ * point to the only return instruction in a dedicated basic block and the end
+ * of the function.
+ * Typically this code:
+ *
+ * dst[x] = 1;
+ * if (x > 4) return;
+ * dst[x] = 3;
+ *
+ * will be replaced by:
+ *
+ * dst[x] = 1;
+ * if (x > 4) goto end;
+ * dst[x] = 3;
+ * end:
+ * return;
+ *
+ * There will be only one return at the end of the function. This return will
+ * be simply encoded as a End-of-thread instruction (EOT)
+ */
+ void lowerReturn(Unit &unit, const std::string &functionName);
+
+ /*! Function arguments are a bit tricky since we must implement the proper C
+ * semantic: we can therefore address the function arguments as we want and
+ * we can even modify them. This leads to interesting challenges. We identify
+ * several cases:
+ *
+ * case 1:
+ * int f (__global int *dst, int x[16], int y) {
+ * dst[get_global_id(0)] = x[16] + y;
+ * }
+ * Here x and y will be pushed to registers using the Curbe. No problem, we
+ * can directly used the pushed registers
+ *
+ * case 2:
+ * int f (__global int *dst, int x[16], int y) {
+ * dst[get_global_id(0)] = x[get_local_id(0)] + y;
+ * }
+ * Here x is indirectly accessed. We need to perform a gather from memory. We
+ * can simply gather it from the curbe in memory
+ *
+ * case 3:
+ * int f (__global int *dst, int x[16], int y) {
+ * x[get_local_id(0)] = y + 1;
+ * int *ptr = get_local_id(0) % 2 ? x[0] : x[1];
+ * dst[get_global_id(0)] = *ptr;
+ * }
+ * Here we modify the function argument since it is valid C. Problem is that
+ * we are running in SIMD mode while the data are scalar (in both memory and
+ * registers). In that case, we just spill everything to memory (using the
+ * stack) and reload it from here when needed.
+ */
+ void lowerFunctionArguments(Unit &unit, const std::string &functionName);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_LOWERING_HPP__ */
+
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
new file mode 100644
index 0000000..9d60402
--- /dev/null
+++ b/backend/src/ir/printf.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file printf.cpp
+ *
+ */
+
+#include <stdarg.h>
+#include "printf.hpp"
+
+namespace gbe
+{
+ namespace ir
+ {
+
+ pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER;
+
+ uint32_t PrintfSet::append(PrintfFmt* fmt, Unit& unit)
+ {
+ fmts.push_back(*fmt);
+
+ for (auto &f : fmts.back()) {
+ if (f.type == PRINTF_SLOT_TYPE_STRING)
+ continue;
+
+ slots.push_back(&f);
+ }
+
+ /* Update the total size of size. */
+ if (slots.size() > 0)
+ sizeOfSize = slots.back()->state->out_buf_sizeof_offset
+ + getPrintfBufferElementSize(slots.size() - 1);
+
+ return (uint32_t)fmts.size();
+ }
+
+ static void generatePrintfFmtString(PrintfState& state, std::string& str)
+ {
+ char num_str[16];
+ str += "%";
+
+ if (state.left_justified) {
+ str += "-";
+ }
+
+ if (state.sign_symbol == 1) {
+ str += "+";
+ } else if (state.sign_symbol == 2) {
+ str += " ";
+ }
+
+ if (state.alter_form) {
+ str += "#";
+ }
+
+ if (state.zero_padding) {
+ str += "0";
+ }
+
+ if (state.min_width >= 0) {
+ snprintf(num_str, 16, "%d", state.min_width);
+ str += num_str;
+ }
+
+ if (state.precision >= 0) {
+ str += ".";
+ snprintf(num_str, 16, "%d", state.precision);
+ str += num_str;
+ }
+
+ switch (state.length_modifier) {
+ case PRINTF_LM_HH:
+ str += "hh";
+ break;
+ case PRINTF_LM_H:
+ str += "h";
+ break;
+ case PRINTF_LM_L:
+ str += "l";
+ break;
+ case PRINTF_LM_HL:
+ str += "";
+ break;
+ default:
+ assert(state.length_modifier == PRINTF_LM_NONE);
+ }
+ }
+
+#define PRINT_SOMETHING(target_ty, conv) do { \
+ if (!vec_i) \
+ pf_str = pf_str + std::string(#conv); \
+ printf(pf_str.c_str(), \
+ ((target_ty *)((char *)buf_addr + slot.state->out_buf_sizeof_offset * \
+ global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
+ [(k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i]);\
+ } while (0)
+
+
+ void PrintfSet::outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+ size_t global_wk_sz1, size_t global_wk_sz2)
+ {
+ LockOutput lock;
+ size_t i, j, k;
+ std::string pf_str;
+ int stmt = 0;
+
+ for (auto &pf : fmts) {
+ for (i = 0; i < global_wk_sz0; i++) {
+ for (j = 0; j < global_wk_sz1; j++) {
+ for (k = 0; k < global_wk_sz2; k++) {
+
+ int flag = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
+ + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
+ if (flag) {
+ for (auto &slot : pf) {
+ pf_str = "";
+ int vec_num;
+
+ if (slot.type == PRINTF_SLOT_TYPE_STRING) {
+ printf("%s", slot.str);
+ continue;
+ }
+ assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+
+ generatePrintfFmtString(*slot.state, pf_str);
+
+ vec_num = slot.state->vector_n > 0 ? slot.state->vector_n : 1;
+
+ for (int vec_i = 0; vec_i < vec_num; vec_i++) {
+ if (vec_i)
+ printf(",");
+
+ switch (slot.state->conversion_specifier) {
+ case PRINTF_CONVERSION_D:
+ case PRINTF_CONVERSION_I:
+ PRINT_SOMETHING(int, d);
+ break;
+
+ case PRINTF_CONVERSION_O:
+ PRINT_SOMETHING(int, o);
+ break;
+ case PRINTF_CONVERSION_U:
+ PRINT_SOMETHING(int, u);
+ break;
+ case PRINTF_CONVERSION_X:
+ PRINT_SOMETHING(int, X);
+ break;
+ case PRINTF_CONVERSION_x:
+ PRINT_SOMETHING(int, x);
+ break;
+
+ case PRINTF_CONVERSION_C:
+ PRINT_SOMETHING(char, c);
+ break;
+
+ case PRINTF_CONVERSION_F:
+ PRINT_SOMETHING(float, F);
+ break;
+ case PRINTF_CONVERSION_f:
+ PRINT_SOMETHING(float, f);
+ break;
+ case PRINTF_CONVERSION_E:
+ PRINT_SOMETHING(float, E);
+ break;
+ case PRINTF_CONVERSION_e:
+ PRINT_SOMETHING(float, e);
+ break;
+ case PRINTF_CONVERSION_G:
+ PRINT_SOMETHING(float, G);
+ break;
+ case PRINTF_CONVERSION_g:
+ PRINT_SOMETHING(float, g);
+ break;
+ case PRINTF_CONVERSION_A:
+ PRINT_SOMETHING(float, A);
+ break;
+ case PRINTF_CONVERSION_a:
+ PRINT_SOMETHING(float, a);
+ break;
+ case PRINTF_CONVERSION_P:
+ PRINT_SOMETHING(int, p);
+ break;
+
+ case PRINTF_CONVERSION_S:
+ pf_str = pf_str + "s";
+ printf(pf_str.c_str(), slot.state->str.c_str());
+ break;
+
+ default:
+ assert(0);
+ return;
+ }
+ }
+
+ pf_str = "";
+ }
+ }
+ }
+ }
+ }
+ stmt++;
+ }
+ }
+ } /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
new file mode 100644
index 0000000..4db7245
--- /dev/null
+++ b/backend/src/ir/printf.hpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file printf.hpp
+ *
+ */
+#ifndef __GBE_IR_PRINTF_HPP__
+#define __GBE_IR_PRINTF_HPP__
+
+#include <string.h>
+#include "sys/map.hpp"
+#include "sys/vector.hpp"
+#include "unit.hpp"
+
+namespace gbe
+{
+ namespace ir
+ {
+
+ /* Things about printf info. */
+ enum {
+ PRINTF_LM_NONE,
+ PRINTF_LM_HH,
+ PRINTF_LM_H,
+ PRINTF_LM_L,
+ PRINTF_LM_HL,
+ };
+
+ enum {
+ PRINTF_CONVERSION_INVALID,
+ PRINTF_CONVERSION_D,
+ PRINTF_CONVERSION_I,
+ PRINTF_CONVERSION_O,
+ PRINTF_CONVERSION_U,
+ PRINTF_CONVERSION_X,
+ PRINTF_CONVERSION_x,
+ PRINTF_CONVERSION_F,
+ PRINTF_CONVERSION_f,
+ PRINTF_CONVERSION_E,
+ PRINTF_CONVERSION_e,
+ PRINTF_CONVERSION_G,
+ PRINTF_CONVERSION_g,
+ PRINTF_CONVERSION_A,
+ PRINTF_CONVERSION_a,
+ PRINTF_CONVERSION_C,
+ PRINTF_CONVERSION_S,
+ PRINTF_CONVERSION_P
+ };
+
+ struct PrintfState {
+ char left_justified;
+ char sign_symbol; //0 for nothing, 1 for sign, 2 for space.
+ char alter_form;
+ char zero_padding;
+ char vector_n;
+ int min_width;
+ int precision;
+ int length_modifier;
+ char conversion_specifier;
+ int out_buf_sizeof_offset; // Should *global_total_size to get the full offset.
+ std::string str; //if %s, the string store here.
+ };
+
+ enum {
+ PRINTF_SLOT_TYPE_NONE,
+ PRINTF_SLOT_TYPE_STRING,
+ PRINTF_SLOT_TYPE_STATE
+ };
+
+ struct PrintfSlot {
+ int type;
+ union {
+ char* str;
+ PrintfState* state;
+ void *ptr;
+ };
+
+ PrintfSlot(void) {
+ type = PRINTF_SLOT_TYPE_NONE;
+ ptr = NULL;
+ }
+
+ PrintfSlot(const char * s) {
+ type = PRINTF_SLOT_TYPE_STRING;
+ int len = strlen(s);
+ str = (char*)malloc((len + 1) * sizeof(char));
+ memcpy(str, s, (len + 1) * sizeof(char));
+ str[len] = 0;
+ }
+
+ PrintfSlot(PrintfState * st) {
+ type = PRINTF_SLOT_TYPE_STATE;
+ state = (PrintfState *)malloc(sizeof(PrintfState));
+ memcpy(state, st, sizeof(PrintfState));
+ }
+
+ PrintfSlot(const PrintfSlot & other) {
+ if (other.type == PRINTF_SLOT_TYPE_STRING) {
+ int len = strlen(other.str);
+ str = (char*)malloc((len + 1) * sizeof(char));
+ memcpy(str, other.str, (len + 1) * sizeof(char));
+ str[len] = 0;
+ type = PRINTF_SLOT_TYPE_STRING;
+ } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
+ type = PRINTF_SLOT_TYPE_STATE;
+ state = (PrintfState *)malloc(sizeof(PrintfState));
+ memcpy(state, other.state, sizeof(PrintfState));
+ } else {
+ type = PRINTF_SLOT_TYPE_NONE;
+ ptr = NULL;
+ }
+ }
+
+ PrintfSlot(PrintfSlot && other) {
+ void *p = other.ptr;
+ type = other.type;
+ other.ptr = ptr;
+ ptr = p;
+ }
+
+ ~PrintfSlot(void) {
+ if (ptr)
+ free(ptr);
+ }
+ };
+
+ class Context;
+
+ class PrintfSet //: public Serializable
+ {
+ public:
+ PrintfSet(const PrintfSet& other) {
+ for (auto &f : other.fmts) {
+ fmts.push_back(f);
+ }
+
+ for (auto &s : other.slots) {
+ slots.push_back(s);
+ }
+
+ sizeOfSize = other.sizeOfSize;
+ btiBuf = other.btiBuf;
+ btiIndexBuf = other.btiIndexBuf;
+ }
+
+ PrintfSet(void) = default;
+
+ struct LockOutput {
+ LockOutput(void) {
+ pthread_mutex_lock(&lock);
+ }
+
+ ~LockOutput(void) {
+ pthread_mutex_unlock(&lock);
+ }
+ };
+
+ typedef vector<PrintfSlot> PrintfFmt;
+ uint32_t append(PrintfFmt* fmt, Unit &unit);
+
+ uint32_t getPrintfNum(void) const {
+ return fmts.size();
+ }
+
+ uint32_t getPrintfSizeOfSize(void) const {
+ return sizeOfSize;
+ }
+
+ void setBufBTI(uint8_t b) { btiBuf = b; }
+ void setIndexBufBTI(uint8_t b) { btiIndexBuf = b; }
+ uint8_t getBufBTI() const { return btiBuf; }
+ uint8_t getIndexBufBTI() const { return btiIndexBuf; }
+
+ uint32_t getPrintfBufferElementSize(uint32_t i) {
+ PrintfSlot* slot = slots[i];
+ int vec_num = 1;
+ if (slot->state->vector_n > 0) {
+ vec_num = slot->state->vector_n;
+ }
+
+ assert(vec_num > 0 && vec_num <= 16);
+
+ switch (slot->state->conversion_specifier) {
+ case PRINTF_CONVERSION_I:
+ case PRINTF_CONVERSION_D:
+ case PRINTF_CONVERSION_O:
+ case PRINTF_CONVERSION_U:
+ case PRINTF_CONVERSION_X:
+ case PRINTF_CONVERSION_x:
+ case PRINTF_CONVERSION_P:
+ /* Char will be aligned to sizeof(int) here. */
+ case PRINTF_CONVERSION_C:
+ return (uint32_t)(sizeof(int) * vec_num);
+ case PRINTF_CONVERSION_E:
+ case PRINTF_CONVERSION_e:
+ case PRINTF_CONVERSION_F:
+ case PRINTF_CONVERSION_f:
+ case PRINTF_CONVERSION_G:
+ case PRINTF_CONVERSION_g:
+ case PRINTF_CONVERSION_A:
+ case PRINTF_CONVERSION_a:
+ return (uint32_t)(sizeof(float) * vec_num);
+ case PRINTF_CONVERSION_S:
+ return (uint32_t)0;
+ default:
+ break;
+ }
+ assert(0);
+ return 0;
+ }
+
+ void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+ size_t global_wk_sz1, size_t global_wk_sz2);
+
+ private:
+ vector<PrintfFmt> fmts;
+ vector<PrintfSlot*> slots;
+ uint32_t sizeOfSize; // Total sizeof size.
+ friend struct LockOutput;
+ uint8_t btiBuf;
+ uint8_t btiIndexBuf;
+ static pthread_mutex_t lock;
+ GBE_CLASS(PrintfSet);
+ };
+ } /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_PRINTF_HPP__ */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
new file mode 100644
index 0000000..fc69367
--- /dev/null
+++ b/backend/src/ir/profile.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file profile.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/profile.hpp"
+#include "ir/function.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe {
+namespace ir {
+
+ namespace ocl
+ {
+ const char *specialRegMean[] = {
+ "local_id_0", "local_id_1", "local_id_2",
+ "group_id_0", "group_id_1", "group_id_2",
+ "num_groups_0", "num_groups_1", "num_groups_2",
+ "local_size_0", "local_size_1", "local_size_2",
+ "global_size_0", "global_size_1", "global_size_2",
+ "global_offset_0", "global_offset_1", "global_offset_2",
+ "stack_pointer", "stack_buffer",
+ "block_ip",
+ "barrier_id", "thread_number", "work_dimension",
+ "zero", "one",
+ "retVal", "slm_offset",
+ "printf_buffer_pointer", "printf_index_buffer_pointer",
+ "invalid"
+ };
+
+#if GBE_DEBUG
+#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
+ r = fn.newRegister(FAMILY_DWORD, UNIFORM); \
+ GBE_ASSERT(r == REG);
+#else
+#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
+ fn.newRegister(FAMILY_DWORD, UNIFORM);
+#endif /* GBE_DEBUG */
+ static void init(Function &fn) {
+ IF_DEBUG(Register r);
+ DECL_NEW_REG(FAMILY_DWORD, lid0, 0);
+ DECL_NEW_REG(FAMILY_DWORD, lid1, 0);
+ DECL_NEW_REG(FAMILY_DWORD, lid2, 0);
+ DECL_NEW_REG(FAMILY_DWORD, groupid0, 1);
+ DECL_NEW_REG(FAMILY_DWORD, groupid1, 1);
+ DECL_NEW_REG(FAMILY_DWORD, groupid2, 1);
+ DECL_NEW_REG(FAMILY_DWORD, numgroup0, 1);
+ DECL_NEW_REG(FAMILY_DWORD, numgroup1, 1);
+ DECL_NEW_REG(FAMILY_DWORD, numgroup2, 1);
+ DECL_NEW_REG(FAMILY_DWORD, lsize0, 1);
+ DECL_NEW_REG(FAMILY_DWORD, lsize1, 1);
+ DECL_NEW_REG(FAMILY_DWORD, lsize2, 1);
+ DECL_NEW_REG(FAMILY_DWORD, gsize0, 1);
+ DECL_NEW_REG(FAMILY_DWORD, gsize1, 1);
+ DECL_NEW_REG(FAMILY_DWORD, gsize2, 1);
+ DECL_NEW_REG(FAMILY_DWORD, goffset0, 1);
+ DECL_NEW_REG(FAMILY_DWORD, goffset1, 1);
+ DECL_NEW_REG(FAMILY_DWORD, goffset2, 1);
+ DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
+ DECL_NEW_REG(FAMILY_DWORD, stackbuffer, 1);
+ DECL_NEW_REG(FAMILY_WORD, blockip, 0);
+ DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
+ DECL_NEW_REG(FAMILY_DWORD, threadn, 1);
+ DECL_NEW_REG(FAMILY_DWORD, workdim, 1);
+ DECL_NEW_REG(FAMILY_DWORD, zero, 1);
+ DECL_NEW_REG(FAMILY_DWORD, one, 1);
+ DECL_NEW_REG(FAMILY_WORD, retVal, 1);
+ DECL_NEW_REG(FAMILY_WORD, slmoffset, 1);
+ DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
+ DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+ DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
+ }
+#undef DECL_NEW_REG
+
+ } /* namespace ocl */
+
+ void initProfile(Function &fn) {
+ const Profile profile = fn.getProfile();
+ switch (profile) {
+ case PROFILE_C: GBE_ASSERTM(false, "Unsupported profile"); break;
+ case PROFILE_OCL: ocl::init(fn);
+ };
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
+
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
new file mode 100644
index 0000000..4e89bdd
--- /dev/null
+++ b/backend/src/ir/profile.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file profile.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_PROFILE_HPP__
+#define __GBE_IR_PROFILE_HPP__
+
+#include "ir/register.hpp"
+
+namespace gbe {
+namespace ir {
+
+ /*! Profile is defined *per-function* and mostly predefined registers */
+ enum Profile : uint32_t {
+ PROFILE_C = 0, // Not used now
+ PROFILE_OCL = 1
+ };
+
+ // Will be pre-initialized based on its profile
+ class Function;
+
+ /*! Registers used for ocl */
+ namespace ocl
+ {
+ static const Register lid0 = Register(0); // get_local_id(0)
+ static const Register lid1 = Register(1); // get_local_id(1)
+ static const Register lid2 = Register(2); // get_local_id(2)
+ static const Register groupid0 = Register(3); // get_group_id(0)
+ static const Register groupid1 = Register(4); // get_group_id(1)
+ static const Register groupid2 = Register(5); // get_group_id(2)
+ static const Register numgroup0 = Register(6); // get_num_groups(0)
+ static const Register numgroup1 = Register(7); // get_num_groups(1)
+ static const Register numgroup2 = Register(8); // get_num_groups(2)
+ static const Register lsize0 = Register(9); // get_local_size(0)
+ static const Register lsize1 = Register(10); // get_local_size(1)
+ static const Register lsize2 = Register(11); // get_local_size(2)
+ static const Register gsize0 = Register(12); // get_global_size(0)
+ static const Register gsize1 = Register(13); // get_global_size(1)
+ static const Register gsize2 = Register(14); // get_global_size(2)
+ static const Register goffset0 = Register(15); // get_global_offset(0)
+ static const Register goffset1 = Register(16); // get_global_offset(1)
+ static const Register goffset2 = Register(17); // get_global_offset(2)
+ static const Register stackptr = Register(18); // stack pointer
+ static const Register stackbuffer = Register(19); // stack buffer base address.
+ static const Register blockip = Register(20); // blockip
+ static const Register barrierid = Register(21);// barrierid
+ static const Register threadn = Register(22); // number of threads
+ static const Register workdim = Register(23); // work dimention.
+ static const Register zero = Register(24); // scalar register holds zero.
+ static const Register one = Register(25); // scalar register holds one.
+ static const Register retVal = Register(26); // helper register to do data flow analysis.
+ static const Register slmoffset = Register(27); // Group's SLM offset in total 64K SLM
+ static const Register printfbptr = Register(28); // printf buffer address .
+ static const Register printfiptr = Register(29); // printf index buffer address.
+ static const Register invalid = Register(30); // used for valid comparation.
+ static const uint32_t regNum = 31; // number of special registers
+ extern const char *specialRegMean[]; // special register name.
+ } /* namespace ocl */
+
+ /*! Initialize the profile of the given function */
+ void initProfile(Function &fn);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_PROFILE_HPP__ */
+
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
new file mode 100644
index 0000000..471bfbd
--- /dev/null
+++ b/backend/src/ir/register.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file register.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/profile.hpp"
+#include "ir/register.hpp"
+
+namespace gbe {
+namespace ir {
+
+ std::ostream &operator<< (std::ostream &out, const RegisterData ®Data)
+ {
+ switch (regData.family) {
+ case FAMILY_BOOL: return out << "bool";
+ case FAMILY_BYTE: return out << "byte";
+ case FAMILY_WORD: return out << "word";
+ case FAMILY_DWORD: return out << "dword";
+ case FAMILY_QWORD: return out << "qword";
+ };
+ return out;
+ }
+
+ std::ostream &operator<< (std::ostream &out, const RegisterFile &file)
+ {
+ out << "## " << file.regNum() << " register"
+ << (file.regNum() ? "s" : "") << " ##" << std::endl;
+ for (uint32_t i = 0; i < file.regNum(); ++i) {
+ const RegisterData reg = file.get(Register(i));
+ out << ".decl." << reg << " %" << i;
+ if (i < ocl::regNum)
+ out << " " << ocl::specialRegMean[i];
+ out << std::endl;
+ }
+ return out;
+ }
+
+ Tuple RegisterFile::appendArrayTuple(const Register *reg, uint32_t regNum) {
+ const Tuple index = Tuple(regTuples.size());
+ for (uint32_t regID = 0; regID < regNum; ++regID) {
+ GBE_ASSERTM(reg[regID] < this->regNum(), "Out-of-bound register");
+ regTuples.push_back(reg[regID]);
+ }
+ return index;
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
new file mode 100644
index 0000000..7bd4f6e
--- /dev/null
+++ b/backend/src/ir/register.hpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file register.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_REGISTER_HPP__
+#define __GBE_IR_REGISTER_HPP__
+
+#include "sys/vector.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe {
+namespace ir {
+
+ /*! Defines the size of the pointers. All the functions from the unit will
+ * use the same pointer size as the unit they belong to
+ */
+ enum PointerSize {
+ POINTER_32_BITS = 32,
+ POINTER_64_BITS = 64
+ };
+
+ /*! Basically provides the size of the register */
+ enum RegisterFamily : uint8_t {
+ FAMILY_BOOL = 0,
+ FAMILY_BYTE = 1,
+ FAMILY_WORD = 2,
+ FAMILY_DWORD = 3,
+ FAMILY_QWORD = 4
+ };
+
+ INLINE char getFamilyName(RegisterFamily family) {
+ static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q'};
+ return registerFamilyName[family];
+ }
+
+ INLINE uint32_t getFamilySize(RegisterFamily family) {
+ switch (family) {
+ case FAMILY_BYTE: return 1;
+ case FAMILY_WORD: return 2;
+ case FAMILY_DWORD: return 4;
+ case FAMILY_QWORD: return 8;
+ default: NOT_SUPPORTED;
+ };
+ return 0;
+ }
+
+ /*! A register can be either a byte, a word, a dword or a qword. We store this
+ * value into a register data (which makes the register file)
+ */
+ class RegisterData
+ {
+ public:
+ /*! Build a register. All fields will be immutable */
+ INLINE RegisterData(RegisterFamily family,
+ bool uniform = false) : family(family), uniform(uniform) {}
+ /*! Copy constructor */
+ INLINE RegisterData(const RegisterData &other) : family(other.family), uniform(other.uniform) {}
+ /*! Copy operator */
+ INLINE RegisterData &operator= (const RegisterData &other) {
+ this->family = other.family;
+ this->uniform = other.uniform;
+ return *this;
+ }
+ /*! Nothing really happens here */
+ INLINE ~RegisterData(void) {}
+ RegisterFamily family; //!< Register size or if it is a flag
+ INLINE const bool isUniform() const { return uniform; }
+ INLINE void setUniform(bool uni) { uniform = uni; }
+ private:
+ bool uniform;
+ GBE_CLASS(RegisterData);
+ };
+
+ /*! Output the register file string in the given stream */
+ std::ostream &operator<< (std::ostream &out, const RegisterData ®Data);
+
+ /*! Register is the position of the index of the register data in the register
+ * file. We enforce type safety with this class
+ */
+ TYPE_SAFE(Register, uint16_t)
+ INLINE bool operator< (const Register &r0, const Register &r1) {
+ return r0.value() < r1.value();
+ }
+
+ /*! Tuple is the position of the first register in the tuple vector. We
+ * enforce type safety with this class
+ */
+ TYPE_SAFE(Tuple, uint16_t)
+
+ /*! A register file allocates and destroys registers. Basically, we will have
+ * one register file per function
+ */
+ class RegisterFile
+ {
+ public:
+ /*! Return the index of a newly allocated register */
+ INLINE Register append(RegisterFamily family, bool uniform = false) {
+ GBE_ASSERTM(regNum() < MAX_INDEX,
+ "Too many defined registers (only 65535 are supported)");
+ const uint16_t index = regNum();
+ const RegisterData reg(family, uniform);
+ regs.push_back(reg);
+ return Register(index);
+ }
+ /*! Make a tuple from an array of register */
+ Tuple appendArrayTuple(const Register *reg, uint32_t regNum);
+ /*! Make a tuple and return the index to the first element of the tuple */
+ template <typename First, typename... Rest>
+ INLINE Tuple appendTuple(First first, Rest... rest) {
+ const Tuple index = Tuple(regTuples.size());
+ GBE_ASSERTM(first < regNum(), "Out-of-bound register");
+ regTuples.push_back(first);
+ appendTuple(rest...);
+ return index;
+ }
+ /*! To terminate variadic recursion */
+ INLINE void appendTuple(void) {}
+ /*! Return a copy of the register at index */
+ INLINE RegisterData get(Register index) const { return regs[index]; }
+ /*! Return true if the specified register is uniform type. */
+ INLINE bool isUniform(Register index) { return regs[index].isUniform(); }
+ /*! Set a register to uniform or varying data type*/
+ INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); }
+ /*! Get the register index from the tuple */
+ INLINE Register get(Tuple index, uint32_t which) const {
+ return regTuples[uint16_t(index) + which];
+ }
+ /*! Set the register index from the tuple */
+ INLINE void set(Tuple index, uint32_t which, Register reg) {
+ regTuples[uint16_t(index) + which] = reg;
+ }
+ /*! Number of registers in the register file */
+ INLINE uint32_t regNum(void) const { return regs.size(); }
+ /*! Number of tuples in the register file */
+ INLINE uint32_t tupleNum(void) const { return regTuples.size(); }
+ /*! register and tuple indices are short */
+ enum { MAX_INDEX = 0xffff };
+ private:
+ vector<RegisterData> regs; //!< All the registers together
+ vector<Register> regTuples; //!< Tuples are used for many src / dst
+ GBE_CLASS(RegisterFile);
+ };
+
+ /*! Output the register file string in the given stream */
+ std::ostream &operator<< (std::ostream &out, const RegisterFile &file);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_REGISTER_HPP__ */
+
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
new file mode 100644
index 0000000..7e8355f
--- /dev/null
+++ b/backend/src/ir/sampler.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.cpp
+ *
+ */
+#include "sampler.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+
+namespace gbe {
+namespace ir {
+
+#ifdef GBE_COMPILER_AVAILABLE
+ uint8_t SamplerSet::appendReg(uint32_t key, Context *ctx) {
+ uint8_t samplerSlot = samplerMap.size();
+ samplerMap.insert(std::make_pair(key, samplerSlot));
+ return samplerSlot;
+ }
+
+ uint8_t SamplerSet::append(uint32_t samplerValue, Context *ctx)
+ {
+ auto it = samplerMap.find(samplerValue);
+ if (it != samplerMap.end())
+ return it->second;
+ // This register is just used as a key.
+ return appendReg(samplerValue, ctx);
+ }
+
+#define SAMPLER_ID(id) ((id << __CLK_SAMPLER_ARG_BASE) | __CLK_SAMPLER_ARG_KEY_BIT)
+ uint8_t SamplerSet::append(Register samplerReg, Context *ctx)
+ {
+ ir::FunctionArgument *arg = ctx->getFunction().getArg(samplerReg);
+ GBE_ASSERT(arg != NULL);
+
+ // XXX As LLVM 3.2/3.1 doesn't have a new data type for the sampler_t, we have to fix up the argument
+ // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
+ // work around.
+ arg->type = ir::FunctionArgument::SAMPLER;
+ arg->info.typeName = "sampler_t";
+ int32_t id = ctx->getFunction().getArgID(arg);
+ GBE_ASSERT(id < (1 << __CLK_SAMPLER_ARG_BITS));
+
+ auto it = samplerMap.find(SAMPLER_ID(id));
+ if (it != samplerMap.end()) {
+ return it->second;
+ }
+ return appendReg(SAMPLER_ID(id), ctx);
+ }
+#endif
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ /*! Implements the serialization. */
+ size_t SamplerSet::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ OUT_UPDATE_SZ(samplerMap.size());
+ for (auto iter : samplerMap) {
+ OUT_UPDATE_SZ(iter.first);
+ OUT_UPDATE_SZ(iter.second);
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+ OUT_UPDATE_SZ(ret_size);
+
+ return ret_size;
+ }
+
+ size_t SamplerSet::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ uint32_t magic;
+ size_t sampler_map_sz = 0;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(sampler_map_sz);
+ for (size_t i = 0; i < sampler_map_sz; i++) {
+ uint32_t key;
+ uint32_t slot;
+
+ IN_UPDATE_SZ(key);
+ IN_UPDATE_SZ(slot);
+ samplerMap.insert(std::make_pair(key, slot));
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+ void SamplerSet::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+ string spaces_nl = indent_to_str(indent + 4);
+
+ outs << spaces << "------------ Begin SamplerSet ------------" << "\n";
+
+ outs << spaces_nl << " SamplerSet Map: [index, sampler_reg, sampler_slot]\n";
+ outs << spaces_nl << " samplerMap size: " << samplerMap.size() << "\n";
+
+ for (auto iter : samplerMap) {
+ outs << spaces_nl << " [" << iter.first << ", "
+ << iter.second << "]\n";
+ }
+
+ outs << spaces << "------------- End SamplerSet -------------" << "\n";
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
new file mode 100644
index 0000000..2b51ce3
--- /dev/null
+++ b/backend/src/ir/sampler.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_SAMPLER_HPP__
+#define __GBE_IR_SAMPLER_HPP__
+
+#include "ir/register.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+ /*! A sampler set is a set of global samplers which are defined as constant global
+ * sampler or defined in the outermost kernel scope variables. According to the spec
+ * all the variable should have a initialized integer value and can't be modified.
+ */
+ class Context;
+
+ class SamplerSet : public Serializable
+ {
+ public:
+ /*! Append the specified sampler and return the allocated offset.
+ * If the speficied sampler is exist, only return the previous offset and
+ * don't append it again. Return -1, if failed.*/
+ uint8_t append(uint32_t clkSamplerValue, Context *ctx);
+ /*! Append a sampler defined in kernel args. */
+ uint8_t append(Register samplerArg, Context *ctx);
+ size_t getDataSize(void) { return samplerMap.size(); }
+ size_t getDataSize(void) const { return samplerMap.size(); }
+ void getData(uint32_t *samplers) const {
+ for(auto &it : samplerMap)
+ samplers[it.second] = it.first;
+ }
+
+ void operator = (const SamplerSet& other) {
+ samplerMap.insert(other.samplerMap.begin(), other.samplerMap.end());
+ }
+
+ bool empty() const { return samplerMap.empty(); }
+
+ SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
+ SamplerSet() {}
+
+ static const uint32_t magic_begin = TO_MAGIC('S', 'A', 'M', 'P');
+ static const uint32_t magic_end = TO_MAGIC('P', 'M', 'A', 'S');
+
+ /* format:
+ magic_begin |
+ samplerMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ regMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
+ private:
+ uint8_t appendReg(uint32_t key, Context *ctx);
+ map<uint32_t, uint32_t> samplerMap;
+ GBE_CLASS(SamplerSet);
+ };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_SAMPLER_HPP__ */
diff --git a/backend/src/ir/type.cpp b/backend/src/ir/type.cpp
new file mode 100644
index 0000000..56f5c12
--- /dev/null
+++ b/backend/src/ir/type.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file instruction.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "ir/type.hpp"
+
+namespace gbe {
+namespace ir {
+ std::ostream &operator<< (std::ostream &out, const Type &type) {
+ switch (type) {
+ case TYPE_BOOL: return out << "bool";
+ case TYPE_S8: return out << "int8";
+ case TYPE_U8: return out << "uint8";
+ case TYPE_S16: return out << "int16";
+ case TYPE_U16: return out << "uin16";
+ case TYPE_S32: return out << "int32";
+ case TYPE_U32: return out << "uin32";
+ case TYPE_S64: return out << "int64";
+ case TYPE_U64: return out << "uin64";
+ case TYPE_HALF: return out << "half";
+ case TYPE_FLOAT: return out << "float";
+ case TYPE_DOUBLE: return out << "double";
+ default :
+ GBE_ASSERT(0 && "Unsupported type\n");
+ };
+ return out;
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/type.hpp b/backend/src/ir/type.hpp
new file mode 100644
index 0000000..8bfbdc8
--- /dev/null
+++ b/backend/src/ir/type.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file type.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_TYPE_HPP__
+#define __GBE_IR_TYPE_HPP__
+
+#include "sys/platform.hpp"
+#include "ir/register.hpp"
+
+#include <ostream>
+
+namespace gbe {
+namespace ir {
+
+ /*! All types possibly supported by the instruction */
+ enum Type : uint8_t {
+ TYPE_BOOL = 0, //!< boolean value
+ TYPE_S8, //!< signed 8 bits integer
+ TYPE_U8, //!< unsigned 8 bits integer
+ TYPE_S16, //!< signed 16 bits integer
+ TYPE_U16, //!< unsigned 16 bits integer
+ TYPE_S32, //!< signed 32 bits integer
+ TYPE_U32, //!< unsigned 32 bits integer
+ TYPE_S64, //!< signed 64 bits integer
+ TYPE_U64, //!< unsigned 64 bits integer
+ TYPE_HALF, //!< 16 bits floating point value
+ TYPE_FLOAT, //!< 32 bits floating point value
+ TYPE_DOUBLE, //!< 64 bits floating point value
+ TYPE_LARGE_INT //!< integer larger than 64 bits.
+ };
+
+ /*! Output a string for the type in the given stream */
+ std::ostream &operator<< (std::ostream &out, const Type &type);
+
+ /*! Get the register family for each type */
+ INLINE RegisterFamily getFamily(Type type) {
+ switch (type) {
+ case TYPE_BOOL:
+ return FAMILY_BOOL;
+ case TYPE_S8:
+ case TYPE_U8:
+ return FAMILY_BYTE;
+ case TYPE_S16:
+ case TYPE_U16:
+ case TYPE_HALF:
+ return FAMILY_WORD;
+ case TYPE_S32:
+ case TYPE_U32:
+ case TYPE_FLOAT:
+ return FAMILY_DWORD;
+ case TYPE_S64:
+ case TYPE_U64:
+ case TYPE_DOUBLE:
+ return FAMILY_QWORD;
+ default:
+ return FAMILY_DWORD;
+ };
+ }
+
+ /*! Return a type for each register family */
+ INLINE Type getType(RegisterFamily family) {
+ switch (family) {
+ case FAMILY_BOOL: return TYPE_BOOL;
+ case FAMILY_BYTE: return TYPE_U8;
+ case FAMILY_WORD: return TYPE_U16;
+ case FAMILY_DWORD: return TYPE_U32;
+ case FAMILY_QWORD: return TYPE_U64;
+ };
+ return TYPE_U32;
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_TYPE_HPP__ */
+
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
new file mode 100644
index 0000000..4f9d740
--- /dev/null
+++ b/backend/src/ir/unit.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file unit.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+
+namespace gbe {
+namespace ir {
+
+ Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize), valid(true) {}
+ Unit::~Unit(void) {
+ for (const auto &pair : functions) GBE_DELETE(pair.second);
+ }
+ Function *Unit::getFunction(const std::string &name) const {
+ auto it = functions.find(name);
+ if (it == functions.end())
+ return NULL;
+ return it->second;
+ }
+ Function *Unit::newFunction(const std::string &name) {
+ auto it = functions.find(name);
+ if (it != functions.end())
+ return NULL;
+ Function *fn = GBE_NEW(Function, name, *this);
+ functions[name] = fn;
+ return fn;
+ }
+ void Unit::newConstant(const char *data,
+ const std::string &name,
+ uint32_t size,
+ uint32_t alignment)
+ {
+ constantSet.append(data, name, size, alignment);
+ }
+
+ std::ostream &operator<< (std::ostream &out, const Unit &unit) {
+ unit.apply([&out] (const Function &fn) { out << fn << std::endl; });
+ return out;
+ }
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
new file mode 100644
index 0000000..adebd3f
--- /dev/null
+++ b/backend/src/ir/unit.hpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file unit.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_UNIT_HPP__
+#define __GBE_IR_UNIT_HPP__
+
+#include "ir/constant.hpp"
+#include "ir/register.hpp"
+#include "sys/hash_map.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+ // A unit contains a set of functions
+ class Function;
+
+ /*! Complete unit of compilation. It contains a set of functions and a set of
+ * constant the functions may refer to.
+ */
+ class Unit : public NonCopyable
+ {
+ public:
+ typedef hash_map<std::string, Function*> FunctionSet;
+ /*! Create an empty unit */
+ Unit(PointerSize pointerSize = POINTER_32_BITS);
+ /*! Release everything (*including* the function pointers) */
+ ~Unit(void);
+ /*! Get the set of functions defined in the unit */
+ const FunctionSet &getFunctionSet(void) const { return functions; }
+ /*! Retrieve the function by its name */
+ Function *getFunction(const std::string &name) const;
+ /*! Return NULL if the function already exists */
+ Function *newFunction(const std::string &name);
+ /*! Create a new constant in the constant set */
+ void newConstant(const char*, const std::string&, uint32_t size, uint32_t alignment);
+ /*! Apply the given functor on all the functions */
+ template <typename T>
+ INLINE void apply(const T &functor) const {
+ for (const auto &pair : functions) functor(*pair.second);
+ }
+ /*! Return the size of the pointers manipulated */
+ INLINE PointerSize getPointerSize(void) const { return pointerSize; }
+ /*! Return the family of registers that contain pointer */
+ INLINE RegisterFamily getPointerFamily(void) const {
+ if (this->getPointerSize() == POINTER_32_BITS)
+ return FAMILY_DWORD;
+ else
+ return FAMILY_QWORD;
+ }
+ /*! Return the constant set */
+ ConstantSet& getConstantSet(void) { return constantSet; }
+ /*! Return the constant set */
+ const ConstantSet& getConstantSet(void) const { return constantSet; }
+ void setValid(bool value) { valid = value; }
+ bool getValid() { return valid; }
+ private:
+ friend class ContextInterface; //!< Can free modify the unit
+ hash_map<std::string, Function*> functions; //!< All the defined functions
+ ConstantSet constantSet; //!< All the constants defined in the unit
+ PointerSize pointerSize; //!< Size shared by all pointers
+ GBE_CLASS(Unit);
+ bool valid;
+ };
+
+ /*! Output the unit string in the given stream */
+ std::ostream &operator<< (std::ostream &out, const Unit &unit);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_UNIT_HPP__ */
diff --git a/backend/src/ir/value.cpp b/backend/src/ir/value.cpp
new file mode 100644
index 0000000..a055bdf
--- /dev/null
+++ b/backend/src/ir/value.cpp
@@ -0,0 +1,607 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file value.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "ir/value.hpp"
+#include "ir/liveness.hpp"
+
+namespace gbe {
+namespace ir {
+
+ /*! To build the chains (i.e. basically the graph of values), we are going to
+ * iterate on liveout definitions: for each block and for each variable
+ * (ir::Register) alive at the end of the block (in Block::LiveOut), we are
+ * computing the set of all possible value definitions. Using these value
+ * definitions, we will finally transfer these sets to the successors to get
+ * the ud / du chains
+ *
+ * LiveOutSet contains the set of definitions for each basic block
+ */
+ class LiveOutSet
+ {
+ public:
+ LiveOutSet(Liveness &liveness, const FunctionDAG &dag);
+ ~LiveOutSet(void);
+ /*! One set per register */
+ typedef set<ValueDef*> RegDefSet;
+ /*! We have one map of liveout register per block */
+ typedef map<Register, RegDefSet*> BlockDefMap;
+ /*! All the block definitions map in the functions */
+ typedef map<const BasicBlock*, BlockDefMap*> FunctionDefMap;
+ /*! Performs the double look-up to get the set of defs per register */
+ RegDefSet &getDefSet(const BasicBlock *bb, Register reg);
+ /*! Build a UD-chain as the union of the predecessor chains */
+ void makeDefSet(DefSet &udChain, const BasicBlock &bb, Register reg);
+ /*! Fast per register definition set allocation */
+ DECL_POOL(RegDefSet, regDefSetPool);
+ /*! Fast register sets allocation */
+ DECL_POOL(BlockDefMap, blockDefMapPool);
+ FunctionDefMap defMap; //!< All per-block data
+ Liveness &liveness; //!< Contains LiveOut information
+ const FunctionDAG &dag; //!< Structure we are building
+ private:
+ /*! Initialize liveOut with the instruction destination values */
+ void initializeInstructionDef(void);
+ /*! Initialize liveOut with the function argument, special and pushed
+ * registers
+ */
+ void initializeOtherDef(void);
+ /*! Iterate to completely transfer the liveness and get the def sets */
+ void iterateLiveOut(void);
+ /*! Use custom allocators */
+ GBE_CLASS(LiveOutSet);
+ };
+
+ /*! Debug print of the liveout set */
+ std::ostream &operator<< (std::ostream &out, LiveOutSet &set);
+
+ LiveOutSet::LiveOutSet(Liveness &liveness, const FunctionDAG &dag) :
+ liveness(liveness), dag(dag)
+ {
+ this->initializeInstructionDef();
+ this->initializeOtherDef();
+ this->iterateLiveOut();
+ }
+
+ LiveOutSet::RegDefSet &LiveOutSet::getDefSet(const BasicBlock *bb, Register reg)
+ {
+ auto bbIt = defMap.find(bb);
+ GBE_ASSERT(bbIt != defMap.end());
+ auto defIt = bbIt->second->find(reg);
+ GBE_ASSERT(defIt != bbIt->second->end() && defIt->second != NULL);
+ return *defIt->second;
+ }
+
+ void LiveOutSet::makeDefSet(DefSet &udChain, const BasicBlock &bb, Register reg)
+ {
+ // Iterate over all the predecessors
+ const auto &preds = bb.getPredecessorSet();
+ for (const auto &pred : preds) {
+ if (pred->undefPhiRegs.contains(reg))
+ continue;
+ RegDefSet &predDef = this->getDefSet(pred, reg);
+ for (auto def : predDef) udChain.insert(def);
+ }
+
+ // If this is the top block we must take into account both function
+ // arguments and special registers
+ const Function &fn = bb.getParent();
+ if (fn.isEntryBlock(bb) == false) return;
+
+ // Is it a function input?
+ const FunctionArgument *arg = fn.getArg(reg);
+ const PushLocation *pushed = fn.getPushLocation(reg);
+
+ // Is it a pushed register?
+ if (pushed != NULL) {
+ ValueDef *def = const_cast<ValueDef*>(dag.getDefAddress(pushed));
+ udChain.insert(def);
+ }
+ // Is a function argument?
+ else if (arg != NULL) {
+ ValueDef *def = const_cast<ValueDef*>(dag.getDefAddress(arg));
+ udChain.insert(def);
+ }
+ // Is it a special register?
+ else if (fn.isSpecialReg(reg) == true) {
+ ValueDef *def = const_cast<ValueDef*>(dag.getDefAddress(reg));
+ udChain.insert(def);
+ }
+ }
+
+ void LiveOutSet::initializeInstructionDef(void) {
+ const Function &fn = liveness.getFunction();
+
+ // Iterate over each block and initialize the liveOut data
+ fn.foreachBlock([&](const BasicBlock &bb) {
+ GBE_ASSERT(defMap.find(&bb) == defMap.end());
+
+ // Allocate a map of register definitions
+ auto blockDefMap = this->newBlockDefMap();
+ defMap.insert(std::make_pair(&bb, blockDefMap));
+
+ // We only consider liveout registers
+ const auto &info = this->liveness.getBlockInfo(&bb);
+ const auto &liveOut = info.liveOut;
+ for (auto reg : liveOut) {
+ GBE_ASSERT(blockDefMap->find(reg) == blockDefMap->end());
+ auto regDefSet = this->newRegDefSet();
+ blockDefMap->insert(std::make_pair(reg, regDefSet));
+ }
+
+ // Now traverse the blocks backwards and find the definition of each
+ // liveOut register
+ set<Register> defined;
+ for (auto it = --bb.end(); it != bb.end(); --it) {
+ const Instruction &insn = *it;
+ const uint32_t dstNum = insn.getDstNum();
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const Register reg = insn.getDst(dstID);
+ // We only take the most recent definition
+ if (defined.contains(reg) == true) continue;
+ // Not in LiveOut, so does not matter
+ if (info.inLiveOut(reg) == false) continue;
+ defined.insert(reg);
+ // Insert the outgoing definition for this register
+ auto regDefSet = blockDefMap->find(reg);
+ ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(&insn, dstID));
+ GBE_ASSERT(regDefSet != blockDefMap->end() && def != NULL);
+ regDefSet->second->insert(def);
+ }
+ }
+ });
+ }
+
+ void LiveOutSet::initializeOtherDef(void) {
+ const Function &fn = liveness.getFunction();
+ const uint32_t argNum = fn.argNum();
+
+ // The first block must also transfer the function arguments
+ const BasicBlock &top = fn.getTopBlock();
+ const Liveness::BlockInfo &info = this->liveness.getBlockInfo(&top);
+ GBE_ASSERT(defMap.contains(&top) == true);
+ auto blockDefMap = defMap.find(&top)->second;
+
+ // Insert all the values that are not overwritten in the block and alive at
+ // the end of it
+ for (uint32_t argID = 0; argID < argNum; ++argID) {
+ const FunctionArgument &arg = fn.getArg(argID);
+ const Register reg = arg.reg;
+ // Do not transfer dead values
+ if (info.inLiveOut(reg) == false) continue;
+ // If we overwrite it, do not transfer the initial value
+ if (info.inVarKill(reg) == true) continue;
+ ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(&arg));
+ auto it = blockDefMap->find(reg);
+ GBE_ASSERT(it != blockDefMap->end());
+ it->second->insert(def);
+ }
+
+ // Now transfer the special registers that are not over-written
+ const uint32_t firstID = fn.getFirstSpecialReg();
+ const uint32_t specialNum = fn.getSpecialRegNum();
+ for (uint32_t regID = firstID; regID < firstID + specialNum; ++regID) {
+ const Register reg(regID);
+ // Do not transfer dead values
+ if (info.inLiveOut(reg) == false) continue;
+ // If we overwrite it, do not transfer the initial value
+ if (info.inVarKill(reg) == true) continue;
+ ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(reg));
+ auto it = blockDefMap->find(reg);
+ GBE_ASSERT(it != blockDefMap->end());
+ it->second->insert(def);
+ }
+
+ // Finally do the same thing with pushed registers
+ const Function::PushMap &pushMap = fn.getPushMap();
+ for (const auto &pushed : pushMap) {
+ const Register reg = pushed.first;
+ // Do not transfer dead values
+ if (info.inLiveOut(reg) == false) continue;
+ // If we overwrite it, do not transfer the initial value
+ if (info.inVarKill(reg) == true) continue;
+ ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(&pushed.second));
+ auto it = blockDefMap->find(reg);
+ GBE_ASSERT(it != blockDefMap->end());
+ it->second->insert(def);
+ }
+ }
+
+ void LiveOutSet::iterateLiveOut(void) {
+ bool changed = true;
+
+ while (changed) {
+ changed = false;
+
+ // Compute the union of the current liveout definitions with the previous
+ // ones. Do not take into account the killed values though
+ liveness.foreach<DF_PRED>([&](Liveness::BlockInfo &curr,
+ const Liveness::BlockInfo &pred)
+ {
+ const BasicBlock &bb = curr.bb;
+ const BasicBlock &pbb = pred.bb;
+ for (auto reg : curr.liveOut) {
+ if (pred.inLiveOut(reg) == false) continue;
+ if (curr.inVarKill(reg) == true) continue;
+ RegDefSet &currSet = this->getDefSet(&bb, reg);
+ RegDefSet &predSet = this->getDefSet(&pbb, reg);
+
+ // Transfer the values
+ for (auto def : predSet) {
+ if (currSet.contains(def)) continue;
+ changed = true;
+ currSet.insert(def);
+ }
+ }
+ });
+ }
+ }
+
+ LiveOutSet::~LiveOutSet(void) {
+ for (const auto pair : defMap) {
+ BlockDefMap *block = pair.second;
+ for (auto regSet : *block)
+ this->deleteRegDefSet(regSet.second);
+ this->deleteBlockDefMap(block);
+ }
+ }
+
+ std::ostream &operator<< (std::ostream &out, LiveOutSet &set) {
+ for (const auto &pair : set.defMap) {
+ // To recognize the block, just print its instructions
+ out << "Block:" << std::endl;
+ for (const auto &insn : *pair.first) out << insn << std::endl;
+
+ // Iterate over all alive registers to get their definitions
+ const LiveOutSet::BlockDefMap *defMap = pair.second;
+ if (defMap->size() > 0) out << "LiveSet:" << std::endl;
+ for (const auto &pair : *defMap) {
+ const Register reg = pair.first;
+ const LiveOutSet::RegDefSet *set = pair.second;
+ for (auto def : *set) {
+ const ValueDef::Type type = def->getType();
+ if (type == ValueDef::DEF_FN_ARG)
+ out << "%" << reg << ": " << "function input" << std::endl;
+ else if (type == ValueDef::DEF_FN_PUSHED)
+ out << "%" << reg << ": " << "pushed register" << std::endl;
+ else if (type == ValueDef::DEF_SPECIAL_REG)
+ out << "%" << reg << ": " << "special register" << std::endl;
+ else {
+ const Instruction *insn = def->getInstruction();
+ out << "%" << reg << ": " << insn << " " << *insn << std::endl;
+ }
+ }
+ }
+ out << std::endl;
+ }
+ return out;
+ }
+
+ FunctionDAG::FunctionDAG(Liveness &liveness) :
+ fn(liveness.getFunction())
+ {
+ // We first start with empty chains
+ udEmpty = this->newDefSet();
+ duEmpty = this->newUseSet();
+
+ // First create the chains and insert them in their respective maps
+ fn.foreachInstruction([this](const Instruction &insn) {
+ // sources == value uses
+ const uint32_t srcNum = insn.getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ ValueUse *valueUse = this->newValueUse(&insn, srcID);
+ useName.insert(std::make_pair(*valueUse, valueUse));
+ udGraph.insert(std::make_pair(*valueUse, udEmpty));
+ }
+ // destinations == value defs
+ const uint32_t dstNum = insn.getDstNum();
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ ValueDef *valueDef = this->newValueDef(&insn, dstID);
+ defName.insert(std::make_pair(*valueDef, valueDef));
+ duGraph.insert(std::make_pair(*valueDef, duEmpty));
+ }
+ });
+
+ // Function arguments are also value definitions
+ const uint32_t argNum = fn.argNum();
+ for (uint32_t argID = 0; argID < argNum; ++argID) {
+ const FunctionArgument &arg = fn.getArg(argID);
+ ValueDef *valueDef = this->newValueDef(&arg);
+ defName.insert(std::make_pair(*valueDef, valueDef));
+ duGraph.insert(std::make_pair(*valueDef, duEmpty));
+ }
+
+ // Special registers are also definitions
+ const uint32_t firstID = fn.getFirstSpecialReg();
+ const uint32_t specialNum = fn.getSpecialRegNum();
+ for (uint32_t regID = firstID; regID < firstID + specialNum; ++regID) {
+ const Register reg(regID);
+ ValueDef *valueDef = this->newValueDef(reg);
+ defName.insert(std::make_pair(*valueDef, valueDef));
+ duGraph.insert(std::make_pair(*valueDef, duEmpty));
+ }
+
+ // Pushed registers are also definitions
+ const Function::PushMap &pushMap = fn.getPushMap();
+ for (const auto &pushed : pushMap) {
+ ValueDef *valueDef = this->newValueDef(&pushed.second);
+ defName.insert(std::make_pair(*valueDef, valueDef));
+ duGraph.insert(std::make_pair(*valueDef, duEmpty));
+ }
+
+ // We create the liveOutSet to help us transfer the definitions
+ LiveOutSet liveOutSet(liveness, *this);
+
+ // Build UD chains traversing the blocks top to bottom
+ fn.foreachBlock([&](const BasicBlock &bb) {
+ // Track the allocated chains to be able to reuse them
+ map<Register, DefSet*> allocated;
+ // Some chains may be not used (ie they are dead). We track them to be
+ // able to deallocate them later
+ set<DefSet*> unused;
+
+ // For each instruction build the UD chains
+ const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+ // Instruction sources consumes definitions
+ const uint32_t srcNum = insn.getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const Register src = insn.getSrc(srcID);
+ const ValueUse use(&insn, srcID);
+ auto ud = udGraph.find(use);
+ GBE_ASSERT(ud != udGraph.end());
+
+ // We already allocate the ud chain for this register
+ auto it = allocated.find(src);
+ if (it != allocated.end()) {
+ udGraph.erase(ud);
+ udGraph.insert(std::make_pair(use, it->second));
+ if (unused.contains(it->second))
+ unused.erase(it->second);
+ }
+ // Create a new one from the predecessor chains (upward used value)
+ else {
+ DefSet *udChain = this->newDefSet();
+ liveOutSet.makeDefSet(*udChain, bb, src);
+ allocated.insert(std::make_pair(src, udChain));
+ ud->second = udChain;
+ }
+ }
+
+ // Instruction destinations create new chains
+ const uint32_t dstNum = insn.getDstNum();
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const Register dst = insn.getDst(dstID);
+ ValueDef *def = const_cast<ValueDef*>(this->getDefAddress(&insn, dstID));
+ DefSet *udChain = this->newDefSet();
+ udChain->insert(def);
+ unused.insert(udChain);
+ // Remove the previous definition if any
+ if (allocated.contains(dst) == true)
+ allocated.erase(dst);
+ allocated.insert(std::make_pair(dst, udChain));
+ }
+ });
+
+ // Deallocate unused chains
+ for (auto set : unused) this->deleteDefSet(set);
+ });
+
+ // Build the DU chains from the UD ones
+ fn.foreachInstruction([&](const Instruction &insn) {
+
+ // For each value definition of each source, we push back this use
+ const uint32_t srcNum = insn.getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ ValueUse *use = const_cast<ValueUse*>(getUseAddress(&insn, srcID));
+
+ // Find all definitions for this source
+ const auto &defs = this->getDef(&insn, srcID);
+ for (auto def : defs) {
+ auto uses = duGraph.find(*def);
+ UseSet *du = uses->second;
+ GBE_ASSERT(uses != duGraph.end());
+ if (du == duEmpty) {
+ duGraph.erase(*def);
+ du = this->newUseSet();
+ duGraph.insert(std::make_pair(*def, du));
+ }
+ du->insert(use);
+ }
+ }
+ });
+
+ // Allocate the set of uses and defs per register
+ const uint32_t regNum = fn.regNum();
+ for (uint32_t regID = 0; regID < regNum; ++regID) {
+ const Register reg(regID);
+ UseSet *useSet = GBE_NEW_NO_ARG(UseSet);
+ DefSet *defSet = GBE_NEW_NO_ARG(DefSet);
+ regUse.insert(std::make_pair(reg, useSet));
+ regDef.insert(std::make_pair(reg, defSet));
+ }
+
+ // Fill use sets (one per register)
+ for (auto &useSet : duGraph) {
+ for (auto use : *useSet.second) {
+ const Register reg = use->getRegister();
+ auto it = regUse.find(reg);
+ GBE_ASSERT(it != regUse.end() && it->second != NULL);
+ it->second->insert(use);
+ }
+ }
+
+ // Fill def sets (one per register)
+ for (auto &defSet : udGraph) {
+ for (auto def : *defSet.second) {
+ const Register reg = def->getRegister();
+ auto it = regDef.find(reg);
+ GBE_ASSERT(it != regDef.end() && it->second != NULL);
+ it->second->insert(def);
+ }
+ }
+ }
+
+/*! Helper to deallocate objects */
+#define PTR_RELEASE(TYPE, VAR) \
+ do { \
+ if (VAR && destroyed.contains(VAR) == false) { \
+ destroyed.insert(VAR); \
+ delete##TYPE(VAR); \
+ } \
+ } while (0)
+
+ FunctionDAG::~FunctionDAG(void) {
+
+ // We track the already destroyed pointers
+ set<void*> destroyed;
+
+ // Release the empty ud-chains and du-chains
+ PTR_RELEASE(DefSet, udEmpty);
+ PTR_RELEASE(UseSet, duEmpty);
+
+ // We free all the ud-chains
+ for (const auto &pair : udGraph) {
+ auto defs = pair.second;
+ if (destroyed.contains(defs)) continue;
+ for (auto def : *defs) PTR_RELEASE(ValueDef, def);
+ PTR_RELEASE(DefSet, defs);
+ }
+
+ // We free all the du-chains
+ for (const auto &pair : duGraph) {
+ auto uses = pair.second;
+ if (destroyed.contains(uses)) continue;
+ for (auto use : *uses) PTR_RELEASE(ValueUse, use);
+ PTR_RELEASE(UseSet, uses);
+ }
+
+ // Release all the use and definition sets per register
+ for (const auto &pair : regUse) GBE_SAFE_DELETE(pair.second);
+ for (const auto &pair : regDef) GBE_SAFE_DELETE(pair.second);
+ }
+#undef PTR_RELEASE
+
+ const UseSet &FunctionDAG::getUse(const ValueDef &def) const {
+ auto it = duGraph.find(def);
+ GBE_ASSERT(it != duGraph.end());
+ return *it->second;
+ }
+ const UseSet &FunctionDAG::getUse(const Instruction *insn, uint32_t dstID) const {
+ return this->getUse(ValueDef(insn, dstID));
+ }
+ const UseSet &FunctionDAG::getUse(const FunctionArgument *arg) const {
+ return this->getUse(ValueDef(arg));
+ }
+ const UseSet &FunctionDAG::getUse(const Register ®) const {
+ return this->getUse(ValueDef(reg));
+ }
+ const DefSet &FunctionDAG::getDef(const ValueUse &use) const {
+ auto it = udGraph.find(use);
+ GBE_ASSERT(it != udGraph.end());
+ return *it->second;
+ }
+ const DefSet &FunctionDAG::getDef(const Instruction *insn, uint32_t srcID) const {
+ return this->getDef(ValueUse(insn, srcID));
+ }
+ const UseSet *FunctionDAG::getRegUse(const Register ®) const {
+ auto it = regUse.find(reg);
+ GBE_ASSERT(it != regUse.end());
+ return it->second;
+ }
+ const DefSet *FunctionDAG::getRegDef(const Register ®) const {
+ auto it = regDef.find(reg);
+ GBE_ASSERT(it != regDef.end());
+ return it->second;
+ }
+
+ const ValueDef *FunctionDAG::getDefAddress(const ValueDef &def) const {
+ auto it = defName.find(def);
+ GBE_ASSERT(it != defName.end() && it->second != NULL);
+ return it->second;
+ }
+ const ValueDef *FunctionDAG::getDefAddress(const PushLocation *pushed) const {
+ return this->getDefAddress(ValueDef(pushed));
+ }
+ const ValueDef *FunctionDAG::getDefAddress(const Instruction *insn, uint32_t dstID) const {
+ return this->getDefAddress(ValueDef(insn, dstID));
+ }
+ const ValueDef *FunctionDAG::getDefAddress(const FunctionArgument *arg) const {
+ return this->getDefAddress(ValueDef(arg));
+ }
+ const ValueDef *FunctionDAG::getDefAddress(const Register ®) const {
+ return this->getDefAddress(ValueDef(reg));
+ }
+ const ValueUse *FunctionDAG::getUseAddress(const Instruction *insn, uint32_t srcID) const {
+ const ValueUse use(insn, srcID);
+ auto it = useName.find(use);
+ GBE_ASSERT(it != useName.end() && it->second != NULL);
+ return it->second;
+ }
+
+ std::ostream &operator<< (std::ostream &out, const FunctionDAG &dag) {
+ const Function &fn = dag.getFunction();
+
+ // Print all uses for the definitions and all definitions for each uses
+ fn.foreachInstruction([&](const Instruction &insn) {
+ out << &insn << ": " << insn << std::endl;
+
+ // Display the set of definition for each destination
+ const uint32_t dstNum = insn.getDstNum();
+ if (dstNum > 0) out << "USES:" << std::endl;
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const Register reg = insn.getDst(dstID);
+ const auto &uses = dag.getUse(&insn, dstID);
+ for (auto use : uses) {
+ const Instruction *other = use->getInstruction();
+ out << " %" << reg << " " << other << ": " << *other << std::endl;
+ }
+ }
+
+ // Display the set of definitions for each source
+ const uint32_t srcNum = insn.getSrcNum();
+ if (srcNum > 0) out << "DEFS:" << std::endl;
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const Register reg = insn.getSrc(srcID);
+ const auto &defs = dag.getDef(&insn, srcID);
+ for (auto def : defs) {
+ if (def->getType() == ValueDef::DEF_FN_PUSHED)
+ out << " %" << reg << " # pushed register" << std::endl;
+ else if (def->getType() == ValueDef::DEF_FN_ARG)
+ out << " %" << reg << " # function argument" << std::endl;
+ else if (def->getType() == ValueDef::DEF_SPECIAL_REG)
+ out << " %" << reg << " # special register" << std::endl;
+ else {
+ const Instruction *other = def->getInstruction();
+ out << " %" << reg << " " << other << ": " << *other << std::endl;
+ }
+ }
+ }
+ out << std::endl;
+ });
+
+ return out;
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/value.hpp b/backend/src/ir/value.hpp
new file mode 100644
index 0000000..47b9048
--- /dev/null
+++ b/backend/src/ir/value.hpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file value.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_VALUE_HPP__
+#define __GBE_IR_VALUE_HPP__
+
+#include "ir/instruction.hpp"
+#include "ir/function.hpp"
+#include "sys/set.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+ // Make UD-Chain and DU-Chain computations faster and easier
+ class Liveness;
+
+ /*! A value definition is a destination of an instruction or a function
+ * argument. Since we support multiple destinations, we also add the
+ * destination ID.
+ */
+ class ValueDef
+ {
+ public:
+ /*! Discriminates the kind of values */
+ enum Type : uint32_t {
+ DEF_FN_ARG = 0,
+ DEF_FN_PUSHED = 1,
+ DEF_INSN_DST = 2,
+ DEF_SPECIAL_REG = 3
+ };
+ /*! Build a value from an instruction destination */
+ explicit ValueDef(const Instruction *insn, uint32_t dstID = 0u) :
+ type(DEF_INSN_DST)
+ {
+ this->data.insn = insn;
+ this->data.dstID = dstID;
+ }
+ /*! Build a value from a function argument */
+ explicit ValueDef(const FunctionArgument *arg) : type(DEF_FN_ARG) {
+ this->data.arg = arg;
+ }
+ /*! Build a value from a pushed register */
+ explicit ValueDef(const PushLocation *pushed) : type(DEF_FN_PUSHED) {
+ this->data.pushed = pushed;
+ }
+ /*! Build a value from a special register */
+ explicit ValueDef(const Register ®) : type(DEF_SPECIAL_REG) {
+ this->data.regID = uint32_t(reg);
+ }
+ /*! Get the type of the value */
+ INLINE Type getType(void) const { return type; }
+ /*! Get the instruction (only if this is a instruction value) */
+ INLINE const Instruction *getInstruction(void) const {
+ GBE_ASSERT(type == DEF_INSN_DST);
+ return data.insn;
+ }
+ /*! Get the destination ID (only if this is a instruction value) */
+ INLINE uint32_t getDstID(void) const {
+ GBE_ASSERT(type == DEF_INSN_DST);
+ return data.dstID;
+ }
+ /*! Get the function input (only if this is a function argument) */
+ INLINE const FunctionArgument *getFunctionArgument(void) const {
+ GBE_ASSERT(type == DEF_FN_ARG);
+ return data.arg;
+ }
+ /*! Get the pushed location */
+ INLINE const PushLocation *getPushLocation(void) const {
+ GBE_ASSERT(type == DEF_FN_PUSHED);
+ return data.pushed;
+ }
+ /*! Get the special register */
+ INLINE Register getSpecialReg(void) const {
+ GBE_ASSERT(type == DEF_SPECIAL_REG);
+ return Register(data.regID);
+ }
+ /*! Retrieve the register associated to the definition */
+ INLINE Register getRegister(void) const {
+ if (type == DEF_SPECIAL_REG)
+ return Register(data.regID);
+ else if (type == DEF_FN_ARG)
+ return data.arg->reg;
+ else if (type == DEF_FN_PUSHED)
+ return data.pushed->getRegister();
+ else
+ return data.insn->getDst(data.dstID);
+ }
+
+ private:
+ /*! Instruction or function argument */
+ union Data {
+ /*! Instruction destination or ... */
+ struct {
+ const Instruction *insn; //<! Instruction itself
+ uint32_t dstID; //<! Which destination we take into account
+ };
+ /*! Pushed value */
+ const PushLocation *pushed;
+ /*! ... function argument or ... */
+ const FunctionArgument *arg;
+ /*! ... special register */
+ uint32_t regID;
+ } data;
+ /*!< Function argument or instruction dst? */
+ Type type;
+ GBE_CLASS(ValueDef); // Use gbe allocators
+ };
+
+ /*! Compare two value definitions (used in maps) */
+ INLINE bool operator< (const ValueDef &def0, const ValueDef &def1) {
+ const ValueDef::Type type0 = def0.getType();
+ const ValueDef::Type type1 = def1.getType();
+ if (type0 != type1) return uint32_t(type0) < uint32_t(type1);
+ if (type0 == ValueDef::DEF_FN_ARG) {
+ const FunctionArgument *in0 = def0.getFunctionArgument();
+ const FunctionArgument *in1 = def1.getFunctionArgument();
+ return uintptr_t(in0) < uintptr_t(in1);
+ } else if (type0 == ValueDef::DEF_FN_PUSHED) {
+ const PushLocation *pushed0 = def0.getPushLocation();
+ const PushLocation *pushed1 = def1.getPushLocation();
+ return uintptr_t(pushed0) < uintptr_t(pushed1);
+ } else if (type0 == ValueDef::DEF_SPECIAL_REG) {
+ const Register reg0 = def0.getSpecialReg();
+ const Register reg1 = def1.getSpecialReg();
+ return uint32_t(reg0) < uint32_t(reg1);
+ } else {
+ const Instruction *insn0 = def0.getInstruction();
+ const Instruction *insn1 = def1.getInstruction();
+ if (insn0 != insn1) return uintptr_t(insn0) < uintptr_t(insn1);
+ const uint32_t dst0 = def0.getDstID();
+ const uint32_t dst1 = def1.getDstID();
+ return dst0 < dst1;
+ }
+ }
+
+ /*! A value use describes a instruction source. This is the place where a
+ * value is used
+ */
+ class ValueUse
+ {
+ public:
+ /*! Build a value use */
+ explicit ValueUse(const Instruction *insn, uint32_t srcID = 0u) :
+ insn(insn), srcID(srcID) {}
+ /*! Get the instruction of the use */
+ const Instruction *getInstruction(void) const { return insn; }
+ /*! Get the source index for this use */
+ uint32_t getSrcID(void) const { return srcID; }
+ /*! Get the register for this use */
+ Register getRegister(void) const { return insn->getSrc(srcID); }
+ private:
+ const Instruction *insn; //!< Instruction where the value is used
+ uint32_t srcID; //!< Index of the source in the instruction
+ GBE_CLASS(ValueUse); // Use gbe allocators
+ };
+
+ /*! Compare two value uses (used in maps) */
+ INLINE bool operator< (const ValueUse &use0, const ValueUse &use1) {
+ const Instruction *insn0 = use0.getInstruction();
+ const Instruction *insn1 = use1.getInstruction();
+ if (insn0 != insn1) return uintptr_t(insn0) < uintptr_t(insn1);
+ const uint32_t src0 = use0.getSrcID();
+ const uint32_t src1 = use1.getSrcID();
+ return src0 < src1;
+ }
+
+ /*! All uses of a definition */
+ typedef set<ValueUse*> UseSet;
+ /*! All possible definitions for a use */
+ typedef set<ValueDef*> DefSet;
+
+ /*! Get the chains (in both directions) for the complete program. This data
+ * structure is unfortunately way too brutal. Using std::sets all over the
+ * place just burns a huge amount of memory. There is work to do to decrease
+ * the memory footprint
+ */
+ class FunctionDAG : public NonCopyable
+ {
+ public:
+ /*! Build the complete DU/UD graphs for the program included in liveness */
+ FunctionDAG(Liveness &liveness);
+ /*! Free all the resources */
+ ~FunctionDAG(void);
+ /*! Get the du-chain for the definition */
+ const UseSet &getUse(const ValueDef &def) const;
+ /*! Get the du-chain for the given instruction and destination */
+ const UseSet &getUse(const Instruction *insn, uint32_t dstID) const;
+ /*! Get the du-chain for the given function input */
+ const UseSet &getUse(const FunctionArgument *arg) const;
+ /*! Get the du-chain for the given pushed location */
+ const UseSet &getUse(const PushLocation *pushed) const;
+ /*! Get the du-chain for the given special register */
+ const UseSet &getUse(const Register ®) const;
+ /*! Get the ud-chain for the given use */
+ const DefSet &getDef(const ValueUse &use) const;
+ /*! Get the ud-chain for the instruction and source */
+ const DefSet &getDef(const Instruction *insn, uint32_t srcID) const;
+ /*! Get the pointer to the definition *as stored in the DAG* */
+ const ValueDef *getDefAddress(const ValueDef &def) const;
+ /*! Get the pointer to the definition *as stored in the DAG* */
+ const ValueDef *getDefAddress(const PushLocation *pushed) const;
+ /*! Get the pointer to the definition *as stored in the DAG* */
+ const ValueDef *getDefAddress(const Instruction *insn, uint32_t dstID) const;
+ /*! Get the pointer to the definition *as stored in the DAG* */
+ const ValueDef *getDefAddress(const FunctionArgument *input) const;
+ /*! Get the pointer to the definition *as stored in the DAG* */
+ const ValueDef *getDefAddress(const Register ®) const;
+ /*! Get the pointer to the use *as stored in the DAG* */
+ const ValueUse *getUseAddress(const Instruction *insn, uint32_t srcID) const;
+ /*! Get the set of all uses for the register */
+ const UseSet *getRegUse(const Register ®) const;
+ /*! Get the set of all definitions for the register */
+ const DefSet *getRegDef(const Register ®) const;
+ /*! Get the function we have the graph for */
+ INLINE const Function &getFunction(void) const { return fn; }
+ /*! The DefSet for each definition use */
+ typedef map<ValueUse, DefSet*> UDGraph;
+ /*! The UseSet for each definition */
+ typedef map<ValueDef, UseSet*> DUGraph;
+ private:
+ UDGraph udGraph; //!< All the UD chains
+ DUGraph duGraph; //!< All the DU chains
+ DefSet *udEmpty; //!< Void use set
+ UseSet *duEmpty; //!< Void def set
+ ValueDef *undefined; //!< Undefined value
+ map<ValueUse, ValueUse*> useName; //!< Get the ValueUse pointer from the value
+ map<ValueDef, ValueDef*> defName; //!< Get the ValueDef pointer from the value
+ map<Register, UseSet*> regUse; //!< All uses of registers
+ map<Register, DefSet*> regDef; //!< All defs of registers
+ DECL_POOL(ValueDef, valueDefPool); //!< Fast ValueDef allocation
+ DECL_POOL(ValueUse, valueUsePool); //!< Fast ValueUse allocation
+ DECL_POOL(DefSet, udChainPool); //!< Fast DefSet allocation
+ DECL_POOL(UseSet, duChainPool); //!< Fast UseSet allocation
+ const Function &fn; //!< Function we are referring to
+ GBE_CLASS(FunctionDAG); // Use internal allocators
+ };
+
+ /*! Pretty print of the function DAG */
+ std::ostream &operator<< (std::ostream &out, const FunctionDAG &dag);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_VALUE_HPP__ */
+
diff --git a/backend/src/llvm/llvm_barrier_nodup.cpp b/backend/src/llvm/llvm_barrier_nodup.cpp
new file mode 100644
index 0000000..791df00
--- /dev/null
+++ b/backend/src/llvm/llvm_barrier_nodup.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_barrier_nodup.cpp
+ *
+ * This pass is to remove or add noduplicate function attribute for barrier functions.
+ * Basically, we want to set NoDuplicate for those __gen_barrier_xxx functions. But if
+ * a sub function calls those barrier functions, the sub function will not be inlined
+ * in llvm's inlining pass. This is what we don't want. As inlining such a function in
+ * the caller is safe, we just don't want it to duplicate the call. So Introduce this
+ * pass to remove the NoDuplicate function attribute before the inlining pass and restore
+ * it after.
+ *
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+ class BarrierNodup : public ModulePass
+ {
+ public:
+ static char ID;
+ BarrierNodup(bool nodup) :
+ ModulePass(ID), nodup(nodup) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+
+ }
+
+ virtual const char *getPassName() const {
+ return "SPIR backend: set barrier no duplicate attr";
+ }
+
+ virtual bool runOnModule(Module &M)
+ {
+ using namespace llvm;
+ bool changed = false;
+ for (auto &F : M) {
+ if (F.getName() == "__gen_ocl_barrier_local_and_global" ||
+ F.getName() == "__gen_ocl_barrier_local" ||
+ F.getName() == "__gen_ocl_barrier_global") {
+ if (nodup) {
+ if (!F.hasFnAttribute(Attribute::NoDuplicate)) {
+ F.addFnAttr(Attribute::NoDuplicate);
+ changed = true;
+ }
+ } else {
+ if (F.hasFnAttribute(Attribute::NoDuplicate)) {
+ auto attrs = F.getAttributes();
+ F.setAttributes(attrs.removeAttribute(M.getContext(),
+ AttributeSet::FunctionIndex,
+ Attribute::NoDuplicate));
+ changed = true;
+ }
+ }
+ }
+ }
+
+ return changed;
+ }
+ private:
+ bool nodup;
+ };
+
+
+ ModulePass *createBarrierNodupPass(bool Nodup) {
+ return new BarrierNodup(Nodup);
+ }
+
+ char BarrierNodup::ID = 0;
+} // end namespace
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
new file mode 100644
index 0000000..6cb3834
--- /dev/null
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -0,0 +1,3628 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_gen_backend.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Transform the LLVM IR code into Gen IR code i.e. our temporary representation
+ * for programs running on Gen.
+ *
+ * Overview
+ * ========
+ *
+ * This code is mostly inspired by the (now defunct and replaced by CppBackend)
+ * CBackend. Basically, there are two ways to transform LLVM code into machine
+ * code (or anything else)
+ * - You write a complete LLVM backend by the book. LLVM proposes a lot of
+ * useful tools to do so. This is obviously the path chosen by all CPU guys
+ * but also by AMD and nVidia which both use the backend infrastructure to
+ * output their own intermediate language. The good point is that you can
+ * reuse a lot of tools (like proper PHI elimination with phi congruence and
+ * global copy propagation a la Chaitin). Bad points are:
+ * 1/ It is a *long* journey to generate anything.
+ * 2/ More importantly, the code is hugely biased towards CPUs. Typically,
+ * the way registers are defined do not fit well Gen register file (which
+ * is really more like a regular piece of memory). Same issue apply for
+ * predicated instructions with mask which is a bit boring to use with
+ * SSA. Indeed, since DAGSelection still manipulates SSA values, anything
+ * predicated requires to insert extra sources
+ * - You write function passes to do the translation yourself. Obviously, you
+ * reinvent the wheel. However, it is easy to do and easier to maintain
+ * (somehow)
+ *
+ * So, the code here just traverses LLVM asm and generates our own ISA. The
+ * generated code is OK even if a global copy propagation pass is still overdue.
+ * Right now, it is pretty straighforward and simplistic in that regard
+ *
+ * About Clang and the ABI / target
+ * ================================
+ *
+ * A major question is: how did we actually generate this LLVM code from OpenCL?
+ * Well, thing is that there is no generic target in LLVM since there are many
+ * dependencies on endianness or ABIs. Fortunately, the ptx (and nvptx for LLVM
+ * 3.2) profile is pretty well adapted to our needs since NV and Gen GPU are
+ * kind of similar, or at least they are similar enough to share the same front
+ * end.
+ *
+ * Problems
+ * ========
+ *
+ * - Several things regarding constants like ConstantExpr are not properly handled.
+ * - ptx front end generates function calls. Since we do not support them yet,
+ * the user needs to force the inlining of all functions. If a function call
+ * is intercepted, we just abort
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/Mangler.h"
+#else
+#include "llvm/Target/Mangler.h"
+#endif
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
+#include "llvm/Target/TargetData.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
+#endif
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
+#include "llvm/Support/InstVisitor.h"
+#elif LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/InstVisitor.h"
+#else
+#include "llvm/InstVisitor.h"
+#endif
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "ir/context.hpp"
+#include "ir/unit.hpp"
+#include "ir/liveness.hpp"
+#include "ir/value.hpp"
+#include "sys/set.hpp"
+#include "sys/cvar.hpp"
+#include "backend/program.h"
+#include <sstream>
+
+/* Not defined for LLVM 3.0 */
+#if !defined(LLVM_VERSION_MAJOR)
+#define LLVM_VERSION_MAJOR 3
+#endif /* !defined(LLVM_VERSION_MAJOR) */
+
+#if !defined(LLVM_VERSION_MINOR)
+#define LLVM_VERSION_MINOR 0
+#endif /* !defined(LLVM_VERSION_MINOR) */
+
+#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR < 3)
+#error "Only LLVM 3.3 and newer are supported"
+#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4) */
+
+using namespace llvm;
+
+namespace gbe
+{
+ /*! Gen IR manipulates only scalar types */
+ static bool isScalarType(const Type *type)
+ {
+ return type->isFloatTy() ||
+ type->isIntegerTy() ||
+ type->isDoubleTy() ||
+ type->isPointerTy();
+ }
+
+ /*! LLVM IR Type to Gen IR type translation */
+ static ir::Type getType(ir::Context &ctx, const Type *type)
+ {
+ GBE_ASSERT(isScalarType(type));
+ if (type->isFloatTy() == true)
+ return ir::TYPE_FLOAT;
+ if (type->isDoubleTy() == true)
+ return ir::TYPE_DOUBLE;
+ if (type->isPointerTy() == true) {
+ if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+ return ir::TYPE_U32;
+ else
+ return ir::TYPE_U64;
+ }
+ GBE_ASSERT(type->isIntegerTy() == true);
+ if (type == Type::getInt1Ty(type->getContext()))
+ return ir::TYPE_BOOL;
+ if (type == Type::getInt8Ty(type->getContext()))
+ return ir::TYPE_S8;
+ if (type == Type::getInt16Ty(type->getContext()))
+ return ir::TYPE_S16;
+ if (type == Type::getInt32Ty(type->getContext()))
+ return ir::TYPE_S32;
+ if (type == Type::getInt64Ty(type->getContext()))
+ return ir::TYPE_S64;
+ return ir::TYPE_LARGE_INT;
+ }
+
+ /*! LLVM IR Type to Gen IR unsigned type translation */
+ static ir::Type getUnsignedType(ir::Context &ctx, const Type *type)
+ {
+ GBE_ASSERT(type->isIntegerTy() == true);
+ if (type == Type::getInt1Ty(type->getContext()))
+ return ir::TYPE_BOOL;
+ if (type == Type::getInt8Ty(type->getContext()))
+ return ir::TYPE_U8;
+ if (type == Type::getInt16Ty(type->getContext()))
+ return ir::TYPE_U16;
+ if (type == Type::getInt32Ty(type->getContext()))
+ return ir::TYPE_U32;
+ if (type == Type::getInt64Ty(type->getContext()))
+ return ir::TYPE_U64;
+ ctx.getUnit().setValid(false);
+ return ir::TYPE_U64;
+ }
+
+ /*! Type to register family translation */
+ static ir::RegisterFamily getFamily(ir::Context &ctx, const Type *type)
+ {
+ GBE_ASSERT(isScalarType(type) == true);
+ if (type == Type::getInt1Ty(type->getContext()))
+ return ir::FAMILY_BOOL;
+ if (type == Type::getInt8Ty(type->getContext()))
+ return ir::FAMILY_BYTE;
+ if (type == Type::getInt16Ty(type->getContext()))
+ return ir::FAMILY_WORD;
+ if (type == Type::getInt32Ty(type->getContext()) || type->isFloatTy())
+ return ir::FAMILY_DWORD;
+ if (type == Type::getInt64Ty(type->getContext()) || type->isDoubleTy())
+ return ir::FAMILY_QWORD;
+ if (type->isPointerTy())
+ return ctx.getPointerFamily();
+ ctx.getUnit().setValid(false);
+ return ir::FAMILY_BOOL;
+ }
+
+ /*! Get number of element to process dealing either with a vector or a scalar
+ * value
+ */
+ static ir::Type getVectorInfo(ir::Context &ctx, Type *llvmType, Value *value, uint32_t &elemNum, bool useUnsigned = false)
+ {
+ ir::Type type;
+ if (llvmType->isVectorTy() == true) {
+ VectorType *vectorType = cast<VectorType>(llvmType);
+ Type *elementType = vectorType->getElementType();
+ elemNum = vectorType->getNumElements();
+ if (useUnsigned)
+ type = getUnsignedType(ctx, elementType);
+ else
+ type = getType(ctx, elementType);
+ } else {
+ elemNum = 1;
+ if (useUnsigned)
+ type = getUnsignedType(ctx, llvmType);
+ else
+ type = getType(ctx, llvmType);
+ }
+ return type;
+ }
+
+ /*! OCL to Gen-IR address type */
+ static INLINE ir::AddressSpace addressSpaceLLVMToGen(unsigned llvmMemSpace) {
+ switch (llvmMemSpace) {
+ case 0: return ir::MEM_PRIVATE;
+ case 1: return ir::MEM_GLOBAL;
+ case 2: return ir::MEM_CONSTANT;
+ case 3: return ir::MEM_LOCAL;
+ case 4: return ir::IMAGE;
+ }
+ GBE_ASSERT(false);
+ return ir::MEM_GLOBAL;
+ }
+
+ static Constant *extractConstantElem(Constant *CPV, uint32_t index) {
+ ConstantVector *CV = dyn_cast<ConstantVector>(CPV);
+ GBE_ASSERT(CV != NULL);
+#if GBE_DEBUG
+ const uint32_t elemNum = CV->getNumOperands();
+ GBE_ASSERTM(index < elemNum, "Out-of-bound constant vector access");
+#endif /* GBE_DEBUG */
+ CPV = cast<Constant>(CV->getOperand(index));
+ return CPV;
+ }
+
+ /*! Handle the LLVM IR Value to Gen IR register translation. This has 2 roles:
+ * - Split the LLVM vector into several scalar values
+ * - Handle the transparent copies (bitcast or use of intrincics functions
+ * like get_local_id / get_global_id
+ */
+ class RegisterTranslator
+ {
+ public:
+ /*! Indices will be zero for scalar values */
+ typedef std::pair<Value*, uint32_t> ValueIndex;
+ RegisterTranslator(ir::Context &ctx) : ctx(ctx) {}
+
+ /*! Empty the maps */
+ void clear(void) {
+ valueMap.clear();
+ scalarMap.clear();
+ }
+ /*! Some values will not be allocated. For example, a bit-cast destination
+ * like: %fake = bitcast %real or a vector insertion since we do not have
+ * vectors in Gen-IR
+ */
+ void newValueProxy(Value *real,
+ Value *fake,
+ uint32_t realIndex = 0u,
+ uint32_t fakeIndex = 0u) {
+ const ValueIndex key(fake, fakeIndex);
+ const ValueIndex value(real, realIndex);
+ GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
+ valueMap[key] = value;
+ }
+ /*! Mostly used for the preallocated registers (lids, gids) */
+ void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
+ const ValueIndex key(value, index);
+ GBE_ASSERT(scalarMap.find(key) == scalarMap.end());
+ scalarMap[key] = reg;
+ }
+ /*! Allocate a new scalar register */
+ ir::Register newScalar(Value *value, Value *key = NULL, uint32_t index = 0u, bool uniform = false)
+ {
+ // we don't allow normal constant, but GlobalValue is a special case,
+ // it needs a register to store its address
+ GBE_ASSERT(! (isa<Constant>(value) && !isa<GlobalValue>(value)));
+ Type *type = value->getType();
+ auto typeID = type->getTypeID();
+ switch (typeID) {
+ case Type::IntegerTyID:
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ case Type::PointerTyID:
+ GBE_ASSERT(index == 0);
+ return this->_newScalar(value, key, type, index, uniform);
+ break;
+ case Type::VectorTyID:
+ {
+ auto vectorType = cast<VectorType>(type);
+ auto elementType = vectorType->getElementType();
+ auto elementTypeID = elementType->getTypeID();
+ if (elementTypeID != Type::IntegerTyID &&
+ elementTypeID != Type::FloatTyID &&
+ elementTypeID != Type::DoubleTyID)
+ GBE_ASSERTM(false, "Vectors of elements are not supported");
+ return this->_newScalar(value, key, elementType, index, uniform);
+ break;
+ }
+ default: NOT_SUPPORTED;
+ };
+ return ir::Register();
+ }
+
+ /*! iterating in the value map to get the final real register */
+ void getRealValue(Value* &value, uint32_t& index) {
+ auto end = valueMap.end();
+ for (;;) {
+ auto it = valueMap.find(std::make_pair(value, index));
+ if (it == end)
+ break;
+ else {
+ value = it->second.first;
+ index = it->second.second;
+ }
+ }
+ }
+
+ /*! Get the register from the given value at given index possibly iterating
+ * in the value map to get the final real register
+ */
+ ir::Register getScalar(Value *value, uint32_t index = 0u) {
+ getRealValue(value, index);
+
+ const auto key = std::make_pair(value, index);
+ GBE_ASSERT(scalarMap.find(key) != scalarMap.end());
+ return scalarMap[key];
+ }
+ /*! Insert a given register at given Value position */
+ void insertRegister(const ir::Register ®, Value *value, uint32_t index) {
+ const auto key = std::make_pair(value, index);
+ GBE_ASSERT(scalarMap.find(key) == scalarMap.end());
+ scalarMap[key] = reg;
+ }
+ /*! Says if the value exists. Otherwise, it is undefined */
+ bool valueExists(Value *value, uint32_t index) {
+ getRealValue(value, index);
+
+ const auto key = std::make_pair(value, index);
+ return scalarMap.find(key) != scalarMap.end();
+ }
+ /*! if it's a undef const value, return true. Otherwise, return false. */
+ bool isUndefConst(Value *value, uint32_t index) {
+ getRealValue(value, index);
+
+ Constant *CPV = dyn_cast<Constant>(value);
+ if(CPV && dyn_cast<ConstantVector>(CPV))
+ CPV = extractConstantElem(CPV, index);
+ return (CPV && (isa<UndefValue>(CPV)));
+ }
+ private:
+ /*! This creates a scalar register for a Value (index is the vector index when
+ * the value is a vector of scalars)
+ */
+ ir::Register _newScalar(Value *value, Value *key, Type *type, uint32_t index, bool uniform) {
+ const ir::RegisterFamily family = getFamily(ctx, type);
+ const ir::Register reg = ctx.reg(family, uniform);
+ key = key == NULL ? value : key;
+ this->insertRegister(reg, key, index);
+ return reg;
+ }
+ /*! Map value to ir::Register */
+ map<ValueIndex, ir::Register> scalarMap;
+ /*! Map values to values when this is only a translation (eq bitcast) */
+ map<ValueIndex, ValueIndex> valueMap;
+ /*! Actually allocates the registers */
+ ir::Context &ctx;
+ };
+
+ /*! Translate LLVM IR code to Gen IR code */
+ class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
+ {
+ /*! Unit to compute */
+ ir::Unit &unit;
+ /*! Helper structure to compute the unit */
+ ir::Context ctx;
+ /*! Make the LLVM-to-Gen translation */
+ RegisterTranslator regTranslator;
+ /*! Map target basic block to its ir::LabelIndex */
+ map<const BasicBlock*, ir::LabelIndex> labelMap;
+ /*! Condition inversion can simplify branch code. We store here all the
+ * compare instructions we need to invert to decrease branch complexity
+ */
+ set<const Value*> conditionSet;
+ map<const Value*, int> globalPointer;
+ /*!
+ * <phi,phiCopy> node information for later optimization
+ */
+ map<const ir::Register, const ir::Register> phiMap;
+ /*! We visit each function twice. Once to allocate the registers and once to
+ * emit the Gen IR instructions
+ */
+ enum Pass {
+ PASS_EMIT_REGISTERS = 0,
+ PASS_EMIT_INSTRUCTIONS = 1
+ } pass;
+
+ typedef enum {
+ CONST_INT,
+ CONST_FLOAT,
+ CONST_DOUBLE
+ } ConstTypeId;
+
+ LoopInfo *LI;
+ const Module *TheModule;
+ int btiBase;
+ public:
+ static char ID;
+ explicit GenWriter(ir::Unit &unit)
+ : FunctionPass(ID),
+ unit(unit),
+ ctx(unit),
+ regTranslator(ctx),
+ LI(0),
+ TheModule(0),
+ btiBase(BTI_RESERVED_NUM)
+ {
+ initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+ pass = PASS_EMIT_REGISTERS;
+ }
+
+ virtual const char *getPassName() const { return "Gen Back-End"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<LoopInfo>();
+ AU.setPreservesAll();
+ }
+
+ virtual bool doInitialization(Module &M);
+ /*! helper function for parsing global constant data */
+ void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
+ void collectGlobalConstant(void) const;
+ ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u);
+ const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u);
+
+ uint32_t incBtiBase() {
+ GBE_ASSERT(btiBase <= BTI_MAX_ID);
+ return btiBase++;
+ }
+
+ bool runOnFunction(Function &F) {
+ // Do not codegen any 'available_externally' functions at all, they have
+ // definitions outside the translation unit.
+ if (F.hasAvailableExternallyLinkage())
+ return false;
+
+ // As we inline all function calls, so skip non-kernel functions
+ bool bKernel = isKernelFunction(F);
+ if(!bKernel) return false;
+
+ LI = &getAnalysis<LoopInfo>();
+ emitFunction(F);
+ phiMap.clear();
+ globalPointer.clear();
+ // Reset for next function
+ btiBase = BTI_RESERVED_NUM;
+ return false;
+ }
+
+ virtual bool doFinalization(Module &M) { return false; }
+ /*! handle global variable register allocation (local, constant space) */
+ void allocateGlobalVariableRegister(Function &F);
+ /*! gather all the loops in the function and add them to ir::Function */
+ void gatherLoopInfo(ir::Function &fn);
+ /*! Emit the complete function code and declaration */
+ void emitFunction(Function &F);
+ /*! Handle input and output function parameters */
+ void emitFunctionPrototype(Function &F);
+ /*! Emit the code for a basic block */
+ void emitBasicBlock(BasicBlock *BB);
+ /*! Each block end may require to emit MOVs for further PHIs */
+ void emitMovForPHI(BasicBlock *curr, BasicBlock *succ);
+ /*! Alocate one or several registers (if vector) for the value */
+ INLINE void newRegister(Value *value, Value *key = NULL, bool uniform = false);
+ /*! get the register for a llvm::Constant */
+ ir::Register getConstantRegister(Constant *c, uint32_t index = 0);
+ /*! get constant pointer */
+ ir::Register getConstantPointerRegister(ConstantExpr *ce, uint32_t index = 0);
+ /*! Return a valid register from an operand (can use LOADI to make one) */
+ INLINE ir::Register getRegister(Value *value, uint32_t index = 0);
+ /*! Create a new immediate from a constant */
+ ir::ImmediateIndex newImmediate(Constant *CPV, uint32_t index = 0);
+ /*! Insert a new label index when this is a scalar value */
+ INLINE void newLabelIndex(const BasicBlock *bb);
+ /*! Inspect the terminator instruction and try to see if we should invert
+ * the value to simplify the code
+ */
+ INLINE void simplifyTerminator(BasicBlock *bb);
+ /*! Helper function to emit loads and stores */
+ template <bool isLoad, typename T> void emitLoadOrStore(T &I);
+ /*! Will try to remove MOVs due to PHI resolution */
+ void removeMOVs(const ir::Liveness &liveness, ir::Function &fn);
+ /*! Optimize phi move based on liveness information */
+ void optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn);
+ /*! Will try to remove redundants LOADI in basic blocks */
+ void removeLOADIs(const ir::Liveness &liveness, ir::Function &fn);
+ /*! To avoid lost copy, we need two values for PHI. This function create a
+ * fake value for the copy (basically ptr+1)
+ */
+ INLINE Value *getPHICopy(Value *PHI);
+ // Currently supported instructions
+#define DECL_VISIT_FN(NAME, TYPE) \
+ void regAllocate##NAME(TYPE &I); \
+ void emit##NAME(TYPE &I); \
+ void visit##NAME(TYPE &I) { \
+ if (pass == PASS_EMIT_INSTRUCTIONS) \
+ emit##NAME(I); \
+ else \
+ regAllocate##NAME(I); \
+ }
+ DECL_VISIT_FN(BinaryOperator, Instruction);
+ DECL_VISIT_FN(CastInst, CastInst);
+ DECL_VISIT_FN(ReturnInst, ReturnInst);
+ DECL_VISIT_FN(LoadInst, LoadInst);
+ DECL_VISIT_FN(StoreInst, StoreInst);
+ DECL_VISIT_FN(CallInst, CallInst);
+ DECL_VISIT_FN(ICmpInst, ICmpInst);
+ DECL_VISIT_FN(FCmpInst, FCmpInst);
+ DECL_VISIT_FN(InsertElement, InsertElementInst);
+ DECL_VISIT_FN(ExtractElement, ExtractElementInst);
+ DECL_VISIT_FN(ShuffleVectorInst, ShuffleVectorInst);
+ DECL_VISIT_FN(SelectInst, SelectInst);
+ DECL_VISIT_FN(BranchInst, BranchInst);
+ DECL_VISIT_FN(PHINode, PHINode);
+ DECL_VISIT_FN(AllocaInst, AllocaInst);
+#undef DECL_VISIT_FN
+
+ // Emit unary instructions from gen native function
+ void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
+ // Emit unary instructions from gen native function
+ void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
+
+ uint8_t appendSampler(CallSite::arg_iterator AI);
+
+ // These instructions are not supported at all
+ void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
+ void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
+ void visitInvokeInst(InvokeInst &I) {NOT_SUPPORTED;}
+#if LLVM_VERSION_MINOR == 0
+ void visitUnwindInst(UnwindInst &I) {NOT_SUPPORTED;}
+#endif /* __LLVM_30__ */
+ void visitResumeInst(ResumeInst &I) {NOT_SUPPORTED;}
+ void visitInlineAsm(CallInst &I) {NOT_SUPPORTED;}
+ void visitIndirectBrInst(IndirectBrInst &I) {NOT_SUPPORTED;}
+ void visitUnreachableInst(UnreachableInst &I) {NOT_SUPPORTED;}
+ void visitGetElementPtrInst(GetElementPtrInst &I) {NOT_SUPPORTED;}
+ void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;}
+ void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;}
+ template <bool isLoad, typename T> void visitLoadOrStore(T &I);
+
+ INLINE void gatherBTI(Value *pointer, ir::BTI &bti);
+ // batch vec4/8/16 load/store
+ INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+ Value *llvmValue, const ir::Register ptr,
+ const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
+ void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
+ private:
+ ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
+ template <typename T, typename P = T>
+ ir::ImmediateIndex processSeqConstant(ConstantDataSequential *seq,
+ int index, ConstTypeId tid);
+ ir::ImmediateIndex processConstantVector(ConstantVector *cv, int index);
+ };
+
+ char GenWriter::ID = 0;
+ void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
+ StringRef data = cda->getRawDataValues();
+ memcpy((char*)ptr+offset, data.data(), data.size());
+ offset += data.size();
+ return;
+ }
+
+ void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+ Type * type = c->getType();
+ Type::TypeID id = type->getTypeID();
+
+ GBE_ASSERT(c);
+ if(isa<UndefValue>(c)) {
+ uint32_t size = getTypeByteSize(unit, type);
+ offset += size;
+ return;
+ } else if(isa<ConstantAggregateZero>(c)) {
+ uint32_t size = getTypeByteSize(unit, type);
+ memset((char*)mem+offset, 0, size);
+ offset += size;
+ return;
+ }
+
+ switch(id) {
+ case Type::TypeID::StructTyID:
+ {
+ const StructType * strTy = cast<StructType>(c->getType());
+ uint32_t size = 0;
+
+ for(uint32_t op=0; op < strTy->getNumElements(); op++)
+ {
+ Type* elementType = strTy->getElementType(op);
+ uint32_t align = 8 * getAlignmentByte(unit, elementType);
+ uint32_t padding = getPadding(size, align);
+ size += padding;
+ size += getTypeBitSize(unit, elementType);
+
+ offset += padding/8;
+ const Constant* sub = cast<Constant>(c->getOperand(op));
+ GBE_ASSERT(sub);
+ getConstantData(sub, mem, offset);
+ }
+ break;
+ }
+ case Type::TypeID::ArrayTyID:
+ {
+ const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+ if(cds)
+ getSequentialData(cds, mem, offset);
+ else {
+ const ConstantArray *ca = dyn_cast<ConstantArray>(c);
+ const ArrayType *arrTy = ca->getType();
+ Type* elemTy = arrTy->getElementType();
+ uint32_t elemSize = getTypeBitSize(unit, elemTy);
+ uint32_t padding = getPadding(elemSize, 8 * getAlignmentByte(unit, elemTy));
+ padding /= 8;
+ uint32_t ops = c->getNumOperands();
+ for(uint32_t op = 0; op < ops; ++op) {
+ Constant * ca = dyn_cast<Constant>(c->getOperand(op));
+ getConstantData(ca, mem, offset);
+ offset += padding;
+ }
+ }
+ break;
+ }
+ case Type::TypeID::VectorTyID:
+ {
+ const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+ const VectorType *vecTy = cast<VectorType>(type);
+ GBE_ASSERT(cds);
+ getSequentialData(cds, mem, offset);
+ if(vecTy->getNumElements() == 3) // OCL spec require align to vec4
+ offset += getTypeByteSize(unit, vecTy->getElementType());
+ break;
+ }
+ case Type::TypeID::IntegerTyID:
+ {
+ const ConstantInt *ci = dyn_cast<ConstantInt>(c);
+ uint32_t size = ci->getBitWidth() / 8;
+ uint64_t data = ci->isNegative() ? ci->getSExtValue() : ci->getZExtValue();
+ memcpy((char*)mem+offset, &data, size);
+ offset += size;
+ break;
+ }
+ case Type::TypeID::FloatTyID:
+ {
+ const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+ *(float *)((char*)mem + offset) = cf->getValueAPF().convertToFloat();
+ offset += sizeof(float);
+ break;
+ }
+ case Type::TypeID::DoubleTyID:
+ {
+ const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+ *(double *)((char*)mem + offset) = cf->getValueAPF().convertToDouble();
+ offset += sizeof(double);
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenWriter::collectGlobalConstant(void) const {
+ const Module::GlobalListType &globalList = TheModule->getGlobalList();
+ for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+ const GlobalVariable &v = *i;
+ if(!v.isConstantUsed()) continue;
+ const char *name = v.getName().data();
+ unsigned addrSpace = v.getType()->getAddressSpace();
+ if(addrSpace == ir::AddressSpace::MEM_CONSTANT) {
+ GBE_ASSERT(v.hasInitializer());
+ const Constant *c = v.getInitializer();
+ Type * type = c->getType();
+
+ uint32_t size = getTypeByteSize(unit, type);
+ void* mem = malloc(size);
+ uint32_t offset = 0;
+ getConstantData(c, mem, offset);
+ uint32_t alignment = getAlignmentByte(unit, type);
+ unit.newConstant((char *)mem, name, size, alignment);
+ free(mem);
+ }
+ }
+ }
+
+ bool GenWriter::doInitialization(Module &M) {
+ FunctionPass::doInitialization(M);
+
+ // Initialize
+ TheModule = &M;
+ collectGlobalConstant();
+ return false;
+ }
+
+ #define GET_EFFECT_DATA(_seq, _index, _tid) \
+ ((_tid == CONST_INT) ? _seq->getElementAsInteger(_index) : \
+ ((_tid == CONST_FLOAT) ? _seq->getElementAsFloat(_index) : \
+ _seq->getElementAsDouble(_index)))
+
+ // typename P is for bool only, as c++ set the &vector<bool)vec[0] to void
+ // type. We have to use uint8_t for bool vector.
+ template <typename T, typename P>
+ ir::ImmediateIndex GenWriter::processSeqConstant(ConstantDataSequential *seq,
+ int index, ConstTypeId tid) {
+ if (index >= 0) {
+ const T data = GET_EFFECT_DATA(seq, index, tid);
+ return ctx.newImmediate(data);
+ } else {
+ vector<P> array;
+ for(int i = 0; i < seq->getNumElements(); i++)
+ array.push_back(GET_EFFECT_DATA(seq, i, tid));
+ return ctx.newImmediate((T*)&array[0], array.size());
+ }
+ }
+
+ ir::ImmediateIndex GenWriter::processConstantVector(ConstantVector *cv, int index) {
+ if (index >= 0) {
+ Constant *c = cv->getOperand(index);
+ return processConstantImmIndex(c, -1);
+ } else {
+ vector<ir::ImmediateIndex> immVector;
+ for (uint32_t i = 0; i < cv->getNumOperands(); i++)
+ immVector.push_back(processConstantImmIndex(cv->getOperand(i)));
+ return ctx.newImmediate(immVector);
+ }
+ }
+
+ ir::ImmediateIndex GenWriter::processConstantImmIndexImpl(Constant *CPV, int32_t index)
+ {
+ GBE_ASSERT(dyn_cast<ConstantExpr>(CPV) == NULL);
+
+#if LLVM_VERSION_MINOR > 0
+ ConstantDataSequential *seq = dyn_cast<ConstantDataSequential>(CPV);
+
+ if (seq) {
+ Type *Ty = seq->getElementType();
+ if (Ty == Type::getInt1Ty(CPV->getContext())) {
+ return processSeqConstant<bool, uint8_t>(seq, index, CONST_INT);
+ } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
+ return processSeqConstant<uint8_t>(seq, index, CONST_INT);
+ } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
+ return processSeqConstant<uint16_t>(seq, index, CONST_INT);
+ } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
+ return processSeqConstant<uint32_t>(seq, index, CONST_INT);
+ } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
+ return processSeqConstant<uint64_t>(seq, index, CONST_INT);
+ } else if (Ty == Type::getFloatTy(CPV->getContext())) {
+ return processSeqConstant<float>(seq, index, CONST_FLOAT);
+ } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+ return processSeqConstant<double>(seq, index, CONST_DOUBLE);
+ }
+ } else
+#endif /* LLVM_VERSION_MINOR > 0 */
+
+ if (dyn_cast<ConstantAggregateZero>(CPV)) {
+ Type* Ty = CPV->getType();
+ if(Ty->isVectorTy())
+ Ty = (cast<VectorType>(Ty))->getElementType();
+ if (Ty == Type::getInt1Ty(CPV->getContext())) {
+ const bool b = 0;
+ return ctx.newImmediate(b);
+ } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
+ const uint8_t u8 = 0;
+ return ctx.newImmediate(u8);
+ } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
+ const uint16_t u16 = 0;
+ return ctx.newImmediate(u16);
+ } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
+ const uint32_t u32 = 0;
+ return ctx.newImmediate(u32);
+ } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
+ const uint64_t u64 = 0;
+ return ctx.newImmediate(u64);
+ } else if (Ty == Type::getFloatTy(CPV->getContext())) {
+ const float f32 = 0;
+ return ctx.newImmediate(f32);
+ } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+ const double f64 = 0;
+ return ctx.newImmediate(f64);
+ } else {
+ GBE_ASSERTM(false, "Unsupporte aggregate zero type.");
+ return ctx.newImmediate(uint32_t(0));
+ }
+ } else {
+ if (dyn_cast<ConstantVector>(CPV))
+ return processConstantVector(dyn_cast<ConstantVector>(CPV), index);
+ GBE_ASSERTM(dyn_cast<ConstantExpr>(CPV) == NULL, "Unsupported constant expression");
+
+ // Integers
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+ Type* Ty = CI->getType();
+ if (Ty == Type::getInt1Ty(CPV->getContext())) {
+ const bool b = CI->getZExtValue();
+ return ctx.newImmediate(b);
+ } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
+ const uint8_t u8 = CI->getZExtValue();
+ return ctx.newImmediate(u8);
+ } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
+ const uint16_t u16 = CI->getZExtValue();
+ return ctx.newImmediate(u16);
+ } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
+ const uint32_t u32 = CI->getZExtValue();
+ return ctx.newImmediate(u32);
+ } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
+ const uint64_t u64 = CI->getZExtValue();
+ return ctx.newImmediate(u64);
+ } else {
+ if (CI->getValue().getActiveBits() > 64) {
+ ctx.getUnit().setValid(false);
+ return ctx.newImmediate(uint64_t(0));
+ }
+ return ctx.newImmediate(uint64_t(CI->getZExtValue()));
+ }
+ }
+
+ // NULL pointers
+ if(isa<ConstantPointerNull>(CPV)) {
+ return ctx.newImmediate(uint32_t(0));
+ }
+
+ const Type::TypeID typeID = CPV->getType()->getTypeID();
+ if (isa<UndefValue>(CPV)) {
+ Type* Ty = CPV->getType();
+ if (Ty == Type::getInt1Ty(CPV->getContext())) return ctx.newImmediate(false);
+ if (Ty == Type::getInt8Ty(CPV->getContext())) return ctx.newImmediate((uint8_t)0);
+ if (Ty == Type::getInt16Ty(CPV->getContext())) return ctx.newImmediate((uint16_t)0);
+ if (Ty == Type::getInt32Ty(CPV->getContext())) return ctx.newImmediate((uint32_t)0);
+ if (Ty == Type::getInt64Ty(CPV->getContext())) return ctx.newImmediate((uint64_t)0);
+ if (Ty == Type::getFloatTy(CPV->getContext())) return ctx.newImmediate((float)0);
+ if (Ty == Type::getDoubleTy(CPV->getContext())) return ctx.newImmediate((double)0);
+ GBE_ASSERT(0 && "Unsupported undef value type.\n");
+ }
+
+ // Floats and doubles
+ switch (typeID) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ {
+ ConstantFP *FPC = cast<ConstantFP>(CPV);
+ GBE_ASSERT(isa<UndefValue>(CPV) == false);
+
+ if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
+ const float f32 = FPC->getValueAPF().convertToFloat();
+ return ctx.newImmediate(f32);
+ } else {
+ const double f64 = FPC->getValueAPF().convertToDouble();
+ return ctx.newImmediate(f64);
+ }
+ }
+ break;
+ default:
+ GBE_ASSERTM(false, "Unsupported constant type");
+ break;
+ }
+ }
+
+ GBE_ASSERTM(false, "Unsupported constant type");
+ return ctx.newImmediate(uint64_t(0));
+ }
+
+ ir::ImmediateIndex GenWriter::processConstantImmIndex(Constant *CPV, int32_t index) {
+ if (dyn_cast<ConstantExpr>(CPV) == NULL)
+ return processConstantImmIndexImpl(CPV, index);
+
+ if (dyn_cast<ConstantExpr>(CPV)) {
+ ConstantExpr *ce = dyn_cast<ConstantExpr>(CPV);
+ ir::Type type = getType(ctx, ce->getType());
+ switch (ce->getOpcode()) {
+ default:
+ //ce->dump();
+ GBE_ASSERT(0 && "unsupported ce opcode.\n");
+ case Instruction::Trunc:
+ {
+ const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+ return ctx.processImm(ir::IMM_TRUNC, immIndex, type);
+ }
+ case Instruction::BitCast:
+ {
+ const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+ if (type == ir::TYPE_LARGE_INT)
+ return immIndex;
+ return ctx.processImm(ir::IMM_BITCAST, immIndex, type);
+ }
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::Shl:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ const ir::ImmediateIndex lhs = processConstantImmIndex(ce->getOperand(0), -1);
+ const ir::ImmediateIndex rhs = processConstantImmIndex(ce->getOperand(1), -1);
+ switch (ce->getOpcode()) {
+ default:
+ //ce->dump();
+ GBE_ASSERTM(0, "Unsupported constant expression.\n");
+ case Instruction::Add:
+ return ctx.processImm(ir::IMM_ADD, lhs, rhs, type);
+ case Instruction::Sub:
+ return ctx.processImm(ir::IMM_SUB, lhs, rhs, type);
+ case Instruction::Mul:
+ return ctx.processImm(ir::IMM_MUL, lhs, rhs, type);
+ case Instruction::SDiv:
+ return ctx.processImm(ir::IMM_DIV, lhs, rhs, type);
+ case Instruction::SRem:
+ return ctx.processImm(ir::IMM_REM, lhs, rhs, type);
+ case Instruction::Shl:
+ return ctx.processImm(ir::IMM_SHL, lhs, rhs, type);
+ case Instruction::AShr:
+ return ctx.processImm(ir::IMM_ASHR, lhs, rhs, type);
+ case Instruction::LShr:
+ return ctx.processImm(ir::IMM_LSHR, lhs, rhs, type);
+ case Instruction::And:
+ return ctx.processImm(ir::IMM_AND, lhs, rhs, type);
+ case Instruction::Or:
+ return ctx.processImm(ir::IMM_OR, lhs, rhs, type);
+ case Instruction::Xor:
+ return ctx.processImm(ir::IMM_XOR, lhs, rhs, type);
+ }
+ }
+ }
+ }
+ GBE_ASSERT(0 && "unsupported constant.\n");
+ return ctx.newImmediate((uint32_t)0);
+ }
+
+ const ir::Immediate &GenWriter::processConstantImm(Constant *CPV, int32_t index) {
+ ir::ImmediateIndex immIndex = processConstantImmIndex(CPV, index);
+ return ctx.getFunction().getImmediate(immIndex);
+ }
+
+ ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV, uint32_t index) {
+ return processConstantImmIndex(CPV, index);
+ }
+
+ void GenWriter::newRegister(Value *value, Value *key, bool uniform) {
+ auto type = value->getType();
+ auto typeID = type->getTypeID();
+ switch (typeID) {
+ case Type::IntegerTyID:
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ case Type::PointerTyID:
+ regTranslator.newScalar(value, key, 0, uniform);
+ break;
+ case Type::VectorTyID:
+ {
+ auto vectorType = cast<VectorType>(type);
+ const uint32_t elemNum = vectorType->getNumElements();
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ regTranslator.newScalar(value, key, elemID, uniform);
+ break;
+ }
+ default: NOT_SUPPORTED;
+ };
+ }
+
+ ir::Register GenWriter::getConstantPointerRegister(ConstantExpr *expr, uint32_t elemID) {
+ Value* val = expr->getOperand(0);
+
+ if (expr->isCast()) {
+ ir::Register pointer_reg;
+ if(isa<ConstantExpr>(val)) {
+ // try to get the real pointer register, for case like:
+ // store i64 ptrtoint (i8 addrspace(3)* getelementptr inbounds ...
+ // in which ptrtoint and getelementptr are ConstantExpr.
+ pointer_reg = getConstantPointerRegister(dyn_cast<ConstantExpr>(val), elemID);
+ } else {
+ pointer_reg = regTranslator.getScalar(val, elemID);
+ }
+ // if ptrToInt request another type other than 32bit, convert as requested
+ ir::Type dstType = getType(ctx, expr->getType());
+ ir::Type srcType = getType(ctx, val->getType());
+ if(srcType != dstType && dstType != ir::TYPE_S32) {
+ ir::Register tmp = ctx.reg(getFamily(dstType));
+ ctx.CVT(dstType, srcType, tmp, pointer_reg);
+ return tmp;
+ }
+ return pointer_reg;
+ }
+ else if (expr->getOpcode() == Instruction::GetElementPtr) {
+ uint32_t TypeIndex;
+ uint32_t constantOffset = 0;
+
+ Value *pointer = val;
+ CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+ for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
+ uint32_t offset = 0;
+ ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
+ GBE_ASSERT(ConstOP);
+ TypeIndex = ConstOP->getZExtValue();
+ if (op == 1) {
+ if (TypeIndex != 0) {
+ Type *elementType = (cast<PointerType>(pointer->getType()))->getElementType();
+ uint32_t elementSize = getTypeByteSize(unit, elementType);
+ uint32_t align = getAlignmentByte(unit, elementType);
+ elementSize += getPadding(elementSize, align);
+ offset += elementSize * TypeIndex;
+ }
+ } else {
+ for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+ {
+ Type* elementType = CompTy->getTypeAtIndex(ty_i);
+ uint32_t align = getAlignmentByte(unit, elementType);
+ offset += getPadding(offset, align);
+ offset += getTypeByteSize(unit, elementType);
+ }
+ const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+ offset += getPadding(offset, align);
+ }
+
+ constantOffset += offset;
+ CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+ }
+
+ ir::Register pointer_reg;
+ if(isa<ConstantExpr>(pointer))
+ pointer_reg = getConstantPointerRegister(dyn_cast<ConstantExpr>(pointer), elemID);
+ else
+ pointer_reg = regTranslator.getScalar(pointer, elemID);
+
+ ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
+ ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32));
+ ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
+ ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
+ return reg;
+ }
+ else
+ assert(0);
+ }
+
+ ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
+ GBE_ASSERT(c != NULL);
+ if(isa<GlobalValue>(c)) {
+ return regTranslator.getScalar(c, elemID);
+ }
+ if(isa<UndefValue>(c)) {
+ Type* llvmType = c->getType();
+ ir::Type dstType = getType(ctx, llvmType);
+ ir::Register reg = ctx.reg(getFamily(dstType));
+
+ ir::ImmediateIndex immIndex;
+ if(llvmType->isIntegerTy())
+ immIndex = ctx.newIntegerImmediate(0, dstType);
+ else if(llvmType->isFloatTy()) {
+ immIndex = ctx.newFloatImmediate((float)0.0);
+ } else {
+ immIndex = ctx.newDoubleImmediate((double)0.0);
+ }
+ ctx.LOADI(dstType, reg, immIndex);
+ return reg;
+ }
+
+ if(isa<ConstantExpr>(c)) {
+ // Check whether this is a constant drived from a pointer.
+ Constant *itC = c;
+ while(isa<ConstantExpr>(itC))
+ itC = dyn_cast<ConstantExpr>(itC)->getOperand(0);
+ if (itC->getType()->isPointerTy())
+ return getConstantPointerRegister(dyn_cast<ConstantExpr>(c), elemID);
+ }
+
+ const ir::ImmediateIndex immIndex = this->newImmediate(c, elemID);
+ const ir::Immediate imm = ctx.getImmediate(immIndex);
+ const ir::Register reg = ctx.reg(getFamily(imm.getType()));
+ ctx.LOADI(imm.getType(), reg, immIndex);
+ return reg;
+ }
+
+ ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
+ //the real value may be constant, so get real value before constant check
+ regTranslator.getRealValue(value, elemID);
+ if(isa<Constant>(value)) {
+ Constant *c = dyn_cast<Constant>(value);
+ return getConstantRegister(c, elemID);
+ } else
+ return regTranslator.getScalar(value, elemID);
+ }
+
+ INLINE Value *GenWriter::getPHICopy(Value *PHI) {
+ const uintptr_t ptr = (uintptr_t) PHI;
+ return (Value*) (ptr+1);
+ }
+
+ void GenWriter::newLabelIndex(const BasicBlock *bb) {
+ if (labelMap.find(bb) == labelMap.end()) {
+ const ir::LabelIndex label = ctx.label();
+ labelMap[bb] = label;
+ }
+ }
+
+ void GenWriter::simplifyTerminator(BasicBlock *bb) {
+ Value *value = --bb->end();
+ BranchInst *I = NULL;
+ if ((I = dyn_cast<BranchInst>(value)) != NULL) {
+ if (I->isConditional() == false)
+ return;
+ // If the "taken" successor is the next block, we try to invert the
+ // branch.
+ BasicBlock *succ = I->getSuccessor(0);
+ if (std::next(Function::iterator(bb)) != Function::iterator(succ))
+ return;
+
+ // More than one use is too complicated: we skip it
+ Value *condition = I->getCondition();
+ if (condition->hasOneUse() == false)
+ return;
+
+ // Right now, we only invert comparison instruction
+ ICmpInst *CI = dyn_cast<ICmpInst>(condition);
+ if (CI != NULL) {
+ GBE_ASSERT(conditionSet.find(CI) == conditionSet.end());
+ conditionSet.insert(CI);
+ return;
+ }
+ }
+ }
+
+ void GenWriter::emitBasicBlock(BasicBlock *BB) {
+ GBE_ASSERT(labelMap.find(BB) != labelMap.end());
+ ctx.LABEL(labelMap[BB]);
+ for (auto II = BB->begin(), E = BB->end(); II != E; ++II) visit(*II);
+ }
+
+ void GenWriter::emitMovForPHI(BasicBlock *curr, BasicBlock *succ) {
+ for (BasicBlock::iterator I = succ->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+ Value *IV = PN->getIncomingValueForBlock(curr);
+ Type *llvmType = PN->getType();
+ const ir::Type type = getType(ctx, llvmType);
+ Value *PHICopy = this->getPHICopy(PN);
+ const ir::Register dst = this->getRegister(PHICopy);
+ if (!isa<UndefValue>(IV)) {
+
+ // Emit the MOV required by the PHI function. We do it simple and do not
+ // try to optimize them. A next data flow analysis pass on the Gen IR
+ // will remove them
+ Constant *CP = dyn_cast<Constant>(IV);
+ if (CP) {
+ GBE_ASSERT(isa<GlobalValue>(CP) == false);
+ ConstantVector *CPV = dyn_cast<ConstantVector>(CP);
+ if (CPV && dyn_cast<ConstantVector>(CPV) &&
+ isa<UndefValue>(extractConstantElem(CPV, 0)))
+ continue;
+ ctx.MOV(type, dst, getRegister(CP));
+ } else if (regTranslator.valueExists(IV,0) || dyn_cast<Constant>(IV)) {
+ const ir::Register src = this->getRegister(IV);
+ ctx.MOV(type, dst, src);
+ }
+ assert(!ctx.getBlock()->undefPhiRegs.contains(dst));
+ ctx.getBlock()->definedPhiRegs.insert(dst);
+ } else {
+ // If this is an undefined value, we don't need emit phi copy here.
+ // But we need to record it. As latter, at liveness's backward analysis,
+ // we don't need to pass the phi value/register to this BB which the phi
+ // value is undefined. Otherwise, the phi value's liveness will be extent
+ // incorrectly and may be extent to the basic block zero which is really bad.
+ ctx.getBlock()->undefPhiRegs.insert(dst);
+ }
+ }
+ }
+
+ void GenWriter::emitFunctionPrototype(Function &F)
+ {
+ GBE_ASSERTM(F.hasStructRetAttr() == false,
+ "Returned value for kernel functions is forbidden");
+
+ // Loop over the kernel metadatas to set the required work group size.
+ NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
+ size_t reqd_wg_sz[3] = {0, 0, 0};
+ size_t hint_wg_sz[3] = {0, 0, 0};
+ ir::FunctionArgument::InfoFromLLVM llvmInfo;
+ MDNode *node = NULL;
+ MDNode *addrSpaceNode = NULL;
+ MDNode *typeNameNode = NULL;
+ MDNode *accessQualNode = NULL;
+ MDNode *typeQualNode = NULL;
+ MDNode *argNameNode = NULL;
+
+ std::string functionAttributes;
+
+ /* First find the meta data belong to this function. */
+ for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
+ node = clKernelMetaDatas->getOperand(i);
+ if (node->getOperand(0) == &F) break;
+ node = NULL;
+ }
+
+ /* because "-cl-kernel-arg-info", should always have meta data. */
+ if (!F.arg_empty())
+ assert(node);
+
+
+ for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+ MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
+ if (attrNode == NULL) break;
+ MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+ if (!attrName) continue;
+
+ if (attrName->getString() == "reqd_work_group_size") {
+ GBE_ASSERT(attrNode->getNumOperands() == 4);
+ ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+ ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+ ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+ GBE_ASSERT(x && y && z);
+ reqd_wg_sz[0] = x->getZExtValue();
+ reqd_wg_sz[1] = y->getZExtValue();
+ reqd_wg_sz[2] = z->getZExtValue();
+ functionAttributes += attrName->getString();
+ std::stringstream param;
+ char buffer[100];
+ param <<"(";
+ param << reqd_wg_sz[0];
+ param << ",";
+ param << reqd_wg_sz[1];
+ param << ",";
+ param << reqd_wg_sz[2];
+ param <<")";
+ param >> buffer;
+ functionAttributes += buffer;
+ functionAttributes += " ";
+ break;
+ } else if (attrName->getString() == "kernel_arg_addr_space") {
+ addrSpaceNode = attrNode;
+ } else if (attrName->getString() == "kernel_arg_access_qual") {
+ accessQualNode = attrNode;
+ } else if (attrName->getString() == "kernel_arg_type") {
+ typeNameNode = attrNode;
+ } else if (attrName->getString() == "kernel_arg_type_qual") {
+ typeQualNode = attrNode;
+ } else if (attrName->getString() == "kernel_arg_name") {
+ argNameNode = attrNode;
+ } else if (attrName->getString() == "vec_type_hint") {
+ GBE_ASSERT(attrNode->getNumOperands() == 3);
+ functionAttributes += attrName->getString();
+ functionAttributes += " ";
+ } else if (attrName->getString() == "work_group_size_hint") {
+ GBE_ASSERT(attrNode->getNumOperands() == 4);
+ ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+ ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+ ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+ GBE_ASSERT(x && y && z);
+ hint_wg_sz[0] = x->getZExtValue();
+ hint_wg_sz[1] = y->getZExtValue();
+ hint_wg_sz[2] = z->getZExtValue();
+ functionAttributes += attrName->getString();
+ std::stringstream param;
+ char buffer[100];
+ param <<"(";
+ param << hint_wg_sz[0];
+ param << ",";
+ param << hint_wg_sz[1];
+ param << ",";
+ param << hint_wg_sz[2];
+ param <<")";
+ param >> buffer;
+ functionAttributes += buffer;
+ functionAttributes += " ";
+ }
+ }
+ ctx.appendSurface(1, ir::ocl::stackbuffer);
+
+ ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
+
+ ctx.getFunction().setFunctionAttributes(functionAttributes);
+ // Loop over the arguments and output registers for them
+ if (!F.arg_empty()) {
+ uint32_t argID = 0;
+ Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+
+ // Insert a new register for each function argument
+#if LLVM_VERSION_MINOR <= 1
+ const AttrListPtr &PAL = F.getAttributes();
+#endif /* LLVM_VERSION_MINOR <= 1 */
+ for (; I != E; ++I, ++argID) {
+ const std::string &argName = I->getName().str();
+ Type *type = I->getType();
+
+ llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
+ llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+ if (llvmInfo.typeName.find("image") != std::string::npos &&
+ llvmInfo.typeName.find("*") != std::string::npos) {
+ uint32_t start = llvmInfo.typeName.find("image");
+ uint32_t end = llvmInfo.typeName.find("*");
+ llvmInfo.typeName = llvmInfo.typeName.substr(start, end - start);
+ }
+ llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
+ llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
+ llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+
+ // function arguments are uniform values.
+ this->newRegister(I, NULL, true);
+ // add support for vector argument.
+ if(type->isVectorTy()) {
+ VectorType *vectorType = cast<VectorType>(type);
+ ir::Register reg = getRegister(I, 0);
+ Type *elemType = vectorType->getElementType();
+ const uint32_t elemSize = getTypeByteSize(unit, elemType);
+ const uint32_t elemNum = vectorType->getNumElements();
+ //vector's elemType always scalar type
+ ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type), 0);
+
+ ir::Function& fn = ctx.getFunction();
+ for(uint32_t i=1; i < elemNum; i++) {
+ ir::PushLocation argLocation(fn, argID, elemSize*i);
+ reg = getRegister(I, i);
+ ctx.appendPushedConstant(reg, argLocation); //add to push map for reg alloc
+ }
+ continue;
+ }
+
+ GBE_ASSERTM(isScalarType(type) == true,
+ "vector type in the function argument is not supported yet");
+ const ir::Register reg = getRegister(I);
+ if (type->isPointerTy() == false)
+ ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
+ else {
+ PointerType *pointerType = dyn_cast<PointerType>(type);
+ Type *pointed = pointerType->getElementType();
+ // By value structure
+#if LLVM_VERSION_MINOR <= 1
+ if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
+#else
+ if (I->hasByValAttr()) {
+#endif /* LLVM_VERSION_MINOR <= 1 */
+ const size_t structSize = getTypeByteSize(unit, pointed);
+ ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
+ }
+ // Regular user provided pointer (global, local or constant)
+ else {
+ const uint32_t addr = pointerType->getAddressSpace();
+ const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(addr);
+ const uint32_t ptrSize = getTypeByteSize(unit, type);
+ const uint32_t align = getAlignmentByte(unit, pointed);
+ switch (addrSpace) {
+ case ir::MEM_GLOBAL:
+ globalPointer.insert(std::make_pair(I, btiBase));
+ ctx.appendSurface(btiBase, reg);
+ ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
+ incBtiBase();
+ break;
+ case ir::MEM_LOCAL:
+ ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, llvmInfo, ptrSize, align, 0xfe);
+ ctx.getFunction().setUseSLM(true);
+ break;
+ case ir::MEM_CONSTANT:
+ ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, llvmInfo, ptrSize, align, 0x2);
+ break;
+ case ir::IMAGE:
+ ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align, 0x0);
+ ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
+ break;
+ default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
+ }
+ }
+ }
+ }
+ }
+
+ // When returning a structure, first input register is the pointer to the
+ // structure
+#if GBE_DEBUG
+ const Type *type = F.getReturnType();
+ GBE_ASSERTM(type->isVoidTy() == true,
+ "Returned value for kernel functions is forbidden");
+
+ // Variable number of arguments is not supported
+ FunctionType *FT = cast<FunctionType>(F.getFunctionType());
+ GBE_ASSERT(FT->isVarArg() == false);
+#endif /* GBE_DEBUG */
+ }
+
+ static inline bool isFPIntBitCast(const Instruction &I) {
+ if (!isa<BitCastInst>(I))
+ return false;
+ Type *SrcTy = I.getOperand(0)->getType();
+ Type *DstTy = I.getType();
+ return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
+ (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
+ }
+
+ /*! To track last read and write of the registers */
+ struct RegInfoForMov {
+ ir::Instruction *lastWriteInsn;
+ ir::Instruction *lastReadInsn;
+ uint32_t lastWrite;
+ uint32_t lastRead;
+ };
+
+ /*! Replace register "from" by register "to" in the destination(s) */
+ static void replaceDst(ir::Instruction *insn, ir::Register from, ir::Register to) {
+ const uint32_t dstNum = insn->getDstNum();
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID)
+ if (insn->getDst(dstID) == from)
+ insn->setDst(dstID, to);
+ }
+
+ /*! Replace register "from" by register "to" in the source(s) */
+ static void replaceSrc(ir::Instruction *insn, ir::Register from, ir::Register to) {
+ const uint32_t srcNum = insn->getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
+ if (insn->getSrc(srcID) == from)
+ insn->setSrc(srcID, to);
+ }
+
+ /*! lastUse maintains data about last uses (reads/writes) for each
+ * ir::Register
+ */
+ static void buildRegInfo(ir::BasicBlock &bb, vector<RegInfoForMov> &lastUse)
+ {
+ // Clear the register usages
+ for (auto &x : lastUse) {
+ x.lastWrite = x.lastRead = 0;
+ x.lastWriteInsn = x.lastReadInsn = NULL;
+ }
+
+ // Find use intervals for all registers (distinguish sources and
+ // destinations)
+ uint32_t insnID = 2;
+ bb.foreach([&](ir::Instruction &insn) {
+ const uint32_t dstNum = insn.getDstNum();
+ const uint32_t srcNum = insn.getSrcNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const ir::Register reg = insn.getSrc(srcID);
+ lastUse[reg].lastRead = insnID;
+ lastUse[reg].lastReadInsn = &insn;
+ }
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const ir::Register reg = insn.getDst(dstID);
+ lastUse[reg].lastWrite = insnID+1;
+ lastUse[reg].lastWriteInsn = &insn;
+ }
+ insnID+=2;
+ });
+ }
+
+ void GenWriter::optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn)
+ {
+ // The overall idea behind is we check whether there is any interference
+ // between phi and phiCopy live range. If there is no point that
+ // phi & phiCopy are both alive, then we can optimize off the move
+ // from phiCopy to phi, and use phiCopy directly instead of phi.
+ using namespace ir;
+ ir::FunctionDAG *dag = new ir::FunctionDAG(liveness);
+
+ for (auto &it : phiMap) {
+ const Register phi = it.first;
+ const Register phiCopy = it.second;
+
+ const ir::DefSet *phiCopyDef = dag->getRegDef(phiCopy);
+ const ir::UseSet *phiUse = dag->getRegUse(phi);
+ const DefSet *phiDef = dag->getRegDef(phi);
+ bool isOpt = true;
+ for (auto &x : *phiCopyDef) {
+ const ir::Instruction * phiCopyDefInsn = x->getInstruction();
+ const ir::BasicBlock *bb = phiCopyDefInsn->getParent();
+ const Liveness::LiveOut &out = liveness.getLiveOut(bb);
+ // phi & phiCopy are both alive at the endpoint of bb,
+ // thus can not be optimized.
+ if (out.contains(phi)) {
+ isOpt = false;
+ break;
+ }
+ // If phi is used in the same BB that define the phiCopy,
+ // we need carefully check the liveness of phi & phiCopy.
+ // Make sure their live ranges do not interfere.
+ bool phiUsedInSameBB = false;
+ for (auto &y : *phiUse) {
+ const ir::Instruction *phiUseInsn = y->getInstruction();
+ const ir::BasicBlock *bb2 = phiUseInsn->getParent();
+ if (bb2 == bb) {
+ phiUsedInSameBB = true;
+ }
+ }
+ // Check phi is not used between phiCopy def point and bb's end point,
+ // which is often referred as 'phi swap issue', just like below:
+ // MOV phiCopy_1, x;
+ // MOV phiCopy_2, phi_1;
+ if (phiUsedInSameBB ) {
+ for (auto it = --bb->end(); it != bb->end() ; --it) {
+ const Instruction &p = *it;
+
+ if (&p == phiCopyDefInsn) break;
+ // we only care MOV here
+ if (p.getSrcNum() == 1 && p.getSrc(0) == phi) {
+ isOpt = false;
+ break;
+ }
+ }
+ }
+ }
+
+ // [MOV phi, phiCopy;] can be removed. So we remove it
+ // and replace phi uses with phiCopy
+ if (isOpt) {
+ for (auto &x : *phiDef) {
+ const_cast<Instruction *>(x->getInstruction())->remove();
+ }
+ for (auto &x : *phiUse) {
+ const Instruction *phiUseInsn = x->getInstruction();
+ replaceSrc(const_cast<Instruction *>(phiUseInsn), phi, phiCopy);
+ }
+ }
+ }
+ delete dag;
+ }
+
+ void GenWriter::removeMOVs(const ir::Liveness &liveness, ir::Function &fn)
+ {
+ // We store the last write and last read for each register
+ const uint32_t regNum = fn.regNum();
+ vector<RegInfoForMov> lastUse;
+ lastUse.resize(regNum);
+
+ // Remove the MOVs per block (local analysis only) Note that we do not try
+ // to remove MOV for variables that outlives the block. So we use liveness
+ // information to figure out which variable is alive
+ fn.foreachBlock([&](ir::BasicBlock &bb)
+ {
+ // We need to know when each register will be read or written
+ buildRegInfo(bb, lastUse);
+
+ // Liveinfo helps us to know if the source outlives the block
+ const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
+
+ auto it = --bb.end();
+ if (it->isMemberOf<ir::BranchInstruction>() == true) --it;
+ for (auto it = --bb.end(); it != bb.end();) {
+ ir::Instruction *insn = &*it; it--;
+ const ir::Opcode op = insn->getOpcode();
+ if (op == ir::OP_MOV) {
+ const ir::Register dst = insn->getDst(0);
+ const ir::Register src = insn->getSrc(0);
+ // Outlives the block. We do not do anything
+ if (info.inLiveOut(src))
+ continue;
+ const RegInfoForMov &dstInfo = lastUse[dst];
+ const RegInfoForMov &srcInfo = lastUse[src];
+ // The source is not computed in this block
+ if (srcInfo.lastWrite == 0)
+ continue;
+ // dst is read after src is written. We cannot overwrite dst
+ if (dstInfo.lastRead > srcInfo.lastWrite)
+ continue;
+ // We are good. We first patch the destination then all the sources
+ replaceDst(srcInfo.lastWriteInsn, src, dst);
+ // Then we patch all subsequent uses of the source
+ ir::Instruction *next = static_cast<ir::Instruction*>(srcInfo.lastWriteInsn->next);
+ while (next != insn) {
+ replaceSrc(next, src, dst);
+ next = static_cast<ir::Instruction*>(next->next);
+ }
+ insn->remove();
+ } else if (op == ir::OP_LOADI)
+ continue;
+ else
+ break;
+ }
+ });
+ }
+
+ void GenWriter::removeLOADIs(const ir::Liveness &liveness, ir::Function &fn)
+ {
+ // We store the last write and last read for each register
+ const uint32_t regNum = fn.regNum();
+ vector<RegInfoForMov> lastUse;
+ lastUse.resize(regNum);
+
+ // Traverse all blocks and remove redundant immediates. Do *not* remove
+ // immediates that outlive the block
+ fn.foreachBlock([&](ir::BasicBlock &bb)
+ {
+ // Each immediate that is already loaded in the block
+ map<ir::Immediate, ir::Register> loadedImm;
+
+ // Immediate to immediate translation
+ map<ir::Register, ir::Register> immTranslate;
+
+ // Liveinfo helps us to know if the loaded immediate outlives the block
+ const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
+
+ // We need to know when each register will be read or written
+ buildRegInfo(bb, lastUse);
+
+ // Top bottom traversal -> remove useless LOADIs
+ uint32_t insnID = 2;
+ bb.foreach([&](ir::Instruction &insn)
+ {
+ // We either try to remove the LOADI or we will try to use it as a
+ // replacement for the next same LOADIs
+ if (insn.isMemberOf<ir::LoadImmInstruction>()) {
+ ir::LoadImmInstruction &loadImm = cast<ir::LoadImmInstruction>(insn);
+ const ir::Immediate imm = loadImm.getImmediate();
+ const ir::Register dst = loadImm.getDst(0);
+
+ // Not here: cool, we put it in the map if the register is not
+ // overwritten. If it is, we just ignore it for simplicity. Note that
+ // it should not happen with the way we "unSSA" the code
+ auto it = loadedImm.find(imm);
+ auto end = loadedImm.end();
+ if (it == end && lastUse[dst].lastWrite == insnID+1)
+ loadedImm.insert(std::make_pair(imm, dst));
+ // We already pushed the same immediate and we do not outlive the
+ // block. We are good to replace this immediate by the previous one
+ else if (it != end && info.inLiveOut(dst) == false) {
+ immTranslate.insert(std::make_pair(dst, it->second));
+ insn.remove();
+ }
+ }
+ // Traverse all the destinations and sources and perform the
+ // substitutions (if any)
+ else {
+ const uint32_t srcNum = insn.getSrcNum();
+ const uint32_t dstNum = insn.getDstNum();
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const ir::Register src = insn.getSrc(srcID);
+ auto it = immTranslate.find(src);
+ if (it != immTranslate.end())
+ insn.setSrc(srcID, it->second);
+ }
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const ir::Register dst = insn.getDst(dstID);
+ auto it = immTranslate.find(dst);
+ if (it != immTranslate.end())
+ insn.setDst(dstID, it->second);
+ }
+ }
+ insnID += 2;
+ });
+ });
+ }
+
+ BVAR(OCL_OPTIMIZE_PHI_MOVES, true);
+ BVAR(OCL_OPTIMIZE_LOADI, true);
+
+ static const Instruction *getInstructionUseLocal(const Value *v) {
+ // Local variable can only be used in one kernel function. So, if we find
+ // one instruction that use the local variable, simply return.
+ const Instruction *insn = NULL;
+ for(Value::const_use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
+ // After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+ const User *theUser = *iter;
+#else
+ const User *theUser = iter->getUser();
+#endif
+ if(isa<Instruction>(theUser)) return cast<const Instruction>(theUser);
+ insn = getInstructionUseLocal(theUser);
+ if(insn != NULL) break;
+ }
+ return insn;
+ }
+
+ void GenWriter::allocateGlobalVariableRegister(Function &F)
+ {
+ // Allocate a address register for each global variable
+ const Module::GlobalListType &globalList = TheModule->getGlobalList();
+ size_t j = 0;
+ for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+ const GlobalVariable &v = *i;
+ if(!v.isConstantUsed()) continue;
+
+ ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
+ if(addrSpace == ir::MEM_LOCAL) {
+ const Value * val = cast<Value>(&v);
+ const Instruction *insn = getInstructionUseLocal(val);
+ GBE_ASSERT(insn && "Can't find a valid reference instruction for local variable.");
+
+ const BasicBlock * bb = insn->getParent();
+ const Function * func = bb->getParent();
+ if(func != &F) continue;
+
+ ir::Function &f = ctx.getFunction();
+ f.setUseSLM(true);
+ const Constant *c = v.getInitializer();
+ Type *ty = c->getType();
+ uint32_t oldSlm = f.getSLMSize();
+ uint32_t align = 8 * getAlignmentByte(unit, ty);
+ uint32_t padding = getPadding(oldSlm*8, align);
+
+ f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty));
+
+ this->newRegister(const_cast<GlobalVariable*>(&v));
+ ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+ ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
+ } else if(addrSpace == ir::MEM_CONSTANT) {
+ GBE_ASSERT(v.hasInitializer());
+ this->newRegister(const_cast<GlobalVariable*>(&v));
+ ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+ ir::Constant &con = unit.getConstantSet().getConstant(j ++);
+ GBE_ASSERT(con.getName() == v.getName());
+ ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
+ } else {
+ if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
+ ctx.appendSurface(btiBase, ir::ocl::printfbptr);
+ ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
+ globalPointer.insert(std::make_pair(&v, incBtiBase()));
+ regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
+ } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
+ ctx.appendSurface(btiBase, ir::ocl::printfiptr);
+ ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
+ globalPointer.insert(std::make_pair(&v, incBtiBase()));
+ regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
+ } else if(v.getName().str().substr(0, 4) == ".str") {
+ /* When there are multi printf statements in multi kernel fucntions within the same
+ translate unit, if they have the same sting parameter, such as
+ kernel_func1 () {
+ printf("Line is %d\n", line_num1);
+ }
+ kernel_func2 () {
+ printf("Line is %d\n", line_num2);
+ }
+ The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
+ So when translating the kernel_func1, we can not unref that global var, so we will
+ get here. Just ignore it to avoid assert. */
+ } else {
+ GBE_ASSERT(0);
+ }
+ }
+ }
+
+ }
+ static INLINE void findAllLoops(LoopInfo * LI, std::vector<std::pair<Loop*, int>> &lp)
+ {
+ for (Loop::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) {
+ lp.push_back(std::make_pair(*I, -1));
+ }
+ if (lp.size() == 0) return;
+
+ uint32_t i = 0;
+ do {
+ const std::vector<Loop*> subLoops = lp[i].first->getSubLoops();
+ for(auto sub : subLoops)
+ lp.push_back(std::make_pair(sub, i));
+ i++;
+ } while(i < lp.size());
+ }
+
+ void GenWriter::gatherLoopInfo(ir::Function &fn) {
+ vector<ir::LabelIndex> loopBBs;
+ vector<std::pair<ir::LabelIndex, ir::LabelIndex>> loopExits;
+ std::vector<std::pair<Loop*, int>> lp;
+
+ findAllLoops(LI, lp);
+#if GBE_DEBUG
+ // check two loops' interference
+ for(unsigned int i = 0; i < lp.size(); i++) {
+ SmallVector<Loop::Edge, 8> exitBBs;
+ lp[i].first->getExitEdges(exitBBs);
+
+ const std::vector<BasicBlock*> &inBBs = lp[i].first->getBlocks();
+ std::vector<ir::LabelIndex> bbs1;
+ for(auto x : inBBs) {
+ bbs1.push_back(labelMap[x]);
+ }
+ std::sort(bbs1.begin(), bbs1.end());
+ for(unsigned int j = i+1; j < lp.size(); j++) {
+ if(! lp[i].first->contains(lp[j].first)) {
+ const std::vector<BasicBlock*> &inBBs2 = lp[j].first->getBlocks();
+ std::vector<ir::LabelIndex> bbs2;
+ std::vector<ir::LabelIndex> bbs3;
+
+ for(auto x : inBBs2) {
+ bbs2.push_back(labelMap[x]);
+ }
+
+ std::sort(bbs2.begin(), bbs2.end());
+ std::set_intersection(bbs1.begin(), bbs1.end(), bbs2.begin(), bbs2.end(), std::back_inserter(bbs3));
+ GBE_ASSERT(bbs3.size() < 1);
+ }
+ }
+ }
+#endif
+
+ for (auto loop : lp) {
+ loopBBs.clear();
+ loopExits.clear();
+
+ const std::vector<BasicBlock*> &inBBs = loop.first->getBlocks();
+ for (auto b : inBBs) {
+ GBE_ASSERT(labelMap.find(b) != labelMap.end());
+ loopBBs.push_back(labelMap[b]);
+ }
+
+ SmallVector<Loop::Edge, 8> exitBBs;
+ loop.first->getExitEdges(exitBBs);
+ for(auto b : exitBBs){
+ GBE_ASSERT(labelMap.find(b.first) != labelMap.end());
+ GBE_ASSERT(labelMap.find(b.second) != labelMap.end());
+ loopExits.push_back(std::make_pair(labelMap[b.first], labelMap[b.second]));
+ }
+ fn.addLoop(loopBBs, loopExits);
+ }
+ }
+
+ void GenWriter::emitFunction(Function &F)
+ {
+ switch (F.getCallingConv()) {
+#if LLVM_VERSION_MINOR <= 2
+ case CallingConv::PTX_Device: // we do not emit device function
+ return;
+ case CallingConv::PTX_Kernel:
+#else
+ case CallingConv::C:
+#endif
+ break;
+ default: GBE_ASSERTM(false, "Unsupported calling convention");
+ }
+
+ ctx.startFunction(F.getName());
+ ir::Function &fn = ctx.getFunction();
+ this->regTranslator.clear();
+ this->labelMap.clear();
+ this->emitFunctionPrototype(F);
+
+ this->allocateGlobalVariableRegister(F);
+ // Visit all the instructions and emit the IR registers or the value to
+ // value mapping when a new register is not needed
+ pass = PASS_EMIT_REGISTERS;
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+ visit(*I);
+
+ // First create all the labels (one per block) ...
+ for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+ this->newLabelIndex(BB);
+
+ // Then, for all branch instructions that have conditions, see if we can
+ // simplify the code by inverting condition code
+ for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+ this->simplifyTerminator(BB);
+
+ // gather loop info, which is useful for liveness analysis
+ gatherLoopInfo(fn);
+
+ // ... then, emit the instructions for all basic blocks
+ pass = PASS_EMIT_INSTRUCTIONS;
+ for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+ emitBasicBlock(BB);
+ ctx.endFunction();
+
+ // Liveness can be shared when we optimized the immediates and the MOVs
+ ir::Liveness liveness(fn);
+
+ if (OCL_OPTIMIZE_LOADI) this->removeLOADIs(liveness, fn);
+ if (OCL_OPTIMIZE_PHI_MOVES) this->optimizePhiCopy(liveness, fn);
+ if (OCL_OPTIMIZE_PHI_MOVES) this->removeMOVs(liveness, fn);
+ }
+
+ void GenWriter::regAllocateReturnInst(ReturnInst &I) {}
+
+ void GenWriter::emitReturnInst(ReturnInst &I) {
+ const ir::Function &fn = ctx.getFunction();
+ GBE_ASSERTM(fn.outputNum() <= 1, "no more than one value can be returned");
+ if (fn.outputNum() == 1 && I.getNumOperands() > 0) {
+ const ir::Register dst = fn.getOutput(0);
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ const ir::RegisterFamily family = fn.getRegisterFamily(dst);
+ ctx.MOV(ir::getType(family), dst, src);
+ }
+ ctx.RET();
+ }
+
+ void GenWriter::regAllocateBinaryOperator(Instruction &I) {
+ this->newRegister(&I);
+ }
+
+ void GenWriter::emitBinaryOperator(Instruction &I) {
+#if GBE_DEBUG
+ GBE_ASSERT(I.getType()->isPointerTy() == false);
+ // We accept logical operations on booleans
+ switch (I.getOpcode()) {
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ break;
+ default:
+ GBE_ASSERT(I.getType() != Type::getInt1Ty(I.getContext()));
+ }
+#endif /* GBE_DEBUG */
+
+ // Get the element type for a vector
+ const ir::Type type = getType(ctx, I.getType());
+
+ // Emit the instructions in a row
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+ switch (I.getOpcode()) {
+ case Instruction::Add:
+ case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
+ case Instruction::Sub:
+ case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
+ case Instruction::Mul:
+ case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
+ case Instruction::URem: ctx.REM(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+ case Instruction::SRem:
+ case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
+ case Instruction::UDiv: ctx.DIV(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+ case Instruction::SDiv:
+ case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
+ case Instruction::And: ctx.AND(type, dst, src0, src1); break;
+ case Instruction::Or: ctx.OR(type, dst, src0, src1); break;
+ case Instruction::Xor: ctx.XOR(type, dst, src0, src1); break;
+ case Instruction::Shl: ctx.SHL(type, dst, src0, src1); break;
+ case Instruction::LShr: ctx.SHR(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+ case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
+ default: NOT_SUPPORTED;
+ }
+ }
+
+ void GenWriter::regAllocateICmpInst(ICmpInst &I) {
+ this->newRegister(&I);
+ }
+
+ static ir::Type makeTypeSigned(const ir::Type &type) {
+ if (type == ir::TYPE_U8) return ir::TYPE_S8;
+ else if (type == ir::TYPE_U16) return ir::TYPE_S16;
+ else if (type == ir::TYPE_U32) return ir::TYPE_S32;
+ else if (type == ir::TYPE_U64) return ir::TYPE_S64;
+ return type;
+ }
+
+ static ir::Type makeTypeUnsigned(const ir::Type &type) {
+ if (type == ir::TYPE_S8) return ir::TYPE_U8;
+ else if (type == ir::TYPE_S16) return ir::TYPE_U16;
+ else if (type == ir::TYPE_S32) return ir::TYPE_U32;
+ else if (type == ir::TYPE_S64) return ir::TYPE_U64;
+ return type;
+ }
+
+ void GenWriter::emitICmpInst(ICmpInst &I) {
+ GBE_ASSERT(I.getOperand(0)->getType() != Type::getInt1Ty(I.getContext()));
+
+ // Get the element type and the number of elements
+ Type *operandType = I.getOperand(0)->getType();
+ const ir::Type type = getType(ctx, operandType);
+ const ir::Type signedType = makeTypeSigned(type);
+ const ir::Type unsignedType = makeTypeUnsigned(type);
+
+ // Emit the instructions in a row
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+ // We must invert the condition to simplify the branch code
+ if (conditionSet.find(&I) != conditionSet.end()) {
+ switch (I.getPredicate()) {
+ case ICmpInst::ICMP_EQ: ctx.NE(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_NE: ctx.EQ(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULE: ctx.GT((unsignedType), dst, src0, src1); break;
+ case ICmpInst::ICMP_SLE: ctx.GT(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGE: ctx.LT(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGE: ctx.LT(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULT: ctx.GE(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SLT: ctx.GE(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGT: ctx.LE(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGT: ctx.LE(signedType, dst, src0, src1); break;
+ default: NOT_SUPPORTED;
+ }
+ }
+ // Nothing special to do
+ else {
+ switch (I.getPredicate()) {
+ case ICmpInst::ICMP_EQ: ctx.EQ(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_NE: ctx.NE(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULE: ctx.LE((unsignedType), dst, src0, src1); break;
+ case ICmpInst::ICMP_SLE: ctx.LE(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGE: ctx.GE(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGE: ctx.GE(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULT: ctx.LT(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SLT: ctx.LT(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGT: ctx.GT(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGT: ctx.GT(signedType, dst, src0, src1); break;
+ default: NOT_SUPPORTED;
+ }
+ }
+ }
+
+ void GenWriter::regAllocateFCmpInst(FCmpInst &I) {
+ this->newRegister(&I);
+ }
+
+ void GenWriter::emitFCmpInst(FCmpInst &I) {
+
+ // Get the element type and the number of elements
+ Type *operandType = I.getOperand(0)->getType();
+ const ir::Type type = getType(ctx, operandType);
+ const ir::Type insnType = getType(ctx, I.getType());
+
+ // Emit the instructions in a row
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+ const ir::Register tmp = ctx.reg(getFamily(ctx, I.getType()));
+ Value *cv = ConstantInt::get(I.getType(), 1);
+
+ switch (I.getPredicate()) {
+ case ICmpInst::FCMP_OEQ: ctx.EQ(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_ONE: ctx.NE(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OLE: ctx.LE(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OGE: ctx.GE(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OLT: ctx.LT(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OGT: ctx.GT(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_ORD:
+ //If there is a constant between src0 and src1, this constant value
+ //must ordered, otherwise, llvm will optimize the instruction to ture.
+ //So discard this constant value, only compare the other src.
+ if(isa<ConstantFP>(I.getOperand(0)))
+ ctx.EQ(type, dst, src1, src1);
+ else if(isa<ConstantFP>(I.getOperand(1)))
+ ctx.EQ(type, dst, src0, src0);
+ else
+ ctx.ORD(type, dst, src0, src1);
+ break;
+ case ICmpInst::FCMP_UNO:
+ if(isa<ConstantFP>(I.getOperand(0)))
+ ctx.NE(type, dst, src1, src1);
+ else if(isa<ConstantFP>(I.getOperand(1)))
+ ctx.NE(type, dst, src0, src0);
+ else {
+ ctx.ORD(type, tmp, src0, src1);
+ ctx.XOR(insnType, dst, tmp, getRegister(cv)); //TODO: Use NOT directly
+ }
+ break;
+ case ICmpInst::FCMP_UEQ:
+ ctx.NE(type, tmp, src0, src1);
+ ctx.XOR(insnType, dst, tmp, getRegister(cv));
+ break;
+ case ICmpInst::FCMP_UGT:
+ ctx.LE(type, tmp, src0, src1);
+ ctx.XOR(insnType, dst, tmp, getRegister(cv));
+ break;
+ case ICmpInst::FCMP_UGE:
+ ctx.LT(type, tmp, src0, src1);
+ ctx.XOR(insnType, dst, tmp, getRegister(cv));
+ break;
+ case ICmpInst::FCMP_ULT:
+ ctx.GE(type, tmp, src0, src1);
+ ctx.XOR(insnType, dst, tmp, getRegister(cv));
+ break;
+ case ICmpInst::FCMP_ULE:
+ ctx.GT(type, tmp, src0, src1);
+ ctx.XOR(insnType, dst, tmp, getRegister(cv));
+ break;
+ case ICmpInst::FCMP_UNE:
+ ctx.EQ(type, tmp, src0, src1);
+ ctx.XOR(insnType, dst, tmp, getRegister(cv));
+ break;
+ case ICmpInst::FCMP_TRUE:
+ ctx.MOV(insnType, dst, getRegister(cv));
+ break;
+ default: NOT_SUPPORTED;
+ }
+ }
+
+ void GenWriter::regAllocateCastInst(CastInst &I) {
+ Value *dstValue = &I;
+ Value *srcValue = I.getOperand(0);
+ const auto op = I.getOpcode();
+
+ switch (op)
+ {
+ // When casting pointer to integers, be aware with integers
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ {
+ Constant *CPV = dyn_cast<Constant>(srcValue);
+ if (CPV == NULL) {
+#if GBE_DEBUG
+ Type *dstType = dstValue->getType();
+ Type *srcType = srcValue->getType();
+ GBE_ASSERT(getTypeByteSize(unit, dstType) == getTypeByteSize(unit, srcType));
+#endif /* GBE_DEBUG */
+ regTranslator.newValueProxy(srcValue, dstValue);
+ } else
+ this->newRegister(dstValue);
+ }
+ break;
+ // Bitcast just forward registers
+ case Instruction::BitCast:
+ {
+ Type *srcType = srcValue->getType();
+ Type *dstType = dstValue->getType();
+
+ if(srcType->isVectorTy() || dstType->isVectorTy())
+ this->newRegister(dstValue);
+ else
+ regTranslator.newValueProxy(srcValue, dstValue);
+ }
+ break;
+ // Various conversion operations -> just allocate registers for them
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::Trunc:
+ this->newRegister(&I);
+ break;
+ default: NOT_SUPPORTED;
+ }
+ }
+
+ void GenWriter::emitCastInst(CastInst &I) {
+ switch (I.getOpcode())
+ {
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ {
+ Value *dstValue = &I;
+ Value *srcValue = I.getOperand(0);
+ Constant *CPV = dyn_cast<Constant>(srcValue);
+ if (CPV != NULL) {
+ const ir::ImmediateIndex index = ctx.newImmediate(CPV);
+ const ir::Immediate imm = ctx.getImmediate(index);
+ const ir::Register reg = this->getRegister(dstValue);
+ ctx.LOADI(imm.getType(), reg, index);
+ }
+ }
+ break;
+ case Instruction::BitCast:
+ {
+ Value *srcValue = I.getOperand(0);
+ Value *dstValue = &I;
+ uint32_t srcElemNum = 0, dstElemNum = 0 ;
+ ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
+ ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
+ // As long and double are not compatible in register storage
+ // and we do not support double yet, simply put an assert here
+ GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
+ GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
+
+ if(srcElemNum > 1 || dstElemNum > 1) {
+ // Build the tuple data in the vector
+ vector<ir::Register> srcTupleData;
+ vector<ir::Register> dstTupleData;
+ uint32_t elemID = 0;
+ for (elemID = 0; elemID < srcElemNum; ++elemID) {
+ ir::Register reg;
+ reg = this->getRegister(srcValue, elemID);
+ srcTupleData.push_back(reg);
+ }
+ for (elemID = 0; elemID < dstElemNum; ++elemID) {
+ ir::Register reg;
+ reg = this->getRegister(dstValue, elemID);
+ dstTupleData.push_back(reg);
+ }
+
+ const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], srcElemNum);
+ const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dstElemNum);
+
+ ctx.BITCAST(dstType, srcType, dstTuple, srcTuple, dstElemNum, srcElemNum);
+ }
+ }
+ break; // nothing to emit here
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::Trunc:
+ {
+ // Get the element type for a vector
+ Type *llvmDstType = I.getType();
+ Type *llvmSrcType = I.getOperand(0)->getType();
+ ir::Type dstType;
+ if (I.getOpcode() == Instruction::FPToUI)
+ dstType = getUnsignedType(ctx, llvmDstType);
+ else
+ dstType = getType(ctx, llvmDstType);
+ ir::Type srcType;
+ if (I.getOpcode() == Instruction::ZExt || I.getOpcode() == Instruction::UIToFP) {
+ srcType = getUnsignedType(ctx, llvmSrcType);
+ } else {
+ srcType = getType(ctx, llvmSrcType);
+ }
+
+ // We use a select (0,1) not a convert when the destination is a boolean
+ if (srcType == ir::TYPE_BOOL) {
+ const ir::RegisterFamily family = getFamily(dstType);
+ const ir::ImmediateIndex zero = ctx.newIntegerImmediate(0, dstType);
+ ir::ImmediateIndex one;
+ if (I.getOpcode() == Instruction::SExt
+ && (dstType == ir::TYPE_S8 || dstType == ir::TYPE_S16 || dstType == ir::TYPE_S32 || dstType == ir::TYPE_S64))
+ one = ctx.newIntegerImmediate(-1, dstType);
+ else
+ one = ctx.newIntegerImmediate(1, dstType);
+ const ir::Register zeroReg = ctx.reg(family);
+ const ir::Register oneReg = ctx.reg(family);
+ ctx.LOADI(dstType, zeroReg, zero);
+ ctx.LOADI(dstType, oneReg, one);
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ ctx.SEL(dstType, dst, src, oneReg, zeroReg);
+ }
+ // Use a convert for the other cases
+ else {
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ ctx.CVT(dstType, srcType, dst, src);
+ }
+ }
+ break;
+ default: NOT_SUPPORTED;
+ }
+ }
+
+ /*! Because there are still fake insert/extract instruction for
+ * load/store, so keep empty function here */
+ void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
+ void GenWriter::emitInsertElement(InsertElementInst &I) {
+ const VectorType *type = dyn_cast<VectorType>(I.getType());
+ GBE_ASSERT(type);
+ const int elemNum = type->getNumElements();
+
+ Value *vec = I.getOperand(0);
+ Value *value = I.getOperand(1);
+ const Value *index = I.getOperand(2);
+ const ConstantInt *c = dyn_cast<ConstantInt>(index);
+ int i = c->getValue().getSExtValue();
+
+ for(int j=0; j<elemNum; j++) {
+ if(i == j)
+ regTranslator.newValueProxy(value, &I, 0, i);
+ else
+ regTranslator.newValueProxy(vec, &I, j, j);
+ }
+ }
+
+ void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {
+ Value *vec = I.getVectorOperand();
+ const Value *index = I.getIndexOperand();
+ const ConstantInt *c = dyn_cast<ConstantInt>(index);
+ GBE_ASSERT(c);
+ int i = c->getValue().getSExtValue();
+ regTranslator.newValueProxy(vec, &I, i, 0);
+ }
+
+ void GenWriter::emitExtractElement(ExtractElementInst &I) {
+ }
+
+ void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
+ void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
+
+ void GenWriter::regAllocateSelectInst(SelectInst &I) {
+ this->newRegister(&I);
+ }
+
+ void GenWriter::emitSelectInst(SelectInst &I) {
+ // Get the element type for a vector
+ const ir::Type type = getType(ctx, I.getType());
+
+ // Emit the instructions in a row
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register cond = this->getRegister(I.getOperand(0));
+ const ir::Register src0 = this->getRegister(I.getOperand(1));
+ const ir::Register src1 = this->getRegister(I.getOperand(2));
+ ctx.SEL(type, dst, cond, src0, src1);
+ }
+
+ void GenWriter::regAllocatePHINode(PHINode &I) {
+ // Copy 1 for the PHI
+ this->newRegister(&I);
+ // Copy 2 to avoid lost copy issue
+ Value *copy = this->getPHICopy(&I);
+ this->newRegister(&I, copy);
+ }
+
+ void GenWriter::emitPHINode(PHINode &I) {
+ Value *copy = this->getPHICopy(&I);
+ const ir::Type type = getType(ctx, I.getType());
+
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(copy);
+ ctx.MOV(type, dst, src);
+ phiMap.insert(std::make_pair(dst, src));
+ }
+
+ void GenWriter::regAllocateBranchInst(BranchInst &I) {}
+
+ void GenWriter::emitBranchInst(BranchInst &I) {
+ // Emit MOVs if required
+ BasicBlock *bb = I.getParent();
+ this->emitMovForPHI(bb, I.getSuccessor(0));
+ if (I.isConditional())
+ this->emitMovForPHI(bb, I.getSuccessor(1));
+
+ // Inconditional branch. Just check that we jump to a block which is not our
+ // successor
+ if (I.isConditional() == false) {
+ BasicBlock *target = I.getSuccessor(0);
+ if (std::next(Function::iterator(bb)) != Function::iterator(target)) {
+ GBE_ASSERT(labelMap.find(target) != labelMap.end());
+ const ir::LabelIndex labelIndex = labelMap[target];
+ ctx.BRA(labelIndex);
+ }
+ }
+ // The LLVM branch has two targets
+ else {
+ BasicBlock *taken = NULL, *nonTaken = NULL;
+ Value *condition = I.getCondition();
+
+ // We may inverted the branch condition to simplify the branching code
+ const bool inverted = conditionSet.find(condition) != conditionSet.end();
+ taken = inverted ? I.getSuccessor(1) : I.getSuccessor(0);
+ nonTaken = inverted ? I.getSuccessor(0) : I.getSuccessor(1);
+
+ // Get both taken label and predicate register
+ GBE_ASSERT(labelMap.find(taken) != labelMap.end());
+ const ir::LabelIndex index = labelMap[taken];
+ const ir::Register reg = this->getRegister(condition);
+ ctx.BRA(index, reg);
+
+ // If non-taken target is the next block, there is nothing to do
+ BasicBlock *bb = I.getParent();
+ if (std::next(Function::iterator(bb)) == Function::iterator(nonTaken))
+ return;
+
+ // This is slightly more complicated here. We need to issue one more
+ // branch for the non-taken condition.
+ GBE_ASSERT(labelMap.find(nonTaken) != labelMap.end());
+ const ir::LabelIndex untakenIndex = ctx.label();
+ ctx.LABEL(untakenIndex);
+ ctx.BRA(labelMap[nonTaken]);
+ }
+ }
+
+ void GenWriter::regAllocateCallInst(CallInst &I) {
+ Value *dst = &I;
+ Value *Callee = I.getCalledValue();
+ GBE_ASSERT(ctx.getFunction().getProfile() == ir::PROFILE_OCL);
+ GBE_ASSERT(isa<InlineAsm>(I.getCalledValue()) == false);
+ GBE_ASSERT(I.hasStructRetAttr() == false);
+
+ // We only support a small number of intrinsics right now
+ if (Function *F = I.getCalledFunction()) {
+ const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
+ if (intrinsicID != 0) {
+ switch (F->getIntrinsicID()) {
+ case Intrinsic::stacksave:
+ this->newRegister(&I);
+ break;
+ case Intrinsic::stackrestore:
+ break;
+#if LLVM_VERSION_MINOR >= 2
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ break;
+ case Intrinsic::fmuladd:
+ this->newRegister(&I);
+ break;
+#endif /* LLVM_VERSION_MINOR >= 2 */
+ case Intrinsic::debugtrap:
+ case Intrinsic::dbg_value:
+ case Intrinsic::dbg_declare:
+ break;
+ default:
+ GBE_ASSERTM(false, "Unsupported intrinsics");
+ }
+ return;
+ }
+ }
+
+ // Get the name of the called function and handle it
+ const std::string fnName = Callee->getName();
+ auto it = instrinsicMap.map.find(fnName);
+ GBE_ASSERT(it != instrinsicMap.map.end());
+ switch (it->second) {
+ case GEN_OCL_GET_GROUP_ID0:
+ regTranslator.newScalarProxy(ir::ocl::groupid0, dst); break;
+ case GEN_OCL_GET_GROUP_ID1:
+ regTranslator.newScalarProxy(ir::ocl::groupid1, dst); break;
+ case GEN_OCL_GET_GROUP_ID2:
+ regTranslator.newScalarProxy(ir::ocl::groupid2, dst); break;
+ case GEN_OCL_GET_LOCAL_ID0:
+ regTranslator.newScalarProxy(ir::ocl::lid0, dst); break;
+ case GEN_OCL_GET_LOCAL_ID1:
+ regTranslator.newScalarProxy(ir::ocl::lid1, dst); break;
+ case GEN_OCL_GET_LOCAL_ID2:
+ regTranslator.newScalarProxy(ir::ocl::lid2, dst); break;
+ case GEN_OCL_GET_NUM_GROUPS0:
+ regTranslator.newScalarProxy(ir::ocl::numgroup0, dst); break;
+ case GEN_OCL_GET_NUM_GROUPS1:
+ regTranslator.newScalarProxy(ir::ocl::numgroup1, dst); break;
+ case GEN_OCL_GET_NUM_GROUPS2:
+ regTranslator.newScalarProxy(ir::ocl::numgroup2, dst); break;
+ case GEN_OCL_GET_LOCAL_SIZE0:
+ regTranslator.newScalarProxy(ir::ocl::lsize0, dst); break;
+ case GEN_OCL_GET_LOCAL_SIZE1:
+ regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
+ case GEN_OCL_GET_LOCAL_SIZE2:
+ regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
+ case GEN_OCL_GET_GLOBAL_SIZE0:
+ regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
+ case GEN_OCL_GET_GLOBAL_SIZE1:
+ regTranslator.newScalarProxy(ir::ocl::gsize1, dst); break;
+ case GEN_OCL_GET_GLOBAL_SIZE2:
+ regTranslator.newScalarProxy(ir::ocl::gsize2, dst); break;
+ case GEN_OCL_GET_GLOBAL_OFFSET0:
+ regTranslator.newScalarProxy(ir::ocl::goffset0, dst); break;
+ case GEN_OCL_GET_GLOBAL_OFFSET1:
+ regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
+ case GEN_OCL_GET_GLOBAL_OFFSET2:
+ regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
+ case GEN_OCL_GET_WORK_DIM:
+ regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
+ case GEN_OCL_PRINTF_BUF_ADDR:
+ regTranslator.newScalarProxy(ir::ocl::printfbptr, dst); break;
+ case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
+ regTranslator.newScalarProxy(ir::ocl::printfiptr, dst); break;
+ case GEN_OCL_FBH:
+ case GEN_OCL_FBL:
+ case GEN_OCL_COS:
+ case GEN_OCL_SIN:
+ case GEN_OCL_SQR:
+ case GEN_OCL_RSQ:
+ case GEN_OCL_LOG:
+ case GEN_OCL_EXP:
+ case GEN_OCL_POW:
+ case GEN_OCL_RCP:
+ case GEN_OCL_ABS:
+ case GEN_OCL_FABS:
+ case GEN_OCL_RNDZ:
+ case GEN_OCL_RNDE:
+ case GEN_OCL_RNDU:
+ case GEN_OCL_RNDD:
+ case GEN_OCL_GET_IMAGE_WIDTH:
+ case GEN_OCL_GET_IMAGE_HEIGHT:
+ case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
+ case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
+ case GEN_OCL_GET_IMAGE_DEPTH:
+ case GEN_OCL_ATOMIC_ADD0:
+ case GEN_OCL_ATOMIC_ADD1:
+ case GEN_OCL_ATOMIC_SUB0:
+ case GEN_OCL_ATOMIC_SUB1:
+ case GEN_OCL_ATOMIC_AND0:
+ case GEN_OCL_ATOMIC_AND1:
+ case GEN_OCL_ATOMIC_OR0:
+ case GEN_OCL_ATOMIC_OR1:
+ case GEN_OCL_ATOMIC_XOR0:
+ case GEN_OCL_ATOMIC_XOR1:
+ case GEN_OCL_ATOMIC_XCHG0:
+ case GEN_OCL_ATOMIC_XCHG1:
+ case GEN_OCL_ATOMIC_UMAX0:
+ case GEN_OCL_ATOMIC_UMAX1:
+ case GEN_OCL_ATOMIC_UMIN0:
+ case GEN_OCL_ATOMIC_UMIN1:
+ case GEN_OCL_ATOMIC_IMAX0:
+ case GEN_OCL_ATOMIC_IMAX1:
+ case GEN_OCL_ATOMIC_IMIN0:
+ case GEN_OCL_ATOMIC_IMIN1:
+ case GEN_OCL_ATOMIC_INC0:
+ case GEN_OCL_ATOMIC_INC1:
+ case GEN_OCL_ATOMIC_DEC0:
+ case GEN_OCL_ATOMIC_DEC1:
+ case GEN_OCL_ATOMIC_CMPXCHG0:
+ case GEN_OCL_ATOMIC_CMPXCHG1:
+ // No structure can be returned
+ this->newRegister(&I);
+ break;
+ case GEN_OCL_FORCE_SIMD8:
+ case GEN_OCL_FORCE_SIMD16:
+ case GEN_OCL_LBARRIER:
+ case GEN_OCL_GBARRIER:
+ case GEN_OCL_LGBARRIER:
+ ctx.getFunction().setUseSLM(true);
+ break;
+ case GEN_OCL_WRITE_IMAGE_I_1D:
+ case GEN_OCL_WRITE_IMAGE_UI_1D:
+ case GEN_OCL_WRITE_IMAGE_F_1D:
+ case GEN_OCL_WRITE_IMAGE_I_2D:
+ case GEN_OCL_WRITE_IMAGE_UI_2D:
+ case GEN_OCL_WRITE_IMAGE_F_2D:
+ case GEN_OCL_WRITE_IMAGE_I_3D:
+ case GEN_OCL_WRITE_IMAGE_UI_3D:
+ case GEN_OCL_WRITE_IMAGE_F_3D:
+ break;
+ case GEN_OCL_READ_IMAGE_I_1D:
+ case GEN_OCL_READ_IMAGE_UI_1D:
+ case GEN_OCL_READ_IMAGE_F_1D:
+ case GEN_OCL_READ_IMAGE_I_2D:
+ case GEN_OCL_READ_IMAGE_UI_2D:
+ case GEN_OCL_READ_IMAGE_F_2D:
+ case GEN_OCL_READ_IMAGE_I_3D:
+ case GEN_OCL_READ_IMAGE_UI_3D:
+ case GEN_OCL_READ_IMAGE_F_3D:
+
+ case GEN_OCL_READ_IMAGE_I_1D_I:
+ case GEN_OCL_READ_IMAGE_UI_1D_I:
+ case GEN_OCL_READ_IMAGE_F_1D_I:
+ case GEN_OCL_READ_IMAGE_I_2D_I:
+ case GEN_OCL_READ_IMAGE_UI_2D_I:
+ case GEN_OCL_READ_IMAGE_F_2D_I:
+ case GEN_OCL_READ_IMAGE_I_3D_I:
+ case GEN_OCL_READ_IMAGE_UI_3D_I:
+ case GEN_OCL_READ_IMAGE_F_3D_I:
+ {
+ // dst is a 4 elements vector. We allocate all 4 registers here.
+ uint32_t elemNum;
+ (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
+ GBE_ASSERT(elemNum == 4);
+ this->newRegister(&I);
+ break;
+ }
+ case GEN_OCL_MUL_HI_INT:
+ case GEN_OCL_MUL_HI_UINT:
+ case GEN_OCL_MUL_HI_I64:
+ case GEN_OCL_MUL_HI_UI64:
+ case GEN_OCL_UPSAMPLE_SHORT:
+ case GEN_OCL_UPSAMPLE_INT:
+ case GEN_OCL_UPSAMPLE_LONG:
+ case GEN_OCL_MAD:
+ case GEN_OCL_FMAX:
+ case GEN_OCL_FMIN:
+ case GEN_OCL_SADD_SAT_CHAR:
+ case GEN_OCL_SADD_SAT_SHORT:
+ case GEN_OCL_SADD_SAT_INT:
+ case GEN_OCL_SADD_SAT_LONG:
+ case GEN_OCL_UADD_SAT_CHAR:
+ case GEN_OCL_UADD_SAT_SHORT:
+ case GEN_OCL_UADD_SAT_INT:
+ case GEN_OCL_UADD_SAT_LONG:
+ case GEN_OCL_SSUB_SAT_CHAR:
+ case GEN_OCL_SSUB_SAT_SHORT:
+ case GEN_OCL_SSUB_SAT_INT:
+ case GEN_OCL_SSUB_SAT_LONG:
+ case GEN_OCL_USUB_SAT_CHAR:
+ case GEN_OCL_USUB_SAT_SHORT:
+ case GEN_OCL_USUB_SAT_INT:
+ case GEN_OCL_USUB_SAT_LONG:
+ case GEN_OCL_HADD:
+ case GEN_OCL_RHADD:
+ case GEN_OCL_I64HADD:
+ case GEN_OCL_I64RHADD:
+ case GEN_OCL_I64_MAD_SAT:
+ case GEN_OCL_I64_MAD_SATU:
+ case GEN_OCL_SAT_CONV_U8_TO_I8:
+ case GEN_OCL_SAT_CONV_I16_TO_I8:
+ case GEN_OCL_SAT_CONV_U16_TO_I8:
+ case GEN_OCL_SAT_CONV_I32_TO_I8:
+ case GEN_OCL_SAT_CONV_U32_TO_I8:
+ case GEN_OCL_SAT_CONV_F32_TO_I8:
+ case GEN_OCL_SAT_CONV_I8_TO_U8:
+ case GEN_OCL_SAT_CONV_I16_TO_U8:
+ case GEN_OCL_SAT_CONV_U16_TO_U8:
+ case GEN_OCL_SAT_CONV_I32_TO_U8:
+ case GEN_OCL_SAT_CONV_U32_TO_U8:
+ case GEN_OCL_SAT_CONV_F32_TO_U8:
+ case GEN_OCL_SAT_CONV_U16_TO_I16:
+ case GEN_OCL_SAT_CONV_I32_TO_I16:
+ case GEN_OCL_SAT_CONV_U32_TO_I16:
+ case GEN_OCL_SAT_CONV_F32_TO_I16:
+ case GEN_OCL_SAT_CONV_I16_TO_U16:
+ case GEN_OCL_SAT_CONV_I32_TO_U16:
+ case GEN_OCL_SAT_CONV_U32_TO_U16:
+ case GEN_OCL_SAT_CONV_F32_TO_U16:
+ case GEN_OCL_SAT_CONV_U32_TO_I32:
+ case GEN_OCL_SAT_CONV_F32_TO_I32:
+ case GEN_OCL_SAT_CONV_I32_TO_U32:
+ case GEN_OCL_SAT_CONV_F32_TO_U32:
+ case GEN_OCL_CONV_F16_TO_F32:
+ case GEN_OCL_CONV_F32_TO_F16:
+ case GEN_OCL_SIMD_ANY:
+ case GEN_OCL_SIMD_ALL:
+ this->newRegister(&I);
+ break;
+ case GEN_OCL_PRINTF:
+ break;
+ default:
+ GBE_ASSERTM(false, "Function call are not supported yet");
+ };
+ }
+
+ void GenWriter::emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode) {
+ CallSite::arg_iterator AI = CS.arg_begin();
+#if GBE_DEBUG
+ CallSite::arg_iterator AE = CS.arg_end();
+#endif /* GBE_DEBUG */
+ GBE_ASSERT(AI != AE);
+ const ir::Register src = this->getRegister(*AI);
+ const ir::Register dst = this->getRegister(&I);
+ ctx.ALU1(opcode, ir::TYPE_FLOAT, dst, src);
+ }
+
+ void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
+ CallSite::arg_iterator AI = CS.arg_begin();
+ CallSite::arg_iterator AE = CS.arg_end();
+ GBE_ASSERT(AI != AE);
+
+ unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
+ const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
+ const ir::Register dst = this->getRegister(&I);
+
+ ir::BTI bti;
+ gatherBTI(*AI, bti);
+ vector<ir::Register> src;
+ uint32_t srcNum = 0;
+ while(AI != AE) {
+ src.push_back(this->getRegister(*(AI++)));
+ srcNum++;
+ }
+ const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
+ ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
+ }
+
+ /* append a new sampler. should be called before any reference to
+ * a sampler_t value. */
+ uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
+ Constant *CPV = dyn_cast<Constant>(*AI);
+ uint8_t index;
+ if (CPV != NULL)
+ {
+ // This is not a kernel argument sampler, we need to append it to sampler set,
+ // and allocate a sampler slot for it.
+ const ir::Immediate &x = processConstantImm(CPV);
+ GBE_ASSERTM(x.getType() == ir::TYPE_U16 || x.getType() == ir::TYPE_S16, "Invalid sampler type");
+
+ index = ctx.getFunction().getSamplerSet()->append(x.getIntegerValue(), &ctx);
+ } else {
+ const ir::Register samplerReg = this->getRegister(*AI);
+ index = ctx.getFunction().getSamplerSet()->append(samplerReg, &ctx);
+ }
+ return index;
+ }
+
+ void GenWriter::emitCallInst(CallInst &I) {
+ if (Function *F = I.getCalledFunction()) {
+ if (F->getIntrinsicID() != 0) {
+ const ir::Function &fn = ctx.getFunction();
+ switch (F->getIntrinsicID()) {
+ case Intrinsic::stacksave:
+ {
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = ir::ocl::stackptr;
+ const ir::RegisterFamily family = fn.getRegisterFamily(dst);
+ ctx.MOV(ir::getType(family), dst, src);
+ }
+ break;
+ case Intrinsic::stackrestore:
+ {
+ const ir::Register dst = ir::ocl::stackptr;
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ const ir::RegisterFamily family = fn.getRegisterFamily(dst);
+ ctx.MOV(ir::getType(family), dst, src);
+ }
+ break;
+#if LLVM_VERSION_MINOR >= 2
+ case Intrinsic::fmuladd:
+ {
+ const ir::Register tmp = ctx.reg(ir::FAMILY_DWORD);
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+ const ir::Register src2 = this->getRegister(I.getOperand(2));
+ ctx.MUL(ir::TYPE_FLOAT, tmp, src0, src1);
+ ctx.ADD(ir::TYPE_FLOAT, dst, tmp, src2);
+ break;
+ }
+ break;
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ break;
+#endif /* LLVM_VERSION_MINOR >= 2 */
+ case Intrinsic::debugtrap:
+ case Intrinsic::dbg_value:
+ case Intrinsic::dbg_declare:
+ break;
+ default: NOT_IMPLEMENTED;
+ }
+ } else {
+ int image_dim;
+ // Get the name of the called function and handle it
+ Value *Callee = I.getCalledValue();
+ const std::string fnName = Callee->getName();
+ auto it = instrinsicMap.map.find(fnName);
+ GBE_ASSERT(it != instrinsicMap.map.end());
+
+ // Get the function arguments
+ CallSite CS(&I);
+ CallSite::arg_iterator AI = CS.arg_begin();
+#if GBE_DEBUG
+ CallSite::arg_iterator AE = CS.arg_end();
+#endif /* GBE_DEBUG */
+
+ switch (it->second) {
+ case GEN_OCL_POW:
+ {
+ const ir::Register src0 = this->getRegister(*AI); ++AI;
+ const ir::Register src1 = this->getRegister(*AI);
+ const ir::Register dst = this->getRegister(&I);
+ ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
+ case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+ case GEN_OCL_ABS:
+ {
+ const ir::Register src = this->getRegister(*AI);
+ const ir::Register dst = this->getRegister(&I);
+ ctx.ALU1(ir::OP_ABS, ir::TYPE_S32, dst, src);
+ break;
+ }
+ case GEN_OCL_SIMD_ALL:
+ {
+ const ir::Register src = this->getRegister(*AI);
+ const ir::Register dst = this->getRegister(&I);
+ ctx.ALU1(ir::OP_SIMD_ALL, ir::TYPE_S16, dst, src);
+ break;
+ }
+ case GEN_OCL_SIMD_ANY:
+ {
+ const ir::Register src = this->getRegister(*AI);
+ const ir::Register dst = this->getRegister(&I);
+ ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
+ break;
+ }
+ case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
+ case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
+ case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
+ case GEN_OCL_EXP: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
+ case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
+ case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
+ case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
+ case GEN_OCL_FABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
+ case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
+ case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
+ case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
+ case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
+ case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
+ case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
+ case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
+ case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
+ case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+ case GEN_OCL_ATOMIC_ADD0:
+ case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
+ case GEN_OCL_ATOMIC_SUB0:
+ case GEN_OCL_ATOMIC_SUB1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_SUB); break;
+ case GEN_OCL_ATOMIC_AND0:
+ case GEN_OCL_ATOMIC_AND1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_AND); break;
+ case GEN_OCL_ATOMIC_OR0:
+ case GEN_OCL_ATOMIC_OR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_OR); break;
+ case GEN_OCL_ATOMIC_XOR0:
+ case GEN_OCL_ATOMIC_XOR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XOR); break;
+ case GEN_OCL_ATOMIC_XCHG0:
+ case GEN_OCL_ATOMIC_XCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XCHG); break;
+ case GEN_OCL_ATOMIC_INC0:
+ case GEN_OCL_ATOMIC_INC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_INC); break;
+ case GEN_OCL_ATOMIC_DEC0:
+ case GEN_OCL_ATOMIC_DEC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_DEC); break;
+ case GEN_OCL_ATOMIC_UMIN0:
+ case GEN_OCL_ATOMIC_UMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMIN); break;
+ case GEN_OCL_ATOMIC_UMAX0:
+ case GEN_OCL_ATOMIC_UMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMAX); break;
+ case GEN_OCL_ATOMIC_IMIN0:
+ case GEN_OCL_ATOMIC_IMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMIN); break;
+ case GEN_OCL_ATOMIC_IMAX0:
+ case GEN_OCL_ATOMIC_IMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMAX); break;
+ case GEN_OCL_ATOMIC_CMPXCHG0:
+ case GEN_OCL_ATOMIC_CMPXCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_CMPXCHG); break;
+ case GEN_OCL_GET_IMAGE_WIDTH:
+ case GEN_OCL_GET_IMAGE_HEIGHT:
+ case GEN_OCL_GET_IMAGE_DEPTH:
+ case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
+ case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+ const ir::Register reg = this->getRegister(&I, 0);
+ int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
+ const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+ ir::ImageInfoKey key(surfaceID, infoType);
+ const ir::Register infoReg = ctx.getFunction().getImageSet()->appendInfo(key, &ctx);
+ ctx.GET_IMAGE_INFO(infoType, reg, surfaceID, infoReg);
+ break;
+ }
+
+ case GEN_OCL_READ_IMAGE_I_1D:
+ case GEN_OCL_READ_IMAGE_UI_1D:
+ case GEN_OCL_READ_IMAGE_F_1D:
+ case GEN_OCL_READ_IMAGE_I_1D_I:
+ case GEN_OCL_READ_IMAGE_UI_1D_I:
+ case GEN_OCL_READ_IMAGE_F_1D_I:
+ image_dim = 1;
+ goto handle_read_image;
+ case GEN_OCL_READ_IMAGE_I_2D:
+ case GEN_OCL_READ_IMAGE_UI_2D:
+ case GEN_OCL_READ_IMAGE_F_2D:
+ case GEN_OCL_READ_IMAGE_I_2D_I:
+ case GEN_OCL_READ_IMAGE_UI_2D_I:
+ case GEN_OCL_READ_IMAGE_F_2D_I:
+ image_dim = 2;
+ goto handle_read_image;
+ case GEN_OCL_READ_IMAGE_I_3D:
+ case GEN_OCL_READ_IMAGE_UI_3D:
+ case GEN_OCL_READ_IMAGE_F_3D:
+ case GEN_OCL_READ_IMAGE_I_3D_I:
+ case GEN_OCL_READ_IMAGE_UI_3D_I:
+ case GEN_OCL_READ_IMAGE_F_3D_I:
+ image_dim = 3;
+handle_read_image:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+ const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+ GBE_ASSERT(AI != AE);
+ const uint8_t sampler = this->appendSampler(AI);
+ ++AI;
+
+ ir::Register ucoord;
+ ir::Register vcoord;
+ ir::Register wcoord;
+
+ GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
+ if (image_dim > 1) {
+ GBE_ASSERT(AI != AE);
+ vcoord = this->getRegister(*AI);
+ ++AI;
+ } else {
+ vcoord = ir::ocl::invalid;
+ }
+
+ if (image_dim > 2) {
+ GBE_ASSERT(AI != AE);
+ wcoord = this->getRegister(*AI);
+ ++AI;
+ } else {
+ wcoord = ir::ocl::invalid;
+ }
+
+ vector<ir::Register> dstTupleData, srcTupleData;
+ const uint32_t elemNum = 4;
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+ const ir::Register reg = this->getRegister(&I, elemID);
+ dstTupleData.push_back(reg);
+ }
+ srcTupleData.push_back(ucoord);
+ srcTupleData.push_back(vcoord);
+ srcTupleData.push_back(wcoord);
+ uint8_t samplerOffset = 0;
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+ assert(CPV);
+ const ir::Immediate &x = processConstantImm(CPV);
+ GBE_ASSERTM(x.getType() == ir::TYPE_U32 || x.getType() == ir::TYPE_S32, "Invalid sampler type");
+ samplerOffset = x.getIntegerValue();
+#endif
+ const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
+ const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 3);
+
+ ir::Type dstType = ir::TYPE_U32;
+
+ switch(it->second) {
+ case GEN_OCL_READ_IMAGE_I_1D:
+ case GEN_OCL_READ_IMAGE_UI_1D:
+ case GEN_OCL_READ_IMAGE_I_2D:
+ case GEN_OCL_READ_IMAGE_UI_2D:
+ case GEN_OCL_READ_IMAGE_I_3D:
+ case GEN_OCL_READ_IMAGE_UI_3D:
+ case GEN_OCL_READ_IMAGE_I_1D_I:
+ case GEN_OCL_READ_IMAGE_UI_1D_I:
+ case GEN_OCL_READ_IMAGE_I_2D_I:
+ case GEN_OCL_READ_IMAGE_UI_2D_I:
+ case GEN_OCL_READ_IMAGE_I_3D_I:
+ case GEN_OCL_READ_IMAGE_UI_3D_I:
+ dstType = ir::TYPE_U32;
+ break;
+ case GEN_OCL_READ_IMAGE_F_1D:
+ case GEN_OCL_READ_IMAGE_F_2D:
+ case GEN_OCL_READ_IMAGE_F_3D:
+ case GEN_OCL_READ_IMAGE_F_1D_I:
+ case GEN_OCL_READ_IMAGE_F_2D_I:
+ case GEN_OCL_READ_IMAGE_F_3D_I:
+ dstType = ir::TYPE_FLOAT;
+ break;
+ default:
+ GBE_ASSERT(0); // never been here.
+ }
+
+ bool isFloatCoord = it->second <= GEN_OCL_READ_IMAGE_F_3D;
+
+ ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
+ isFloatCoord, sampler, samplerOffset);
+ break;
+ }
+
+ case GEN_OCL_WRITE_IMAGE_I_1D:
+ case GEN_OCL_WRITE_IMAGE_UI_1D:
+ case GEN_OCL_WRITE_IMAGE_F_1D:
+ image_dim = 1;
+ goto handle_write_image;
+ case GEN_OCL_WRITE_IMAGE_I_2D:
+ case GEN_OCL_WRITE_IMAGE_UI_2D:
+ case GEN_OCL_WRITE_IMAGE_F_2D:
+ image_dim = 2;
+ goto handle_write_image;
+ case GEN_OCL_WRITE_IMAGE_I_3D:
+ case GEN_OCL_WRITE_IMAGE_UI_3D:
+ case GEN_OCL_WRITE_IMAGE_F_3D:
+ image_dim = 3;
+handle_write_image:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+ const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+ ir::Register ucoord, vcoord, wcoord;
+
+ GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
+
+ if (image_dim > 1) {
+ GBE_ASSERT(AI != AE);
+ vcoord = this->getRegister(*AI);
+ ++AI;
+ } else
+ vcoord = ir::ocl::invalid;
+
+ if (image_dim > 2) {
+ GBE_ASSERT(AI != AE);
+ wcoord = this->getRegister(*AI);
+ ++AI;
+ } else {
+ wcoord = ir::ocl::invalid;
+ }
+
+ GBE_ASSERT(AI != AE);
+ vector<ir::Register> srcTupleData;
+
+ srcTupleData.push_back(ucoord);
+ srcTupleData.push_back(vcoord);
+ srcTupleData.push_back(wcoord);
+
+ const uint32_t elemNum = 4;
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+ const ir::Register reg = this->getRegister(*AI, elemID);
+ srcTupleData.push_back(reg);
+ }
+ const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 7);
+
+ ir::Type srcType = ir::TYPE_U32;
+
+ switch(it->second) {
+ case GEN_OCL_WRITE_IMAGE_I_1D:
+ case GEN_OCL_WRITE_IMAGE_UI_1D:
+ case GEN_OCL_WRITE_IMAGE_I_2D:
+ case GEN_OCL_WRITE_IMAGE_UI_2D:
+ case GEN_OCL_WRITE_IMAGE_I_3D:
+ case GEN_OCL_WRITE_IMAGE_UI_3D:
+ srcType = ir::TYPE_U32;
+ break;
+ case GEN_OCL_WRITE_IMAGE_F_1D:
+ case GEN_OCL_WRITE_IMAGE_F_2D:
+ case GEN_OCL_WRITE_IMAGE_F_3D:
+ srcType = ir::TYPE_FLOAT;
+ break;
+ default:
+ GBE_ASSERT(0); // never been here.
+ }
+
+ ctx.TYPED_WRITE(surfaceID, srcTuple, srcType, ir::TYPE_U32);
+ break;
+ }
+ case GEN_OCL_MUL_HI_INT:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_MUL_HI_UINT:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_MUL_HI_I64:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64_MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_MUL_HI_UI64:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64_MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_UPSAMPLE_SHORT:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.UPSAMPLE_SHORT(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_UPSAMPLE_INT:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.UPSAMPLE_INT(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_UPSAMPLE_LONG:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.UPSAMPLE_LONG(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_SADD_SAT_CHAR:
+ case GEN_OCL_SADD_SAT_SHORT:
+ case GEN_OCL_SADD_SAT_INT:
+ case GEN_OCL_SADD_SAT_LONG:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.ADDSAT(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_UADD_SAT_CHAR:
+ case GEN_OCL_UADD_SAT_SHORT:
+ case GEN_OCL_UADD_SAT_INT:
+ case GEN_OCL_UADD_SAT_LONG:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.ADDSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_SSUB_SAT_CHAR:
+ case GEN_OCL_SSUB_SAT_SHORT:
+ case GEN_OCL_SSUB_SAT_INT:
+ case GEN_OCL_SSUB_SAT_LONG:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.SUBSAT(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_USUB_SAT_CHAR:
+ case GEN_OCL_USUB_SAT_SHORT:
+ case GEN_OCL_USUB_SAT_INT:
+ case GEN_OCL_USUB_SAT_LONG:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.SUBSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_I64_MAD_SAT:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64MADSAT(getType(ctx, I.getType()), dst, src0, src1, src2);
+ break;
+ }
+ case GEN_OCL_I64_MAD_SATU:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
+ break;
+ }
+ case GEN_OCL_MAD: {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
+ break;
+ }
+ case GEN_OCL_FMAX:
+ case GEN_OCL_FMIN:{
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
+ //Becasue cmp's sources are same as sel's source, so cmp instruction and sel
+ //instruction will be merged to one sel_cmp instruction in the gen selection
+ //Add two intruction here for simple.
+ if(it->second == GEN_OCL_FMAX)
+ ctx.GE(getType(ctx, I.getType()), cmp, src0, src1);
+ else
+ ctx.LT(getType(ctx, I.getType()), cmp, src0, src1);
+ ctx.SEL(getType(ctx, I.getType()), dst, cmp, src0, src1);
+ break;
+ }
+ case GEN_OCL_HADD: {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.HADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_I64HADD:
+ {
+ GBE_ASSERT(AI != AE);
+ const ir::Register src0 = this->getRegister(*(AI++));
+ GBE_ASSERT(AI != AE);
+ const ir::Register src1 = this->getRegister(*(AI++));
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64HADD(ir::TYPE_U64, dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_RHADD: {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.RHADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_I64RHADD:
+ {
+ GBE_ASSERT(AI != AE);
+ const ir::Register src0 = this->getRegister(*(AI++));
+ GBE_ASSERT(AI != AE);
+ const ir::Register src1 = this->getRegister(*(AI++));
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64RHADD(ir::TYPE_U64, dst, src0, src1);
+ break;
+ }
+#define DEF(DST_TYPE, SRC_TYPE) \
+ { ctx.SAT_CVT(DST_TYPE, SRC_TYPE, getRegister(&I), getRegister(I.getOperand(0))); break; }
+ case GEN_OCL_SAT_CONV_U8_TO_I8:
+ DEF(ir::TYPE_S8, ir::TYPE_U8);
+ case GEN_OCL_SAT_CONV_I16_TO_I8:
+ DEF(ir::TYPE_S8, ir::TYPE_S16);
+ case GEN_OCL_SAT_CONV_U16_TO_I8:
+ DEF(ir::TYPE_S8, ir::TYPE_U16);
+ case GEN_OCL_SAT_CONV_I32_TO_I8:
+ DEF(ir::TYPE_S8, ir::TYPE_S32);
+ case GEN_OCL_SAT_CONV_U32_TO_I8:
+ DEF(ir::TYPE_S8, ir::TYPE_U32);
+ case GEN_OCL_SAT_CONV_F32_TO_I8:
+ DEF(ir::TYPE_S8, ir::TYPE_FLOAT);
+ case GEN_OCL_SAT_CONV_I8_TO_U8:
+ DEF(ir::TYPE_U8, ir::TYPE_S8);
+ case GEN_OCL_SAT_CONV_I16_TO_U8:
+ DEF(ir::TYPE_U8, ir::TYPE_S16);
+ case GEN_OCL_SAT_CONV_U16_TO_U8:
+ DEF(ir::TYPE_U8, ir::TYPE_U16);
+ case GEN_OCL_SAT_CONV_I32_TO_U8:
+ DEF(ir::TYPE_U8, ir::TYPE_S32);
+ case GEN_OCL_SAT_CONV_U32_TO_U8:
+ DEF(ir::TYPE_U8, ir::TYPE_U32);
+ case GEN_OCL_SAT_CONV_F32_TO_U8:
+ DEF(ir::TYPE_U8, ir::TYPE_FLOAT);
+ case GEN_OCL_SAT_CONV_U16_TO_I16:
+ DEF(ir::TYPE_S16, ir::TYPE_U16);
+ case GEN_OCL_SAT_CONV_I32_TO_I16:
+ DEF(ir::TYPE_S16, ir::TYPE_S32);
+ case GEN_OCL_SAT_CONV_U32_TO_I16:
+ DEF(ir::TYPE_S16, ir::TYPE_U32);
+ case GEN_OCL_SAT_CONV_F32_TO_I16:
+ DEF(ir::TYPE_S16, ir::TYPE_FLOAT);
+ case GEN_OCL_SAT_CONV_I16_TO_U16:
+ DEF(ir::TYPE_U16, ir::TYPE_S16);
+ case GEN_OCL_SAT_CONV_I32_TO_U16:
+ DEF(ir::TYPE_U16, ir::TYPE_S32);
+ case GEN_OCL_SAT_CONV_U32_TO_U16:
+ DEF(ir::TYPE_U16, ir::TYPE_U32);
+ case GEN_OCL_SAT_CONV_F32_TO_U16:
+ DEF(ir::TYPE_U16, ir::TYPE_FLOAT);
+ case GEN_OCL_SAT_CONV_U32_TO_I32:
+ DEF(ir::TYPE_S32, ir::TYPE_U32);
+ case GEN_OCL_SAT_CONV_F32_TO_I32:
+ DEF(ir::TYPE_S32, ir::TYPE_FLOAT);
+ case GEN_OCL_SAT_CONV_I32_TO_U32:
+ DEF(ir::TYPE_U32, ir::TYPE_S32);
+ case GEN_OCL_SAT_CONV_F32_TO_U32:
+ DEF(ir::TYPE_U32, ir::TYPE_FLOAT);
+ case GEN_OCL_CONV_F16_TO_F32:
+ ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, getRegister(&I), getRegister(I.getOperand(0)));
+ break;
+ case GEN_OCL_CONV_F32_TO_F16:
+ ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, getRegister(&I), getRegister(I.getOperand(0)));
+ break;
+#undef DEF
+
+ case GEN_OCL_PRINTF:
+ {
+ ir::PrintfSet::PrintfFmt* fmt = (ir::PrintfSet::PrintfFmt*)getPrintfInfo(&I);
+ ctx.getFunction().getPrintfSet()->append(fmt, unit);
+ assert(fmt);
+ break;
+ }
+ case GEN_OCL_PRINTF_BUF_ADDR:
+ case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
+ default: break;
+ }
+ }
+ }
+ }
+
+ void GenWriter::regAllocateAllocaInst(AllocaInst &I) {
+ this->newRegister(&I);
+ }
+ void GenWriter::emitAllocaInst(AllocaInst &I) {
+ Value *src = I.getOperand(0);
+ Type *elemType = I.getType()->getElementType();
+ ir::ImmediateIndex immIndex;
+ uint32_t elementSize = getTypeByteSize(unit, elemType);
+
+ // Be aware, we manipulate pointers
+ if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+ immIndex = ctx.newImmediate(uint32_t(elementSize));
+ else
+ immIndex = ctx.newImmediate(uint64_t(elementSize));
+
+ // OK, we try to see if we know compile time the size we need to allocate
+ if (I.isArrayAllocation() == true) {
+ Constant *CPV = dyn_cast<Constant>(src);
+ GBE_ASSERT(CPV);
+ const ir::Immediate &imm = processConstantImm(CPV);
+ const uint64_t elemNum = imm.getIntegerValue();
+ elementSize *= elemNum;
+ if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+ immIndex = ctx.newImmediate(uint32_t(ALIGN(elementSize, 4)));
+ else
+ immIndex = ctx.newImmediate(uint64_t(ALIGN(elementSize, 4)));
+ }
+
+ // Now emit the stream of instructions to get the allocated pointer
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register stack = ir::ocl::stackptr;
+ const ir::Register reg = ctx.reg(pointerFamily);
+ const ir::Immediate imm = ctx.getImmediate(immIndex);
+ uint32_t align = getAlignmentByte(unit, elemType);
+ // below code assume align is power of 2
+ GBE_ASSERT(align && (align & (align-1)) == 0);
+
+ // align the stack pointer according to data alignment
+ if(align > 1) {
+ uint32_t prevStackPtr = ctx.getFunction().getStackSize();
+ uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
+ if (step != 0) {
+ ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+ ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
+ ctx.LOADI(ir::TYPE_S32, stepReg, stepImm);
+ ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+ ctx.getFunction().pushStackSize(step);
+ }
+ }
+ // Set the destination register properly
+ ctx.MOV(imm.getType(), dst, stack);
+
+ ctx.LOADI(imm.getType(), reg, immIndex);
+ ctx.ADD(imm.getType(), stack, stack, reg);
+ ctx.getFunction().pushStackSize(elementSize);
+ }
+
+ static INLINE Value *getLoadOrStoreValue(LoadInst &I) {
+ return &I;
+ }
+ static INLINE Value *getLoadOrStoreValue(StoreInst &I) {
+ return I.getValueOperand();
+ }
+ void GenWriter::regAllocateLoadInst(LoadInst &I) {
+ this->newRegister(&I);
+ }
+ void GenWriter::regAllocateStoreInst(StoreInst &I) {}
+
+ void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+ Value *llvmValues, const ir::Register ptr,
+ const ir::AddressSpace addrSpace,
+ Type * elemType, bool isLoad, ir::BTI bti) {
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+ uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
+ uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
+ const uint32_t perMsgNum = elemNum / msgNum;
+
+ for (uint32_t msg = 0; msg < msgNum; ++msg) {
+ // Build the tuple data in the vector
+ vector<ir::Register> tupleData; // put registers here
+ for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
+ ir::Register reg;
+ if(regTranslator.isUndefConst(llvmValues, elemID)) {
+ Value *v = Constant::getNullValue(elemType);
+ reg = this->getRegister(v);
+ } else
+ reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
+
+ tupleData.push_back(reg);
+ }
+ const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
+
+ // We may need to update to offset the pointer
+ ir::Register addr;
+ if (msg == 0)
+ addr = ptr;
+ else {
+ const ir::Register offset = ctx.reg(pointerFamily);
+ ir::ImmediateIndex immIndex;
+ ir::Type immType;
+ // each message can read/write 16 byte
+ const int32_t stride = 16;
+ if (pointerFamily == ir::FAMILY_DWORD) {
+ immIndex = ctx.newImmediate(int32_t(msg*stride));
+ immType = ir::TYPE_S32;
+ } else {
+ immIndex = ctx.newImmediate(int64_t(msg*stride));
+ immType = ir::TYPE_S64;
+ }
+
+ addr = ctx.reg(pointerFamily);
+ ctx.LOADI(immType, offset, immIndex);
+ ctx.ADD(immType, addr, ptr, offset);
+ }
+
+ // Emit the instruction
+ if (isLoad)
+ ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+ else
+ ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+ }
+ }
+
+ // The idea behind is to search along the use-def chain, and find out all
+ // possible source of the pointer. Then in later codeGen, we can emit
+ // read/store instructions to these btis gathered.
+ void GenWriter::gatherBTI(Value *pointer, ir::BTI &bti) {
+ typedef map<const Value*, int>::iterator GlobalPtrIter;
+ Value *p;
+ size_t idx = 0;
+ int nBTI = 0;
+ std::vector<Value*> candidates;
+ candidates.push_back(pointer);
+ std::set<Value*> processed;
+
+ while (idx < candidates.size()) {
+ bool isPrivate = false;
+ bool needNewBTI = true;
+ p = candidates[idx];
+
+ while (dyn_cast<User>(p) && !dyn_cast<GlobalVariable>(p)) {
+
+ if (processed.find(p) == processed.end()) {
+ processed.insert(p);
+ } else {
+ // This use-def chain falls into a loop,
+ // it does not introduce a new buffer source.
+ needNewBTI = false;
+ break;
+ }
+
+ if (dyn_cast<SelectInst>(p)) {
+ SelectInst *sel = cast<SelectInst>(p);
+ p = sel->getTrueValue();
+ candidates.push_back(sel->getFalseValue());
+ continue;
+ }
+
+ if (dyn_cast<PHINode>(p)) {
+ PHINode* phi = cast<PHINode>(p);
+ int n = phi->getNumIncomingValues();
+ for (int j = 1; j < n; j++)
+ candidates.push_back(phi->getIncomingValue(j));
+ p = phi->getIncomingValue(0);
+ continue;
+ }
+
+ if (dyn_cast<AllocaInst>(p)) {
+ isPrivate = true;
+ break;
+ }
+ p = cast<User>(p)->getOperand(0);
+ }
+
+ if (needNewBTI == false) {
+ // go to next possible pointer source
+ idx++; continue;
+ }
+
+ uint8_t new_bti = 0;
+ if (isPrivate) {
+ new_bti = BTI_PRIVATE;
+ } else {
+ if(isa<Argument>(p) && dyn_cast<Argument>(p)->hasByValAttr()) {
+ // structure value implementation is not complete now,
+ // they are now treated as push constant, so, the load/store
+ // here is not as meaningful.
+ bti.bti[0] = BTI_PRIVATE;
+ bti.count = 1;
+ break;
+ }
+ Type *ty = p->getType();
+ if(ty->getPointerAddressSpace() == 3) {
+ // __local memory
+ new_bti = 0xfe;
+ } else {
+ // __global memory
+ GlobalPtrIter iter = globalPointer.find(p);
+ GBE_ASSERT(iter != globalPointer.end());
+ new_bti = iter->second;
+ }
+ }
+ // avoid duplicate
+ bool bFound = false;
+ for (int j = 0; j < nBTI; j++) {
+ if (bti.bti[j] == new_bti) {
+ bFound = true; break;
+ }
+ }
+ if (bFound == false) {
+ bti.bti[nBTI++] = new_bti;
+ bti.count = nBTI;
+ }
+ idx++;
+ }
+ GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
+ }
+
+ extern int OCL_SIMD_WIDTH;
+ template <bool isLoad, typename T>
+ INLINE void GenWriter::emitLoadOrStore(T &I)
+ {
+ unsigned int llvmSpace = I.getPointerAddressSpace();
+ Value *llvmPtr = I.getPointerOperand();
+ Value *llvmValues = getLoadOrStoreValue(I);
+ Type *llvmType = llvmValues->getType();
+ const bool dwAligned = (I.getAlignment() % 4) == 0;
+ const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
+ const ir::Register ptr = this->getRegister(llvmPtr);
+ ir::BTI binding;
+ if(addrSpace == ir::MEM_GLOBAL || addrSpace == ir::MEM_PRIVATE) {
+ gatherBTI(llvmPtr, binding);
+ }
+ // Scalar is easy. We neednot build register tuples
+ if (isScalarType(llvmType) == true) {
+ const ir::Type type = getType(ctx, llvmType);
+ const ir::Register values = this->getRegister(llvmValues);
+ if (isLoad)
+ ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
+ else
+ ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
+ }
+ // A vector type requires to build a tuple
+ else {
+ VectorType *vectorType = cast<VectorType>(llvmType);
+ Type *elemType = vectorType->getElementType();
+
+ // We follow OCL spec and support 2,3,4,8,16 elements only
+ uint32_t elemNum = vectorType->getNumElements();
+ GBE_ASSERTM(elemNum == 2 || elemNum == 3 || elemNum == 4 || elemNum == 8 || elemNum == 16,
+ "Only vectors of 2,3,4,8 or 16 elements are supported");
+ // Per OPenCL 1.2 spec 6.1.5:
+ // For 3-component vector data types, the size of the data type is 4 * sizeof(component).
+ // And the llvm does cast a type3 data to type4 for load/store instruction,
+ // so a 4 elements vector may only have 3 valid elements. We need to fix it to correct element
+ // count here.
+ if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
+ elemNum = 3;
+
+ // The code is going to be fairly different from types to types (based on
+ // size of each vector element)
+ const ir::Type type = getType(ctx, elemType);
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+ const ir::RegisterFamily dataFamily = getFamily(type);
+
+ if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
+ // One message is enough here. Nothing special to do
+ if (elemNum <= 4) {
+ // Build the tuple data in the vector
+ vector<ir::Register> tupleData; // put registers here
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+ ir::Register reg;
+ if(regTranslator.isUndefConst(llvmValues, elemID)) {
+ Value *v = Constant::getNullValue(elemType);
+ reg = this->getRegister(v);
+ } else
+ reg = this->getRegister(llvmValues, elemID);
+
+ tupleData.push_back(reg);
+ }
+ const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
+
+ // Emit the instruction
+ if (isLoad)
+ ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+ else
+ ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+ }
+ // Not supported by the hardware. So, we split the message and we use
+ // strided loads and stores
+ else {
+ emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+ }
+ }
+ else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
+ emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+ } else {
+ for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+ if(regTranslator.isUndefConst(llvmValues, elemID))
+ continue;
+
+ const ir::Register reg = this->getRegister(llvmValues, elemID);
+ ir::Register addr;
+ if (elemID == 0)
+ addr = ptr;
+ else {
+ const ir::Register offset = ctx.reg(pointerFamily);
+ ir::ImmediateIndex immIndex;
+ int elemSize = getTypeByteSize(unit, elemType);
+ immIndex = ctx.newImmediate(int32_t(elemID * elemSize));
+ addr = ctx.reg(pointerFamily);
+ ctx.LOADI(ir::TYPE_S32, offset, immIndex);
+ ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
+ }
+ if (isLoad)
+ ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
+ else
+ ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+ }
+ }
+ }
+ }
+
+ void GenWriter::emitLoadInst(LoadInst &I) {
+ this->emitLoadOrStore<true>(I);
+ }
+
+ void GenWriter::emitStoreInst(StoreInst &I) {
+ this->emitLoadOrStore<false>(I);
+ }
+
+ llvm::FunctionPass *createGenPass(ir::Unit &unit) {
+ return new GenWriter(unit);
+ }
+} /* namespace gbe */
+
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
new file mode 100644
index 0000000..cc5cdad
--- /dev/null
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_gen_backend.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Pass generation functions
+ */
+#ifndef __GBE_LLVM_GEN_BACKEND_HPP__
+#define __GBE_LLVM_GEN_BACKEND_HPP__
+
+#include "llvm/Pass.h"
+#include "sys/platform.hpp"
+#include "sys/map.hpp"
+#include "sys/hash_map.hpp"
+#include <algorithm>
+
+// LLVM Type
+namespace llvm { class Type; }
+
+namespace gbe
+{
+ // Final target of the Gen backend
+ namespace ir { class Unit; }
+
+ /*! All intrinsic Gen functions */
+ enum OCLInstrinsic {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+ };
+
+ /*! Build the hash map for OCL functions on Gen */
+ struct OCLIntrinsicMap {
+ /*! Build the intrinsic hash map */
+ OCLIntrinsicMap(void) {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
+ map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+ }
+ /*! Sort intrinsics with their names */
+ hash_map<std::string, OCLInstrinsic> map;
+ };
+
+ /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
+ static const OCLIntrinsicMap instrinsicMap;
+
+ /*! Pad the offset */
+ uint32_t getPadding(uint32_t offset, uint32_t align);
+
+ /*! Get the type alignment in bytes */
+ uint32_t getAlignmentByte(const ir::Unit &unit, llvm::Type* Ty);
+
+ /*! Get the type size in bits */
+ uint32_t getTypeBitSize(const ir::Unit &unit, llvm::Type* Ty);
+
+ /*! Get the type size in bytes */
+ uint32_t getTypeByteSize(const ir::Unit &unit, llvm::Type* Ty);
+
+ /*! whether this is a kernel function */
+ bool isKernelFunction(const llvm::Function &f);
+
+ /*! Create a Gen-IR unit */
+ llvm::FunctionPass *createGenPass(ir::Unit &unit);
+
+ /*! Remove the GEP instructions */
+ llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
+
+ /*! Merge load/store if possible */
+ llvm::BasicBlockPass *createLoadStoreOptimizationPass();
+
+ /*! Scalarize all vector op instructions */
+ llvm::FunctionPass* createScalarizePass();
+ /*! Remove/add NoDuplicate function attribute for barrier functions. */
+ llvm::ModulePass* createBarrierNodupPass(bool);
+
+ /*! Convert the Intrinsic call to gen function */
+ llvm::BasicBlockPass *createIntrinsicLoweringPass();
+
+ /*! Passer the printf function call. */
+ llvm::FunctionPass* createPrintfParserPass();
+
+ void* getPrintfInfo(llvm::CallInst* inst);
+} /* namespace gbe */
+
+#endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
+
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
new file mode 100644
index 0000000..f3ce096
--- /dev/null
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -0,0 +1,196 @@
+DECL_LLVM_GEN_FUNCTION(GET_GROUP_ID0, __gen_ocl_get_group_id0)
+DECL_LLVM_GEN_FUNCTION(GET_GROUP_ID1, __gen_ocl_get_group_id1)
+DECL_LLVM_GEN_FUNCTION(GET_GROUP_ID2, __gen_ocl_get_group_id2)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_ID0, __gen_ocl_get_local_id0)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_ID1, __gen_ocl_get_local_id1)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_ID2, __gen_ocl_get_local_id2)
+DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS0, __gen_ocl_get_num_groups0)
+DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS1, __gen_ocl_get_num_groups1)
+DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS2, __gen_ocl_get_num_groups2)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE0, __gen_ocl_get_local_size0)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE1, __gen_ocl_get_local_size1)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE2, __gen_ocl_get_local_size2)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE0, __gen_ocl_get_global_size0)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE1, __gen_ocl_get_global_size1)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
+DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
+
+// Math function
+DECL_LLVM_GEN_FUNCTION(FABS, __gen_ocl_fabs)
+DECL_LLVM_GEN_FUNCTION(COS, __gen_ocl_cos)
+DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
+DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
+DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
+DECL_LLVM_GEN_FUNCTION(LOG, __gen_ocl_log)
+DECL_LLVM_GEN_FUNCTION(EXP, __gen_ocl_exp)
+DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
+DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
+DECL_LLVM_GEN_FUNCTION(RNDZ, __gen_ocl_rndz)
+DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
+DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
+DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
+DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
+DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
+DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
+
+// Barrier function
+DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
+DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global)
+DECL_LLVM_GEN_FUNCTION(LGBARRIER, __gen_ocl_barrier_local_and_global)
+
+// To force SIMD8/16 compilation
+DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8, __gen_ocl_force_simd8)
+DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
+
+// To read_image functions.
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D, _Z21__gen_ocl_read_imageijtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D, _Z22__gen_ocl_read_imageuijtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D, _Z21__gen_ocl_read_imagefjtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D, _Z21__gen_ocl_read_imageijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D, _Z22__gen_ocl_read_imageuijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D, _Z21__gen_ocl_read_imagefjtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D, _Z21__gen_ocl_read_imageijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D, _Z22__gen_ocl_read_imageuijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D, _Z21__gen_ocl_read_imagefjtfffj)
+// work around read image with the LD message. The coords are integer type.
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D_I, _Z21__gen_ocl_read_imageijtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D_I, _Z22__gen_ocl_read_imageuijtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D_I, _Z21__gen_ocl_read_imagefjtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D_I, _Z21__gen_ocl_read_imageijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D_I, _Z22__gen_ocl_read_imageuijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D_I, _Z21__gen_ocl_read_imagefjtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D_I, _Z21__gen_ocl_read_imageijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D_I, _Z22__gen_ocl_read_imageuijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D_I, _Z21__gen_ocl_read_imagefjtiiij)
+
+// To write_image functions.
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_1D, _Z22__gen_ocl_write_imageijiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_1D, _Z23__gen_ocl_write_imageuijiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_1D, _Z22__gen_ocl_write_imagefjiDv4_f)
+
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_2D, _Z22__gen_ocl_write_imageijiiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_2D, _Z23__gen_ocl_write_imageuijiiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_2D, _Z22__gen_ocl_write_imagefjiiDv4_f)
+
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_3D, _Z22__gen_ocl_write_imageijiiiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_3D, _Z23__gen_ocl_write_imageuijiiiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_3D, _Z22__gen_ocl_write_imagefjiiiDv4_f)
+
+// To get image info function
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_WIDTH, __gen_ocl_get_image_width)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_HEIGHT, __gen_ocl_get_image_height)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_DEPTH, __gen_ocl_get_image_depth)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_DATA_TYPE, __gen_ocl_get_image_channel_data_type)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_ORDER, __gen_ocl_get_image_channel_order)
+
+// atomic related functions.
+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD0, _Z20__gen_ocl_atomic_addPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD1, _Z20__gen_ocl_atomic_addPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB0, _Z20__gen_ocl_atomic_subPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB1, _Z20__gen_ocl_atomic_subPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND0, _Z20__gen_ocl_atomic_andPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND1, _Z20__gen_ocl_atomic_andPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR0, _Z19__gen_ocl_atomic_orPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR1, _Z19__gen_ocl_atomic_orPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR0, _Z20__gen_ocl_atomic_xorPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR1, _Z20__gen_ocl_atomic_xorPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN0, _Z21__gen_ocl_atomic_uminPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN1, _Z21__gen_ocl_atomic_uminPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX0, _Z21__gen_ocl_atomic_umaxPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX1, _Z21__gen_ocl_atomic_umaxPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN0, _Z21__gen_ocl_atomic_iminPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN1, _Z21__gen_ocl_atomic_iminPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX0, _Z21__gen_ocl_atomic_imaxPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX1, _Z21__gen_ocl_atomic_imaxPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG0, _Z21__gen_ocl_atomic_xchgPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG1, _Z21__gen_ocl_atomic_xchgPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC0, _Z20__gen_ocl_atomic_incPU3AS1j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC1, _Z20__gen_ocl_atomic_incPU3AS3j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC0, _Z20__gen_ocl_atomic_decPU3AS1j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC1, _Z20__gen_ocl_atomic_decPU3AS3j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG0, _Z24__gen_ocl_atomic_cmpxchgPU3AS1jjj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG1, _Z24__gen_ocl_atomic_cmpxchgPU3AS3jjj)
+
+// saturation related functions.
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_SHORT, _Z12ocl_sadd_satss)
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_INT, _Z12ocl_sadd_satii)
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_LONG, _Z12ocl_sadd_satll)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_CHAR, _Z12ocl_uadd_sathh)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_SHORT, _Z12ocl_uadd_sattt)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_INT, _Z12ocl_uadd_satjj)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_LONG, _Z12ocl_uadd_satmm)
+
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_CHAR, _Z12ocl_ssub_satcc)
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_SHORT, _Z12ocl_ssub_satss)
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_INT, _Z12ocl_ssub_satii)
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_LONG, _Z12ocl_ssub_satll)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_CHAR, _Z12ocl_usub_sathh)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_SHORT, _Z12ocl_usub_sattt)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_INT, _Z12ocl_usub_satjj)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_LONG, _Z12ocl_usub_satmm)
+
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SAT, _Z17__gen_ocl_mad_satlll)
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SATU, _Z17__gen_ocl_mad_satmmm)
+
+// integer built-in functions
+DECL_LLVM_GEN_FUNCTION(MUL_HI_INT, _Z16__gen_ocl_mul_hiii)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_UINT, _Z16__gen_ocl_mul_hijj)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_I64, _Z16__gen_ocl_mul_hill)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_UI64, _Z16__gen_ocl_mul_himm)
+DECL_LLVM_GEN_FUNCTION(FBH, __gen_ocl_fbh)
+DECL_LLVM_GEN_FUNCTION(FBL, __gen_ocl_fbl)
+DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_abs)
+DECL_LLVM_GEN_FUNCTION(HADD, _Z14__gen_ocl_haddjj)
+DECL_LLVM_GEN_FUNCTION(RHADD, _Z15__gen_ocl_rhaddjj)
+DECL_LLVM_GEN_FUNCTION(I64HADD, _Z14__gen_ocl_haddmm)
+DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+
+// saturate convert
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U8_TO_I8, _Z16convert_char_sath)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_I8, _Z16convert_char_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_I8, _Z16convert_char_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_I8, _Z16convert_char_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I8, _Z16convert_char_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I8, _Z16convert_char_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I8_TO_U8, _Z17convert_uchar_satc)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_U8, _Z17convert_uchar_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_U8, _Z17convert_uchar_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U8, _Z17convert_uchar_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_U8, _Z17convert_uchar_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U8, _Z17convert_uchar_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_I16, _Z17convert_short_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_I16, _Z17convert_short_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I16, _Z17convert_short_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I16, _Z17convert_short_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_U16, _Z18convert_ushort_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U16, _Z18convert_ushort_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_U16, _Z18convert_ushort_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U16, _Z18convert_ushort_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I32, _Z15convert_int_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I32, _Z15convert_int_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U32, _Z16convert_uint_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U32, _Z16convert_uint_satf)
+
+DECL_LLVM_GEN_FUNCTION(CONV_F16_TO_F32, __gen_ocl_f16to32)
+DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
+
+// SIMD level function for internal usage
+DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)
+DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
+
+// printf function
+DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+DECL_LLVM_GEN_FUNCTION(PRINTF_BUF_ADDR, __gen_ocl_printf_get_buf_addr)
+DECL_LLVM_GEN_FUNCTION(PRINTF_INDEX_BUF_ADDR, __gen_ocl_printf_get_index_buf_addr)
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
new file mode 100644
index 0000000..7d04318
--- /dev/null
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_intrinisc_lowering.cpp
+ * \author Yang Rong <rong.r.yang at intel.com>
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+ class InstrinsicLowering : public BasicBlockPass
+ {
+ public:
+ static char ID;
+ InstrinsicLowering() :
+ BasicBlockPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+
+ }
+
+ virtual const char *getPassName() const {
+ return "SPIR backend: lowering instrinsics";
+ }
+ static char convertSpaceToName(Value *val) {
+ const uint32_t space = val->getType()->getPointerAddressSpace();
+ switch(space) {
+ case 0:
+ return 'p';
+ case 1:
+ return 'g';
+ case 3:
+ return 'l';
+ default:
+ assert("Non support address space");
+ return '\0';
+ }
+ }
+ static CallInst *replaceCallWith(const char *NewFn, CallInst *CI,
+ Value **ArgBegin, Value **ArgEnd,
+ Type *RetTy)
+ {
+ // If we haven't already looked up this function, check to see if the
+ // program already contains a function with this name.
+ Module *M = CI->getParent()->getParent()->getParent();
+ // Get or insert the definition now.
+ std::vector<Type *> ParamTys;
+ for (Value** I = ArgBegin; I != ArgEnd; ++I)
+ ParamTys.push_back((*I)->getType());
+ Constant* FCache = M->getOrInsertFunction(NewFn,
+ FunctionType::get(RetTy, ParamTys, false));
+
+ IRBuilder<> Builder(CI->getParent(), CI);
+ SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+ CallInst *NewCI = Builder.CreateCall(FCache, Args);
+ NewCI->setName(CI->getName());
+ if (!CI->use_empty())
+ CI->replaceAllUsesWith(NewCI);
+ CI->eraseFromParent();
+ return NewCI;
+ }
+ virtual bool runOnBasicBlock(BasicBlock &BB)
+ {
+ bool changedBlock = false;
+ Module *M = BB.getParent()->getParent();
+
+ DataLayout TD(M);
+ LLVMContext &Context = BB.getContext();
+ for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+ Instruction *Inst = DI++;
+ CallInst* CI = dyn_cast<CallInst>(Inst);
+ if(CI == NULL)
+ continue;
+
+ IRBuilder<> Builder(&BB, CI);
+ // only support memcpy and memset
+ if (Function *F = CI->getCalledFunction()) {
+ const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
+ if (intrinsicID == 0)
+ continue;
+ switch (intrinsicID) {
+ case Intrinsic::memcpy: {
+ Type *IntPtr = TD.getIntPtrType(Context);
+ Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+ /* isSigned */ false);
+ Value *Ops[3];
+ Ops[0] = CI->getArgOperand(0);
+ Ops[1] = CI->getArgOperand(1);
+ Ops[2] = Size;
+ char name[16] = "__gen_memcpy_xx";
+ name[13] = convertSpaceToName(Ops[0]);
+ name[14] = convertSpaceToName(Ops[1]);
+ replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+ break;
+ }
+ case Intrinsic::memset: {
+ Value *Op0 = CI->getArgOperand(0);
+ Value *val = Builder.CreateIntCast(CI->getArgOperand(1), IntegerType::getInt8Ty(Context),
+ /* isSigned */ false);
+ Type *IntPtr = TD.getIntPtrType(Op0->getType());
+ Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+ /* isSigned */ false);
+ Value *Ops[3];
+ Ops[0] = Op0;
+ // Extend the amount to i32.
+ Ops[1] = val;
+ Ops[2] = Size;
+ char name[16] = "__gen_memset_x";
+ name[13] = convertSpaceToName(Ops[0]);
+ replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+ break;
+ }
+ default:
+ continue;
+ }
+ }
+ }
+ return changedBlock;
+ }
+ };
+
+ char InstrinsicLowering::ID = 0;
+
+ BasicBlockPass *createIntrinsicLoweringPass() {
+ return new InstrinsicLowering();
+ }
+} // end namespace
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
new file mode 100644
index 0000000..4bfc7f6
--- /dev/null
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling, Song <ruiling.song at intel.com>
+ *
+ * The Idea is that: As GEN support at most 4 successive DWORD load/store,
+ * then merge successive load/store that are compatible is beneficial.
+ * The method of checking whether two load/store is compatible are borrowed
+ * from Vectorize passes in llvm.
+ */
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+
+using namespace llvm;
+namespace gbe {
+ class GenLoadStoreOptimization : public BasicBlockPass {
+
+ public:
+ static char ID;
+ ScalarEvolution *SE;
+ const DataLayout *TD;
+ GenLoadStoreOptimization() : BasicBlockPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<ScalarEvolution>();
+ AU.addPreserved<ScalarEvolution>();
+ AU.setPreservesCFG();
+ }
+
+ virtual bool runOnBasicBlock(BasicBlock &BB) {
+ SE = &getAnalysis<ScalarEvolution>();
+ #if LLVM_VERSION_MINOR >= 5
+ DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+ TD = DLP ? &DLP->getDataLayout() : nullptr;
+ #else
+ TD = getAnalysisIfAvailable<DataLayout>();
+ #endif
+ return optimizeLoadStore(BB);
+ }
+ Type *getValueType(Value *insn);
+ Value *getPointerOperand(Value *I);
+ unsigned getAddressSpace(Value *I);
+ bool isSimpleLoadStore(Value *I);
+ bool optimizeLoadStore(BasicBlock &BB);
+
+ bool isLoadStoreCompatible(Value *A, Value *B);
+ void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+ void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+ BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
+ SmallVector<Instruction*, 4> &merged,
+ BasicBlock::iterator &start,
+ unsigned maxLimit,
+ bool isLoad);
+
+ virtual const char *getPassName() const {
+ return "Merge compatible Load/stores for Gen";
+ }
+ };
+
+ char GenLoadStoreOptimization::ID = 0;
+
+ Value *GenLoadStoreOptimization::getPointerOperand(Value *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+ return NULL;
+ }
+ unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) {
+ if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+ if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+ return -1;
+ }
+ bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) {
+ if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->isSimple();
+ if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->isSimple();
+ return false;
+ }
+ Type *GenLoadStoreOptimization::getValueType(Value *insn) {
+ if(LoadInst *ld = dyn_cast<LoadInst>(insn)) return ld->getType();
+ if(StoreInst *st = dyn_cast<StoreInst>(insn)) return st->getValueOperand()->getType();
+
+ return NULL;
+ }
+
+ bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) {
+ Value *ptrA = getPointerOperand(A);
+ Value *ptrB = getPointerOperand(B);
+ unsigned ASA = getAddressSpace(A);
+ unsigned ASB = getAddressSpace(B);
+
+ // Check that the address spaces match and that the pointers are valid.
+ if (!ptrA || !ptrB || (ASA != ASB)) return false;
+
+ if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false;
+ // Check that A and B are of the same type.
+ if (ptrA->getType() != ptrB->getType()) return false;
+
+ // Calculate the distance.
+ const SCEV *ptrSCEVA = SE->getSCEV(ptrA);
+ const SCEV *ptrSCEVB = SE->getSCEV(ptrB);
+ const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB);
+ const SCEVConstant *constOffSCEV = dyn_cast<SCEVConstant>(offsetSCEV);
+
+ // Non constant distance.
+ if (!constOffSCEV) return false;
+
+ int64_t offset = constOffSCEV->getValue()->getSExtValue();
+ Type *Ty = cast<PointerType>(ptrA->getType())->getElementType();
+ // The Instructions are connsecutive if the size of the first load/store is
+ // the same as the offset.
+ int64_t sz = TD->getTypeStoreSize(Ty);
+ return ((-offset) == sz);
+ }
+
+ void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+ IRBuilder<> Builder(&BB);
+
+ unsigned size = merged.size();
+ SmallVector<Value *, 4> values;
+ for(unsigned i = 0; i < size; i++) {
+ values.push_back(merged[i]);
+ }
+ LoadInst *ld = cast<LoadInst>(merged[0]);
+ unsigned align = ld->getAlignment();
+ unsigned addrSpace = ld->getPointerAddressSpace();
+ // insert before first load
+ Builder.SetInsertPoint(ld);
+ VectorType *vecTy = VectorType::get(ld->getType(), size);
+ Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
+ PointerType::get(vecTy, addrSpace));
+ LoadInst *vecValue = Builder.CreateLoad(vecPtr);
+ vecValue->setAlignment(align);
+
+ for (unsigned i = 0; i < size; ++i) {
+ Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i));
+ values[i]->replaceAllUsesWith(S);
+ }
+ }
+
+ BasicBlock::iterator
+ GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
+ SmallVector<Instruction*, 4> &merged,
+ BasicBlock::iterator &start,
+ unsigned maxLimit,
+ bool isLoad) {
+
+ BasicBlock::iterator stepForward = start;
+ if(!isSimpleLoadStore(start)) return stepForward;
+
+ merged.push_back(start);
+
+ BasicBlock::iterator E = BB.end();
+ BasicBlock::iterator J = ++start;
+
+ for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
+ if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
+ if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
+ merged.push_back(J);
+ stepForward = ++J;
+ }
+ } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+ // simple stop to keep read/write order
+ break;
+ }
+
+ if(merged.size() >= 4) break;
+ }
+ return stepForward;
+ }
+
+ void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+ IRBuilder<> Builder(&BB);
+
+ unsigned size = merged.size();
+ SmallVector<Value *, 4> values;
+ for(unsigned i = 0; i < size; i++) {
+ values.push_back(cast<StoreInst>(merged[i])->getValueOperand());
+ }
+ StoreInst *st = cast<StoreInst>(merged[0]);
+ unsigned addrSpace = st->getPointerAddressSpace();
+
+ unsigned align = st->getAlignment();
+ // insert before the last store
+ Builder.SetInsertPoint(merged[size-1]);
+
+ Type *dataTy = st->getValueOperand()->getType();
+ VectorType *vecTy = VectorType::get(dataTy, size);
+ Value * parent = UndefValue::get(vecTy);
+ for(unsigned i = 0; i < size; i++) {
+ parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
+ }
+
+ Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace));
+ StoreInst *newST = Builder.CreateStore(parent, newPtr);
+ newST->setAlignment(align);
+ }
+
+ bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
+ bool changed = false;
+ SmallVector<Instruction*, 4> merged;
+ for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
+ if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
+ bool isLoad = isa<LoadInst>(*BBI) ? true: false;
+ Type *ty = getValueType(BBI);
+ if(ty->isVectorTy()) continue;
+ // we only support DWORD data type merge
+ if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
+ BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
+ if(merged.size() > 1) {
+ if(isLoad)
+ mergeLoad(BB, merged);
+ else
+ mergeStore(BB, merged);
+ // remove merged insn
+ int size = merged.size();
+ for(int i = 0; i < size; i++)
+ merged[i]->eraseFromParent();
+ changed = true;
+ }
+ merged.clear();
+ }
+ }
+ return changed;
+ }
+
+ BasicBlockPass *createLoadStoreOptimizationPass() {
+ return new GenLoadStoreOptimization();
+ }
+};
+
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
new file mode 100644
index 0000000..1a38a0c
--- /dev/null
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Heldge RHodin <alice.rhodin at alice-dsl.net>
+ */
+
+/**
+ * \file llvm_passes.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ * \author Heldge RHodin <alice.rhodin at alice-dsl.net>
+ */
+
+/* THIS CODE IS DERIVED FROM GPL LLVM PTX BACKEND. CODE IS HERE:
+ * http://sourceforge.net/scm/?type=git&group_id=319085
+ * Note that however, the original author, Heldge Rhodin, granted me (Benjamin
+ * Segovia) the right to use another license for it (MIT here)
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/Mangler.h"
+#else
+#include "llvm/Target/Mangler.h"
+#endif
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
+#include "llvm/Target/TargetData.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
+#endif
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
+#include "llvm/Support/InstVisitor.h"
+#elif LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/InstVisitor.h"
+#else
+#include "llvm/InstVisitor.h"
+#endif
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "ir/unit.hpp"
+#include "sys/map.hpp"
+
+using namespace llvm;
+
+namespace gbe
+{
+ bool isKernelFunction(const llvm::Function &F) {
+ const Module *module = F.getParent();
+ const Module::NamedMDListType& globalMD = module->getNamedMDList();
+ bool bKernel = false;
+ for(auto i = globalMD.begin(); i != globalMD.end(); i++) {
+ const NamedMDNode &md = *i;
+ if(strcmp(md.getName().data(), "opencl.kernels") != 0) continue;
+ uint32_t ops = md.getNumOperands();
+ for(uint32_t x = 0; x < ops; x++) {
+ MDNode* node = md.getOperand(x);
+ Value * op = node->getOperand(0);
+ if(op == &F) bKernel = true;
+ }
+ }
+ return bKernel;
+ }
+
+ uint32_t getPadding(uint32_t offset, uint32_t align) {
+ return (align - (offset % align)) % align;
+ }
+
+ uint32_t getAlignmentByte(const ir::Unit &unit, Type* Ty)
+ {
+ switch (Ty->getTypeID()) {
+ case Type::VoidTyID: NOT_SUPPORTED;
+ case Type::VectorTyID:
+ {
+ const VectorType* VecTy = cast<VectorType>(Ty);
+ uint32_t elemNum = VecTy->getNumElements();
+ if (elemNum == 3) elemNum = 4; // OCL spec
+ return elemNum * getTypeByteSize(unit, VecTy->getElementType());
+ }
+ case Type::PointerTyID:
+ case Type::IntegerTyID:
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ case Type::HalfTyID:
+ return getTypeBitSize(unit, Ty)/8;
+ case Type::ArrayTyID:
+ return getAlignmentByte(unit, cast<ArrayType>(Ty)->getElementType());
+ case Type::StructTyID:
+ {
+ const StructType* StrTy = cast<StructType>(Ty);
+ uint32_t maxa = 0;
+ for(uint32_t subtype = 0; subtype < StrTy->getNumElements(); subtype++)
+ {
+ maxa = std::max(getAlignmentByte(unit, StrTy->getElementType(subtype)), maxa);
+ }
+ return maxa;
+ }
+ default: NOT_SUPPORTED;
+ }
+ return 0u;
+ }
+
+ uint32_t getTypeBitSize(const ir::Unit &unit, Type* Ty)
+ {
+ switch (Ty->getTypeID()) {
+ case Type::VoidTyID: NOT_SUPPORTED;
+ case Type::PointerTyID: return unit.getPointerSize();
+ case Type::IntegerTyID:
+ {
+ // use S16 to represent SLM bool variables.
+ int bitWidth = cast<IntegerType>(Ty)->getBitWidth();
+ return (bitWidth == 1) ? 16 : bitWidth;
+ }
+ case Type::HalfTyID: return 16;
+ case Type::FloatTyID: return 32;
+ case Type::DoubleTyID: return 64;
+ case Type::VectorTyID:
+ {
+ const VectorType* VecTy = cast<VectorType>(Ty);
+ uint32_t numElem = VecTy->getNumElements();
+ if(numElem == 3) numElem = 4; // OCL spec
+ return numElem * getTypeBitSize(unit, VecTy->getElementType());
+ }
+ case Type::ArrayTyID:
+ {
+ const ArrayType* ArrTy = cast<ArrayType>(Ty);
+ Type* elementType = ArrTy->getElementType();
+ uint32_t size_element = getTypeBitSize(unit, elementType);
+ uint32_t size = ArrTy->getNumElements() * size_element;
+ uint32_t align = 8 * getAlignmentByte(unit, elementType);
+ size += (ArrTy->getNumElements()-1) * getPadding(size_element, align);
+ return size;
+ }
+ case Type::StructTyID:
+ {
+ const StructType* StrTy = cast<StructType>(Ty);
+ uint32_t size = 0;
+ for(uint32_t subtype=0; subtype < StrTy->getNumElements(); subtype++)
+ {
+ Type* elementType = StrTy->getElementType(subtype);
+ uint32_t align = 8 * getAlignmentByte(unit, elementType);
+ size += getPadding(size, align);
+ size += getTypeBitSize(unit, elementType);
+ }
+ return size;
+ }
+ default: NOT_SUPPORTED;
+ }
+ return 0u;
+ }
+
+ uint32_t getTypeByteSize(const ir::Unit &unit, Type* Ty)
+ {
+ uint32_t size_bit = getTypeBitSize(unit, Ty);
+ assert((size_bit%8==0) && "no multiple of 8");
+ return size_bit/8;
+ }
+
+ class GenRemoveGEPPasss : public BasicBlockPass
+ {
+
+ public:
+ static char ID;
+#define FORMER_VERSION 0
+#if FORMER_VERSION
+ GenRemoveGEPPasss(map<const Value *, const Value *>&
+ parentCompositePointer)
+ : BasicBlockPass(ID),
+ parentPointers(parentCompositePointer) {}
+ map<const Value *, const Value *>& parentPointers;
+#else
+ GenRemoveGEPPasss(const ir::Unit &unit) :
+ BasicBlockPass(ID),
+ unit(unit) {}
+ const ir::Unit &unit;
+#endif
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ }
+
+ virtual const char *getPassName() const {
+ return "SPIR backend: insert special spir instructions";
+ }
+
+ bool simplifyGEPInstructions(GetElementPtrInst* GEPInst);
+
+ virtual bool runOnBasicBlock(BasicBlock &BB)
+ {
+ bool changedBlock = false;
+ iplist<Instruction>::iterator I = BB.getInstList().begin();
+ for (auto nextI = I, E = --BB.getInstList().end(); I != E; I = nextI) {
+ iplist<Instruction>::iterator I = nextI++;
+ if(GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(&*I))
+ changedBlock = (simplifyGEPInstructions(gep) || changedBlock);
+ }
+ return changedBlock;
+ }
+ };
+
+ char GenRemoveGEPPasss::ID = 0;
+
+ bool GenRemoveGEPPasss::simplifyGEPInstructions(GetElementPtrInst* GEPInst)
+ {
+ const uint32_t ptrSize = unit.getPointerSize();
+ Value* parentPointer = GEPInst->getOperand(0);
+#if FORMER_VERSION
+ Value* topParent = parentPointer;
+#endif
+ CompositeType* CompTy = cast<CompositeType>(parentPointer->getType());
+
+ Value* currentAddrInst =
+ new PtrToIntInst(parentPointer, IntegerType::get(GEPInst->getContext(), ptrSize), "", GEPInst);
+
+ uint32_t constantOffset = 0;
+
+ for(uint32_t op=1; op<GEPInst->getNumOperands(); ++op)
+ {
+ uint32_t TypeIndex;
+ //we have a constant struct/array acces
+ if(ConstantInt* ConstOP = dyn_cast<ConstantInt>(GEPInst->getOperand(op)))
+ {
+ uint32_t offset = 0;
+ TypeIndex = ConstOP->getZExtValue();
+ if (op == 1) {
+ if (TypeIndex != 0) {
+ Type *elementType = (cast<PointerType>(parentPointer->getType()))->getElementType();
+ uint32_t elementSize = getTypeByteSize(unit, elementType);
+ uint32_t align = getAlignmentByte(unit, elementType);
+ elementSize += getPadding(elementSize, align);
+ offset += elementSize * TypeIndex;
+ }
+ } else {
+ for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+ {
+ Type* elementType = CompTy->getTypeAtIndex(ty_i);
+ uint32_t align = getAlignmentByte(unit, elementType);
+ offset += getPadding(offset, align);
+ offset += getTypeByteSize(unit, elementType);
+ }
+
+ //add getPaddingding for accessed type
+ const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+ offset += getPadding(offset, align);
+ }
+
+ constantOffset += offset;
+ }
+ // none constant index (=> only array/verctor allowed)
+ else
+ {
+ // we only have array/vectors here,
+ // therefore all elements have the same size
+ TypeIndex = 0;
+
+ Type* elementType = CompTy->getTypeAtIndex(TypeIndex);
+ uint32_t size = getTypeByteSize(unit, elementType);
+
+ //add padding
+ uint32_t align = getAlignmentByte(unit, elementType);
+ size += getPadding(size, align);
+
+ Constant* newConstSize =
+ ConstantInt::get(IntegerType::get(GEPInst->getContext(), ptrSize), size);
+
+ Value *operand = GEPInst->getOperand(op);
+
+ //HACK TODO: Inserted by type replacement.. this code could break something????
+ if(getTypeByteSize(unit, operand->getType())>4)
+ {
+ GBE_ASSERTM(false, "CHECK IT");
+ operand->dump();
+
+ //previous instruction is sext or zext instr. ignore it
+ CastInst *cast = dyn_cast<CastInst>(operand);
+ if(cast && (isa<ZExtInst>(operand) || isa<SExtInst>(operand)))
+ {
+ //hope that CastInst is a s/zext
+ operand = cast->getOperand(0);
+ }
+ else
+ {
+ //trunctate
+ operand =
+ new TruncInst(operand,
+ IntegerType::get(GEPInst->getContext(),
+ ptrSize),
+ "", GEPInst);
+ }
+ }
+
+ BinaryOperator* tmpMul =
+ BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
+ "", GEPInst);
+ currentAddrInst =
+ BinaryOperator::Create(Instruction::Add, currentAddrInst, tmpMul,
+ "", GEPInst);
+ }
+
+ //step down in type hirachy
+ CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+ }
+
+ //insert addition of new offset before GEPInst
+ Constant* newConstOffset =
+ ConstantInt::get(IntegerType::get(GEPInst->getContext(),
+ ptrSize),
+ constantOffset);
+ currentAddrInst =
+ BinaryOperator::Create(Instruction::Add, currentAddrInst,
+ newConstOffset, "", GEPInst);
+
+ //convert offset to ptr type (nop)
+ IntToPtrInst* intToPtrInst =
+ new IntToPtrInst(currentAddrInst,GEPInst->getType(),"", GEPInst);
+
+ //replace uses of the GEP instruction with the newly calculated pointer
+ GEPInst->replaceAllUsesWith(intToPtrInst);
+ GEPInst->dropAllReferences();
+ GEPInst->eraseFromParent();
+
+#if FORMER_VERSION
+ //insert new pointer into parent list
+ while(parentPointers.find(topParent)!=parentPointers.end())
+ topParent = parentPointers.find(topParent)->second;
+ parentPointers[intToPtrInst] = topParent;
+#endif
+
+ return true;
+ }
+
+ BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit) {
+ return new GenRemoveGEPPasss(unit);
+ }
+} /* namespace gbe */
+
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
new file mode 100644
index 0000000..00e1ef8
--- /dev/null
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -0,0 +1,851 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_printf_parser.cpp
+ *
+ * When there are printf functions existing, we have something to do here.
+ * Because the GPU's feature, it is relatively hard to parse and caculate the
+ * printf's format string. OpenCL 1.2 restrict the format string to be a
+ * constant string and can be decided at compiling time. So we add a pass here
+ * to parse the format string and check whether the parameters is valid.
+ * If all are valid, we will generate the according instruction to store the
+ * parameter content into the printf buffer. And if something is invalid, a
+ * warning is generated and the printf instruction is skipped in order to avoid
+ * GPU error. We also keep the relationship between the printf format and printf
+ * content in GPU's printf buffer here, and use the system's C standard printf to
+ * print the content after kernel executed.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+#include "ir/printf.hpp"
+
+using namespace llvm;
+
+namespace gbe
+{
+ using namespace ir;
+
+ /* Return the conversion_specifier if succeed, -1 if failed. */
+ static char __parse_printf_state(char *begin, char *end, char** rend, PrintfState * state)
+ {
+ const char *fmt;
+ state->left_justified = 0;
+ state->sign_symbol = 0; //0 for nothing, 1 for sign, 2 for space.
+ state->alter_form = 0;
+ state->zero_padding = 0;
+ state->vector_n = 0;
+ state->min_width = -1;
+ state->precision = -1;
+ state->length_modifier = 0;
+ state->conversion_specifier = PRINTF_CONVERSION_INVALID;
+ state->out_buf_sizeof_offset = -1;
+
+ fmt = begin;
+
+ if (*fmt != '%')
+ return -1;
+
+#define FMT_PLUS_PLUS do { \
+ if (fmt + 1 <= end) fmt++; \
+ else { \
+ printf("Error, line: %d, fmt > end\n", __LINE__); \
+ return -1; \
+ } \
+ } while(0)
+
+ FMT_PLUS_PLUS;
+
+ // parse the flags.
+ while (*fmt == '-' || *fmt == '+' || *fmt == ' ' || *fmt == '#' || *fmt == '0')
+ switch (*fmt) {
+ case '-':
+ /* The result of the conversion is left-justified within the field. */
+ state->left_justified = 1;
+ FMT_PLUS_PLUS;
+ break;
+ case '+':
+ /* The result of a signed conversion always begins with a plus or minus sign. */
+ state->sign_symbol = 1;
+ FMT_PLUS_PLUS;
+ break;
+ case ' ':
+ /* If the first character of a signed conversion is not a sign, or if a signed
+ conversion results in no characters, a space is prefixed to the result.
+ If the space and + flags both appear,the space flag is ignored. */
+ if (state->sign_symbol == 0) state->sign_symbol = 2;
+ FMT_PLUS_PLUS;
+ break;
+ case '#':
+ /*The result is converted to an alternative form. */
+ state->alter_form = 1;
+ FMT_PLUS_PLUS;
+ break;
+ case '0':
+ if (!state->left_justified) state->zero_padding = 1;
+ FMT_PLUS_PLUS;
+ break;
+ default:
+ break;
+ }
+
+ // The minimum field width
+ while ((*fmt >= '0') && (*fmt <= '9')) {
+ if (state->min_width < 0)
+ state->min_width = 0;
+ state->min_width = state->min_width * 10 + (*fmt - '0');
+ FMT_PLUS_PLUS;
+ }
+
+ // The precision
+ if (*fmt == '.') {
+ FMT_PLUS_PLUS;
+ state->precision = 0;
+ while (*fmt >= '0' && *fmt <= '9') {
+ state->precision = state->precision * 10 + (*fmt - '0');
+ FMT_PLUS_PLUS;
+ }
+ }
+
+ // handle the vector specifier.
+ if (*fmt == 'v') {
+ FMT_PLUS_PLUS;
+ switch (*fmt) {
+ case '2':
+ case '3':
+ case '4':
+ case '8':
+ state->vector_n = *fmt - '0';
+ FMT_PLUS_PLUS;
+ break;
+ case '1':
+ FMT_PLUS_PLUS;
+ if (*fmt == '6') {
+ state->vector_n = 16;
+ FMT_PLUS_PLUS;
+ } else
+ return -1;
+ break;
+ default:
+ //Wrong vector, error.
+ return -1;
+ }
+ }
+
+ // length modifiers
+ if (*fmt == 'h') {
+ FMT_PLUS_PLUS;
+ if (*fmt == 'h') { //hh
+ state->length_modifier = PRINTF_LM_HH;
+ FMT_PLUS_PLUS;
+ } else if (*fmt == 'l') { //hl
+ state->length_modifier = PRINTF_LM_HL;
+ FMT_PLUS_PLUS;
+ } else { //h
+ state->length_modifier = PRINTF_LM_H;
+ }
+ } else if (*fmt == 'l') {
+ state->length_modifier = PRINTF_LM_L;
+ FMT_PLUS_PLUS;
+ }
+
+#define CONVERSION_SPEC_AND_RET(XXX, xxx) \
+ case XXX: \
+ state->conversion_specifier = PRINTF_CONVERSION_##xxx; \
+ FMT_PLUS_PLUS; \
+ *rend = (char *)fmt; \
+ return XXX; \
+ break;
+
+ // conversion specifiers
+ switch (*fmt) {
+ CONVERSION_SPEC_AND_RET('d', D)
+ CONVERSION_SPEC_AND_RET('i', I)
+ CONVERSION_SPEC_AND_RET('o', O)
+ CONVERSION_SPEC_AND_RET('u', U)
+ CONVERSION_SPEC_AND_RET('x', x)
+ CONVERSION_SPEC_AND_RET('X', X)
+ CONVERSION_SPEC_AND_RET('f', f)
+ CONVERSION_SPEC_AND_RET('F', F)
+ CONVERSION_SPEC_AND_RET('e', e)
+ CONVERSION_SPEC_AND_RET('E', E)
+ CONVERSION_SPEC_AND_RET('g', g)
+ CONVERSION_SPEC_AND_RET('G', G)
+ CONVERSION_SPEC_AND_RET('a', a)
+ CONVERSION_SPEC_AND_RET('A', A)
+ CONVERSION_SPEC_AND_RET('c', C)
+ CONVERSION_SPEC_AND_RET('s', S)
+ CONVERSION_SPEC_AND_RET('p', P)
+
+ // %% has been handled
+
+ default:
+ return -1;
+ }
+ }
+
+ static PrintfSet::PrintfFmt* parser_printf_fmt(char* format, int& num)
+ {
+ char* begin;
+ char* end;
+ char* p;
+ char ret_char;
+ char* rend;
+ PrintfState state;
+ PrintfSet::PrintfFmt* printf_fmt = new PrintfSet::PrintfFmt();
+
+ p = format;
+ begin = format;
+ end = format + strlen(format);
+
+ /* Now parse it. */
+ while (*begin) {
+ p = begin;
+
+again:
+ while (p < end && *p != '%') {
+ p++;
+ }
+ if (p < end && p + 1 == end) { // String with % at end.
+ printf("string end with %%\n");
+ goto error;
+ }
+ if (*(p + 1) == '%') { // %%
+ p += 2;
+ goto again;
+ }
+
+ if (p != begin) {
+ std::string s = std::string(begin, size_t(p - begin));
+ printf_fmt->push_back(PrintfSlot(s.c_str()));
+ }
+
+ if (p == end) // finish
+ break;
+
+ /* Now parse the % start conversion_specifier. */
+ ret_char = __parse_printf_state(p, end, &rend, &state);
+ if (ret_char < 0)
+ goto error;
+
+ printf_fmt->push_back(&state);
+ num++;
+
+ if (rend == end)
+ break;
+
+ begin = rend;
+ }
+
+#if 0
+ {
+ int j = 0;
+ for (auto &s : *printf_fmt) {
+ j++;
+ if (s.type == PRINTF_SLOT_TYPE_STATE) {
+ fprintf(stderr, "---- %d ---: state : \n", j);
+ fprintf(stderr, " left_justified : %d\n", s.state->left_justified);
+ fprintf(stderr, " sign_symbol: %d\n", s.state->sign_symbol);
+ fprintf(stderr, " alter_form : %d\n", s.state->alter_form);
+ fprintf(stderr, " zero_padding : %d\n", s.state->zero_padding);
+ fprintf(stderr, " vector_n : %d\n", s.state->vector_n);
+ fprintf(stderr, " min_width : %d\n", s.state->min_width);
+ fprintf(stderr, " precision : %d\n", s.state->precision);
+ fprintf(stderr, " length_modifier : %d\n", s.state->length_modifier);
+ fprintf(stderr, " conversion_specifier : %d\n", s.state->conversion_specifier);
+ } else if (s.type == PRINTF_SLOT_TYPE_STRING) {
+ fprintf(stderr, "---- %d ---: string : %s\n", j, s.str);
+ }
+ }
+ }
+#endif
+
+ return printf_fmt;
+
+error:
+ printf("error format string.\n");
+ delete printf_fmt;
+ return NULL;
+ }
+
+ class PrintfParser : public FunctionPass
+ {
+ public:
+ static char ID;
+ typedef std::pair<Instruction*, bool> PrintfInst;
+ std::vector<PrintfInst> deadprintfs;
+ Module* module;
+ IRBuilder<>* builder;
+ Type* intTy;
+ Value* pbuf_ptr;
+ Value* index_buf_ptr;
+ int out_buf_sizeof_offset;
+ static map<CallInst*, PrintfSet::PrintfFmt*> printfs;
+ int printf_num;
+
+ PrintfParser(void) : FunctionPass(ID)
+ {
+ module = NULL;
+ builder = NULL;
+ intTy = NULL;
+ out_buf_sizeof_offset = 0;
+ printfs.clear();
+ pbuf_ptr = NULL;
+ index_buf_ptr = NULL;
+ printf_num = 0;
+ }
+
+ ~PrintfParser(void)
+ {
+ for (auto &s : printfs) {
+ delete s.second;
+ s.second = NULL;
+ }
+ printfs.clear();
+ }
+
+
+ bool parseOnePrintfInstruction(CallInst *& call);
+ bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size);
+
+ virtual const char *getPassName() const
+ {
+ return "Printf Parser";
+ }
+
+ virtual bool runOnFunction(llvm::Function &F);
+ };
+
+ bool PrintfParser::parseOnePrintfInstruction(CallInst *& call)
+ {
+ CallSite CS(call);
+ CallSite::arg_iterator CI_FMT = CS.arg_begin();
+ int param_num = 0;
+
+ llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
+ llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+ if (!arg0_ptr) {
+ return false;
+ }
+
+ ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+ if (!fmt_arg || !fmt_arg->isCString()) {
+ return false;
+ }
+
+ std::string fmt = fmt_arg->getAsCString();
+
+ PrintfSet::PrintfFmt* printf_fmt = NULL;
+
+ if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
+ return false;
+ }
+
+ /* iff parameter more than %, error. */
+ /* str_fmt arg0 arg1 ... NULL */
+ if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
+ delete printf_fmt;
+ return false;
+ }
+
+ /* FIXME: Because the OpenCL language do not support va macro, and we do not want
+ to introduce the va_list, va_start and va_end into our code, we just simulate
+ the function calls to caculate the offset caculation here. */
+#define BUILD_CALL_INST(name) \
+ CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
+ "__gen_ocl_get_"#name, \
+ IntegerType::getInt32Ty(module->getContext()), \
+ NULL)))
+
+ BUILD_CALL_INST(group_id2);
+ BUILD_CALL_INST(group_id1);
+ BUILD_CALL_INST(group_id0);
+ BUILD_CALL_INST(global_size2);
+ BUILD_CALL_INST(global_size1);
+ BUILD_CALL_INST(global_size0);
+ BUILD_CALL_INST(local_id2);
+ BUILD_CALL_INST(local_id1);
+ BUILD_CALL_INST(local_id0);
+ BUILD_CALL_INST(local_size2);
+ BUILD_CALL_INST(local_size1);
+ BUILD_CALL_INST(local_size0);
+
+#undef BUILD_CALL_INST
+
+ Value* op0 = NULL;
+ Value* val = NULL;
+ /* calculate offset for later usage.
+ offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+ + (local_id1 + local_size1 * group_id1) * global_size0
+ + (local_id0 + local_size0 * group_id0)) * sizeof(type) */
+
+ // local_size2 * group_id2
+ val = builder->CreateMul(local_size2, group_id2);
+ // local_id2 + local_size2 * group_id2
+ val = builder->CreateAdd(local_id2, val);
+ // global_size1 * global_size0
+ op0 = builder->CreateMul(global_size1, global_size0);
+ // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+ Value* offset1 = builder->CreateMul(val, op0);
+ // local_size1 * group_id1
+ val = builder->CreateMul(local_size1, group_id1);
+ // local_id1 + local_size1 * group_id1
+ val = builder->CreateAdd(local_id1, val);
+ // (local_id1 + local_size1 * group_id1) * global_size_0
+ Value* offset2 = builder->CreateMul(val, global_size0);
+ // local_size0 * group_id0
+ val = builder->CreateMul(local_size0, group_id0);
+ // local_id0 + local_size0 * group_id0
+ val = builder->CreateAdd(local_id0, val);
+ // The total sum
+ val = builder->CreateAdd(val, offset1);
+ Value* offset = builder->CreateAdd(val, offset2);
+
+ /////////////////////////////////////////////////////
+ /* calculate index address.
+ index_addr = (index_offset + offset )* sizeof(int) + index_buf_ptr
+ index_offset = global_size2 * global_size1 * global_size0 * printf_num */
+
+ // global_size2 * global_size1
+ op0 = builder->CreateMul(global_size2, global_size1);
+ // global_size2 * global_size1 * global_size0
+ Value* glXg2Xg3 = builder->CreateMul(op0, global_size0);
+ Value* index_offset = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, printf_num));
+ // index_offset + offset
+ op0 = builder->CreateAdd(index_offset, offset);
+ // (index_offset + offset)* sizeof(int)
+ op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)));
+ // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int)
+ op0 = builder->CreateAdd(index_buf_ptr, op0);
+ Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
+ builder->CreateStore(ConstantInt::get(intTy, 1), index_addr);// The flag
+
+ int i = 1;
+ Value* data_addr = NULL;
+ for (auto &s : *printf_fmt) {
+ if (s.type == PRINTF_SLOT_TYPE_STRING)
+ continue;
+
+ assert(i < static_cast<int>(call->getNumOperands()) - 1);
+
+ Value *out_arg = call->getOperand(i);
+ Type *dst_type = NULL;
+ int sizeof_size = 0;
+ if (!generateOneParameterInst(s, out_arg, dst_type, sizeof_size)) {
+ printf("Printf: %d, parameter %d may have no result because some error\n",
+ printf_num, i - 1);
+ i++;
+ continue;
+ }
+
+ s.state->out_buf_sizeof_offset = out_buf_sizeof_offset;
+ if (!sizeof_size) {
+ i++;
+ continue;
+ }
+
+ assert(dst_type);
+
+ /////////////////////////////////////////////////////
+ /* Calculate the data address.
+ data_addr = data_offset + pbuf_ptr + offset * sizeof(specify)
+ data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset
+
+ //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */
+ op0 = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
+ //offset * sizeof(specify)
+ val = builder->CreateMul(offset, ConstantInt::get(intTy, sizeof_size));
+ //data_offset + pbuf_ptr
+ op0 = builder->CreateAdd(pbuf_ptr, op0);
+ op0 = builder->CreateAdd(op0, val);
+ data_addr = builder->CreateIntToPtr(op0, dst_type);
+ builder->CreateStore(out_arg, data_addr);
+
+ out_buf_sizeof_offset += ((sizeof_size + 3) / 4) * 4;
+ i++;
+ }
+
+ CallInst* printf_inst = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+ "__gen_ocl_printf", Type::getVoidTy(module->getContext()),
+ NULL)));
+ assert(printfs[printf_inst] == NULL);
+ printfs[printf_inst] = printf_fmt;
+ printf_num++;
+ return true;
+ }
+
+ bool PrintfParser::runOnFunction(llvm::Function &F)
+ {
+ bool changed = false;
+ switch (F.getCallingConv()) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+ case CallingConv::PTX_Device:
+ return false;
+ case CallingConv::PTX_Kernel:
+#else
+ case CallingConv::C:
+#endif
+ break;
+ default:
+ GBE_ASSERTM(false, "Unsupported calling convention");
+ }
+
+ module = F.getParent();
+ intTy = IntegerType::get(module->getContext(), 32);
+
+ // As we inline all function calls, so skip non-kernel functions
+ bool bKernel = isKernelFunction(F);
+ if(!bKernel) return false;
+
+ builder = new IRBuilder<>(module->getContext());
+
+ /* Iter the function and find printf. */
+ for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
+ for (BasicBlock::iterator instI = B->begin(),
+ instE = B->end(); instI != instE; ++instI) {
+
+ llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
+ if (!call) {
+ continue;
+ }
+
+ if (call->getCalledFunction()->getIntrinsicID() != 0)
+ continue;
+
+ Value *Callee = call->getCalledValue();
+ const std::string fnName = Callee->getName();
+
+ if (fnName != "__gen_ocl_printf_stub")
+ continue;
+
+ changed = true;
+
+ builder->SetInsertPoint(call);
+
+ if (!pbuf_ptr) {
+ /* alloc a new buffer ptr to collect the print output. */
+ Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+ llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
+ pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+ }
+ if (!index_buf_ptr) {
+ Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+ llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
+ index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+ }
+
+ deadprintfs.push_back(PrintfInst(cast<Instruction>(call),parseOnePrintfInstruction(call)));
+ }
+ }
+
+ /* Replace the instruction's operand if using printf's return value. */
+ for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
+ for (BasicBlock::iterator instI = B->begin(),
+ instE = B->end(); instI != instE; ++instI) {
+
+ for (unsigned i = 0; i < instI->getNumOperands(); i++) {
+ for (auto &prf : deadprintfs) {
+ if (instI->getOperand(i) == prf.first) {
+
+ if (prf.second == true) {
+ instI->setOperand(i, ConstantInt::get(intTy, 0));
+ } else {
+ instI->setOperand(i, ConstantInt::get(intTy, -1));
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /* Kill the dead printf instructions. */
+ for (auto &prf : deadprintfs) {
+ prf.first->dropAllReferences();
+ if (prf.first->use_empty())
+ prf.first->eraseFromParent();
+ }
+
+ deadprintfs.clear();
+ delete builder;
+
+ return changed;
+ }
+
+ bool PrintfParser::generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size)
+ {
+ assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+ assert(builder);
+
+ /* Check whether the arg match the format specifer. If needed, some
+ conversion need to be applied. */
+ switch (arg->getType()->getTypeID()) {
+ case Type::IntegerTyID: {
+ bool sign = false;
+ switch (slot.state->conversion_specifier) {
+ case PRINTF_CONVERSION_I:
+ case PRINTF_CONVERSION_D:
+ sign = true;
+ case PRINTF_CONVERSION_O:
+ case PRINTF_CONVERSION_U:
+ case PRINTF_CONVERSION_x:
+ case PRINTF_CONVERSION_X:
+ /* If the bits change, we need to consider the signed. */
+ if (arg->getType() != Type::getInt32Ty(module->getContext())) {
+ arg = builder->CreateIntCast(arg, Type::getInt32Ty(module->getContext()), sign);
+ }
+
+ /* Int to Int, just store. */
+ dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+ sizeof_size = sizeof(int);
+ return true;
+
+ case PRINTF_CONVERSION_C:
+ /* Int to Char, add a conversion. */
+ arg = builder->CreateIntCast(arg, Type::getInt8Ty(module->getContext()), false);
+ dst_type = Type::getInt8PtrTy(module->getContext(), 1);
+ sizeof_size = sizeof(char);
+ return true;
+
+ case PRINTF_CONVERSION_F:
+ case PRINTF_CONVERSION_f:
+ case PRINTF_CONVERSION_E:
+ case PRINTF_CONVERSION_e:
+ case PRINTF_CONVERSION_G:
+ case PRINTF_CONVERSION_g:
+ case PRINTF_CONVERSION_A:
+ case PRINTF_CONVERSION_a:
+ printf("Warning: Have a float paramter for %%d like specifier, take care of it\n");
+ arg = builder->CreateSIToFP(arg, Type::getFloatTy(module->getContext()));
+ dst_type = Type::getFloatPtrTy(module->getContext(), 1);
+ sizeof_size = sizeof(float);
+ return true;
+
+ case PRINTF_CONVERSION_S:
+ /* Here, the case is printf("xxx%s", 0); we should output the null. */
+ sizeof_size = 0;
+ slot.state->str = "(null)";
+ return true;
+
+ default:
+ return false;
+ }
+
+ break;
+ }
+
+ case Type::DoubleTyID:
+ case Type::FloatTyID: {
+ /* Because the printf is a variable parameter function, it does not have the
+ function prototype, so the compiler will always promote the arg to the
+ longest precise type for float. So here, we can always find it is double. */
+ switch (slot.state->conversion_specifier) {
+ case PRINTF_CONVERSION_I:
+ case PRINTF_CONVERSION_D:
+ /* Float to Int, add a conversion. */
+ printf("Warning: Have a int paramter for %%f like specifier, take care of it\n");
+ arg = builder->CreateFPToSI(arg, Type::getInt32Ty(module->getContext()));
+ dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+ sizeof_size = sizeof(int);
+ return true;
+
+ case PRINTF_CONVERSION_O:
+ case PRINTF_CONVERSION_U:
+ case PRINTF_CONVERSION_x:
+ case PRINTF_CONVERSION_X:
+ /* Float to uint, add a conversion. */
+ printf("Warning: Have a uint paramter for %%f like specifier, take care of it\n");
+ arg = builder->CreateFPToUI(arg, Type::getInt32Ty(module->getContext()));
+ dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+ sizeof_size = sizeof(int);
+ return true;
+
+ case PRINTF_CONVERSION_F:
+ case PRINTF_CONVERSION_f:
+ case PRINTF_CONVERSION_E:
+ case PRINTF_CONVERSION_e:
+ case PRINTF_CONVERSION_G:
+ case PRINTF_CONVERSION_g:
+ case PRINTF_CONVERSION_A:
+ case PRINTF_CONVERSION_a:
+ arg = builder->CreateFPCast(arg, Type::getFloatTy(module->getContext()));
+ dst_type = Type::getFloatPtrTy(module->getContext(), 1);
+ sizeof_size = sizeof(float);
+ return true;
+
+ default:
+ return false;
+ }
+
+ break;
+ }
+
+ /* %p and %s */
+ case Type::PointerTyID:
+ switch (slot.state->conversion_specifier) {
+ case PRINTF_CONVERSION_S: {
+ llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(arg);
+ llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+ if (!arg0_ptr) {
+ return false;
+ }
+
+ ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+ if (!fmt_arg || !fmt_arg->isCString()) {
+ return false;
+ }
+ sizeof_size = 0;
+ slot.state->str = fmt_arg->getAsCString();
+ return true;
+ }
+ case PRINTF_CONVERSION_P: {
+ arg = builder->CreatePtrToInt(arg, Type::getInt32Ty(module->getContext()));
+ dst_type = arg->getType()->getPointerTo(1);
+ sizeof_size = sizeof(int);
+ return true;
+ }
+ default:
+ return false;
+ }
+
+ break;
+
+ case Type::VectorTyID: {
+ Type* vect_type = arg->getType();
+ Type* elt_type = vect_type->getVectorElementType();
+ int vec_num = vect_type->getVectorNumElements();
+ bool sign = false;
+
+ if (vec_num != slot.state->vector_n) {
+ return false;
+ }
+
+ switch (slot.state->conversion_specifier) {
+ case PRINTF_CONVERSION_I:
+ case PRINTF_CONVERSION_D:
+ sign = true;
+ case PRINTF_CONVERSION_O:
+ case PRINTF_CONVERSION_U:
+ case PRINTF_CONVERSION_x:
+ case PRINTF_CONVERSION_X:
+ if (elt_type->getTypeID() != Type::IntegerTyID)
+ return false;
+
+ /* If the bits change, we need to consider the signed. */
+ if (elt_type != Type::getInt32Ty(elt_type->getContext())) {
+ Value *II = NULL;
+ for (int i = 0; i < vec_num; i++) {
+ Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getInt32Ty(elt_type->getContext()), vec_num));
+ Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
+ Value *org = builder->CreateExtractElement(arg, cv);
+ Value *cvt = builder->CreateIntCast(org, Type::getInt32Ty(module->getContext()), sign);
+ II = builder->CreateInsertElement(vec, cvt, cv);
+ }
+ arg = II;
+ }
+
+ dst_type = arg->getType()->getPointerTo(1);
+ sizeof_size = sizeof(int) * vec_num;
+ return true;
+
+ case PRINTF_CONVERSION_F:
+ case PRINTF_CONVERSION_f:
+ case PRINTF_CONVERSION_E:
+ case PRINTF_CONVERSION_e:
+ case PRINTF_CONVERSION_G:
+ case PRINTF_CONVERSION_g:
+ case PRINTF_CONVERSION_A:
+ case PRINTF_CONVERSION_a:
+ if (elt_type->getTypeID() != Type::DoubleTyID && elt_type->getTypeID() != Type::FloatTyID)
+ return false;
+
+ if (elt_type->getTypeID() != Type::FloatTyID) {
+ Value *II = NULL;
+ for (int i = 0; i < vec_num; i++) {
+ Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getFloatTy(elt_type->getContext()), vec_num));
+ Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
+ Value *org = builder->CreateExtractElement(arg, cv);
+ Value* cvt = builder->CreateFPCast(org, Type::getFloatTy(module->getContext()));
+ II = builder->CreateInsertElement(vec, cvt, cv);
+ }
+ arg = II;
+ }
+ }
+ dst_type = arg->getType()->getPointerTo(1);
+ sizeof_size = sizeof(int) * vec_num;
+ return true;
+ }
+
+ default:
+ return false;
+ }
+
+ return false;
+ }
+
+ map<CallInst*, PrintfSet::PrintfFmt*> PrintfParser::printfs;
+
+ void* getPrintfInfo(CallInst* inst)
+ {
+ if (PrintfParser::printfs[inst])
+ return (void*)PrintfParser::printfs[inst];
+ return NULL;
+ }
+
+ FunctionPass* createPrintfParserPass()
+ {
+ return new PrintfParser();
+ }
+ char PrintfParser::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
new file mode 100644
index 0000000..3e48fbf
--- /dev/null
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -0,0 +1,878 @@
+/**
+ * \file llvm_scalarize.cpp
+ *
+ * This file is derived from:
+ * https://code.google.com/p/lunarglass/source/browse/trunk/Core/Passes/Transforms/Scalarize.cpp?r=903
+ */
+
+//===- Scalarize.cpp - Scalarize LunarGLASS IR ----------------------------===//
+//
+// LunarGLASS: An Open Modular Shader Compiler Architecture
+// Copyright (C) 2010-2014 LunarG, Inc.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following
+// disclaimer in the documentation and/or other materials provided
+// with the distribution.
+//
+// Neither the name of LunarG Inc. nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//===----------------------------------------------------------------------===//
+//
+// Author: Michael Ilseman, LunarG
+//
+//===----------------------------------------------------------------------===//
+//
+// Scalarize the IR.
+// * Loads of uniforms become multiple loadComponent calls
+//
+// * Reads/writes become read/writeComponent calls
+//
+// * Component-wise operations become multiple ops over each component
+//
+// * Texture call become recomponsed texture calls
+//
+// * Vector ops disappear, with their users referring to the scalarized
+// * components
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+using namespace llvm;
+
+namespace gbe {
+
+ struct VectorValues {
+ VectorValues() : vals()
+ { }
+
+ void setComponent(int c, llvm::Value* val)
+ {
+ assert(c >= 0 && c < 16 && "Out of bounds component");
+ vals[c] = val;
+ }
+ llvm::Value* getComponent(int c)
+ {
+ assert(c >= 0 && c < 16 && "Out of bounds component");
+ assert(vals[c] && "Requesting non-existing component");
+ return vals[c];
+ }
+
+ // {Value* x, Value* y, Value* z, Value* w}
+ llvm::Value* vals[16];
+ };
+
+ class Scalarize : public FunctionPass {
+
+ public:
+ // Standard pass stuff
+ static char ID;
+
+ Scalarize() : FunctionPass(ID)
+ {
+ initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+ initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
+#else
+ initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+#endif
+ }
+
+ virtual bool runOnFunction(Function&);
+ void print(raw_ostream&, const Module* = 0) const;
+ virtual void getAnalysisUsage(AnalysisUsage&) const;
+
+ protected:
+ // An instruction is valid post-scalarization iff it is fully scalar or it
+ // is a gla_loadn
+ bool isValid(const Instruction*);
+
+ // Take an instruction that produces a vector, and scalarize it
+ bool scalarize(Instruction*);
+ bool scalarizePerComponent(Instruction*);
+ bool scalarizeBitCast(BitCastInst *);
+ bool scalarizeFuncCall(CallInst *);
+ bool scalarizeLoad(LoadInst*);
+ bool scalarizeStore(StoreInst*);
+ //bool scalarizeIntrinsic(IntrinsicInst*);
+ bool scalarizeExtract(ExtractElementInst*);
+ bool scalarizeInsert(InsertElementInst*);
+ bool scalarizeShuffleVector(ShuffleVectorInst*);
+ bool scalarizePHI(PHINode*);
+ void scalarizeArgs(Function& F);
+ // ...
+
+ // Helpers to make the actual multiple scalar calls, one per
+ // component. Updates the given VectorValues's components with the new
+ // Values.
+ void makeScalarizedCalls(Function*, ArrayRef<Value*>, int numComponents, VectorValues&);
+
+ void makePerComponentScalarizedCalls(Instruction*, ArrayRef<Value*>);
+
+ // Makes a scalar form of the given instruction: replaces the operands
+ // and chooses a correct return type
+ Instruction* createScalarInstruction(Instruction* inst, ArrayRef<Value*>);
+
+ // Gather the specified components in the given values. Returns the
+ // component if the given value is a vector, or the scalar itself.
+ void gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs);
+
+ // Get the assigned component for that value. If the value is a scalar,
+ // returns the scalar. If it's a constant, returns that component. If
+ // it's an instruction, returns the vectorValues of that instruction for
+ // that component
+ Value* getComponent(int component, Value*);
+
+ // Used for assertion purposes. Whether we can get the component out with
+ // a getComponent call
+ bool canGetComponent(Value*);
+
+ // Used for assertion purposes. Whether for every operand we can get
+ // components with a getComponent call
+ bool canGetComponentArgs(User*);
+
+ // Delete the instruction in the deadList
+ void dce();
+
+
+ int GetConstantInt(const Value* value);
+ bool IsPerComponentOp(const Instruction* inst);
+ bool IsPerComponentOp(const Value* value);
+
+ //these function used to add extract and insert instructions when load/store etc.
+ void extractFromVector(Value* insn);
+ Value* InsertToVector(Value* insn, Value* vecValue);
+
+ Type* GetBasicType(Value* value) {
+ return GetBasicType(value->getType());
+ }
+
+ Type* GetBasicType(Type* type) {
+ switch(type->getTypeID()) {
+ case Type::VectorTyID:
+ case Type::ArrayTyID:
+ return GetBasicType(type->getContainedType(0));
+ default:
+ break;
+ }
+ return type;
+ }
+
+ int GetComponentCount(const Type* type) {
+ if (type->getTypeID() == Type::VectorTyID)
+ return llvm::dyn_cast<VectorType>(type)->getNumElements();
+ else
+ return 1;
+ }
+
+ int GetComponentCount(const Value* value) {
+ return GetComponentCount(value->getType());
+ }
+
+ /* set to insert new instructions after the specified instruction.*/
+ void setAppendPoint(Instruction *insn) {
+ BasicBlock::iterator next(insn);
+ builder->SetInsertPoint(++next);
+ }
+
+ DenseMap<Value*, VectorValues> vectorVals;
+ Module* module;
+ IRBuilder<>* builder;
+
+ Type* intTy;
+ Type* floatTy;
+
+ std::vector<Instruction*> deadList;
+
+ // List of vector phis that were not completely scalarized because some
+ // of their operands hadn't before been visited (i.e. loop variant
+ // variables)
+ SmallVector<PHINode*, 16> incompletePhis;
+ };
+
+ Value* Scalarize::getComponent(int component, Value* v)
+ {
+ assert(canGetComponent(v) && "getComponent called on unhandled vector");
+
+ if (v->getType()->isVectorTy()) {
+ if (ConstantDataVector* c = dyn_cast<ConstantDataVector>(v)) {
+ return c->getElementAsConstant(component);
+ } else if (ConstantVector* c = dyn_cast<ConstantVector>(v)) {
+ return c->getOperand(component);
+ } else if (isa<ConstantAggregateZero>(v)) {
+ return Constant::getNullValue(GetBasicType(v));
+ } else if (isa<UndefValue>(v)) {
+ return UndefValue::get(GetBasicType(v));
+ } else {
+ return vectorVals[v].getComponent(component);
+ }
+ } else {
+ return v;
+ }
+ }
+
+ bool IsPerComponentOp(const llvm::Value* value)
+ {
+ const llvm::Instruction* inst = llvm::dyn_cast<const llvm::Instruction>(value);
+ return inst && IsPerComponentOp(inst);
+ }
+
+ bool Scalarize::IsPerComponentOp(const Instruction* inst)
+ {
+ //if (const IntrinsicInst* intr = dyn_cast<const IntrinsicInst>(inst))
+ // return IsPerComponentOp(intr);
+
+ if (inst->isTerminator())
+ return false;
+
+ switch (inst->getOpcode()) {
+
+ // Cast ops are only per-component if they cast back to the same vector
+ // width
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ return GetComponentCount(inst->getOperand(0)) == GetComponentCount(inst);
+
+ // Vector ops
+ case Instruction::InsertElement:
+ case Instruction::ExtractElement:
+ case Instruction::ShuffleVector:
+
+ // Ways of accessing/loading/storing vectors
+ case Instruction::ExtractValue:
+ case Instruction::InsertValue:
+
+ // Memory ops
+ case Instruction::Alloca:
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::GetElementPtr:
+ // Phis are a little special. We consider them not to be per-component
+ // because the mechanism of choice is a single value (what path we took to
+ // get here), and doesn't choose per-component (as select would). The caller
+ // should know to handle phis specially
+ case Instruction::PHI:
+ // Call insts, conservatively are no per-component
+ case Instruction::Call:
+ // Misc
+ case Instruction::LandingPad: //--- 3.0
+ case Instruction::VAArg:
+ return false;
+ } // end of switch (inst->getOpcode())
+
+ return true;
+ }
+ int Scalarize::GetConstantInt(const Value* value)
+ {
+ const ConstantInt *constantInt = dyn_cast<ConstantInt>(value);
+
+ // this might still be a constant expression, rather than a numeric constant,
+ // e.g., expression with undef's in it, so it was not folded
+ if (! constantInt)
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("non-simple constant");
+
+ return constantInt->getValue().getSExtValue();
+ }
+ bool Scalarize::canGetComponent(Value* v)
+ {
+ if (v->getType()->isVectorTy()) {
+ if (isa<ConstantDataVector>(v) || isa<ConstantVector>(v) || isa<ConstantAggregateZero>(v) || isa<UndefValue>(v)) {
+ return true;
+ } else {
+ assert((isa<Instruction>(v) || isa<Argument>(v)) && "Non-constant non-instuction?");
+ return vectorVals.count(v);
+ }
+ } else {
+ return true;
+ }
+ }
+
+ bool Scalarize::canGetComponentArgs(User* u)
+ {
+ if (PHINode* phi = dyn_cast<PHINode>(u)) {
+ for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i)
+ if (!canGetComponent(phi->getIncomingValue(i)))
+ return false;
+ } else {
+ for (User::op_iterator i = u->op_begin(), e = u->op_end(); i != e; ++i)
+ if (!canGetComponent(*i))
+ return false;
+ }
+ return true;
+ }
+
+ void Scalarize::gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs)
+ {
+ componentArgs.clear();
+ for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end(); i != e; ++i)
+ componentArgs.push_back(getComponent(component, *i));
+ }
+
+ Instruction* Scalarize::createScalarInstruction(Instruction* inst, ArrayRef<Value*> args)
+ {
+ // TODO: Refine the below into one large switch
+
+ unsigned op = inst->getOpcode();
+ if (inst->isCast()) {
+ assert(args.size() == 1 && "incorrect number of arguments for cast op");
+ return CastInst::Create((Instruction::CastOps)op, args[0], GetBasicType(inst));
+ }
+
+ if (inst->isBinaryOp()) {
+ assert(args.size() == 2 && "incorrect number of arguments for binary op");
+ return BinaryOperator::Create((Instruction::BinaryOps)op, args[0], args[1]);
+ }
+
+ if (PHINode* phi = dyn_cast<PHINode>(inst)) {
+ PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues());
+
+ // Loop over pairs of operands: [Value*, BasicBlock*]
+ for (unsigned int i = 0; i < args.size(); i++) {
+ BasicBlock* bb = phi->getIncomingBlock(i); //dyn_cast<BasicBlock>(args[i+1]);
+ //assert(bb && "Non-basic block incoming block?");
+ res->addIncoming(args[i], bb);
+ }
+
+ return res;
+ }
+
+ if (CmpInst* cmpInst = dyn_cast<CmpInst>(inst)) {
+ assert(args.size() == 2 && "incorrect number of arguments for comparison");
+ return CmpInst::Create(cmpInst->getOpcode(), cmpInst->getPredicate(), args[0], args[1]);
+ }
+
+ if (isa<SelectInst>(inst)) {
+ assert(args.size() == 3 && "incorrect number of arguments for select");
+ return SelectInst::Create(args[0], args[1], args[2]);
+ }
+
+ if (IntrinsicInst* intr = dyn_cast<IntrinsicInst>(inst)) {
+ if (! IsPerComponentOp(inst))
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarize instruction on a non-per-component intrinsic");
+
+ // TODO: Assumption is that all per-component intrinsics have all their
+ // arguments be overloadable. Need to find some way to assert on this
+ // assumption. This is due to how getDeclaration operates; it only takes
+ // a list of types that fit overloadable slots.
+ SmallVector<Type*, 8> tys(1, GetBasicType(inst->getType()));
+ // Call instructions have the decl as a last argument, so skip it
+ for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end() - 1; i != e; ++i) {
+ tys.push_back(GetBasicType((*i)->getType()));
+ }
+
+ Function* f = Intrinsic::getDeclaration(module, intr->getIntrinsicID(), tys);
+ return CallInst::Create(f, args);
+ }
+
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unsupported instruction: ", inst->getOpcode(),
+ // inst->getOpcodeName());
+ return 0;
+
+ }
+
+
+ void Scalarize::makeScalarizedCalls(Function* f, ArrayRef<Value*> args, int count, VectorValues& vVals)
+ {
+ assert(count > 0 && count <= 16 && "invalid number of vector components");
+ for (int i = 0; i < count; ++i) {
+ Value* res;
+ SmallVector<Value*, 8> callArgs(args.begin(), args.end());
+ callArgs.push_back(ConstantInt::get(intTy, i));
+
+ res = builder->CreateCall(f, callArgs);
+ vVals.setComponent(i, res);
+ }
+ }
+
+ void Scalarize::makePerComponentScalarizedCalls(Instruction* inst, ArrayRef<Value*> args)
+ {
+ int count = GetComponentCount(inst);
+ assert(count > 0 && count <= 16 && "invalid number of vector components");
+ assert((inst->getNumOperands() == args.size() || isa<PHINode>(inst))
+ && "not enough arguments passed for instruction");
+
+ VectorValues& vVals = vectorVals[inst];
+
+ for (int i = 0; i < count; ++i) {
+ // Set this component of each arg
+ SmallVector<Value*, 8> callArgs(args.size(), 0);
+ gatherComponents(i, args, callArgs);
+
+ Instruction* res = createScalarInstruction(inst, callArgs);
+
+ vVals.setComponent(i, res);
+ builder->Insert(res);
+ }
+ }
+
+ bool Scalarize::isValid(const Instruction* inst)
+ {
+ // The result
+ if (inst->getType()->isVectorTy())
+ return false;
+
+ // The arguments
+ for (Instruction::const_op_iterator i = inst->op_begin(), e = inst->op_end(); i != e; ++i) {
+ const Value* v = (*i);
+ assert(v);
+ if (v->getType()->isVectorTy())
+ return false;
+ }
+
+ return true;
+ }
+
+ bool Scalarize::scalarize(Instruction* inst)
+ {
+ if (isValid(inst))
+ return false;
+
+ assert(! vectorVals.count(inst) && "We've already scalarized this somehow?");
+ assert((canGetComponentArgs(inst) || isa<PHINode>(inst)) &&
+ "Scalarizing an op whose arguments haven't been scalarized ");
+ builder->SetInsertPoint(inst);
+
+ if (IsPerComponentOp(inst))
+ return scalarizePerComponent(inst);
+
+ //not Per Component bitcast, for example <2 * i8> -> i16, handle it in backend
+ if (BitCastInst* bt = dyn_cast<BitCastInst>(inst))
+ return scalarizeBitCast(bt);
+
+ if (LoadInst* ld = dyn_cast<LoadInst>(inst))
+ return scalarizeLoad(ld);
+
+ if (CallInst* call = dyn_cast<CallInst>(inst))
+ return scalarizeFuncCall(call);
+
+ if (ExtractElementInst* extr = dyn_cast<ExtractElementInst>(inst))
+ return scalarizeExtract(extr);
+
+ if (InsertElementInst* ins = dyn_cast<InsertElementInst>(inst))
+ return scalarizeInsert(ins);
+
+ if (ShuffleVectorInst* sv = dyn_cast<ShuffleVectorInst>(inst))
+ return scalarizeShuffleVector(sv);
+
+ if (PHINode* phi = dyn_cast<PHINode>(inst))
+ return scalarizePHI(phi);
+
+ if (isa<ExtractValueInst>(inst) || isa<InsertValueInst>(inst))
+ // TODO: need to come up with a struct/array model for scalarization
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarizing struct/array ops");
+
+ if (StoreInst* st = dyn_cast<StoreInst>(inst))
+ return scalarizeStore(st);
+
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unhandled instruction ", inst->getOpcode(), inst->getOpcodeName());
+ return false;
+ }
+
+ bool Scalarize::scalarizeShuffleVector(ShuffleVectorInst* sv)
+ {
+ // %res = shuffleVector <n x ty> %foo, <n x ty> bar, <n x i32> <...>
+ // ==> nothing (just make a new VectorValues with the new components)
+ VectorValues& vVals = vectorVals[sv];
+
+ int size = GetComponentCount(sv);
+ int srcSize = GetComponentCount(sv->getOperand(0)->getType());
+
+ for (int i = 0; i < size; ++i) {
+ int select = sv->getMaskValue(i);
+
+ if (select < 0) {
+ vVals.setComponent(i, UndefValue::get(GetBasicType(sv->getOperand(0))));
+ continue;
+ }
+
+ // Otherwise look up the corresponding component from the correct
+ // source.
+ Value* selectee;
+ if (select < srcSize) {
+ selectee = sv->getOperand(0);
+ } else {
+ // Choose from the second operand
+ select -= srcSize;
+ selectee = sv->getOperand(1);
+ }
+
+ vVals.setComponent(i, getComponent(select, selectee));
+ }
+
+ return true;
+ }
+
+ bool Scalarize::scalarizePerComponent(Instruction* inst)
+ {
+ // dst = op <n x ty> %foo, <n x ty> %bar
+ // ==> dstx = op ty %foox, ty %barx
+ // dsty = op ty %fooy, ty %bary
+ // ...
+
+ SmallVector<Value*, 16> args(inst->op_begin(), inst->op_end());
+
+ makePerComponentScalarizedCalls(inst, args);
+
+ return true;
+ }
+
+ bool Scalarize::scalarizePHI(PHINode* phi)
+ {
+ // dst = phi <n x ty> [ %foo, %bb1 ], [ %bar, %bb2], ...
+ // ==> dstx = phi ty [ %foox, %bb1 ], [ %barx, %bb2], ...
+ // dsty = phi ty [ %fooy, %bb1 ], [ %bary, %bb2], ...
+
+ // If the scalar values are all known up-front, then just make the full
+ // phinode now. If they are not yet known (phinode for a loop variant
+ // variable), then deferr the arguments until later
+
+ if (canGetComponentArgs(phi)) {
+ SmallVector<Value*, 8> args(phi->op_begin(), phi->op_end());
+ makePerComponentScalarizedCalls(phi, args);
+ } else {
+ makePerComponentScalarizedCalls(phi, ArrayRef<Value*>());
+ incompletePhis.push_back(phi);
+ }
+
+ return true;
+ }
+
+ void Scalarize::extractFromVector(Value* insn) {
+ VectorValues& vVals = vectorVals[insn];
+
+ for (int i = 0; i < GetComponentCount(insn); ++i) {
+ Value *cv = ConstantInt::get(intTy, i);
+ Value *EI = builder->CreateExtractElement(insn, cv);
+ vVals.setComponent(i, EI);
+ }
+ }
+
+ Value* Scalarize::InsertToVector(Value * insn, Value* vecValue) {
+ //VectorValues& vVals = vectorVals[writeValue];
+
+ //add fake insert instructions to avoid removed
+ Value *II = NULL;
+ for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+ Value *vec = II ? II : UndefValue::get(vecValue->getType());
+ Value *cv = ConstantInt::get(intTy, i);
+ II = builder->CreateInsertElement(vec, getComponent(i, vecValue), cv);
+ }
+
+ return II;
+ }
+
+ bool Scalarize::scalarizeFuncCall(CallInst* call) {
+ if (Function *F = call->getCalledFunction()) {
+ if (F->getIntrinsicID() != 0) { //Intrinsic functions
+ NOT_IMPLEMENTED;
+ } else {
+ Value *Callee = call->getCalledValue();
+ const std::string fnName = Callee->getName();
+ auto it = instrinsicMap.map.find(fnName);
+ GBE_ASSERT(it != instrinsicMap.map.end());
+
+ // Get the function arguments
+ CallSite CS(call);
+ CallSite::arg_iterator CI = CS.arg_begin() + 2;
+
+ switch (it->second) {
+ default: break;
+ case GEN_OCL_READ_IMAGE_I_1D:
+ case GEN_OCL_READ_IMAGE_UI_1D:
+ case GEN_OCL_READ_IMAGE_F_1D:
+ case GEN_OCL_READ_IMAGE_I_2D:
+ case GEN_OCL_READ_IMAGE_UI_2D:
+ case GEN_OCL_READ_IMAGE_F_2D:
+ case GEN_OCL_READ_IMAGE_I_3D:
+ case GEN_OCL_READ_IMAGE_UI_3D:
+ case GEN_OCL_READ_IMAGE_F_3D:
+
+ case GEN_OCL_READ_IMAGE_I_1D_I:
+ case GEN_OCL_READ_IMAGE_UI_1D_I:
+ case GEN_OCL_READ_IMAGE_F_1D_I:
+ case GEN_OCL_READ_IMAGE_I_2D_I:
+ case GEN_OCL_READ_IMAGE_UI_2D_I:
+ case GEN_OCL_READ_IMAGE_F_2D_I:
+ case GEN_OCL_READ_IMAGE_I_3D_I:
+ case GEN_OCL_READ_IMAGE_UI_3D_I:
+ case GEN_OCL_READ_IMAGE_F_3D_I:
+ case GEN_OCL_GET_IMAGE_WIDTH:
+ case GEN_OCL_GET_IMAGE_HEIGHT:
+ {
+ setAppendPoint(call);
+ extractFromVector(call);
+ break;
+ }
+ case GEN_OCL_WRITE_IMAGE_I_3D:
+ case GEN_OCL_WRITE_IMAGE_UI_3D:
+ case GEN_OCL_WRITE_IMAGE_F_3D:
+ CI++;
+ case GEN_OCL_WRITE_IMAGE_I_2D:
+ case GEN_OCL_WRITE_IMAGE_UI_2D:
+ case GEN_OCL_WRITE_IMAGE_F_2D:
+ CI++;
+ case GEN_OCL_WRITE_IMAGE_I_1D:
+ case GEN_OCL_WRITE_IMAGE_UI_1D:
+ case GEN_OCL_WRITE_IMAGE_F_1D:
+ {
+ *CI = InsertToVector(call, *CI);
+ break;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ bool Scalarize::scalarizeBitCast(BitCastInst* bt)
+ {
+ if(bt->getOperand(0)->getType()->isVectorTy())
+ bt->setOperand(0, InsertToVector(bt, bt->getOperand(0)));
+ if(bt->getType()->isVectorTy()) {
+ setAppendPoint(bt);
+ extractFromVector(bt);
+ }
+ return false;
+ }
+
+ bool Scalarize::scalarizeLoad(LoadInst* ld)
+ {
+ setAppendPoint(ld);
+ extractFromVector(ld);
+ return false;
+ }
+
+ bool Scalarize::scalarizeStore(StoreInst* st) {
+ st->setOperand(0, InsertToVector(st, st->getValueOperand()));
+ return false;
+ }
+
+ bool Scalarize::scalarizeExtract(ExtractElementInst* extr)
+ {
+ // %res = extractelement <n X ty> %foo, %i
+ // ==> nothing (just use %foo's %ith component instead of %res)
+
+ if (! isa<Constant>(extr->getOperand(1))) {
+ // TODO: Variably referenced components. Probably handle/emulate through
+ // a series of selects.
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+ }
+ //if (isa<Argument>(extr->getOperand(0)))
+ // return false;
+ int component = GetConstantInt(extr->getOperand(1));
+ Value* v = getComponent(component, extr->getOperand(0));
+ if(extr == v)
+ return false;
+ extr->replaceAllUsesWith(v);
+
+ return true;
+ }
+
+ bool Scalarize::scalarizeInsert(InsertElementInst* ins)
+ {
+ // %res = insertValue <n x ty> %foo, %i
+ // ==> nothing (just make a new VectorValues with the new component)
+
+ if (! isa<Constant>(ins->getOperand(2))) {
+ // TODO: Variably referenced components. Probably handle/emulate through
+ // a series of selects.
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+ }
+
+ int component = GetConstantInt(ins->getOperand(2));
+
+ VectorValues& vVals = vectorVals[ins];
+ for (int i = 0; i < GetComponentCount(ins); ++i) {
+ vVals.setComponent(i, i == component ? ins->getOperand(1)
+ : getComponent(i, ins->getOperand(0)));
+ }
+
+ return true;
+ }
+
+ void Scalarize::scalarizeArgs(Function& F) {
+ if (F.arg_empty())
+ return;
+ ReversePostOrderTraversal<Function*> rpot(&F);
+ BasicBlock::iterator instI = (*rpot.begin())->begin();
+ builder->SetInsertPoint(instI);
+
+ Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+
+ for (; I != E; ++I) {
+ Type *type = I->getType();
+
+ if(type->isVectorTy())
+ extractFromVector(I);
+ }
+ return;
+ }
+
+ bool Scalarize::runOnFunction(Function& F)
+ {
+ switch (F.getCallingConv()) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+ case CallingConv::PTX_Device:
+ return false;
+ case CallingConv::PTX_Kernel:
+#else
+ case CallingConv::C:
+#endif
+ break;
+ default: GBE_ASSERTM(false, "Unsupported calling convention");
+ }
+
+ // As we inline all function calls, so skip non-kernel functions
+ bool bKernel = isKernelFunction(F);
+ if(!bKernel) return false;
+
+ bool changed = false;
+ module = F.getParent();
+ intTy = IntegerType::get(module->getContext(), 32);
+ floatTy = Type::getFloatTy(module->getContext());
+ builder = new IRBuilder<>(module->getContext());
+
+ scalarizeArgs(F);
+ typedef ReversePostOrderTraversal<Function*> RPOTType;
+ RPOTType rpot(&F);
+ for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
+ for (BasicBlock::iterator instI = (*bbI)->begin(), instE = (*bbI)->end(); instI != instE; ++instI) {
+ bool scalarized = scalarize(instI);
+ if (scalarized) {
+ changed = true;
+ // TODO: uncomment when done
+ deadList.push_back(instI);
+ }
+ }
+ }
+
+ // Fill in the incomplete phis
+ for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
+ phiI != phiE; ++phiI) {
+ assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
+ // Fill in each component of this phi
+ VectorValues& vVals = vectorVals[*phiI];
+ for (int c = 0; c < GetComponentCount(*phiI); ++c) {
+ PHINode* compPhi = dyn_cast<PHINode>(vVals.getComponent(c));
+ assert(compPhi && "Vector phi got scalarized to non-phis?");
+
+ // Loop over pairs of operands: [Value*, BasicBlock*]
+ for (unsigned int i = 0; i < (*phiI)->getNumOperands(); i++) {
+ BasicBlock* bb = (*phiI)->getIncomingBlock(i);
+ assert(bb && "Non-basic block incoming block?");
+ compPhi->addIncoming(getComponent(c, (*phiI)->getOperand(i)), bb);
+ }
+ }
+ }
+
+ dce();
+
+ incompletePhis.clear();
+ vectorVals.clear();
+
+ delete builder;
+ builder = 0;
+
+ return changed;
+ }
+
+ void Scalarize::dce()
+ {
+ //two passes delete for some phinode
+ for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+ (*i)->dropAllReferences();
+ if((*i)->use_empty()) {
+ (*i)->eraseFromParent();
+ (*i) = NULL;
+ }
+ }
+ for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+ if((*i) && (*i)->getParent())
+ (*i)->eraseFromParent();
+ }
+ deadList.clear();
+ }
+
+ void Scalarize::getAnalysisUsage(AnalysisUsage& AU) const
+ {
+ }
+
+ void Scalarize::print(raw_ostream&, const Module*) const
+ {
+ return;
+ }
+ FunctionPass* createScalarizePass()
+ {
+ return new Scalarize();
+ }
+ char Scalarize::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
new file mode 100644
index 0000000..84ba383
--- /dev/null
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_to_gen.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DataLayout.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/ADT/Triple.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Support/IRReader.h"
+#else
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
+#else
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#endif
+
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/llvm_gen_backend.hpp"
+#include "llvm/llvm_to_gen.hpp"
+#include "sys/cvar.hpp"
+#include "sys/platform.hpp"
+
+#include <clang/CodeGen/CodeGenAction.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <memory>
+
+namespace gbe
+{
+ BVAR(OCL_OUTPUT_LLVM, false);
+ BVAR(OCL_OUTPUT_CFG, false);
+ BVAR(OCL_OUTPUT_CFG_ONLY, false);
+ BVAR(OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS, false);
+ using namespace llvm;
+
+ void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL)
+ {
+ FunctionPassManager FPM(&mod);
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+ FPM.add(new DataLayoutPass(DL));
+#else
+ FPM.add(new DataLayout(DL));
+#endif
+
+ // XXX remove the verifier pass to workaround a non-fatal error.
+ // add this pass cause the Clang abort with the following error message:
+ // "Global is external, but doesn't have external or weak linkage"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+ //FPM.add(createVerifierPass(true));
+#else
+ //FPM.add(createVerifierPass());
+#endif
+ FPM.add(new TargetLibraryInfo(*libraryInfo));
+ FPM.add(createTypeBasedAliasAnalysisPass());
+ FPM.add(createBasicAliasAnalysisPass());
+ FPM.add(createCFGSimplificationPass());
+ FPM.add(createSROAPass());
+ FPM.add(createEarlyCSEPass());
+ FPM.add(createLowerExpectIntrinsicPass());
+
+ FPM.doInitialization();
+ for (Module::iterator I = mod.begin(),
+ E = mod.end(); I != E; ++I)
+ if (!I->isDeclaration())
+ FPM.run(*I);
+ FPM.doFinalization();
+ }
+
+ void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL, int optLevel)
+ {
+ llvm::PassManager MPM;
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+ MPM.add(new DataLayoutPass(DL));
+#else
+ MPM.add(new DataLayout(DL));
+#endif
+ MPM.add(new TargetLibraryInfo(*libraryInfo));
+ MPM.add(createTypeBasedAliasAnalysisPass());
+ MPM.add(createBasicAliasAnalysisPass());
+ MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
+
+ MPM.add(createIPSCCPPass()); // IP SCCP
+ MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
+
+ MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+ MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
+ MPM.add(createPruneEHPass()); // Remove dead EH info
+ MPM.add(createBarrierNodupPass(false)); // remove noduplicate fnAttr before inlining.
+ MPM.add(createFunctionInliningPass(200000));
+ MPM.add(createBarrierNodupPass(true)); // restore noduplicate fnAttr after inlining.
+ MPM.add(createFunctionAttrsPass()); // Set readonly/readnone attrs
+
+ //MPM.add(createScalarReplAggregatesPass(64, true, -1, -1, 64))
+ if(optLevel > 0)
+ MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+ MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
+ MPM.add(createJumpThreadingPass()); // Thread jumps.
+ MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ MPM.add(createInstructionCombiningPass()); // Combine silly seq's
+
+ MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
+ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ MPM.add(createReassociatePass()); // Reassociate expressions
+ MPM.add(createLoopRotatePass()); // Rotate Loop
+ MPM.add(createLICMPass()); // Hoist loop invariants
+ MPM.add(createLoopUnswitchPass(true));
+ MPM.add(createInstructionCombiningPass());
+ MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
+ MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
+ MPM.add(createLoopDeletionPass()); // Delete dead loops
+ MPM.add(createLoopUnrollPass()); // Unroll small loops
+ if(optLevel > 0)
+ MPM.add(createGVNPass()); // Remove redundancies
+ MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset
+ MPM.add(createSCCPPass()); // Constant prop with SCCP
+
+ // Run instcombine after redundancy elimination to exploit opportunities
+ // opened up by them.
+ MPM.add(createInstructionCombiningPass());
+ MPM.add(createJumpThreadingPass()); // Thread jumps
+ MPM.add(createCorrelatedValuePropagationPass());
+ MPM.add(createDeadStoreEliminationPass()); // Delete dead stores
+ MPM.add(createAggressiveDCEPass()); // Delete dead instructions
+ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ MPM.add(createInstructionCombiningPass()); // Clean up after everything.
+ MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+ if(optLevel > 0) {
+ MPM.add(createGlobalDCEPass()); // Remove dead fns and globals.
+ MPM.add(createConstantMergePass()); // Merge dup global constants
+ }
+
+ MPM.run(mod);
+ }
+
+ bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module, int optLevel)
+ {
+ std::string errInfo;
+ std::unique_ptr<llvm::raw_fd_ostream> o = NULL;
+ if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS || OCL_OUTPUT_LLVM)
+ o = std::unique_ptr<llvm::raw_fd_ostream>(new llvm::raw_fd_ostream(fileno(stdout), false));
+
+ // Get the module from its file
+ llvm::SMDiagnostic Err;
+ std::auto_ptr<Module> M;
+ if(fileName){
+ // only when module is null, Get the global LLVM context
+ llvm::LLVMContext& c = llvm::getGlobalContext();
+ M.reset(ParseIRFile(fileName, Err, c));
+ if (M.get() == 0) return false;
+ }
+ Module &mod = (module!=NULL)?*(llvm::Module*)module:*M.get();
+ DataLayout DL(&mod);
+
+ Triple TargetTriple(mod.getTargetTriple());
+ TargetLibraryInfo *libraryInfo = new TargetLibraryInfo(TargetTriple);
+ libraryInfo->disableAllFunctions();
+
+ runFuntionPass(mod, libraryInfo, DL);
+ runModulePass(mod, libraryInfo, DL, optLevel);
+
+ llvm::PassManager passes;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+ passes.add(new DataLayoutPass(DL));
+#else
+ passes.add(new DataLayout(DL));
+#endif
+ // Print the code before further optimizations
+ if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+ passes.add(createPrintModulePass(*o));
+#else
+ passes.add(createPrintModulePass(&*o));
+#endif
+ passes.add(createIntrinsicLoweringPass());
+ passes.add(createFunctionInliningPass(200000));
+ passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
+ passes.add(createLoadStoreOptimizationPass());
+ passes.add(createRemoveGEPPass(unit));
+ passes.add(createConstantPropagationPass());
+ passes.add(createLowerSwitchPass());
+ passes.add(createPromoteMemoryToRegisterPass());
+ if(optLevel > 0)
+ passes.add(createGVNPass()); // Remove redundancies
+ passes.add(createPrintfParserPass());
+ passes.add(createScalarizePass()); // Expand all vector ops
+ passes.add(createDeadInstEliminationPass()); // Remove simplified instructions
+ passes.add(createCFGSimplificationPass()); // Merge & remove BBs
+ passes.add(createScalarizePass()); // Expand all vector ops
+
+ if(OCL_OUTPUT_CFG)
+ passes.add(createCFGPrinterPass());
+ if(OCL_OUTPUT_CFG_ONLY)
+ passes.add(createCFGOnlyPrinterPass());
+ passes.add(createGenPass(unit));
+
+ // Print the code extra optimization passes
+ if (OCL_OUTPUT_LLVM)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+ passes.add(createPrintModulePass(*o));
+#else
+ passes.add(createPrintModulePass(&*o));
+#endif
+ passes.run(mod);
+ return true;
+ }
+} /* namespace gbe */
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
new file mode 100644
index 0000000..41e3477
--- /dev/null
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_to_gen.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_LLVM_TO_GEN_HPP__
+#define __GBE_IR_LLVM_TO_GEN_HPP__
+
+namespace gbe {
+ namespace ir {
+ // The code is output into an IR unit
+ class Unit;
+ } /* namespace ir */
+
+ /*! Convert the LLVM IR code to a GEN IR code,
+ optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
+ bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module, int optLevel);
+
+} /* namespace gbe */
+
+#endif /* __GBE_IR_LLVM_TO_GEN_HPP__ */
+
diff --git a/backend/src/ocl_as.h b/backend/src/ocl_as.h
new file mode 100644
index 0000000..692e892
--- /dev/null
+++ b/backend/src/ocl_as.h
@@ -0,0 +1,3086 @@
+// This file is autogenerated by gen_as.sh.
+// Don't modify it manually.
+union _type_cast_1_b {
+ char _char;
+ uchar _uchar;
+};
+
+INLINE OVERLOADABLE uchar as_uchar(char v) {
+ union _type_cast_1_b u;
+ u._char = v;
+ return u._uchar;
+}
+
+INLINE OVERLOADABLE char as_char(uchar v) {
+ union _type_cast_1_b u;
+ u._uchar = v;
+ return u._char;
+}
+
+union _type_cast_2_b {
+ short _short;
+ ushort _ushort;
+ char2 _char2;
+ uchar2 _uchar2;
+};
+
+INLINE OVERLOADABLE ushort as_ushort(short v) {
+ union _type_cast_2_b u;
+ u._short = v;
+ return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(short v) {
+ union _type_cast_2_b u;
+ u._short = v;
+ return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(short v) {
+ union _type_cast_2_b u;
+ u._short = v;
+ return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(ushort v) {
+ union _type_cast_2_b u;
+ u._ushort = v;
+ return u._short;
+}
+
+INLINE OVERLOADABLE char2 as_char2(ushort v) {
+ union _type_cast_2_b u;
+ u._ushort = v;
+ return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(ushort v) {
+ union _type_cast_2_b u;
+ u._ushort = v;
+ return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(char2 v) {
+ union _type_cast_2_b u;
+ u._char2 = v;
+ return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(char2 v) {
+ union _type_cast_2_b u;
+ u._char2 = v;
+ return u._ushort;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(char2 v) {
+ union _type_cast_2_b u;
+ u._char2 = v;
+ return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(uchar2 v) {
+ union _type_cast_2_b u;
+ u._uchar2 = v;
+ return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(uchar2 v) {
+ union _type_cast_2_b u;
+ u._uchar2 = v;
+ return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(uchar2 v) {
+ union _type_cast_2_b u;
+ u._uchar2 = v;
+ return u._char2;
+}
+
+union _type_cast_4_b {
+ int _int;
+ uint _uint;
+ short2 _short2;
+ ushort2 _ushort2;
+ char3 _char3;
+ char4 _char4;
+ uchar3 _uchar3;
+ uchar4 _uchar4;
+ float _float;
+};
+
+INLINE OVERLOADABLE uint as_uint(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(int v) {
+ union _type_cast_4_b u;
+ u._int = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(uint v) {
+ union _type_cast_4_b u;
+ u._uint = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(short2 v) {
+ union _type_cast_4_b u;
+ u._short2 = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(ushort2 v) {
+ union _type_cast_4_b u;
+ u._ushort2 = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(char3 v) {
+ union _type_cast_4_b u;
+ u._char3 = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(char3 v) {
+ union _type_cast_4_b u;
+ u._char3 = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(char3 v) {
+ union _type_cast_4_b u;
+ u._char3 = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(char3 v) {
+ union _type_cast_4_b u;
+ u._char3 = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
+ union _type_cast_4_b u;
+ u._char3 = v;
+ return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(char3 v) {
+ union _type_cast_4_b u;
+ u._char3 = v;
+ return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(char3 v) {
+ union _type_cast_4_b u;
+ u._char3 = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(char4 v) {
+ union _type_cast_4_b u;
+ u._char4 = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(char4 v) {
+ union _type_cast_4_b u;
+ u._char4 = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(char4 v) {
+ union _type_cast_4_b u;
+ u._char4 = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(char4 v) {
+ union _type_cast_4_b u;
+ u._char4 = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(char4 v) {
+ union _type_cast_4_b u;
+ u._char4 = v;
+ return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(char4 v) {
+ union _type_cast_4_b u;
+ u._char4 = v;
+ return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(char4 v) {
+ union _type_cast_4_b u;
+ u._char4 = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uchar3 v) {
+ union _type_cast_4_b u;
+ u._uchar3 = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(uchar3 v) {
+ union _type_cast_4_b u;
+ u._uchar3 = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uchar3 v) {
+ union _type_cast_4_b u;
+ u._uchar3 = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uchar3 v) {
+ union _type_cast_4_b u;
+ u._uchar3 = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
+ union _type_cast_4_b u;
+ u._uchar3 = v;
+ return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uchar3 v) {
+ union _type_cast_4_b u;
+ u._uchar3 = v;
+ return u._char4;
+}
+
+INLINE OVERLOADABLE float as_float(uchar3 v) {
+ union _type_cast_4_b u;
+ u._uchar3 = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uchar4 v) {
+ union _type_cast_4_b u;
+ u._uchar4 = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(uchar4 v) {
+ union _type_cast_4_b u;
+ u._uchar4 = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uchar4 v) {
+ union _type_cast_4_b u;
+ u._uchar4 = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uchar4 v) {
+ union _type_cast_4_b u;
+ u._uchar4 = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uchar4 v) {
+ union _type_cast_4_b u;
+ u._uchar4 = v;
+ return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uchar4 v) {
+ union _type_cast_4_b u;
+ u._uchar4 = v;
+ return u._char4;
+}
+
+INLINE OVERLOADABLE float as_float(uchar4 v) {
+ union _type_cast_4_b u;
+ u._uchar4 = v;
+ return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
+ union _type_cast_4_b u;
+ u._float = v;
+ return u._uchar4;
+}
+
+union _type_cast_8_b {
+ long _long;
+ ulong _ulong;
+ int2 _int2;
+ uint2 _uint2;
+ short3 _short3;
+ short4 _short4;
+ ushort3 _ushort3;
+ ushort4 _ushort4;
+ char8 _char8;
+ uchar8 _uchar8;
+ double _double;
+ float2 _float2;
+};
+
+INLINE OVERLOADABLE ulong as_ulong(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(long v) {
+ union _type_cast_8_b u;
+ u._long = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ulong v) {
+ union _type_cast_8_b u;
+ u._ulong = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(int2 v) {
+ union _type_cast_8_b u;
+ u._int2 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uint2 v) {
+ union _type_cast_8_b u;
+ u._uint2 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(short3 v) {
+ union _type_cast_8_b u;
+ u._short3 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(short4 v) {
+ union _type_cast_8_b u;
+ u._short4 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ushort3 v) {
+ union _type_cast_8_b u;
+ u._ushort3 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ushort4 v) {
+ union _type_cast_8_b u;
+ u._ushort4 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(char8 v) {
+ union _type_cast_8_b u;
+ u._char8 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE double as_double(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uchar8 v) {
+ union _type_cast_8_b u;
+ u._uchar8 = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE float2 as_float2(double v) {
+ union _type_cast_8_b u;
+ u._double = v;
+ return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(float2 v) {
+ union _type_cast_8_b u;
+ u._float2 = v;
+ return u._double;
+}
+
+union _type_cast_16_b {
+ long2 _long2;
+ ulong2 _ulong2;
+ int3 _int3;
+ int4 _int4;
+ uint3 _uint3;
+ uint4 _uint4;
+ short8 _short8;
+ ushort8 _ushort8;
+ char16 _char16;
+ uchar16 _uchar16;
+ double2 _double2;
+ float3 _float3;
+ float4 _float4;
+};
+
+INLINE OVERLOADABLE ulong2 as_ulong2(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(long2 v) {
+ union _type_cast_16_b u;
+ u._long2 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
+ union _type_cast_16_b u;
+ u._ulong2 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(int3 v) {
+ union _type_cast_16_b u;
+ u._int3 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(int4 v) {
+ union _type_cast_16_b u;
+ u._int4 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uint3 v) {
+ union _type_cast_16_b u;
+ u._uint3 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uint4 v) {
+ union _type_cast_16_b u;
+ u._uint4 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(short8 v) {
+ union _type_cast_16_b u;
+ u._short8 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ushort8 v) {
+ union _type_cast_16_b u;
+ u._ushort8 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(char16 v) {
+ union _type_cast_16_b u;
+ u._char16 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uchar16 v) {
+ union _type_cast_16_b u;
+ u._uchar16 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE float3 as_float3(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(double2 v) {
+ union _type_cast_16_b u;
+ u._double2 = v;
+ return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float3 v) {
+ union _type_cast_16_b u;
+ u._float3 = v;
+ return u._double2;
+}
+
+INLINE OVERLOADABLE long2 as_long2(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float4 v) {
+ union _type_cast_16_b u;
+ u._float4 = v;
+ return u._double2;
+}
+
+union _type_cast_32_b {
+ long3 _long3;
+ long4 _long4;
+ ulong3 _ulong3;
+ ulong4 _ulong4;
+ int8 _int8;
+ uint8 _uint8;
+ short16 _short16;
+ ushort16 _ushort16;
+ double3 _double3;
+ double4 _double4;
+ float8 _float8;
+};
+
+INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(long3 v) {
+ union _type_cast_32_b u;
+ u._long3 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(long4 v) {
+ union _type_cast_32_b u;
+ u._long4 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ulong3 v) {
+ union _type_cast_32_b u;
+ u._ulong3 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ulong4 v) {
+ union _type_cast_32_b u;
+ u._ulong4 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(int8 v) {
+ union _type_cast_32_b u;
+ u._int8 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(uint8 v) {
+ union _type_cast_32_b u;
+ u._uint8 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(short16 v) {
+ union _type_cast_32_b u;
+ u._short16 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ushort16 v) {
+ union _type_cast_32_b u;
+ u._ushort16 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE float8 as_float8(double3 v) {
+ union _type_cast_32_b u;
+ u._double3 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE float8 as_float8(double4 v) {
+ union _type_cast_32_b u;
+ u._double4 = v;
+ return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(float8 v) {
+ union _type_cast_32_b u;
+ u._float8 = v;
+ return u._double4;
+}
+
+union _type_cast_64_b {
+ long8 _long8;
+ ulong8 _ulong8;
+ int16 _int16;
+ uint16 _uint16;
+ double8 _double8;
+ float16 _float16;
+};
+
+INLINE OVERLOADABLE ulong8 as_ulong8(long8 v) {
+ union _type_cast_64_b u;
+ u._long8 = v;
+ return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(long8 v) {
+ union _type_cast_64_b u;
+ u._long8 = v;
+ return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(long8 v) {
+ union _type_cast_64_b u;
+ u._long8 = v;
+ return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(long8 v) {
+ union _type_cast_64_b u;
+ u._long8 = v;
+ return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(long8 v) {
+ union _type_cast_64_b u;
+ u._long8 = v;
+ return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(ulong8 v) {
+ union _type_cast_64_b u;
+ u._ulong8 = v;
+ return u._long8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(ulong8 v) {
+ union _type_cast_64_b u;
+ u._ulong8 = v;
+ return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(ulong8 v) {
+ union _type_cast_64_b u;
+ u._ulong8 = v;
+ return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(ulong8 v) {
+ union _type_cast_64_b u;
+ u._ulong8 = v;
+ return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(ulong8 v) {
+ union _type_cast_64_b u;
+ u._ulong8 = v;
+ return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(int16 v) {
+ union _type_cast_64_b u;
+ u._int16 = v;
+ return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(int16 v) {
+ union _type_cast_64_b u;
+ u._int16 = v;
+ return u._ulong8;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(int16 v) {
+ union _type_cast_64_b u;
+ u._int16 = v;
+ return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(int16 v) {
+ union _type_cast_64_b u;
+ u._int16 = v;
+ return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(int16 v) {
+ union _type_cast_64_b u;
+ u._int16 = v;
+ return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(uint16 v) {
+ union _type_cast_64_b u;
+ u._uint16 = v;
+ return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(uint16 v) {
+ union _type_cast_64_b u;
+ u._uint16 = v;
+ return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(uint16 v) {
+ union _type_cast_64_b u;
+ u._uint16 = v;
+ return u._int16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(uint16 v) {
+ union _type_cast_64_b u;
+ u._uint16 = v;
+ return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(uint16 v) {
+ union _type_cast_64_b u;
+ u._uint16 = v;
+ return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(double8 v) {
+ union _type_cast_64_b u;
+ u._double8 = v;
+ return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(double8 v) {
+ union _type_cast_64_b u;
+ u._double8 = v;
+ return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(double8 v) {
+ union _type_cast_64_b u;
+ u._double8 = v;
+ return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(double8 v) {
+ union _type_cast_64_b u;
+ u._double8 = v;
+ return u._uint16;
+}
+
+INLINE OVERLOADABLE float16 as_float16(double8 v) {
+ union _type_cast_64_b u;
+ u._double8 = v;
+ return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(float16 v) {
+ union _type_cast_64_b u;
+ u._float16 = v;
+ return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(float16 v) {
+ union _type_cast_64_b u;
+ u._float16 = v;
+ return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(float16 v) {
+ union _type_cast_64_b u;
+ u._float16 = v;
+ return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(float16 v) {
+ union _type_cast_64_b u;
+ u._float16 = v;
+ return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(float16 v) {
+ union _type_cast_64_b u;
+ u._float16 = v;
+ return u._double8;
+}
+
+union _type_cast_128_b {
+ long16 _long16;
+ ulong16 _ulong16;
+ double16 _double16;
+};
+
+INLINE OVERLOADABLE ulong16 as_ulong16(long16 v) {
+ union _type_cast_128_b u;
+ u._long16 = v;
+ return u._ulong16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(long16 v) {
+ union _type_cast_128_b u;
+ u._long16 = v;
+ return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(ulong16 v) {
+ union _type_cast_128_b u;
+ u._ulong16 = v;
+ return u._long16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(ulong16 v) {
+ union _type_cast_128_b u;
+ u._ulong16 = v;
+ return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(double16 v) {
+ union _type_cast_128_b u;
+ u._double16 = v;
+ return u._long16;
+}
+
+INLINE OVERLOADABLE ulong16 as_ulong16(double16 v) {
+ union _type_cast_128_b u;
+ u._double16 = v;
+ return u._ulong16;
+}
+
diff --git a/backend/src/ocl_barrier.ll b/backend/src/ocl_barrier.ll
new file mode 100644
index 0000000..4e55fcb
--- /dev/null
+++ b/backend/src/ocl_barrier.ll
@@ -0,0 +1,39 @@
+;XXX FIXME as llvm can't use macros, we hardcoded 3, 1, 2
+;here, we may need to use a more grace way to handle this type
+;of values latter.
+;#define CLK_LOCAL_MEM_FENCE (1 << 0)
+;#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+declare i32 @_get_local_mem_fence() nounwind alwaysinline
+declare i32 @_get_global_mem_fence() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
+
+define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
+ %1 = icmp eq i32 %flags, 3
+ br i1 %1, label %barrier_local_global, label %barrier_local_check
+
+barrier_local_global:
+ call void @__gen_ocl_barrier_local_and_global()
+ br label %done
+
+barrier_local_check:
+ %2 = icmp eq i32 %flags, 1
+ br i1 %2, label %barrier_local, label %barrier_global_check
+
+barrier_local:
+ call void @__gen_ocl_barrier_local()
+ br label %done
+
+barrier_global_check:
+ %3 = icmp eq i32 %flags, 2
+ br i1 %3, label %barrier_global, label %done
+
+barrier_global:
+ call void @__gen_ocl_barrier_global()
+ br label %done
+
+done:
+ ret void
+}
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
new file mode 100644
index 0000000..52f5365
--- /dev/null
+++ b/backend/src/ocl_common_defines.h
@@ -0,0 +1,126 @@
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+#ifndef __OCL_COMMON_DEFINES__
+#define __OCL_COMMON_DEFINES__
+//
+// Common defines for Image intrinsics
+// Channel order
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
+enum {
+ CLK_R = 0x10B0,
+ CLK_A = 0x10B1,
+ CLK_RG = 0x10B2,
+ CLK_RA = 0x10B3,
+ CLK_RGB = 0x10B4,
+ CLK_RGBA = 0x10B5,
+ CLK_BGRA = 0x10B6,
+ CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+ CLK_xRGB = 0x10B7,
+#endif
+
+ CLK_INTENSITY = 0x10B8,
+ CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ ,
+ CLK_Rx = 0x10BA,
+ CLK_RGx = 0x10BB,
+ CLK_RGBx = 0x10BC
+#endif
+};
+
+
+typedef enum clk_channel_type {
+ // valid formats for float return types
+ CLK_SNORM_INT8 = 0x10D0, // four channel RGBA unorm8
+ CLK_SNORM_INT16 = 0x10D1, // four channel RGBA unorm16
+ CLK_UNORM_INT8 = 0x10D2, // four channel RGBA unorm8
+ CLK_UNORM_INT16 = 0x10D3, // four channel RGBA unorm16
+ CLK_HALF_FLOAT = 0x10DD, // four channel RGBA half
+ CLK_FLOAT = 0x10DE, // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ CLK_UNORM_SHORT_565 = 0x10D4,
+ CLK_UNORM_SHORT_555 = 0x10D5,
+ CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+ // valid only for integer return types
+ CLK_SIGNED_INT8 = 0x10D7,
+ CLK_SIGNED_INT16 = 0x10D8,
+ CLK_SIGNED_INT32 = 0x10D9,
+ CLK_UNSIGNED_INT8 = 0x10DA,
+ CLK_UNSIGNED_INT16 = 0x10DB,
+ CLK_UNSIGNED_INT32 = 0x10DC,
+
+ // CI SPI for CPU
+ __CLK_UNORM_INT8888 , // four channel ARGB unorm8
+ __CLK_UNORM_INT8888R, // four channel BGRA unorm8
+
+ __CLK_VALID_IMAGE_TYPE_COUNT,
+ __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+ __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4, // number of bits required to
+ // represent any image type
+ __CLK_VALID_IMAGE_TYPE_MASK = ( 1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS ) - 1
+}clk_channel_type;
+
+typedef enum clk_sampler_type {
+ __CLK_ADDRESS_BASE = 0,
+ CLK_ADDRESS_NONE = (0 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_CLAMP = (1 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_CLAMP_TO_EDGE = (2 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_REPEAT = (3 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_MIRROR = (4 << __CLK_ADDRESS_BASE),
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+ CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
+#endif
+ __CLK_ADDRESS_MASK = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+ CLK_ADDRESS_CLAMP_TO_EDGE |
+ CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR),
+ __CLK_ADDRESS_BITS = 3, // number of bits required to
+ // represent address info
+
+ __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
+ CLK_NORMALIZED_COORDS_FALSE = 0,
+ CLK_NORMALIZED_COORDS_TRUE = (1 << __CLK_NORMALIZED_BASE),
+ __CLK_NORMALIZED_MASK = (CLK_NORMALIZED_COORDS_FALSE |
+ CLK_NORMALIZED_COORDS_TRUE),
+ __CLK_NORMALIZED_BITS = 1, // number of bits required to
+ // represent normalization
+ __CLK_FILTER_BASE = (__CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS),
+ CLK_FILTER_NEAREST = (0 << __CLK_FILTER_BASE),
+ CLK_FILTER_LINEAR = (1 << __CLK_FILTER_BASE),
+ CLK_FILTER_ANISOTROPIC = (2 << __CLK_FILTER_BASE),
+ __CLK_FILTER_MASK = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+ CLK_FILTER_ANISOTROPIC),
+ __CLK_FILTER_BITS = 2, // number of bits required to
+ // represent address info
+
+ __CLK_MIP_BASE = (__CLK_FILTER_BASE + __CLK_FILTER_BITS),
+ CLK_MIP_NEAREST = (0 << __CLK_MIP_BASE),
+ CLK_MIP_LINEAR = (1 << __CLK_MIP_BASE),
+ CLK_MIP_ANISOTROPIC = (2 << __CLK_MIP_BASE),
+ __CLK_MIP_MASK = (CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+ CLK_MIP_ANISOTROPIC),
+ __CLK_MIP_BITS = 2,
+
+ __CLK_SAMPLER_BITS = (__CLK_MIP_BASE + __CLK_MIP_BITS),
+ __CLK_SAMPLER_MASK = (__CLK_MIP_MASK | __CLK_FILTER_MASK |
+ __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK),
+
+ __CLK_SAMPLER_ARG_BASE = (__CLK_MIP_BASE + __CLK_SAMPLER_BITS),
+ __CLK_SAMPLER_ARG_BITS = 8,
+ __CLK_SAMPLER_ARG_MASK = (((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE),
+ __CLK_SAMPLER_ARG_KEY_BIT = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
+ __CLK_SAMPLER_ARG_KEY_BITS = 1,
+
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+#endif /* __OCL_COMMON_DEFINES__ */
\ No newline at end of file
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
new file mode 100644
index 0000000..8326768
--- /dev/null
+++ b/backend/src/ocl_convert.h
@@ -0,0 +1,17415 @@
+// This file is autogenerated by gen_convert.sh.
+// Don't modify it manually.
+INLINE OVERLOADABLE long convert_long(long v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(long v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(long v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(long v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(long v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(long v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(long v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(long v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(long v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(long v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ulong v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(ulong v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ulong v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ulong v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ulong v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(ulong v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ulong v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ulong v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ulong v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ulong v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(int v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(int v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(int v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(int v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(int v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(int v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(int v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(int v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(int v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(int v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uint v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uint v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uint v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(uint v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uint v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uint v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uint v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(uint v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uint v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uint v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(short v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(short v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(short v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(short v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(short v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(short v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(short v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(short v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(short v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(short v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ushort v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(ushort v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ushort v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ushort v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ushort v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(ushort v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ushort v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ushort v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ushort v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ushort v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(char v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(char v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(char v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(char v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(char v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(char v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(char v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(char v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(char v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(char v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uchar v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uchar v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uchar v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(uchar v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uchar v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uchar v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uchar v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(uchar v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uchar v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uchar v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(double v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(double v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(double v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(double v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(double v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(double v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(double v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(double v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(double v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(double v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(float v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(float v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(float v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(float v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(float v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(float v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(float v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(float v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(float v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(float v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long2 convert_long2(long2 v) { return v; }
+INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(long2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(long2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(long2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(long2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(long2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(long2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(long2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(long2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(ulong2 v) { return v; }
+INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ulong2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ulong2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(ulong2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(ulong2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ulong2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ulong2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ulong2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(int2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(int2 v) { return v; }
+INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(int2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(int2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(int2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(int2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(int2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(int2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uint2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uint2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(uint2 v) { return v; }
+INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uint2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uint2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(uint2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(uint2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uint2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(short2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(short2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(short2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(short2 v) { return v; }
+INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(short2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(short2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(short2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(short2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ushort2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(ushort2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(ushort2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ushort2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(ushort2 v) { return v; }
+INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ushort2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ushort2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ushort2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(char2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(char2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(char2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(char2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(char2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(char2 v) { return v; }
+INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(char2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(char2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uchar2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uchar2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uchar2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(uchar2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(uchar2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uchar2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(uchar2 v) { return v; }
+INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uchar2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(double2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(double2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(double2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(double2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(double2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(double2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(double2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(double2 v) { return v; }
+INLINE OVERLOADABLE float2 convert_float2(double2 v) {
+ return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(float2 v) {
+ return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(float2 v) {
+ return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(float2 v) {
+ return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(float2 v) {
+ return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(float2 v) {
+ return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(float2 v) {
+ return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(float2 v) {
+ return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(float2 v) {
+ return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(float2 v) {
+ return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(float2 v) { return v; }
+INLINE OVERLOADABLE long3 convert_long3(long3 v) { return v; }
+INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(long3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(long3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(long3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(long3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(long3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(long3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(long3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(long3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(ulong3 v) { return v; }
+INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ulong3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ulong3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(ulong3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(ulong3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ulong3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ulong3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ulong3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(int3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(int3 v) { return v; }
+INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(int3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(int3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(int3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(int3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(int3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(int3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uint3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uint3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(uint3 v) { return v; }
+INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uint3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uint3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(uint3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(uint3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uint3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(short3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(short3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(short3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(short3 v) { return v; }
+INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(short3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(short3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(short3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(short3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ushort3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(ushort3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(ushort3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ushort3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(ushort3 v) { return v; }
+INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ushort3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ushort3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ushort3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(char3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(char3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(char3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(char3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(char3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(char3 v) { return v; }
+INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(char3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(char3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uchar3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uchar3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uchar3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(uchar3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(uchar3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uchar3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(uchar3 v) { return v; }
+INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uchar3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(double3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(double3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(double3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(double3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(double3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(double3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(double3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(double3 v) { return v; }
+INLINE OVERLOADABLE float3 convert_float3(double3 v) {
+ return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(float3 v) {
+ return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(float3 v) {
+ return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(float3 v) {
+ return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(float3 v) {
+ return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(float3 v) {
+ return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(float3 v) {
+ return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(float3 v) {
+ return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(float3 v) {
+ return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(float3 v) {
+ return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(float3 v) { return v; }
+INLINE OVERLOADABLE long4 convert_long4(long4 v) { return v; }
+INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(long4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(long4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(long4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(long4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(long4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(long4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(long4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(long4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(ulong4 v) { return v; }
+INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ulong4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ulong4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(ulong4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(ulong4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ulong4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ulong4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ulong4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(int4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(int4 v) { return v; }
+INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(int4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(int4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(int4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(int4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(int4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(int4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uint4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uint4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(uint4 v) { return v; }
+INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uint4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uint4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(uint4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(uint4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uint4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(short4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(short4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(short4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(short4 v) { return v; }
+INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(short4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(short4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(short4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(short4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ushort4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(ushort4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(ushort4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ushort4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(ushort4 v) { return v; }
+INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ushort4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ushort4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ushort4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(char4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(char4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(char4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(char4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(char4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(char4 v) { return v; }
+INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(char4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(char4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uchar4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uchar4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uchar4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(uchar4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(uchar4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uchar4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(uchar4 v) { return v; }
+INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uchar4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(double4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(double4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(double4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(double4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(double4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(double4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(double4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(double4 v) { return v; }
+INLINE OVERLOADABLE float4 convert_float4(double4 v) {
+ return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(float4 v) {
+ return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(float4 v) {
+ return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(float4 v) {
+ return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(float4 v) {
+ return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(float4 v) {
+ return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(float4 v) {
+ return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(float4 v) {
+ return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(float4 v) {
+ return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(float4 v) {
+ return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(float4 v) { return v; }
+INLINE OVERLOADABLE long8 convert_long8(long8 v) { return v; }
+INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(long8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(long8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(long8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(long8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(long8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(long8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(long8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(long8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(ulong8 v) { return v; }
+INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ulong8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ulong8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(ulong8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(ulong8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ulong8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ulong8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ulong8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(int8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(int8 v) { return v; }
+INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(int8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(int8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(int8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(int8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(int8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(int8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uint8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uint8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(uint8 v) { return v; }
+INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uint8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uint8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(uint8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(uint8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uint8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(short8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(short8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(short8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(short8 v) { return v; }
+INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(short8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(short8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(short8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(short8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ushort8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(ushort8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(ushort8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ushort8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(ushort8 v) { return v; }
+INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ushort8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ushort8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ushort8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(char8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(char8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(char8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(char8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(char8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(char8 v) { return v; }
+INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(char8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(char8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uchar8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uchar8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uchar8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(uchar8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(uchar8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uchar8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(uchar8 v) { return v; }
+INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uchar8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(double8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(double8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(double8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(double8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(double8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(double8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(double8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(double8 v) { return v; }
+INLINE OVERLOADABLE float8 convert_float8(double8 v) {
+ return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(float8 v) {
+ return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(float8 v) {
+ return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(float8 v) {
+ return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(float8 v) {
+ return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(float8 v) {
+ return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(float8 v) {
+ return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(float8 v) {
+ return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(float8 v) {
+ return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(float8 v) {
+ return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(float8 v) { return v; }
+INLINE OVERLOADABLE long16 convert_long16(long16 v) { return v; }
+INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(long16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(long16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(long16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(long16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(long16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(long16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(long16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(long16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(ulong16 v) { return v; }
+INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ulong16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ulong16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(ulong16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(ulong16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ulong16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ulong16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ulong16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(int16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(int16 v) { return v; }
+INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(int16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(int16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(int16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(int16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(int16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(int16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uint16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uint16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(uint16 v) { return v; }
+INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uint16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uint16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(uint16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(uint16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uint16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(short16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(short16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(short16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(short16 v) { return v; }
+INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(short16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(short16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(short16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(short16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ushort16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(ushort16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(ushort16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ushort16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(ushort16 v) { return v; }
+INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ushort16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ushort16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ushort16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(char16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(char16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(char16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(char16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(char16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(char16 v) { return v; }
+INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(char16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(char16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uchar16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uchar16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uchar16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(uchar16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(uchar16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uchar16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(uchar16 v) { return v; }
+INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uchar16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(double16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(double16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(double16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(double16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(double16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(double16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(double16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(double16 v) { return v; }
+INLINE OVERLOADABLE float16 convert_float16(double16 v) {
+ return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(float16 v) {
+ return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(float16 v) {
+ return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(float16 v) {
+ return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(float16 v) {
+ return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(float16 v) {
+ return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(float16 v) {
+ return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(float16 v) {
+ return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
+ return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(float16 v) {
+ return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(float16 v) { return v; }
+
+#define DEF(DSTTYPE, SRCTYPE) \
+ OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
+ }
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x >= MAX ? (DSTTYPE)MAX : x; \
+ }
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+
+INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
+ ulong MAX = 0x7ffffffffffffffful;
+ return x >= MAX ? MAX : x;
+}
+
+#define DEF(DSTTYPE, SRCTYPE) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x <= 0 ? 0 : x; \
+ }
+DEF(ushort, char);
+DEF(uint, char);
+DEF(uint, short);
+DEF(ulong, char);
+DEF(ulong, short);
+DEF(ulong, int);
+DEF(ulong, long);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE) \
+ INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+ return x; \
+ }
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, uchar);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, uchar);
+DEF(ulong, ushort);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+
+INLINE OVERLOADABLE long2 convert_long2_sat(long2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(long2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(long2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(long2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(long2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(long2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(long2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(long2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(ulong2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ulong2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(ulong2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(ulong2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(ulong2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ulong2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(ulong2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ulong2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(int2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(int2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(int2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(int2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(int2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(int2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(int2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(int2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(uint2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uint2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(uint2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(uint2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(uint2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uint2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(uint2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uint2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(short2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(short2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(short2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(short2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(short2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(short2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(short2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(short2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(ushort2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ushort2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(ushort2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(ushort2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(ushort2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ushort2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(ushort2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ushort2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(char2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(char2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(char2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(char2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(char2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(char2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(char2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(char2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(uchar2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uchar2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(uchar2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(uchar2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(uchar2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uchar2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(uchar2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uchar2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(float2 v) {
+ return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(float2 v) {
+ return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(float2 v) {
+ return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(float2 v) {
+ return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(float2 v) {
+ return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(float2 v) {
+ return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(float2 v) {
+ return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(float2 v) {
+ return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(long3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(long3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(long3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(long3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(long3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(long3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(long3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(long3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(ulong3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ulong3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(ulong3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(ulong3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(ulong3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ulong3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(ulong3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ulong3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(int3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(int3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(int3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(int3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(int3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(int3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(int3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(int3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(uint3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uint3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(uint3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(uint3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(uint3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uint3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(uint3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uint3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(short3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(short3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(short3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(short3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(short3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(short3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(short3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(short3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(ushort3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ushort3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(ushort3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(ushort3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(ushort3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ushort3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(ushort3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ushort3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(char3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(char3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(char3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(char3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(char3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(char3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(char3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(char3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(uchar3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uchar3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(uchar3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(uchar3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(uchar3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uchar3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(uchar3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uchar3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(float3 v) {
+ return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(float3 v) {
+ return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(float3 v) {
+ return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(float3 v) {
+ return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(float3 v) {
+ return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(float3 v) {
+ return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(float3 v) {
+ return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(float3 v) {
+ return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(long4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(long4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(long4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(long4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(long4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(long4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(long4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(long4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(ulong4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ulong4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(ulong4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(ulong4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(ulong4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ulong4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(ulong4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ulong4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(int4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(int4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(int4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(int4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(int4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(int4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(int4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(int4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(uint4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uint4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(uint4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(uint4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(uint4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uint4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(uint4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uint4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(short4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(short4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(short4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(short4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(short4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(short4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(short4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(short4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(ushort4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ushort4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(ushort4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(ushort4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(ushort4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ushort4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(ushort4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ushort4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(char4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(char4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(char4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(char4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(char4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(char4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(char4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(char4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(uchar4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uchar4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(uchar4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(uchar4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(uchar4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uchar4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(uchar4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uchar4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(float4 v) {
+ return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(float4 v) {
+ return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(float4 v) {
+ return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(float4 v) {
+ return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(float4 v) {
+ return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(float4 v) {
+ return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(float4 v) {
+ return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(float4 v) {
+ return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(long8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(long8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(long8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(long8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(long8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(long8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(long8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(long8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(ulong8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ulong8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(ulong8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(ulong8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(ulong8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ulong8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(ulong8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ulong8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(int8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(int8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(int8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(int8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(int8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(int8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(int8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(int8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(uint8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uint8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(uint8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(uint8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(uint8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uint8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(uint8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uint8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(short8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(short8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(short8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(short8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(short8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(short8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(short8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(short8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(ushort8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ushort8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(ushort8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(ushort8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(ushort8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ushort8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(ushort8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ushort8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(char8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(char8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(char8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(char8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(char8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(char8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(char8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(char8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(uchar8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uchar8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(uchar8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(uchar8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(uchar8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uchar8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(uchar8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uchar8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(float8 v) {
+ return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(float8 v) {
+ return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(float8 v) {
+ return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(float8 v) {
+ return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(float8 v) {
+ return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(float8 v) {
+ return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(float8 v) {
+ return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(float8 v) {
+ return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(long16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(long16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(long16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(long16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(long16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(long16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(long16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(long16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(ulong16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ulong16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(ulong16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(ulong16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(ulong16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ulong16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(ulong16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ulong16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(int16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(int16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(int16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(int16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(int16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(int16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(int16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(int16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(uint16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uint16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(uint16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(uint16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(uint16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uint16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(uint16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uint16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(short16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(short16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(short16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(short16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(short16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(short16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(short16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(short16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(ushort16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ushort16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(ushort16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(ushort16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(ushort16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ushort16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(ushort16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ushort16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(char16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(char16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(char16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(char16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(char16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(char16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(char16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(char16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(uchar16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uchar16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(uchar16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(uchar16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(uchar16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uchar16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(uchar16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uchar16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(float16 v) {
+ return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(float16 v) {
+ return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(float16 v) {
+ return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(float16 v) {
+ return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(float16 v) {
+ return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(float16 v) {
+ return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(float16 v) {
+ return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(float16 v) {
+ return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __convert_float_rtz(long x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long l = u.f;
+ if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+ (l < x && x < 0)) {
+ u.u -= 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(long x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long l = u.f; //can not use u.f < x
+ if(l < x && x < 0x7fffffc000000000) {
+ if(x > 0)
+ u.u = u.u + 1;
+ else
+ u.u = u.u - 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(long x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long l = u.f; //avoid overflow
+ if(l > x || x >= 0x7fffffc000000000) {
+ if(x > 0)
+ u.u = u.u - 1;
+ else
+ u.u = u.u + 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ ulong l = u.f;
+ if(l > x || x >= 0xffffff8000000000)
+ u.u -= 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ ulong l = u.f; //can not use u.f < x
+ if(l < x && x < 0xffffff8000000000)
+ u.u = u.u + 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+ return __convert_float_rtz(x);
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(int x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long i = u.f;
+ if((i > x && x > 0) ||
+ (i < x && x < 0)) {
+ u.u -= 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(int x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ int i = u.f;
+ if(i < x) {
+ if(x > 0)
+ u.u += 1;
+ else
+ u.u -= 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(int x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ long i = u.f; //avoid overflow
+ if(i > x) {
+ if(x > 0)
+ u.u = u.u - 1;
+ else
+ u.u = u.u + 1;
+ }
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ ulong i = u.f;
+ if(i > x)
+ u.u -= 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
+{
+ union {
+ uint u;
+ float f;
+ } u;
+ u.f = x;
+ uint i = u.f;
+ if(i < x)
+ u.u += 1;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
+{
+ return __convert_float_rtz(x);
+}
+
+INLINE_OVERLOADABLE long convert_long_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(long x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(long x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(long x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(ulong x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(ulong x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(ulong x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(int x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(int x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(int x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(uint x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(uint x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(uint x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE long convert_long_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE long convert_long_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE long convert_long_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE int convert_int_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE int convert_int_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE int convert_int_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE int convert_int_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE uint convert_uint_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE short convert_short_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE short convert_short_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE short convert_short_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE short convert_short_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE char convert_char_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE char convert_char_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE char convert_char_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE char convert_char_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float convert_float_rte(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(float x)
+{ return x; }
+INLINE OVERLOADABLE long2 convert_long2_rte(long2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(long2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(long2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(long2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(long2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(long2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(long2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(long2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(long2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(long2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(long2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(long2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(long2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(long2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(long2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(long2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(long2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(long2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(long2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(long2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(long2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(long2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(long2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(long2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(long2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(long2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(long2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(long2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(long2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(long2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(long2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(long2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(long2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(long2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(long2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(long2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(ulong2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(ulong2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(ulong2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(ulong2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ulong2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ulong2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ulong2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ulong2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(ulong2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(ulong2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(ulong2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(ulong2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(ulong2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(ulong2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(ulong2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(ulong2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(ulong2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(ulong2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(ulong2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(ulong2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ulong2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ulong2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ulong2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ulong2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(ulong2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(ulong2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(ulong2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(ulong2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ulong2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ulong2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ulong2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ulong2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(ulong2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(ulong2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(ulong2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(ulong2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(int2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(int2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(int2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(int2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(int2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(int2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(int2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(int2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(int2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(int2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(int2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(int2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(int2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(int2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(int2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(int2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(int2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(int2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(int2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(int2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(int2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(int2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(int2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(int2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(int2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(int2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(int2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(int2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(int2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(int2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(int2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(int2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(int2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(int2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(int2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(int2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(uint2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(uint2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(uint2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(uint2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uint2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uint2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uint2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uint2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(uint2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(uint2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(uint2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(uint2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(uint2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(uint2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(uint2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(uint2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(uint2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(uint2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(uint2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(uint2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uint2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uint2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uint2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uint2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(uint2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(uint2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(uint2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(uint2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uint2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uint2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uint2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uint2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(uint2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(uint2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(uint2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(uint2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(short2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(short2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(short2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(short2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(short2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(short2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(short2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(short2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(short2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(short2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(short2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(short2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(short2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(short2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(short2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(short2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(short2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(short2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(short2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(short2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(short2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(short2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(short2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(short2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(short2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(short2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(short2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(short2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(short2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(short2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(short2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(short2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(short2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(short2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(short2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(short2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(ushort2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(ushort2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(ushort2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(ushort2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ushort2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ushort2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ushort2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ushort2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(ushort2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(ushort2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(ushort2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(ushort2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(ushort2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(ushort2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(ushort2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(ushort2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(ushort2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(ushort2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(ushort2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(ushort2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ushort2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ushort2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ushort2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ushort2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(ushort2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(ushort2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(ushort2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(ushort2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ushort2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ushort2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ushort2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ushort2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(ushort2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(ushort2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(ushort2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(ushort2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(char2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(char2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(char2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(char2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(char2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(char2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(char2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(char2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(char2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(char2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(char2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(char2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(char2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(char2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(char2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(char2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(char2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(char2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(char2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(char2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(char2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(char2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(char2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(char2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(char2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(char2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(char2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(char2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(char2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(char2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(char2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(char2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(char2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(char2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(char2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(char2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(uchar2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(uchar2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(uchar2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(uchar2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uchar2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uchar2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uchar2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uchar2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(uchar2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(uchar2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(uchar2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(uchar2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(uchar2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(uchar2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(uchar2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(uchar2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(uchar2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(uchar2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(uchar2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(uchar2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uchar2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uchar2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uchar2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uchar2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(uchar2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(uchar2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(uchar2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(uchar2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uchar2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uchar2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uchar2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uchar2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(uchar2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(uchar2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(uchar2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(uchar2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(float2 v) {
+ return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(float2 v) {
+ return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(float2 v) {
+ return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(float2 v) {
+ return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(float2 v) {
+ return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(float2 v) {
+ return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(float2 v) {
+ return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(float2 v) {
+ return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(float2 v) {
+ return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(float2 v) {
+ return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(float2 v) {
+ return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(float2 v) {
+ return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(float2 v) {
+ return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(float2 v) {
+ return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(float2 v) {
+ return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(float2 v) {
+ return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(float2 v) {
+ return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(float2 v) {
+ return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(float2 v) {
+ return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(float2 v) {
+ return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(float2 v) {
+ return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(float2 v) {
+ return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(float2 v) {
+ return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(float2 v) {
+ return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(float2 v) {
+ return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(float2 v) {
+ return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(float2 v) {
+ return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(float2 v) {
+ return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(float2 v) {
+ return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(float2 v) {
+ return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(float2 v) {
+ return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(float2 v) {
+ return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(float2 v) {
+ return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(float2 v) {
+ return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(float2 v) {
+ return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(float2 v) {
+ return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(long3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(long3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(long3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(long3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(long3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(long3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(long3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(long3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(long3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(long3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(long3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(long3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(long3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(long3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(long3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(long3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(long3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(long3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(long3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(long3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(long3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(long3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(long3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(long3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(long3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(long3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(long3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(long3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(long3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(long3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(long3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(long3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(long3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(long3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(long3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(long3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(ulong3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(ulong3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(ulong3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(ulong3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ulong3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ulong3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ulong3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ulong3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(ulong3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(ulong3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(ulong3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(ulong3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(ulong3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(ulong3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(ulong3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(ulong3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(ulong3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(ulong3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(ulong3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(ulong3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ulong3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ulong3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ulong3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ulong3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(ulong3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(ulong3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(ulong3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(ulong3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ulong3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ulong3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ulong3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ulong3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(ulong3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(ulong3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(ulong3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(ulong3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(int3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(int3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(int3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(int3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(int3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(int3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(int3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(int3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(int3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(int3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(int3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(int3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(int3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(int3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(int3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(int3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(int3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(int3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(int3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(int3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(int3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(int3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(int3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(int3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(int3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(int3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(int3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(int3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(int3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(int3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(int3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(int3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(int3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(int3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(int3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(int3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(uint3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(uint3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(uint3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(uint3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uint3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uint3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uint3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uint3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(uint3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(uint3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(uint3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(uint3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(uint3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(uint3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(uint3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(uint3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(uint3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(uint3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(uint3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(uint3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uint3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uint3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uint3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uint3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(uint3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(uint3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(uint3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(uint3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uint3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uint3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uint3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uint3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(uint3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(uint3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(uint3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(uint3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(short3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(short3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(short3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(short3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(short3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(short3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(short3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(short3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(short3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(short3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(short3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(short3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(short3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(short3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(short3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(short3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(short3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(short3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(short3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(short3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(short3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(short3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(short3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(short3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(short3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(short3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(short3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(short3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(short3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(short3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(short3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(short3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(short3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(short3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(short3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(short3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(ushort3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(ushort3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(ushort3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(ushort3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ushort3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ushort3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ushort3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ushort3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(ushort3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(ushort3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(ushort3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(ushort3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(ushort3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(ushort3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(ushort3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(ushort3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(ushort3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(ushort3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(ushort3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(ushort3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ushort3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ushort3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ushort3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ushort3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(ushort3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(ushort3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(ushort3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(ushort3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ushort3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ushort3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ushort3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ushort3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(ushort3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(ushort3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(ushort3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(ushort3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(char3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(char3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(char3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(char3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(char3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(char3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(char3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(char3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(char3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(char3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(char3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(char3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(char3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(char3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(char3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(char3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(char3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(char3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(char3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(char3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(char3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(char3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(char3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(char3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(char3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(char3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(char3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(char3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(char3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(char3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(char3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(char3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(char3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(char3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(char3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(char3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(uchar3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(uchar3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(uchar3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(uchar3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uchar3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uchar3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uchar3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uchar3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(uchar3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(uchar3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(uchar3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(uchar3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(uchar3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(uchar3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(uchar3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(uchar3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(uchar3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(uchar3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(uchar3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(uchar3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uchar3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uchar3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uchar3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uchar3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(uchar3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(uchar3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(uchar3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(uchar3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uchar3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uchar3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uchar3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uchar3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(uchar3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(uchar3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(uchar3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(uchar3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(float3 v) {
+ return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(float3 v) {
+ return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(float3 v) {
+ return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(float3 v) {
+ return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(float3 v) {
+ return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(float3 v) {
+ return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(float3 v) {
+ return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(float3 v) {
+ return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(float3 v) {
+ return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(float3 v) {
+ return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(float3 v) {
+ return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(float3 v) {
+ return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(float3 v) {
+ return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(float3 v) {
+ return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(float3 v) {
+ return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(float3 v) {
+ return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(float3 v) {
+ return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(float3 v) {
+ return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(float3 v) {
+ return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(float3 v) {
+ return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(float3 v) {
+ return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(float3 v) {
+ return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(float3 v) {
+ return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(float3 v) {
+ return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(float3 v) {
+ return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(float3 v) {
+ return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(float3 v) {
+ return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(float3 v) {
+ return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(float3 v) {
+ return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(float3 v) {
+ return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(float3 v) {
+ return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(float3 v) {
+ return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(float3 v) {
+ return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(float3 v) {
+ return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(float3 v) {
+ return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(float3 v) {
+ return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(long4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(long4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(long4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(long4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(long4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(long4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(long4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(long4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(long4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(long4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(long4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(long4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(long4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(long4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(long4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(long4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(long4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(long4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(long4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(long4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(long4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(long4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(long4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(long4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(long4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(long4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(long4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(long4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(long4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(long4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(long4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(long4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(long4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(long4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(long4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(long4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(ulong4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(ulong4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(ulong4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(ulong4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ulong4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ulong4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ulong4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ulong4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(ulong4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(ulong4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(ulong4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(ulong4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(ulong4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(ulong4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(ulong4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(ulong4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(ulong4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(ulong4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(ulong4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(ulong4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ulong4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ulong4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ulong4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ulong4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(ulong4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(ulong4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(ulong4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(ulong4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ulong4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ulong4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ulong4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ulong4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(ulong4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(ulong4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(ulong4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(ulong4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(int4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(int4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(int4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(int4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(int4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(int4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(int4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(int4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(int4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(int4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(int4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(int4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(int4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(int4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(int4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(int4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(int4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(int4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(int4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(int4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(int4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(int4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(int4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(int4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(int4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(int4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(int4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(int4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(int4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(int4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(int4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(int4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(int4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(int4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(int4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(int4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(uint4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(uint4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(uint4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(uint4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uint4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uint4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uint4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uint4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(uint4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(uint4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(uint4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(uint4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(uint4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(uint4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(uint4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(uint4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(uint4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(uint4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(uint4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(uint4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uint4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uint4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uint4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uint4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(uint4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(uint4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(uint4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(uint4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uint4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uint4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uint4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uint4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(uint4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(uint4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(uint4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(uint4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(short4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(short4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(short4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(short4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(short4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(short4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(short4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(short4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(short4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(short4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(short4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(short4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(short4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(short4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(short4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(short4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(short4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(short4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(short4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(short4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(short4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(short4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(short4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(short4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(short4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(short4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(short4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(short4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(short4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(short4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(short4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(short4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(short4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(short4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(short4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(short4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(ushort4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(ushort4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(ushort4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(ushort4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ushort4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ushort4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ushort4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ushort4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(ushort4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(ushort4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(ushort4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(ushort4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(ushort4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(ushort4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(ushort4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(ushort4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(ushort4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(ushort4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(ushort4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(ushort4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ushort4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ushort4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ushort4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ushort4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(ushort4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(ushort4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(ushort4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(ushort4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ushort4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ushort4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ushort4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ushort4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(ushort4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(ushort4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(ushort4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(ushort4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(char4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(char4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(char4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(char4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(char4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(char4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(char4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(char4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(char4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(char4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(char4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(char4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(char4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(char4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(char4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(char4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(char4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(char4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(char4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(char4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(char4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(char4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(char4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(char4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(char4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(char4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(char4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(char4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(char4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(char4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(char4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(char4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(char4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(char4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(char4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(char4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(uchar4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(uchar4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(uchar4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(uchar4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uchar4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uchar4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uchar4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uchar4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(uchar4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(uchar4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(uchar4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(uchar4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(uchar4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(uchar4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(uchar4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(uchar4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(uchar4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(uchar4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(uchar4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(uchar4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uchar4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uchar4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uchar4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uchar4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(uchar4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(uchar4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(uchar4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(uchar4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uchar4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uchar4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uchar4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uchar4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(uchar4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(uchar4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(uchar4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(uchar4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(float4 v) {
+ return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(float4 v) {
+ return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(float4 v) {
+ return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(float4 v) {
+ return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(float4 v) {
+ return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(float4 v) {
+ return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(float4 v) {
+ return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(float4 v) {
+ return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(float4 v) {
+ return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(float4 v) {
+ return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(float4 v) {
+ return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(float4 v) {
+ return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(float4 v) {
+ return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(float4 v) {
+ return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(float4 v) {
+ return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(float4 v) {
+ return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(float4 v) {
+ return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(float4 v) {
+ return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(float4 v) {
+ return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(float4 v) {
+ return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(float4 v) {
+ return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(float4 v) {
+ return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(float4 v) {
+ return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(float4 v) {
+ return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(float4 v) {
+ return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(float4 v) {
+ return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(float4 v) {
+ return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(float4 v) {
+ return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(float4 v) {
+ return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(float4 v) {
+ return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(float4 v) {
+ return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(float4 v) {
+ return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(float4 v) {
+ return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(float4 v) {
+ return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(float4 v) {
+ return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(float4 v) {
+ return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(long8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(long8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(long8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(long8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(long8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(long8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(long8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(long8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(long8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(long8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(long8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(long8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(long8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(long8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(long8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(long8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(long8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(long8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(long8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(long8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(long8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(long8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(long8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(long8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(long8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(long8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(long8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(long8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(long8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(long8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(long8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(long8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(long8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(long8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(long8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(long8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(ulong8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(ulong8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(ulong8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(ulong8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ulong8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ulong8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ulong8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ulong8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(ulong8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(ulong8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(ulong8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(ulong8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(ulong8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(ulong8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(ulong8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(ulong8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(ulong8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(ulong8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(ulong8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(ulong8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ulong8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ulong8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ulong8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ulong8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(ulong8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(ulong8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(ulong8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(ulong8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ulong8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ulong8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ulong8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ulong8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(ulong8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(ulong8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(ulong8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(ulong8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(int8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(int8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(int8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(int8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(int8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(int8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(int8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(int8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(int8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(int8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(int8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(int8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(int8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(int8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(int8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(int8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(int8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(int8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(int8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(int8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(int8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(int8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(int8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(int8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(int8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(int8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(int8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(int8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(int8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(int8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(int8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(int8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(int8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(int8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(int8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(int8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(uint8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(uint8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(uint8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(uint8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uint8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uint8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uint8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uint8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(uint8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(uint8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(uint8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(uint8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(uint8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(uint8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(uint8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(uint8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(uint8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(uint8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(uint8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(uint8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uint8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uint8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uint8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uint8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(uint8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(uint8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(uint8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(uint8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uint8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uint8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uint8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uint8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(uint8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(uint8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(uint8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(uint8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(short8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(short8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(short8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(short8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(short8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(short8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(short8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(short8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(short8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(short8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(short8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(short8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(short8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(short8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(short8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(short8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(short8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(short8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(short8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(short8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(short8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(short8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(short8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(short8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(short8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(short8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(short8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(short8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(short8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(short8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(short8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(short8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(short8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(short8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(short8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(short8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(ushort8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(ushort8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(ushort8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(ushort8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ushort8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ushort8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ushort8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ushort8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(ushort8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(ushort8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(ushort8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(ushort8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(ushort8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(ushort8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(ushort8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(ushort8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(ushort8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(ushort8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(ushort8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(ushort8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ushort8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ushort8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ushort8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ushort8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(ushort8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(ushort8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(ushort8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(ushort8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ushort8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ushort8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ushort8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ushort8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(ushort8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(ushort8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(ushort8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(ushort8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(char8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(char8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(char8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(char8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(char8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(char8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(char8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(char8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(char8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(char8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(char8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(char8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(char8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(char8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(char8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(char8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(char8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(char8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(char8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(char8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(char8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(char8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(char8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(char8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(char8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(char8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(char8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(char8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(char8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(char8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(char8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(char8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(char8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(char8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(char8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(char8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(uchar8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(uchar8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(uchar8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(uchar8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uchar8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uchar8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uchar8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uchar8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(uchar8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(uchar8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(uchar8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(uchar8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(uchar8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(uchar8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(uchar8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(uchar8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(uchar8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(uchar8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(uchar8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(uchar8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uchar8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uchar8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uchar8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uchar8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(uchar8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(uchar8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(uchar8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(uchar8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uchar8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uchar8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uchar8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uchar8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(uchar8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(uchar8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(uchar8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(uchar8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(float8 v) {
+ return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(float8 v) {
+ return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(float8 v) {
+ return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(float8 v) {
+ return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(float8 v) {
+ return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(float8 v) {
+ return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(float8 v) {
+ return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(float8 v) {
+ return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(float8 v) {
+ return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(float8 v) {
+ return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(float8 v) {
+ return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(float8 v) {
+ return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(float8 v) {
+ return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(float8 v) {
+ return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(float8 v) {
+ return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(float8 v) {
+ return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(float8 v) {
+ return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(float8 v) {
+ return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(float8 v) {
+ return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(float8 v) {
+ return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(float8 v) {
+ return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(float8 v) {
+ return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(float8 v) {
+ return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(float8 v) {
+ return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(float8 v) {
+ return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(float8 v) {
+ return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(float8 v) {
+ return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(float8 v) {
+ return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(float8 v) {
+ return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(float8 v) {
+ return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(float8 v) {
+ return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(float8 v) {
+ return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(float8 v) {
+ return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(float8 v) {
+ return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(float8 v) {
+ return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(float8 v) {
+ return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(long16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(long16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(long16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(long16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(long16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(long16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(long16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(long16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(long16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(long16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(long16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(long16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(long16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(long16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(long16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(long16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(long16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(long16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(long16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(long16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(long16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(long16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(long16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(long16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(long16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(long16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(long16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(long16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(long16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(long16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(long16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(long16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(long16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(long16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(long16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(long16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(ulong16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(ulong16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(ulong16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(ulong16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ulong16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ulong16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ulong16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ulong16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(ulong16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(ulong16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(ulong16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(ulong16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(ulong16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(ulong16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(ulong16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(ulong16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(ulong16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(ulong16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(ulong16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(ulong16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ulong16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ulong16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ulong16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ulong16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(ulong16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(ulong16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(ulong16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(ulong16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ulong16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ulong16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ulong16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ulong16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(ulong16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(ulong16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(ulong16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(ulong16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(int16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(int16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(int16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(int16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(int16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(int16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(int16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(int16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(int16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(int16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(int16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(int16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(int16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(int16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(int16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(int16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(int16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(int16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(int16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(int16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(int16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(int16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(int16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(int16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(int16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(int16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(int16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(int16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(int16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(int16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(int16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(int16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(int16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(int16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(int16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(int16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(uint16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(uint16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(uint16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(uint16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uint16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uint16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uint16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uint16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(uint16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(uint16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(uint16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(uint16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(uint16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(uint16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(uint16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(uint16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(uint16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(uint16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(uint16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(uint16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uint16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uint16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uint16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uint16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(uint16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(uint16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(uint16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(uint16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uint16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uint16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uint16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uint16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(uint16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(uint16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(uint16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(uint16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(short16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(short16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(short16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(short16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(short16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(short16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(short16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(short16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(short16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(short16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(short16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(short16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(short16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(short16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(short16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(short16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(short16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(short16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(short16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(short16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(short16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(short16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(short16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(short16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(short16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(short16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(short16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(short16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(short16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(short16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(short16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(short16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(short16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(short16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(short16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(short16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(ushort16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(ushort16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(ushort16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(ushort16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ushort16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ushort16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ushort16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ushort16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(ushort16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(ushort16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(ushort16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(ushort16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(ushort16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(ushort16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(ushort16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(ushort16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(ushort16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(ushort16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(ushort16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(ushort16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ushort16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ushort16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ushort16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ushort16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(ushort16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(ushort16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(ushort16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(ushort16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ushort16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ushort16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ushort16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ushort16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(ushort16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(ushort16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(ushort16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(ushort16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(char16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(char16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(char16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(char16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(char16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(char16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(char16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(char16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(char16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(char16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(char16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(char16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(char16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(char16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(char16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(char16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(char16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(char16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(char16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(char16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(char16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(char16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(char16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(char16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(char16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(char16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(char16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(char16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(char16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(char16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(char16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(char16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(char16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(char16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(char16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(char16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(uchar16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(uchar16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(uchar16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(uchar16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uchar16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uchar16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uchar16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uchar16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(uchar16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(uchar16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(uchar16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(uchar16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(uchar16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(uchar16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(uchar16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(uchar16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(uchar16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(uchar16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(uchar16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(uchar16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uchar16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uchar16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uchar16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uchar16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(uchar16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(uchar16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(uchar16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(uchar16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uchar16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uchar16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uchar16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uchar16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(uchar16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(uchar16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(uchar16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(uchar16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(float16 v) {
+ return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(float16 v) {
+ return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(float16 v) {
+ return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(float16 v) {
+ return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(float16 v) {
+ return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(float16 v) {
+ return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(float16 v) {
+ return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(float16 v) {
+ return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(float16 v) {
+ return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(float16 v) {
+ return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(float16 v) {
+ return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(float16 v) {
+ return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(float16 v) {
+ return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(float16 v) {
+ return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(float16 v) {
+ return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(float16 v) {
+ return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(float16 v) {
+ return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(float16 v) {
+ return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(float16 v) {
+ return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(float16 v) {
+ return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(float16 v) {
+ return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(float16 v) {
+ return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(float16 v) {
+ return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(float16 v) {
+ return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(float16 v) {
+ return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(float16 v) {
+ return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(float16 v) {
+ return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(float16 v) {
+ return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(float16 v) {
+ return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(float16 v) {
+ return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(float16 v) {
+ return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(float16 v) {
+ return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(float16 v) {
+ return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(float16 v) {
+ return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(float16 v) {
+ return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(float16 v) {
+ return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE_OVERLOADABLE long convert_long_sat_rte(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(float x)
+{ return convert_long_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(float x)
+{ return convert_long_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(float x)
+{ return convert_long_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(float x)
+{ return convert_long_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(float x)
+{ return convert_ulong_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(float x)
+{ return convert_ulong_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(float x)
+{ return convert_ulong_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(float x)
+{ return convert_ulong_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(float x)
+{ return convert_int_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(float x)
+{ return convert_int_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(float x)
+{ return convert_int_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(float x)
+{ return convert_int_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(float x)
+{ return convert_uint_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(float x)
+{ return convert_uint_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(float x)
+{ return convert_uint_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(float x)
+{ return convert_uint_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(float x)
+{ return convert_short_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(float x)
+{ return convert_short_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(float x)
+{ return convert_short_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(float x)
+{ return convert_short_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(float x)
+{ return convert_ushort_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(float x)
+{ return convert_ushort_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(float x)
+{ return convert_ushort_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(float x)
+{ return convert_ushort_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(float x)
+{ return convert_char_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(float x)
+{ return convert_char_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(float x)
+{ return convert_char_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(float x)
+{ return convert_char_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(float x)
+{ return convert_uchar_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(float x)
+{ return convert_uchar_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(float x)
+{ return convert_uchar_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(float x)
+{ return convert_uchar_sat(__gen_ocl_rndd(x)); }
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(long2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(long2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(long2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(long2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(long2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(long2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(long2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(long2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(long2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(long2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(long2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(long2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(long2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(long2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(long2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(long2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(long2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(long2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(long2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(long2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(long2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(long2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(long2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(long2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(long2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(long2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(long2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(long2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(long2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(long2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(long2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(long2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(ulong2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ulong2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ulong2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ulong2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ulong2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ulong2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ulong2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ulong2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(ulong2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ulong2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ulong2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ulong2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ulong2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ulong2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ulong2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ulong2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(ulong2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ulong2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ulong2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ulong2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ulong2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ulong2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ulong2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ulong2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(ulong2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ulong2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ulong2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ulong2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ulong2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ulong2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ulong2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ulong2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(int2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(int2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(int2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(int2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(int2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(int2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(int2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(int2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(int2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(int2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(int2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(int2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(int2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(int2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(int2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(int2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(int2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(int2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(int2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(int2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(int2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(int2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(int2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(int2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(int2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(int2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(int2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(int2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(int2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(int2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(int2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(int2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(uint2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uint2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uint2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uint2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uint2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uint2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uint2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uint2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(uint2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uint2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uint2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uint2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uint2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uint2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uint2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uint2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(uint2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uint2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uint2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uint2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uint2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uint2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uint2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uint2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(uint2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uint2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uint2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uint2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uint2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uint2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uint2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uint2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(short2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(short2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(short2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(short2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(short2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(short2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(short2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(short2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(short2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(short2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(short2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(short2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(short2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(short2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(short2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(short2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(short2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(short2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(short2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(short2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(short2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(short2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(short2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(short2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(short2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(short2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(short2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(short2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(short2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(short2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(short2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(short2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(ushort2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ushort2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ushort2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ushort2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ushort2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ushort2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ushort2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ushort2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(ushort2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ushort2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ushort2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ushort2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ushort2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ushort2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ushort2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ushort2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(ushort2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ushort2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ushort2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ushort2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ushort2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ushort2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ushort2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ushort2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(ushort2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ushort2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ushort2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ushort2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ushort2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ushort2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ushort2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ushort2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(char2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(char2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(char2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(char2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(char2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(char2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(char2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(char2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(char2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(char2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(char2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(char2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(char2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(char2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(char2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(char2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(char2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(char2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(char2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(char2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(char2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(char2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(char2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(char2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(char2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(char2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(char2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(char2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(char2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(char2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(char2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(char2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(uchar2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uchar2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uchar2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uchar2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uchar2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uchar2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uchar2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uchar2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(uchar2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uchar2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uchar2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uchar2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uchar2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uchar2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uchar2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uchar2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(uchar2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uchar2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uchar2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uchar2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uchar2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uchar2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uchar2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uchar2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(uchar2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uchar2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uchar2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uchar2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uchar2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uchar2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uchar2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uchar2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(float2 v) {
+ return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(float2 v) {
+ return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(float2 v) {
+ return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(float2 v) {
+ return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(float2 v) {
+ return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(float2 v) {
+ return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(float2 v) {
+ return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(float2 v) {
+ return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(float2 v) {
+ return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(float2 v) {
+ return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(float2 v) {
+ return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(float2 v) {
+ return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(float2 v) {
+ return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(float2 v) {
+ return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(float2 v) {
+ return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(float2 v) {
+ return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(float2 v) {
+ return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(float2 v) {
+ return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(float2 v) {
+ return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(float2 v) {
+ return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(float2 v) {
+ return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(float2 v) {
+ return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(float2 v) {
+ return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(float2 v) {
+ return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(float2 v) {
+ return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(float2 v) {
+ return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(float2 v) {
+ return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(float2 v) {
+ return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(float2 v) {
+ return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(float2 v) {
+ return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(float2 v) {
+ return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(float2 v) {
+ return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(long3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(long3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(long3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(long3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(long3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(long3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(long3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(long3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(long3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(long3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(long3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(long3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(long3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(long3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(long3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(long3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(long3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(long3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(long3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(long3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(long3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(long3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(long3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(long3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(long3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(long3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(long3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(long3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(long3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(long3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(long3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(long3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(ulong3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ulong3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ulong3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ulong3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ulong3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ulong3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ulong3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ulong3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(ulong3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ulong3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ulong3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ulong3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ulong3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ulong3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ulong3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ulong3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(ulong3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ulong3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ulong3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ulong3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ulong3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ulong3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ulong3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ulong3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(ulong3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ulong3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ulong3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ulong3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ulong3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ulong3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ulong3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ulong3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(int3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(int3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(int3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(int3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(int3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(int3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(int3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(int3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(int3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(int3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(int3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(int3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(int3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(int3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(int3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(int3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(int3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(int3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(int3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(int3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(int3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(int3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(int3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(int3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(int3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(int3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(int3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(int3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(int3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(int3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(int3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(int3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(uint3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uint3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uint3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uint3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uint3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uint3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uint3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uint3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(uint3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uint3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uint3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uint3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uint3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uint3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uint3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uint3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(uint3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uint3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uint3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uint3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uint3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uint3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uint3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uint3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(uint3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uint3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uint3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uint3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uint3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uint3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uint3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uint3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(short3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(short3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(short3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(short3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(short3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(short3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(short3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(short3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(short3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(short3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(short3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(short3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(short3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(short3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(short3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(short3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(short3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(short3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(short3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(short3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(short3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(short3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(short3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(short3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(short3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(short3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(short3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(short3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(short3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(short3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(short3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(short3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(ushort3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ushort3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ushort3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ushort3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ushort3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ushort3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ushort3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ushort3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(ushort3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ushort3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ushort3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ushort3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ushort3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ushort3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ushort3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ushort3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(ushort3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ushort3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ushort3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ushort3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ushort3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ushort3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ushort3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ushort3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(ushort3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ushort3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ushort3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ushort3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ushort3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ushort3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ushort3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ushort3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(char3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(char3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(char3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(char3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(char3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(char3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(char3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(char3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(char3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(char3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(char3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(char3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(char3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(char3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(char3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(char3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(char3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(char3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(char3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(char3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(char3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(char3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(char3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(char3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(char3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(char3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(char3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(char3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(char3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(char3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(char3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(char3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(uchar3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uchar3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uchar3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uchar3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uchar3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uchar3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uchar3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uchar3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(uchar3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uchar3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uchar3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uchar3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uchar3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uchar3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uchar3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uchar3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(uchar3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uchar3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uchar3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uchar3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uchar3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uchar3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uchar3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uchar3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(uchar3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uchar3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uchar3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uchar3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uchar3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uchar3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uchar3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uchar3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(float3 v) {
+ return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(float3 v) {
+ return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(float3 v) {
+ return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(float3 v) {
+ return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(float3 v) {
+ return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(float3 v) {
+ return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(float3 v) {
+ return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(float3 v) {
+ return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(float3 v) {
+ return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(float3 v) {
+ return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(float3 v) {
+ return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(float3 v) {
+ return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(float3 v) {
+ return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(float3 v) {
+ return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(float3 v) {
+ return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(float3 v) {
+ return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(float3 v) {
+ return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(float3 v) {
+ return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(float3 v) {
+ return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(float3 v) {
+ return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(float3 v) {
+ return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(float3 v) {
+ return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(float3 v) {
+ return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(float3 v) {
+ return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(float3 v) {
+ return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(float3 v) {
+ return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(float3 v) {
+ return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(float3 v) {
+ return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(float3 v) {
+ return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(float3 v) {
+ return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(float3 v) {
+ return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(float3 v) {
+ return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(long4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(long4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(long4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(long4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(long4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(long4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(long4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(long4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(long4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(long4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(long4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(long4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(long4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(long4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(long4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(long4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(long4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(long4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(long4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(long4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(long4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(long4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(long4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(long4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(long4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(long4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(long4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(long4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(long4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(long4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(long4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(long4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(ulong4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ulong4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ulong4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ulong4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ulong4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ulong4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ulong4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ulong4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(ulong4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ulong4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ulong4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ulong4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ulong4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ulong4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ulong4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ulong4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(ulong4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ulong4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ulong4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ulong4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ulong4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ulong4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ulong4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ulong4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(ulong4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ulong4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ulong4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ulong4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ulong4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ulong4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ulong4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ulong4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(int4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(int4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(int4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(int4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(int4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(int4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(int4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(int4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(int4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(int4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(int4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(int4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(int4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(int4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(int4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(int4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(int4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(int4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(int4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(int4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(int4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(int4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(int4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(int4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(int4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(int4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(int4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(int4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(int4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(int4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(int4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(int4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(uint4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uint4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uint4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uint4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uint4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uint4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uint4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uint4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(uint4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uint4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uint4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uint4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uint4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uint4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uint4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uint4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(uint4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uint4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uint4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uint4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uint4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uint4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uint4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uint4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(uint4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uint4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uint4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uint4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uint4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uint4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uint4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uint4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(short4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(short4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(short4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(short4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(short4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(short4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(short4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(short4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(short4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(short4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(short4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(short4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(short4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(short4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(short4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(short4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(short4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(short4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(short4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(short4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(short4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(short4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(short4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(short4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(short4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(short4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(short4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(short4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(short4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(short4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(short4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(short4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(ushort4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ushort4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ushort4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ushort4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ushort4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ushort4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ushort4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ushort4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(ushort4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ushort4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ushort4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ushort4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ushort4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ushort4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ushort4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ushort4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(ushort4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ushort4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ushort4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ushort4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ushort4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ushort4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ushort4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ushort4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(ushort4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ushort4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ushort4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ushort4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ushort4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ushort4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ushort4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ushort4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(char4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(char4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(char4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(char4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(char4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(char4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(char4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(char4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(char4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(char4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(char4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(char4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(char4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(char4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(char4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(char4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(char4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(char4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(char4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(char4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(char4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(char4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(char4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(char4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(char4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(char4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(char4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(char4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(char4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(char4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(char4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(char4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(uchar4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uchar4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uchar4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uchar4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uchar4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uchar4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uchar4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uchar4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(uchar4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uchar4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uchar4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uchar4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uchar4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uchar4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uchar4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uchar4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(uchar4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uchar4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uchar4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uchar4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uchar4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uchar4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uchar4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uchar4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(uchar4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uchar4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uchar4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uchar4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uchar4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uchar4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uchar4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uchar4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(float4 v) {
+ return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(float4 v) {
+ return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(float4 v) {
+ return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(float4 v) {
+ return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(float4 v) {
+ return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(float4 v) {
+ return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(float4 v) {
+ return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(float4 v) {
+ return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(float4 v) {
+ return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(float4 v) {
+ return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(float4 v) {
+ return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(float4 v) {
+ return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(float4 v) {
+ return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(float4 v) {
+ return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(float4 v) {
+ return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(float4 v) {
+ return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(float4 v) {
+ return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(float4 v) {
+ return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(float4 v) {
+ return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(float4 v) {
+ return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(float4 v) {
+ return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(float4 v) {
+ return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(float4 v) {
+ return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(float4 v) {
+ return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(float4 v) {
+ return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(float4 v) {
+ return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(float4 v) {
+ return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(float4 v) {
+ return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(float4 v) {
+ return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(float4 v) {
+ return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(float4 v) {
+ return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(float4 v) {
+ return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(long8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(long8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(long8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(long8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(long8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(long8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(long8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(long8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(long8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(long8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(long8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(long8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(long8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(long8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(long8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(long8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(long8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(long8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(long8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(long8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(long8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(long8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(long8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(long8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(long8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(long8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(long8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(long8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(long8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(long8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(long8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(long8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(ulong8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ulong8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ulong8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ulong8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ulong8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ulong8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ulong8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ulong8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(ulong8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ulong8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ulong8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ulong8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ulong8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ulong8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ulong8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ulong8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(ulong8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ulong8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ulong8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ulong8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ulong8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ulong8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ulong8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ulong8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(ulong8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ulong8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ulong8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ulong8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ulong8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ulong8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ulong8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ulong8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(int8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(int8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(int8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(int8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(int8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(int8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(int8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(int8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(int8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(int8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(int8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(int8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(int8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(int8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(int8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(int8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(int8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(int8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(int8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(int8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(int8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(int8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(int8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(int8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(int8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(int8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(int8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(int8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(int8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(int8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(int8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(int8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(uint8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uint8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uint8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uint8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uint8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uint8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uint8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uint8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(uint8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uint8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uint8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uint8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uint8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uint8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uint8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uint8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(uint8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uint8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uint8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uint8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uint8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uint8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uint8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uint8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(uint8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uint8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uint8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uint8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uint8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uint8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uint8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uint8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(short8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(short8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(short8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(short8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(short8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(short8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(short8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(short8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(short8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(short8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(short8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(short8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(short8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(short8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(short8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(short8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(short8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(short8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(short8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(short8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(short8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(short8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(short8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(short8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(short8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(short8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(short8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(short8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(short8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(short8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(short8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(short8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(ushort8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ushort8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ushort8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ushort8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ushort8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ushort8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ushort8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ushort8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(ushort8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ushort8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ushort8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ushort8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ushort8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ushort8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ushort8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ushort8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(ushort8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ushort8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ushort8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ushort8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ushort8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ushort8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ushort8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ushort8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(ushort8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ushort8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ushort8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ushort8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ushort8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ushort8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ushort8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ushort8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(char8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(char8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(char8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(char8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(char8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(char8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(char8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(char8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(char8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(char8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(char8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(char8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(char8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(char8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(char8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(char8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(char8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(char8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(char8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(char8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(char8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(char8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(char8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(char8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(char8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(char8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(char8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(char8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(char8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(char8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(char8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(char8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(uchar8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uchar8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uchar8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uchar8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uchar8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uchar8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uchar8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uchar8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(uchar8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uchar8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uchar8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uchar8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uchar8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uchar8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uchar8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uchar8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(uchar8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uchar8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uchar8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uchar8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uchar8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uchar8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uchar8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uchar8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(uchar8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uchar8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uchar8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uchar8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uchar8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uchar8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uchar8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uchar8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(float8 v) {
+ return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(float8 v) {
+ return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(float8 v) {
+ return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(float8 v) {
+ return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(float8 v) {
+ return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(float8 v) {
+ return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(float8 v) {
+ return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(float8 v) {
+ return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(float8 v) {
+ return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(float8 v) {
+ return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(float8 v) {
+ return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(float8 v) {
+ return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(float8 v) {
+ return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(float8 v) {
+ return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(float8 v) {
+ return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(float8 v) {
+ return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(float8 v) {
+ return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(float8 v) {
+ return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(float8 v) {
+ return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(float8 v) {
+ return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(float8 v) {
+ return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(float8 v) {
+ return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(float8 v) {
+ return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(float8 v) {
+ return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(float8 v) {
+ return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(float8 v) {
+ return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(float8 v) {
+ return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(float8 v) {
+ return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(float8 v) {
+ return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(float8 v) {
+ return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(float8 v) {
+ return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(float8 v) {
+ return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(long16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(long16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(long16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(long16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(long16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(long16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(long16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(long16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(long16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(long16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(long16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(long16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(long16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(long16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(long16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(long16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(long16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(long16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(long16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(long16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(long16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(long16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(long16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(long16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(long16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(long16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(long16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(long16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(long16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(long16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(long16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(long16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(ulong16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ulong16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ulong16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ulong16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ulong16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ulong16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ulong16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ulong16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(ulong16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ulong16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ulong16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ulong16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ulong16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ulong16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ulong16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ulong16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(ulong16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ulong16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ulong16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ulong16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ulong16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ulong16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ulong16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ulong16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(ulong16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ulong16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ulong16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ulong16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ulong16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ulong16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ulong16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ulong16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(int16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(int16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(int16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(int16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(int16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(int16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(int16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(int16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(int16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(int16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(int16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(int16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(int16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(int16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(int16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(int16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(int16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(int16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(int16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(int16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(int16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(int16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(int16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(int16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(int16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(int16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(int16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(int16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(int16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(int16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(int16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(int16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(uint16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uint16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uint16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uint16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uint16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uint16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uint16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uint16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(uint16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uint16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uint16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uint16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uint16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uint16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uint16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uint16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(uint16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uint16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uint16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uint16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uint16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uint16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uint16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uint16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(uint16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uint16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uint16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uint16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uint16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uint16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uint16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uint16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(short16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(short16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(short16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(short16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(short16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(short16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(short16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(short16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(short16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(short16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(short16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(short16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(short16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(short16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(short16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(short16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(short16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(short16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(short16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(short16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(short16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(short16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(short16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(short16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(short16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(short16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(short16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(short16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(short16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(short16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(short16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(short16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(ushort16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ushort16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ushort16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ushort16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ushort16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ushort16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ushort16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ushort16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(ushort16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ushort16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ushort16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ushort16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ushort16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ushort16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ushort16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ushort16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(ushort16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ushort16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ushort16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ushort16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ushort16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ushort16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ushort16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ushort16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(ushort16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ushort16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ushort16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ushort16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ushort16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ushort16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ushort16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ushort16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(char16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(char16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(char16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(char16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(char16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(char16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(char16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(char16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(char16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(char16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(char16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(char16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(char16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(char16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(char16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(char16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(char16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(char16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(char16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(char16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(char16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(char16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(char16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(char16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(char16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(char16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(char16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(char16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(char16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(char16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(char16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(char16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(uchar16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uchar16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uchar16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uchar16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uchar16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uchar16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uchar16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uchar16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(uchar16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uchar16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uchar16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uchar16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uchar16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uchar16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uchar16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uchar16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(uchar16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uchar16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uchar16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uchar16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uchar16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uchar16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uchar16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uchar16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(uchar16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uchar16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uchar16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uchar16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uchar16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uchar16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uchar16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uchar16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(float16 v) {
+ return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(float16 v) {
+ return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(float16 v) {
+ return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(float16 v) {
+ return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(float16 v) {
+ return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(float16 v) {
+ return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(float16 v) {
+ return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(float16 v) {
+ return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(float16 v) {
+ return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(float16 v) {
+ return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(float16 v) {
+ return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(float16 v) {
+ return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(float16 v) {
+ return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(float16 v) {
+ return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(float16 v) {
+ return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(float16 v) {
+ return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(float16 v) {
+ return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(float16 v) {
+ return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(float16 v) {
+ return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(float16 v) {
+ return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(float16 v) {
+ return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(float16 v) {
+ return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(float16 v) {
+ return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(float16 v) {
+ return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(float16 v) {
+ return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(float16 v) {
+ return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(float16 v) {
+ return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(float16 v) {
+ return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(float16 v) {
+ return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(float16 v) {
+ return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(float16 v) {
+ return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(float16 v) {
+ return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
diff --git a/backend/src/ocl_memcpy.ll b/backend/src/ocl_memcpy.ll
new file mode 100644
index 0000000..476033e
--- /dev/null
+++ b/backend/src/ocl_memcpy.ll
@@ -0,0 +1,336 @@
+;The memcpy's source code.
+; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
+; size_t index = 0;
+; while((index + 4) >= size) {
+; *((uint *)(dst + index)) = *((uint *)(src + index));
+; index += 4;
+; }
+; while(index < size) {
+; dst[index] = src[index];
+; index++;
+; }
+; }
+
+define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ %1 = load i32 addrspace(1)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+ store i32 %1, i32 addrspace(1)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+ %3 = load i8 addrspace(1)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+ %1 = load i32 addrspace(0)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+ store i32 %1, i32 addrspace(1)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+ %3 = load i8 addrspace(0)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ %1 = load i32 addrspace(3)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+ store i32 %1, i32 addrspace(1)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+ %3 = load i8 addrspace(3)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ %1 = load i32 addrspace(1)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+ store i32 %1, i32 addrspace(0)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+ %3 = load i8 addrspace(1)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+ %1 = load i32 addrspace(0)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+ store i32 %1, i32 addrspace(0)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+ %3 = load i8 addrspace(0)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ %1 = load i32 addrspace(3)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+ store i32 %1, i32 addrspace(0)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+ %3 = load i8 addrspace(3)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ %1 = load i32 addrspace(1)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+ store i32 %1, i32 addrspace(3)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+ %3 = load i8 addrspace(1)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+ %1 = load i32 addrspace(0)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+ store i32 %1, i32 addrspace(3)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+ %3 = load i8 addrspace(0)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
+
+define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond3, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ %1 = load i32 addrspace(3)* %0, align 4
+ %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+ store i32 %1, i32 addrspace(3)* %2, align 4
+ br label %while.cond
+
+while.cond3: ; preds = %while.cond, %while.body5
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+ %cmp4 = icmp ult i32 %index.1, %size
+ br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5: ; preds = %while.cond3
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+ %3 = load i8 addrspace(3)* %arrayidx, align 1
+ %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond3
+
+while.end7: ; preds = %while.cond3
+ ret void
+}
diff --git a/backend/src/ocl_memset.ll b/backend/src/ocl_memset.ll
new file mode 100644
index 0000000..addf9f5
--- /dev/null
+++ b/backend/src/ocl_memset.ll
@@ -0,0 +1,127 @@
+;The memset's source code.
+; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
+; size_t index = 0;
+; uint v = (val << 24) | (val << 16) | (val << 8) | val;
+; while((index + 4) >= size) {
+; *((uint *)(dst + index)) = v;
+; index += 4;
+; }
+; while(index < size) {
+; dst[index] = val;
+; index++;
+; }
+; }
+
+define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+ %conv = zext i8 %val to i32
+ %shl = shl nuw i32 %conv, 24
+ %shl2 = shl nuw nsw i32 %conv, 16
+ %or = or i32 %shl, %shl2
+ %shl4 = shl nuw nsw i32 %conv, 8
+ %or5 = or i32 %or, %shl4
+ %or7 = or i32 %or5, %conv
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond10, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
+ %0 = bitcast i8* %add.ptr to i32*
+ store i32 %or7, i32* %0, align 4
+ br label %while.cond
+
+while.cond10: ; preds = %while.cond, %while.body13
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+ %cmp11 = icmp ult i32 %index.1, %size
+ br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13: ; preds = %while.cond10
+ %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
+ store i8 %val, i8* %arrayidx, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond10
+
+while.end14: ; preds = %while.cond10
+ ret void
+}
+
+define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+ %conv = zext i8 %val to i32
+ %shl = shl nuw i32 %conv, 24
+ %shl2 = shl nuw nsw i32 %conv, 16
+ %or = or i32 %shl, %shl2
+ %shl4 = shl nuw nsw i32 %conv, 8
+ %or5 = or i32 %or, %shl4
+ %or7 = or i32 %or5, %conv
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond10, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+ %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+ store i32 %or7, i32 addrspace(1)* %0, align 4
+ br label %while.cond
+
+while.cond10: ; preds = %while.cond, %while.body13
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+ %cmp11 = icmp ult i32 %index.1, %size
+ br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13: ; preds = %while.cond10
+ %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+ store i8 %val, i8 addrspace(1)* %arrayidx, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond10
+
+while.end14: ; preds = %while.cond10
+ ret void
+}
+
+define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+ %conv = zext i8 %val to i32
+ %shl = shl nuw i32 %conv, 24
+ %shl2 = shl nuw nsw i32 %conv, 16
+ %or = or i32 %shl, %shl2
+ %shl4 = shl nuw nsw i32 %conv, 8
+ %or5 = or i32 %or, %shl4
+ %or7 = or i32 %or5, %conv
+ br label %while.cond
+
+while.cond: ; preds = %while.body, %entry
+ %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+ %add = add i32 %index.0, 4
+ %cmp = icmp ult i32 %add, %size
+ br i1 %cmp, label %while.cond10, label %while.body
+
+while.body: ; preds = %while.cond
+ %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+ %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+ store i32 %or7, i32 addrspace(3)* %0, align 4
+ br label %while.cond
+
+while.cond10: ; preds = %while.cond, %while.body13
+ %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+ %cmp11 = icmp ult i32 %index.1, %size
+ br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13: ; preds = %while.cond10
+ %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+ store i8 %val, i8 addrspace(3)* %arrayidx, align 1
+ %inc = add i32 %index.1, 1
+ br label %while.cond10
+
+while.end14: ; preds = %while.cond10
+ ret void
+}
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
new file mode 100755
index 0000000..f648a8c
--- /dev/null
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -0,0 +1,5160 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GEN_OCL_STDLIB_H__
+#define __GEN_OCL_STDLIB_H__
+
+#define INLINE inline __attribute__((always_inline))
+#define OVERLOADABLE __attribute__((overloadable))
+#define PURE __attribute__((pure))
+#define CONST __attribute__((const))
+#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
+// FIXME, clang's opencl FE doesn't support static.
+#define static
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in scalar data types
+/////////////////////////////////////////////////////////////////////////////
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef __typeof__(sizeof(int)) size_t;
+typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
+typedef signed int intptr_t;
+typedef unsigned int uintptr_t;
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL address space
+/////////////////////////////////////////////////////////////////////////////
+// These are built-ins in LLVM 3.3.
+#if 100*__clang_major__ + __clang_minor__ <= 302
+#define __private __attribute__((address_space(0)))
+#define __global __attribute__((address_space(1)))
+#define __constant __attribute__((address_space(2)))
+#define __local __attribute__((address_space(3)))
+#define global __global
+#define local __local
+#define constant __constant
+#define private __private
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in vector data types
+/////////////////////////////////////////////////////////////////////////////
+#define DEF(type) typedef type type##2 __attribute__((ext_vector_type(2)));\
+ typedef type type##3 __attribute__((ext_vector_type(3)));\
+ typedef type type##4 __attribute__((ext_vector_type(4)));\
+ typedef type type##8 __attribute__((ext_vector_type(8)));\
+ typedef type type##16 __attribute__((ext_vector_type(16)));
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+DEF(double);
+#undef DEF
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL other built-in data types
+/////////////////////////////////////////////////////////////////////////////
+// FIXME:
+// This is a transitional hack to bypass the LLVM 3.3 built-in types.
+// See the Khronos SPIR specification for handling of these types.
+#define __texture __attribute__((address_space(4)))
+struct _image1d_t;
+typedef __texture struct _image1d_t* __image1d_t;
+struct _image1d_buffer_t;
+typedef __texture struct _image1d_buffer_t* __image1d_buffer_t;
+struct _image1d_array_t;
+typedef __texture struct _image1d_array_t* __image1d_array_t;
+struct _image2d_t;
+typedef __texture struct _image2d_t* __image2d_t;
+struct _image2d_array_t;
+typedef __texture struct _image2d_array_t* __image2d_array_t;
+struct _image3d_t;
+typedef __texture struct _image3d_t* __image3d_t;
+typedef const ushort __sampler_t;
+typedef size_t __event_t;
+#define image1d_t __image1d_t
+#define image1d_buffer_t __image1d_buffer_t
+#define image1d_array_t __image1d_array_t
+#define image2d_t __image2d_t
+#define image2d_array_t __image2d_array_t
+#define image3d_t __image3d_t
+#define sampler_t __sampler_t
+#define event_t __event_t
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL conversions & type casting
+/////////////////////////////////////////////////////////////////////////////
+
+// ##BEGIN_AS##
+
+// ##END_AS##
+
+// ##BEGIN_CONVERT##
+
+// ##END_CONVERT##
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL preprocessor directives & macros
+/////////////////////////////////////////////////////////////////////////////
+#define __OPENCL_VERSION__ 120
+#define __CL_VERSION_1_0__ 100
+#define __CL_VERSION_1_1__ 110
+#define __CL_VERSION_1_2__ 120
+
+#define __ENDIAN_LITTLE__ 1
+#define __IMAGE_SUPPORT__ 1
+#define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
+ __attribute__((vec_type_hint(TYPE)))
+#define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
+#define cl_khr_global_int32_base_atomics
+#define cl_khr_global_int32_extended_atomics
+#define cl_khr_local_int32_base_atomics
+#define cl_khr_local_int32_extended_atomics
+#define cl_khr_byte_addressable_store
+#define cl_khr_icd
+#define cl_khr_gl_sharing
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL floating-point macros and pragmas
+/////////////////////////////////////////////////////////////////////////////
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_ONE 1.0000000000e+00 /* 0x3F800000 */
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define MAXFLOAT 3.40282347e38F
+INLINE_OVERLOADABLE float __ocl_inff(void) {
+ union { uint u; float f; } u;
+ u.u = 0x7F800000;
+ return u.f;
+}
+INLINE_OVERLOADABLE float __ocl_nanf(void) {
+ union { uint u; float f; } u;
+ u.u = 0x7F800001;
+ return u.f;
+}
+typedef union
+{
+ float value;
+ uint word;
+} float_shape_type;
+
+/* Get a 32 bit int from a float. */
+#ifndef GEN_OCL_GET_FLOAT_WORD
+# define GEN_OCL_GET_FLOAT_WORD(i,d) \
+do { \
+ float_shape_type gf_u; \
+ gf_u.value = (d); \
+ (i) = gf_u.word; \
+} while (0)
+#endif
+/* Set a float from a 32 bit int. */
+#ifndef GEN_OCL_SET_FLOAT_WORD
+# define GEN_OCL_SET_FLOAT_WORD(d,i) \
+do { \
+ float_shape_type sf_u; \
+ sf_u.word = (i); \
+ (d) = sf_u.value; \
+} while (0)
+#endif
+
+INLINE_OVERLOADABLE int __ocl_finitef (float x){
+ unsigned ix;
+ GEN_OCL_GET_FLOAT_WORD (ix, x);
+ return (ix & 0x7fffffff) < 0x7f800000;
+}
+
+#define HUGE_VALF (__ocl_inff())
+#define INFINITY (__ocl_inff())
+#define NAN (__ocl_nanf())
+#define M_E_F 2.718281828459045F
+#define M_LOG2E_F 1.4426950408889634F
+#define M_LOG10E_F 0.43429448190325176F
+#define M_LN2_F 0.6931471805599453F
+#define M_LN10_F 2.302585092994046F
+#define M_PI_F 3.141592653589793F
+#define M_PI_2_F 1.5707963267948966F
+#define M_PI_4_F 0.7853981633974483F
+#define M_1_PI_F 0.3183098861837907F
+#define M_2_PI_F 0.6366197723675814F
+#define M_2_SQRTPI_F 1.1283791670955126F
+#define M_SQRT2_F 1.4142135623730951F
+#define M_SQRT1_2_F 0.7071067811865476F
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL integer built-in macros
+/////////////////////////////////////////////////////////////////////////////
+#define CHAR_BIT 8
+#define CHAR_MAX SCHAR_MAX
+#define CHAR_MIN SCHAR_MIN
+#define INT_MAX 2147483647
+#define INT_MIN (-2147483647 - 1)
+#define LONG_MAX 0x7fffffffffffffffL
+#define LONG_MIN (-0x7fffffffffffffffL - 1)
+#define SCHAR_MAX 127
+#define SCHAR_MIN (-127 - 1)
+#define SHRT_MAX 32767
+#define SHRT_MIN (-32767 - 1)
+#define UCHAR_MAX 255
+#define USHRT_MAX 65535
+#define UINT_MAX 0xffffffff
+#define ULONG_MAX 0xffffffffffffffffUL
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL relational built-in functions
+/////////////////////////////////////////////////////////////////////////////
+
+int INLINE_OVERLOADABLE isequal(float x, float y) { return x == y; }
+int INLINE_OVERLOADABLE isnotequal(float x, float y) { return x != y; }
+int INLINE_OVERLOADABLE isgreater(float x, float y) { return x > y; }
+int INLINE_OVERLOADABLE isgreaterequal(float x, float y) { return x >= y; }
+int INLINE_OVERLOADABLE isless(float x, float y) { return x < y; }
+int INLINE_OVERLOADABLE islessequal(float x, float y) { return x <= y; }
+int INLINE_OVERLOADABLE islessgreater(float x, float y) { return (x < y) || (x > y); }
+
+#define SDEF(TYPE) \
+OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y); \
+OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y); \
+INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
+INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
+SDEF(char);
+SDEF(short);
+#undef SDEF
+OVERLOADABLE int ocl_sadd_sat(int x, int y);
+INLINE_OVERLOADABLE int add_sat(int x, int y) { return ocl_sadd_sat(x, y); }
+OVERLOADABLE int ocl_ssub_sat(int x, int y);
+INLINE_OVERLOADABLE int sub_sat(int x, int y) {
+ return (y == 0x80000000u) ? (ocl_sadd_sat(ocl_sadd_sat(0x7fffffff, x), 1)) : ocl_ssub_sat(x, y);
+}
+OVERLOADABLE long ocl_sadd_sat(long x, long y);
+INLINE_OVERLOADABLE long add_sat(long x, long y) {
+ union {long l; uint i[2];} ux, uy;
+ ux.l = x;
+ uy.l = y;
+ if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+ return x + y;
+ return ocl_sadd_sat(x, y);
+}
+OVERLOADABLE long ocl_ssub_sat(long x, long y);
+INLINE_OVERLOADABLE long sub_sat(long x, long y) {
+ union {long l; uint i[2];} ux, uy;
+ ux.l = x;
+ uy.l = y;
+ if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+ return ocl_ssub_sat(x, y);
+ return x - y;
+}
+#define UDEF(TYPE) \
+OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y); \
+OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y); \
+INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
+INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
+UDEF(uchar);
+UDEF(ushort);
+UDEF(uint);
+UDEF(ulong);
+#undef UDEF
+
+INLINE_OVERLOADABLE int isfinite(float x) {
+ union { uint u; float f; } u;
+ u.f = x;
+ return (u.u & 0x7FFFFFFF) < 0x7F800000;
+}
+INLINE_OVERLOADABLE int isinf(float x) {
+ union { uint u; float f; } u;
+ u.f = x;
+ return (u.u & 0x7FFFFFFF) == 0x7F800000;
+}
+INLINE_OVERLOADABLE int isnan(float x) {
+ return x != x;
+}
+INLINE_OVERLOADABLE int isnormal(float x) {
+ union { uint u; float f; } u;
+ u.f = x;
+ u.u &= 0x7FFFFFFF;
+ return (u.u < 0x7F800000) && (u.u >= 0x800000);
+}
+INLINE_OVERLOADABLE int isordered(float x, float y) { return isequal(x, x) && isequal(y, y); }
+INLINE_OVERLOADABLE int isunordered(float x, float y) { return isnan(x) || isnan(y); }
+INLINE_OVERLOADABLE int signbit(float x) {
+ union { uint u; float f; } u;
+ u.f = x;
+ return u.u >> 31;
+}
+
+#define DEC1(type) INLINE_OVERLOADABLE int any(type a) { return a<0; }
+#define DEC2(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
+#define DEC3(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0; }
+#define DEC4(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0; }
+#define DEC8(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0; }
+#define DEC16(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0 || a.s8<0 || a.s9<0 || a.sA<0 || a.sB<0 || a.sC<0 || a.sD<0 || a.sE<0 || a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+#define DEC1(type) INLINE_OVERLOADABLE int all(type a) { return a<0; }
+#define DEC2(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0; }
+#define DEC3(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0; }
+#define DEC4(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0; }
+#define DEC8(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0; }
+#define DEC16(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0 && a.s8<0 && a.s9<0 && a.sA<0 && a.sB<0 && a.sC<0 && a.sD<0 && a.sE<0 && a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEF(type) INLINE_OVERLOADABLE type bitselect(type a, type b, type c) { return (a & ~c) | (b & c); }
+DEF(char); DEF(uchar); DEF(short); DEF(ushort); DEF(int); DEF(uint)
+DEF(long); DEF(ulong)
+#undef DEF
+INLINE_OVERLOADABLE float bitselect(float a, float b, float c) {
+ return as_float(bitselect(as_int(a), as_int(b), as_int(c)));
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Integer built-in functions
+/////////////////////////////////////////////////////////////////////////////
+PURE CONST uint __gen_ocl_fbh(uint);
+PURE CONST uint __gen_ocl_fbl(uint);
+
+INLINE_OVERLOADABLE char clz(char x) {
+ if (x < 0)
+ return 0;
+ if (x == 0)
+ return 8;
+ return __gen_ocl_fbh(x) - 24;
+}
+
+INLINE_OVERLOADABLE uchar clz(uchar x) {
+ if (x == 0)
+ return 8;
+ return __gen_ocl_fbh(x) - 24;
+}
+
+INLINE_OVERLOADABLE short clz(short x) {
+ if (x < 0)
+ return 0;
+ if (x == 0)
+ return 16;
+ return __gen_ocl_fbh(x) - 16;
+}
+
+INLINE_OVERLOADABLE ushort clz(ushort x) {
+ if (x == 0)
+ return 16;
+ return __gen_ocl_fbh(x) - 16;
+}
+
+INLINE_OVERLOADABLE int clz(int x) {
+ if (x < 0)
+ return 0;
+ if (x == 0)
+ return 32;
+ return __gen_ocl_fbh(x);
+}
+
+INLINE_OVERLOADABLE uint clz(uint x) {
+ if (x == 0)
+ return 32;
+ return __gen_ocl_fbh(x);
+}
+
+INLINE_OVERLOADABLE long clz(long x) {
+ union { int i[2]; long x; } u;
+ u.x = x;
+ if (u.i[1] & 0x80000000u)
+ return 0;
+ if (u.i[1] == 0 && u.i[0] == 0)
+ return 64;
+ uint v = clz(u.i[1]);
+ if(v == 32)
+ v += clz(u.i[0]);
+ return v;
+}
+
+INLINE_OVERLOADABLE ulong clz(ulong x) {
+ if (x == 0)
+ return 64;
+ union { uint i[2]; ulong x; } u;
+ u.x = x;
+ uint v = clz(u.i[1]);
+ if(v == 32)
+ v += clz(u.i[0]);
+ return v;
+}
+
+OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
+OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
+OVERLOADABLE long __gen_ocl_mul_hi(long x, long y);
+OVERLOADABLE ulong __gen_ocl_mul_hi(ulong x, ulong y);
+INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE long mul_hi(long x, long y) {
+ return __gen_ocl_mul_hi(x, y);
+}
+INLINE_OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
+ return __gen_ocl_mul_hi(x, y);
+}
+
+#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+#undef DEF
+
+INLINE_OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
+INLINE_OVERLOADABLE uint mul24(uint a, uint b) { return (a & 0xFFFFFF) * (b & 0xFFFFFF); }
+
+INLINE_OVERLOADABLE int mad24(int a, int b, int c) { return mul24(a, b) + c; }
+INLINE_OVERLOADABLE uint mad24(uint a, uint b, uint c) { return mul24(a, b) + c; }
+
+INLINE_OVERLOADABLE char mad_sat(char a, char b, char c) {
+ int x = (int)a * (int)b + (int)c;
+ if (x > 127)
+ x = 127;
+ if (x < -128)
+ x = -128;
+ return x;
+}
+
+INLINE_OVERLOADABLE uchar mad_sat(uchar a, uchar b, uchar c) {
+ uint x = (uint)a * (uint)b + (uint)c;
+ if (x > 255)
+ x = 255;
+ return x;
+}
+
+INLINE_OVERLOADABLE short mad_sat(short a, short b, short c) {
+ int x = (int)a * (int)b + (int)c;
+ if (x > 32767)
+ x = 32767;
+ if (x < -32768)
+ x = -32768;
+ return x;
+}
+
+INLINE_OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c) {
+ uint x = (uint)a * (uint)b + (uint)c;
+ if (x > 65535)
+ x = 65535;
+ return x;
+}
+
+INLINE_OVERLOADABLE int mad_sat(int a, int b, int c) {
+ long x = (long)a * (long)b + (long)c;
+ if (x > 0x7FFFFFFF)
+ x = 0x7FFFFFFF;
+ else if (x < -0x7FFFFFFF-1)
+ x = -0x7FFFFFFF-1;
+ return (int)x;
+}
+
+INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
+ ulong x = (ulong)a * (ulong)b + (ulong)c;
+ if (x > 0xFFFFFFFFu)
+ x = 0xFFFFFFFFu;
+ return (uint)x;
+}
+
+OVERLOADABLE long __gen_ocl_mad_sat(long a, long b, long c);
+OVERLOADABLE ulong __gen_ocl_mad_sat(ulong a, ulong b, ulong c);
+
+INLINE_OVERLOADABLE long mad_sat(long a, long b, long c) {
+ return __gen_ocl_mad_sat(a, b, c);
+}
+
+INLINE_OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
+ return __gen_ocl_mad_sat(a, b, c);
+}
+
+INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
+INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
+INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
+INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
+INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
+INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
+INLINE_OVERLOADABLE ulong __rotate_left(ulong x, ulong y) { return (x << y) | (x >> (64 - y)); }
+INLINE_OVERLOADABLE long __rotate_left(long x, long y) { return __rotate_left((ulong)x, (ulong)y); }
+#define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
+DEF(char, 7)
+DEF(uchar, 7)
+DEF(short, 15)
+DEF(ushort, 15)
+DEF(int, 31)
+DEF(uint, 31)
+DEF(long, 63)
+DEF(ulong, 63)
+#undef DEF
+
+OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
+OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
+OVERLOADABLE long __gen_ocl_upsample(long hi, long lo);
+INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE long upsample(int hi, uint lo) {
+ return __gen_ocl_upsample((long)hi, (long)lo);
+}
+INLINE_OVERLOADABLE ulong upsample(uint hi, uint lo) {
+ return __gen_ocl_upsample((long)hi, (long)lo);
+}
+
+OVERLOADABLE uint __gen_ocl_hadd(uint x, uint y);
+OVERLOADABLE uint __gen_ocl_rhadd(uint x, uint y);
+#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
+#define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
+DEC
+#undef DEF
+#define DEF(type) INLINE_OVERLOADABLE type rhadd(type x, type y) { return (x + y + 1) >> 1; }
+DEC
+#undef DEF
+#undef DEC
+INLINE_OVERLOADABLE int hadd(int x, int y) {
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y) >> 1) :
+ __gen_ocl_hadd((uint)x, (uint)y);
+}
+INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE int rhadd(int x, int y) {
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y + 1) >> 1) :
+ __gen_ocl_rhadd((uint)x, (uint)y);
+ }
+INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+OVERLOADABLE ulong __gen_ocl_hadd(ulong x, ulong y);
+OVERLOADABLE ulong __gen_ocl_rhadd(ulong x, ulong y);
+INLINE_OVERLOADABLE long hadd(long x, long y) {
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y) >> 1) :
+ __gen_ocl_hadd((ulong)x, (ulong)y);
+}
+INLINE_OVERLOADABLE ulong hadd(ulong x, ulong y) {
+ return __gen_ocl_hadd(x, y);
+}
+INLINE_OVERLOADABLE long rhadd(long x, long y) {
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y + 1) >> 1) :
+ __gen_ocl_rhadd((ulong)x, (ulong)y);
+}
+INLINE_OVERLOADABLE ulong rhadd(ulong x, ulong y) {
+ return __gen_ocl_rhadd(x, y);
+}
+
+int __gen_ocl_abs(int x);
+#define DEC(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
+DEC(int)
+DEC(short)
+DEC(char)
+#undef DEC
+INLINE_OVERLOADABLE ulong abs(long x) { return x < 0 ? -x : x; }
+/* For unsigned types, do nothing. */
+#define DEC(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
+DEC(uint)
+DEC(ushort)
+DEC(uchar)
+DEC(ulong)
+#undef DEC
+
+/* Char and short type abs diff */
+/* promote char and short to int and will be no module overflow */
+#define DEC(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
+ { return (UTYPE) (abs((int)x - (int)y)); }
+DEC(char, uchar)
+DEC(uchar, uchar)
+DEC(short, ushort)
+DEC(ushort, ushort)
+#undef DEC
+
+INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
+ /* same signed will never overflow. */
+ return y > x ? (y -x) : (x - y);
+}
+
+INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
+ /* same signed will never module overflow. */
+ if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+ return abs(x - y);
+
+ return (abs(x) + abs(y));
+}
+
+INLINE_OVERLOADABLE ulong abs_diff (long x, long y) {
+ if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+ return abs(x - y);
+ return abs(x) + abs(y);
+}
+INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
+ return y > x ? (y - x) : (x - y);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////
+// SIMD level function
+/////////////////////////////////////////////////////////////////////////////
+short __gen_ocl_simd_any(short);
+short __gen_ocl_simd_all(short);
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Work Items functions (see 6.11.1 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+
+PURE CONST uint __gen_ocl_get_work_dim(void);
+INLINE uint get_work_dim(void) {
+ return __gen_ocl_get_work_dim();
+}
+
+#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
+PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##2(void);
+DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
+DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
+#undef DECL_INTERNAL_WORK_ITEM_FN
+
+#define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET) \
+INLINE unsigned NAME(unsigned int dim) { \
+ if (dim == 0) return __gen_ocl_##NAME##0(); \
+ else if (dim == 1) return __gen_ocl_##NAME##1(); \
+ else if (dim == 2) return __gen_ocl_##NAME##2(); \
+ else return OTHER_RET; \
+}
+
+DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
+#undef DECL_PUBLIC_WORK_ITEM_FN
+
+INLINE uint get_global_id(uint dim) {
+ return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Math Functions (see 6.11.2 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+PURE CONST float __gen_ocl_fabs(float x);
+PURE CONST float __gen_ocl_sin(float x);
+PURE CONST float __gen_ocl_cos(float x);
+PURE CONST float __gen_ocl_sqrt(float x);
+PURE CONST float __gen_ocl_rsqrt(float x);
+PURE CONST float __gen_ocl_log(float x);
+PURE CONST float __gen_ocl_exp(float x);
+PURE CONST float __gen_ocl_pow(float x, float y);
+PURE CONST float __gen_ocl_rcp(float x);
+PURE CONST float __gen_ocl_rndz(float x);
+PURE CONST float __gen_ocl_rnde(float x);
+PURE CONST float __gen_ocl_rndu(float x);
+PURE CONST float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+ union { unsigned u; float f; } ux, uy;
+ ux.f = x;
+ uy.f = y;
+ ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
+ return ux.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x) {
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ union { unsigned int i; float f; } u;
+ const float
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ two25 = 3.355443200e+07, /* 0x4c000000 */
+ Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
+ Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
+ Lg3 = 2.8571429849e-01, /* 3E924925 */
+ Lg4 = 2.2222198546e-01, /* 3E638E29 */
+ Lg5 = 1.8183572590e-01, /* 3E3A3325 */
+ Lg6 = 1.5313838422e-01, /* 3E1CD04F */
+ Lg7 = 1.4798198640e-01; /* 3E178897 */
+
+ const float zero = 0.0;
+ float hfsq,f,s,z,R,w,t1,t2,dk;
+ int k,ix,i,j;
+
+ u.f = x; ix = u.i;
+ k=0;
+ if (ix < 0x00800000) { /* x < 2**-126 */
+ if ((ix&0x7fffffff)==0)
+ return -two25/zero; /* log(+-0)=-inf */
+ if (ix<0) return (x-x)/zero; /* log(-#) = NaN */
+ return -INFINITY; /* Gen does not support subnormal number now */
+ //k -= 25; x *= two25; /* subnormal number, scale up x */
+ //u.f = x; ix = u.i;
+ }
+ if (ix >= 0x7f800000) return x+x;
+ k += (ix>>23)-127;
+ ix &= 0x007fffff;
+ i = (ix+(0x95f64<<3))&0x800000;
+ u.i = ix|(i^0x3f800000); x = u.f;
+ k += (i>>23);
+ f = x-(float)1.0;
+ if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
+ if(f==zero) {
+ if(k==0) return zero;
+ else {
+ dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
+ }
+ }
+ R = f*f*((float)0.5-(float)0.33333333333333333*f);
+ if(k==0)
+ return f-R;
+ else {
+ dk=(float)k; return dk*ln2_hi-((R-dk*ln2_lo)-f);
+ }
+ }
+ s = f/((float)2.0+f);
+ dk = (float)k;
+ z = s*s;
+ i = ix-(0x6147a<<3);
+ w = z*z;
+ j = (0x6b851<<3)-ix;
+ t1= w*(Lg2+w*(Lg4+w*Lg6));
+ t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+ i |= j;
+ R = t2+t1;
+ if(i>0) {
+ hfsq=(float)0.5*f*f;
+ if(k==0) return f-(hfsq-s*(hfsq+R)); else
+ return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
+ } else {
+ if(k==0) return f-s*(f-R); else
+ return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
+ }
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) {
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ union {float f; unsigned i; }u;
+ const float
+ zero = 0.0,
+ two25 = 3.3554432000e+07, /* 0x4c000000 */
+ ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */
+ log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */
+ log10_2lo = 7.9034151668e-07; /* 0x355427db */
+
+ float y,z;
+ int i,k,hx;
+
+ u.f = x; hx = u.i;
+ k=0;
+ if (hx < 0x00800000) { /* x < 2**-126 */
+ if ((hx&0x7fffffff)==0)
+ return -two25/zero; /* log(+-0)=-inf */
+ if (hx<0) return NAN; /* log(-#) = NaN */
+ return -INFINITY; /* Gen does not support subnormal now */
+ //k -= 25; x *= two25; /* subnormal number, scale up x */
+ //u.f = x; hx = u.i;
+ }
+ if (hx >= 0x7f800000) return x+x;
+ k += (hx>>23)-127;
+ i = ((unsigned)k&0x80000000)>>31;
+ hx = (hx&0x007fffff)|((0x7f-i)<<23);
+ y = (float)(k+i);
+ u.i = hx; x = u.f;
+ z = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
+ return z+y*log10_2hi;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x) {
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ const float zero = 0.0,
+ ln2 = 0.69314718055994530942,
+ two25 = 3.355443200e+07, /** 0x4c000000 */
+ Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
+ Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
+ Lg3 = 2.8571429849e-01, /** 3E924925 */
+ Lg4 = 2.2222198546e-01, /** 3E638E29 */
+ Lg5 = 1.8183572590e-01, /** 3E3A3325 */
+ Lg6 = 1.5313838422e-01, /** 3E1CD04F */
+ Lg7 = 1.4798198640e-01; /** 3E178897 */
+
+ float hfsq,f,s,z,R,w,t1,t2,dk;
+ int k,ix,i,j;
+
+ union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
+ u.f = x; ix = u.i;
+
+ k=0;
+ if (ix < 0x00800000) { /** x < 2**-126 */
+ if ((ix&0x7fffffff)==0)
+ return -two25/(x-x); /** log(+-0)=-inf */
+
+ if (ix<0) return (x-x)/(x-x); /** log(-#) = NaN */
+ return -INFINITY;
+ k -= 25; x *= two25; /** subnormal number, scale up x */
+ u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
+ }
+
+ if (ix >= 0x7f800000) return x+x;
+
+ k += (ix>>23)-127;
+ ix &= 0x007fffff;
+ i = (ix+(0x95f64<<3))&0x800000;
+
+ u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000)); /** normalize x or x/2 */
+ k += (i>>23);
+ dk = (float)k;
+ f = x-(float)1.0;
+
+ if((0x007fffff&(15+ix))<16) { /** |f| < 2**-20 */
+ if(f==zero) return dk;
+
+ R = f*f*((float)0.5-(float)0.33333333333333333*f);
+ return dk-(R-f)/ln2;
+ }
+
+ s = f/((float)2.0+f);
+ z = s*s;
+ i = ix-(0x6147a<<3);
+ w = z*z;
+ j = (0x6b851<<3)-ix;
+ t1= w*(Lg2+w*(Lg4+w*Lg6));
+ t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+ i |= j;
+ R = t2+t1;
+
+ if(i>0) {
+ hfsq=(float)0.5*f*f;
+ return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
+ } else {
+ return dk-((s*(f-R))-f)/ln2;
+ }
+}
+
+INLINE float __gen_ocl_scalbnf (float x, int n){
+ /* copy from fdlibm */
+ float two25 = 3.355443200e+07, /* 0x4c000000 */
+ twom25 = 2.9802322388e-08, /* 0x33000000 */
+ huge = 1.0e+30,
+ tiny = 1.0e-30;
+ int k,ix;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ k = (ix&0x7f800000)>>23; /* extract exponent */
+ if (k==0) { /* 0 or subnormal x */
+ if ((ix&0x7fffffff)==0) return x; /* +-0 */
+ x *= two25;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ k = ((ix&0x7f800000)>>23) - 25;
+ }
+ if (k==0xff) return x+x; /* NaN or Inf */
+ if (n< -50000)
+ return tiny*__gen_ocl_internal_copysign(tiny,x); /*underflow*/
+ if (n> 50000 || k+n > 0xfe)
+ return huge*__gen_ocl_internal_copysign(huge,x); /* overflow */
+ /* Now k and n are bounded we know that k = k+n does not overflow. */
+ k = k+n;
+ if (k > 0) { /* normal result */
+ GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+ return x;
+ }
+ if (k <= -25)
+ return tiny*__gen_ocl_internal_copysign(tiny,x); /*underflow*/
+ k += 25; /* subnormal result */
+ GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+ return x*twom25;
+}
+
+
+
+__constant const float PIo2[] = {
+ 1.5703125000e+00, /* 0x3fc90000 */
+ 4.5776367188e-04, /* 0x39f00000 */
+ 2.5987625122e-05, /* 0x37da0000 */
+ 7.5437128544e-08, /* 0x33a20000 */
+ 6.0026650317e-11, /* 0x2e840000 */
+ 7.3896444519e-13, /* 0x2b500000 */
+ 5.3845816694e-15, /* 0x27c20000 */
+ 5.6378512969e-18, /* 0x22d00000 */
+ 8.3009228831e-20, /* 0x1fc40000 */
+ 3.2756352257e-22, /* 0x1bc60000 */
+ 6.3331015649e-25, /* 0x17440000 */
+};
+
+INLINE int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const __constant int *ipio2)
+{
+ /* copied from fdlibm */
+const float
+zero = 0.0,
+one = 1.0,
+two8 = 2.5600000000e+02, /* 0x43800000 */
+twon8 = 3.9062500000e-03; /* 0x3b800000 */
+
+ int init_jk[3]; /* initial value for jk */
+ int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
+ float z,fw,f[20],fq[20],q[20];
+ init_jk[0] = 4; init_jk[1] = 7; init_jk[2] = 9;
+ /* initialize jk*/
+ jk = init_jk[prec];
+ jp = jk;
+
+ /* determine jx,jv,q0, note that 3>q0 */
+ jx = nx-1;
+ jv = (e0-3)/8; if(jv<0) jv=0;
+ q0 = e0-8*(jv+1);
+
+ /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+ j = jv-jx; m = jx+jk;
+ for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (float) ipio2[j];
+
+ /* compute q[0],q[1],...q[jk] */
+ for (i=0;i<=jk;i++) {
+ for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+ }
+
+ jz = jk;
+recompute:
+ /* distill q[] into iq[] reversingly */
+ for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+ fw = (float)((int)(twon8* z));
+ iq[i] = (int)(z-two8*fw);
+ z = q[j-1]+fw;
+ }
+
+ /* compute n */
+ z = __gen_ocl_scalbnf(z,q0); /* actual value of z */
+ z -= (float)8.0*__gen_ocl_internal_floor(z*(float)0.125); /* trim off integer >= 8 */
+ n = (int) z;
+ z -= (float)n;
+ ih = 0;
+ if(q0>0) { /* need iq[jz-1] to determine n */
+ i = (iq[jz-1]>>(8-q0)); n += i;
+ iq[jz-1] -= i<<(8-q0);
+ ih = iq[jz-1]>>(7-q0);
+ }
+ else if(q0==0) ih = iq[jz-1]>>8;
+ else if(z>=(float)0.5) ih=2;
+
+ if(ih>0) { /* q > 0.5 */
+ n += 1; carry = 0;
+ for(i=0;i<jz ;i++) { /* compute 1-q */
+ j = iq[i];
+ if(carry==0) {
+ if(j!=0) {
+ carry = 1; iq[i] = 0x100- j;
+ }
+ } else iq[i] = 0xff - j;
+ }
+ if(q0>0) { /* rare case: chance is 1 in 12 */
+ switch(q0) {
+ case 1:
+ iq[jz-1] &= 0x7f; break;
+ case 2:
+ iq[jz-1] &= 0x3f; break;
+ }
+ }
+ if(ih==2) {
+ z = one - z;
+ if(carry!=0) z -= __gen_ocl_scalbnf(one,q0);
+ }
+ }
+
+ /* check if recomputation is needed */
+ if(z==zero) {
+ j = 0;
+ for (i=jz-1;i>=jk;i--) j |= iq[i];
+ if(j==0) { /* need recomputation */
+ for(k=1;iq[jk-k]==0;k++); /* k = no. of terms needed */
+
+ for(i=jz+1;i<=jz+k;i++) { /* add q[jz+1] to q[jz+k] */
+ f[jx+i] = (float) ipio2[jv+i];
+ for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+ q[i] = fw;
+ }
+ jz += k;
+ goto recompute;
+ }
+ }
+
+ /* chop off zero terms */
+ if(z==(float)0.0) {
+ jz -= 1; q0 -= 8;
+ while(iq[jz]==0) { jz--; q0-=8;}
+ } else { /* break z into 8-bit if necessary */
+ z = __gen_ocl_scalbnf(z,-q0);
+ if(z>=two8) {
+ fw = (float)((int)(twon8*z));
+ iq[jz] = (int)(z-two8*fw);
+ jz += 1; q0 += 8;
+ iq[jz] = (int) fw;
+ } else iq[jz] = (int) z ;
+ }
+
+ /* convert integer "bit" chunk to floating-point value */
+ fw = __gen_ocl_scalbnf(one,q0);
+ for(i=jz;i>=0;i--) {
+ q[i] = fw*(float)iq[i]; fw*=twon8;
+ }
+
+ /* compute PIo2[0,...,jp]*q[jz,...,0] */
+ for(i=jz;i>=0;i--) {
+ for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+ fq[jz-i] = fw;
+ }
+
+ /* compress fq[] into y[] */
+ switch(prec) {
+ case 0:
+ fw = 0.0;
+ for (i=jz;i>=0;i--) fw += fq[i];
+ y[0] = (ih==0)? fw: -fw;
+ break;
+ case 1:
+ case 2:
+ fw = 0.0;
+ for (i=jz;i>=0;i--) fw += fq[i];
+ y[0] = (ih==0)? fw: -fw;
+ fw = fq[0]-fw;
+ for (i=1;i<=jz;i++) fw += fq[i];
+ y[1] = (ih==0)? fw: -fw;
+ break;
+ case 3: /* painful */
+ for (i=jz;i>0;i--) {
+ fw = fq[i-1]+fq[i];
+ fq[i] += fq[i-1]-fw;
+ fq[i-1] = fw;
+ }
+ for (i=jz;i>1;i--) {
+ fw = fq[i-1]+fq[i];
+ fq[i] += fq[i-1]-fw;
+ fq[i-1] = fw;
+ }
+ for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
+ if(ih==0) {
+ y[0] = fq[0]; y[1] = fq[1]; y[2] = fw;
+ } else {
+ y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
+ }
+ }
+ return n&7;
+
+}
+__constant const int npio2_hw[32] = {
+0x3fc90f00, 0x40490f00, 0x4096cb00, 0x40c90f00, 0x40fb5300, 0x4116cb00,
+0x412fed00, 0x41490f00, 0x41623100, 0x417b5300, 0x418a3a00, 0x4196cb00,
+0x41a35c00, 0x41afed00, 0x41bc7e00, 0x41c90f00, 0x41d5a000, 0x41e23100,
+0x41eec200, 0x41fb5300, 0x4203f200, 0x420a3a00, 0x42108300, 0x4216cb00,
+0x421d1400, 0x42235c00, 0x4229a500, 0x422fed00, 0x42363600, 0x423c7e00,
+0x4242c700, 0x42490f00
+};
+
+__constant const int two_over_pi[22*9] = {
+0xA2, 0xF9, 0x83, 0x6E, 0x4E, 0x44, 0x15, 0x29, 0xFC,
+0x27, 0x57, 0xD1, 0xF5, 0x34, 0xDD, 0xC0, 0xDB, 0x62,
+0x95, 0x99, 0x3C, 0x43, 0x90, 0x41, 0xFE, 0x51, 0x63,
+0xAB, 0xDE, 0xBB, 0xC5, 0x61, 0xB7, 0x24, 0x6E, 0x3A,
+0x42, 0x4D, 0xD2, 0xE0, 0x06, 0x49, 0x2E, 0xEA, 0x09,
+0xD1, 0x92, 0x1C, 0xFE, 0x1D, 0xEB, 0x1C, 0xB1, 0x29,
+0xA7, 0x3E, 0xE8, 0x82, 0x35, 0xF5, 0x2E, 0xBB, 0x44,
+0x84, 0xE9, 0x9C, 0x70, 0x26, 0xB4, 0x5F, 0x7E, 0x41,
+0x39, 0x91, 0xD6, 0x39, 0x83, 0x53, 0x39, 0xF4, 0x9C,
+0x84, 0x5F, 0x8B, 0xBD, 0xF9, 0x28, 0x3B, 0x1F, 0xF8,
+0x97, 0xFF, 0xDE, 0x05, 0x98, 0x0F, 0xEF, 0x2F, 0x11,
+0x8B, 0x5A, 0x0A, 0x6D, 0x1F, 0x6D, 0x36, 0x7E, 0xCF,
+0x27, 0xCB, 0x09, 0xB7, 0x4F, 0x46, 0x3F, 0x66, 0x9E,
+0x5F, 0xEA, 0x2D, 0x75, 0x27, 0xBA, 0xC7, 0xEB, 0xE5,
+0xF1, 0x7B, 0x3D, 0x07, 0x39, 0xF7, 0x8A, 0x52, 0x92,
+0xEA, 0x6B, 0xFB, 0x5F, 0xB1, 0x1F, 0x8D, 0x5D, 0x08,
+0x56, 0x03, 0x30, 0x46, 0xFC, 0x7B, 0x6B, 0xAB, 0xF0,
+0xCF, 0xBC, 0x20, 0x9A, 0xF4, 0x36, 0x1D, 0xA9, 0xE3,
+0x91, 0x61, 0x5E, 0xE6, 0x1B, 0x08, 0x65, 0x99, 0x85,
+0x5F, 0x14, 0xA0, 0x68, 0x40, 0x8D, 0xFF, 0xD8, 0x80,
+0x4D, 0x73, 0x27, 0x31, 0x06, 0x06, 0x15, 0x56, 0xCA,
+0x73, 0xA8, 0xC9, 0x60, 0xE2, 0x7B, 0xC0, 0x8C, 0x6B,
+};
+
+
+
+INLINE int __ieee754_rem_pio2f(float x, float *y) {
+ /* copied from fdlibm */
+ float z,w,t,r,fn;
+ float tx[3];
+
+const float half_value = 5.0000000e-1;
+const float zero = 0.0000000000;
+const float two8 = 2.5600000000e+02;
+const float invpio2 = 6.3661980629e-01;
+const float pio2_1 = 1.5707855225e+00;
+const float pio2_1t = 1.0804334124e-05;
+const float pio2_2 = 1.0804273188e-05;
+const float pio2_2t = 6.0770999344e-11;
+const float pio2_3 = 6.0770943833e-11;
+const float pio2_3t = 6.1232342629e-17;
+ int e0,i,j,nx,n,ix,hx;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix<=0x3f490fd8) /* |x| ~<= pi/4 , no need for reduction */
+ {y[0] = x; y[1] = 0; return 0;}
+ if(ix<0x4016cbe4) { /* |x| < 3pi/4, special case with n=+-1 */
+ if(hx>0) {
+ z = x - pio2_1;
+ if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+ y[0] = z - pio2_1t;
+ y[1] = (z-y[0])-pio2_1t;
+ } else { /* near pi/2, use 24+24+24 bit pi */
+ z -= pio2_2;
+ y[0] = z - pio2_2t;
+ y[1] = (z-y[0])-pio2_2t;
+ }
+ return 1;
+ } else { /* negative x */
+ z = x + pio2_1;
+ if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+ y[0] = z + pio2_1t;
+ y[1] = (z-y[0])+pio2_1t;
+ } else { /* near pi/2, use 24+24+24 bit pi */
+ z += pio2_2;
+ y[0] = z + pio2_2t;
+ y[1] = (z-y[0])+pio2_2t;
+ }
+ return -1;
+ }
+ }
+ if(ix<=0x43490f80) { /* |x| ~<= 2^7*(pi/2), medium size */
+ t = __gen_ocl_fabs(x);
+ n = (int) (t*invpio2+half_value);
+ fn = (float)n;
+ r = t-fn*pio2_1;
+ w = fn*pio2_1t; /* 1st round good to 40 bit */
+ if(n<32&&(ix&0xffffff00)!=npio2_hw[n-1]) {
+ y[0] = r-w; /* quick check no cancellation */
+ } else {
+ uint high;
+ j = ix>>23;
+ y[0] = r-w;
+ GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+ i = j-((high>>23)&0xff);
+ if(i>8) { /* 2nd iteration needed, good to 57 */
+ t = r;
+ w = fn*pio2_2;
+ r = t-w;
+ w = fn*pio2_2t-((t-r)-w);
+ y[0] = r-w;
+ GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+ i = j-((high>>23)&0xff);
+ if(i>25) { /* 3rd iteration need, 74 bits acc */
+ t = r; /* will cover all possible cases */
+ w = fn*pio2_3;
+ r = t-w;
+ w = fn*pio2_3t-((t-r)-w);
+ y[0] = r-w;
+ }
+ }
+ }
+ y[1] = (r-y[0])-w;
+ if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+ else return n;
+ }
+ /*
+ * all other (large) arguments
+ */
+ if(ix>=0x7f800000) { /* x is inf or NaN */
+ y[0]=y[1]=x-x; return 0;
+ }
+ /* set z = scalbn(|x|,ilogb(x)-7) */
+ e0 = (ix>>23)-134; /* e0 = ilogb(z)-7; */
+ GEN_OCL_SET_FLOAT_WORD(z, ix - ((int)(e0<<23)));
+ for(i=0;i<2;i++) {
+ tx[i] = (float)((int)(z));
+ z = (z-tx[i])*two8;
+ }
+ tx[2] = z;
+ nx = 3;
+ while(tx[nx-1]==zero) nx--; /* skip zero term */
+ n = __kernel_rem_pio2f(tx,y,e0,nx,2,two_over_pi);
+ if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+ return n;
+}
+
+INLINE_OVERLOADABLE float __kernel_sinf(float x, float y, int iy)
+{
+ /* copied from fdlibm */
+const float
+half_value = 5.0000000000e-01,/* 0x3f000000 */
+S1 = -1.6666667163e-01, /* 0xbe2aaaab */
+S2 = 8.3333337680e-03, /* 0x3c088889 */
+S3 = -1.9841270114e-04, /* 0xb9500d01 */
+S4 = 2.7557314297e-06, /* 0x3638ef1b */
+S5 = -2.5050759689e-08, /* 0xb2d72f34 */
+S6 = 1.5896910177e-10; /* 0x2f2ec9d3 */
+ float z,r,v;
+ int ix;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ ix &= 0x7fffffff; /* high word of x */
+ if(ix<0x32000000) /* |x| < 2**-27 */
+ {if((int)x==0) return x;} /* generate inexact */
+ z = x*x;
+ v = z*x;
+ r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
+ if(iy==0) return x+v*(S1+z*r);
+ else return x-((z*(half_value*y-v*r)-y)-v*S1);
+}
+
+INLINE float __kernel_cosf(float x, float y)
+{
+ /* copied from fdlibm */
+ const float
+ one = 1.0000000000e+00, /* 0x3f800000 */
+ C1 = 4.1666667908e-02, /* 0x3d2aaaab */
+ C2 = -1.3888889225e-03, /* 0xbab60b61 */
+ C3 = 2.4801587642e-05, /* 0x37d00d01 */
+ C4 = -2.7557314297e-07, /* 0xb493f27c */
+ C5 = 2.0875723372e-09, /* 0x310f74f6 */
+ C6 = -1.1359647598e-11; /* 0xad47d74e */
+ const float pio2_hi = 0x1.92p0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+ float a,hz,z,r,qx;
+ int ix;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ ix &= 0x7fffffff; /* ix = |x|'s high word*/
+ if(ix<0x32000000) { /* if x < 2**27 */
+ if(((int)x)==0) return one; /* generate inexact */
+ }
+
+ if(x < 0.0f) { x= -x; y = -y; }
+ if(ix > 0x3f490fdb) { /* |x|>pi/4*/
+ return -__kernel_sinf(x-pio2_hi-pio2_mid-pio2_low, y, 1);
+ }
+ z = x*x;
+ r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+ if(ix < 0x3e99999a) /* if |x| < 0.3 */
+ return one - ((float)0.5*z - (z*r - x*y));
+ else {
+ GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
+ hz = (float)0.5*z-qx;
+ a = one-qx;
+ return a - (hz - (z*r-x*y));
+ }
+}
+
+INLINE_OVERLOADABLE float sin(float x) {
+ /* copied from fdlibm */
+ float y[2],z=0.0;
+ int n, ix;
+
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+ /* |x| ~< pi/4 */
+ ix &= 0x7fffffff;
+ if(ix <= 0x3f490fd8) return __kernel_sinf(x,z,0);
+
+ /* sin(Inf or NaN) is NaN */
+ else if (ix>=0x7f800000) return x-x;
+
+ /* argument reduction needed */
+ else {
+ n = __ieee754_rem_pio2f(x,y);
+ switch(n&3) {
+ case 0: return __kernel_sinf(y[0],y[1],1);
+ case 1: return __kernel_cosf(y[0],y[1]);
+ case 2: return -__kernel_sinf(y[0],y[1],1);
+ default:
+ return -__kernel_cosf(y[0],y[1]);
+ }
+ }
+}
+INLINE_OVERLOADABLE float cos(float x) {
+ /* copied from fdlibm */
+ float y[2],z=0.0;
+ int n, ix;
+
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+ /* |x| ~< pi/4 */
+ ix &= 0x7fffffff;
+ if(ix <= 0x3f490fd8) return __kernel_cosf(x,z);
+
+ /* cos(Inf or NaN) is NaN */
+ else if (ix>=0x7f800000) return x-x;
+
+ /* argument reduction needed */
+ else {
+ n = __ieee754_rem_pio2f(x,y);
+ switch(n&3) {
+ case 0: return __kernel_cosf(y[0],y[1]);
+ case 1: return -__kernel_sinf(y[0],y[1],1);
+ case 2: return -__kernel_cosf(y[0],y[1]);
+ default:
+ return __kernel_sinf(y[0],y[1],1);
+ }
+ }
+}
+
+INLINE float __kernel_tanf(float x, float y, int iy)
+{
+ /* copied from fdlibm */
+ float z,r,v,w,s;
+ int ix,hx;
+ const float
+ one = 1.0000000000e+00, /* 0x3f800000 */
+ pio4 = 7.8539812565e-01, /* 0x3f490fda */
+ pio4lo= 3.7748947079e-08; /* 0x33222168 */
+ float T[13];// = {
+ T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
+ T[1] = 1.3333334029e-01; /* 0x3e088889 */
+ T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
+ T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
+ T[4] = 8.8632395491e-03; /* 0x3c11371f */
+ T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
+ T[6] = 1.4562094584e-03; /* 0x3abede48 */
+ T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
+ T[8] = 2.4646313977e-04; /* 0x398137b9 */
+ T[9] = 7.8179444245e-05; /* 0x38a3f445 */
+ T[10] = 7.1407252108e-05; /* 0x3895c07a */
+ T[11] = -1.8558637748e-05; /* 0xb79bae5f */
+ T[12] = 2.5907305826e-05; /* 0x37d95384 */
+
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff; /* high word of |x| */
+ if(ix<0x31800000) /* x < 2**-28 */
+ {if((int)x==0) { /* generate inexact */
+ if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
+ else return (iy==1)? x: -one/x;
+ }
+ }
+ if(ix>=0x3f2ca140) { /* |x|>=0.6744 */
+ if(hx<0) {x = -x; y = -y;}
+
+
+ z = pio4-x;
+ w = pio4lo-y;
+ x = z+w; y = 0.0;
+ }
+ z = x*x;
+ w = z*z;
+ /* Break x^5*(T[1]+x^2*T[2]+...) into
+ * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+ * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+ */
+ r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
+ v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
+ s = z*x;
+ r = y + z*(s*(r+v)+y);
+ r += T[0]*s;
+ w = x+r;
+ if(ix>=0x3f2ca140) {
+ v = (float)iy;
+ return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
+ }
+ if(iy==1) return w;
+ else { /* if allow error up to 2 ulp
+ simply return -1.0/(x+r) here */
+ /* compute -1.0/(x+r) accurately */
+ float a,t;
+ int i;
+ z = w;
+ GEN_OCL_GET_FLOAT_WORD(i,z);
+ GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
+ v = r-(z - x); /* z+v = r+x */
+ t = a = -(float)1.0/w; /* a = -1.0/w */
+ GEN_OCL_GET_FLOAT_WORD(i,t);
+ GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
+ s = (float)1.0+t*z;
+ return t+a*(s+t*v);
+ }
+}
+
+INLINE_OVERLOADABLE float tan(float x)
+{
+ /* copied from fdlibm */
+ const float pio2_hi = 0x1.92p-0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+ const float pio4 = 7.8539812565e-01;
+ float y[2],z=0.0;
+ int n, ix;
+
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+ /* |x| ~< pi/4 */
+ ix &= 0x7fffffff;
+ if(ix <= 0x3f490fda) return __kernel_tanf(x,z,1);
+
+ /* tan(Inf or NaN) is NaN */
+ else if (ix>=0x7f800000) return x-x; /* NaN */
+
+ /* argument reduction needed */
+ else {
+ n = __ieee754_rem_pio2f(x,y);
+
+ x = y[0];
+ float m = y[1];
+ int iy = 1-((n&1)<<1);
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ float sign = 1.0f;
+ if(ix < 0) {
+ x = -x; m = -m;
+ sign = -1.0f;
+ }
+
+ if(x > pio4) {/* reduce x to less than pi/4 through (pi/2-x) */
+ float t = __kernel_tanf(pio2_hi-x+pio2_mid+pio2_low, -m, 1);
+ if(iy == -1) return sign*(-t); else return sign*1/t;
+ } else
+ return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /* 1 -- n even
+ -1 -- n odd */
+ }
+}
+
+INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+ int ix;
+ if(isinf(x) || isnan(x)) { return NAN; }
+ if(x < 0.0f) { x = -x; }
+ GEN_OCL_GET_FLOAT_WORD(ix, x);
+ if(x> 0x1.0p24) return 1.0f;
+ float m = __gen_ocl_internal_floor(x);
+ ix = (int)m;
+ m = x-m;
+ if((ix&0x1) != 0) m+=1.0f;
+ ix = __gen_ocl_internal_floor(m*4.0f);
+
+ switch(ix) {
+ case 0:
+ return __kernel_cosf(m*M_PI_F, 0.0f);
+ case 1:
+ case 2:
+ return __kernel_sinf((0.5f-m)*M_PI_F, 0.0f, 0);
+ case 3:
+ case 4:
+ return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
+ case 5:
+ case 6:
+ return __kernel_sinf((m-1.5f)*M_PI_F, 0.0f, 0);
+ default:
+ return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
+ }
+}
+INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+ float sign = 1.0f;
+ int ix;
+ if(isinf(x)) return NAN;
+ if(x < 0.0f) { x = -x; sign = -1.0f; }
+ GEN_OCL_GET_FLOAT_WORD(ix, x);
+ if(x> 0x1.0p24) return 0.0f;
+ float m = __gen_ocl_internal_floor(x);
+ ix = (int)m;
+ m = x-m;
+ if((ix&0x1) != 0) m+=1.0f;
+ ix = __gen_ocl_internal_floor(m*4.0f);
+
+ switch(ix) {
+ case 0:
+ return sign*__kernel_sinf(m*M_PI_F, 0.0f, 0);
+ case 1:
+ case 2:
+ return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+ case 3:
+ case 4:
+ return -sign*__kernel_sinf((m-1.0f)*M_PI_F, 0.0f, 0);
+ case 5:
+ case 6:
+ return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+ default:
+ return -sign*__kernel_sinf((2.0f-m)*M_PI_F, 0.0f, 0);
+ }
+
+}
+INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+INLINE_OVERLOADABLE float native_log(float x) {
+ return native_log2(x) * 0.6931472002f;
+}
+INLINE_OVERLOADABLE float tgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ float pi = 3.1415927410e+00,
+ a0 = 7.7215664089e-02,
+ a1 = 3.2246702909e-01,
+ a2 = 6.7352302372e-02,
+ a3 = 2.0580807701e-02,
+ a4 = 7.3855509982e-03,
+ a5 = 2.8905137442e-03,
+ a6 = 1.1927076848e-03,
+ a7 = 5.1006977446e-04,
+ a8 = 2.2086278477e-04,
+ a9 = 1.0801156895e-04,
+ a10 = 2.5214456400e-05,
+ a11 = 4.4864096708e-05,
+ tc = 1.4616321325e+00,
+ tf = -1.2148628384e-01,
+ tt = 6.6971006518e-09,
+ t0 = 4.8383611441e-01,
+ t1 = -1.4758771658e-01,
+ t2 = 6.4624942839e-02,
+ t3 = -3.2788541168e-02,
+ t4 = 1.7970675603e-02,
+ t5 = -1.0314224288e-02,
+ t6 = 6.1005386524e-03,
+ t7 = -3.6845202558e-03,
+ t8 = 2.2596477065e-03,
+ t9 = -1.4034647029e-03,
+ t10 = 8.8108185446e-04,
+ t11 = -5.3859531181e-04,
+ t12 = 3.1563205994e-04,
+ t13 = -3.1275415677e-04,
+ t14 = 3.3552918467e-04,
+ u0 = -7.7215664089e-02,
+ u1 = 6.3282704353e-01,
+ u2 = 1.4549225569e+00,
+ u3 = 9.7771751881e-01,
+ u4 = 2.2896373272e-01,
+ u5 = 1.3381091878e-02,
+ v1 = 2.4559779167e+00,
+ v2 = 2.1284897327e+00,
+ v3 = 7.6928514242e-01,
+ v4 = 1.0422264785e-01,
+ v5 = 3.2170924824e-03,
+ s0 = -7.7215664089e-02,
+ s1 = 2.1498242021e-01,
+ s2 = 3.2577878237e-01,
+ s3 = 1.4635047317e-01,
+ s4 = 2.6642270386e-02,
+ s5 = 1.8402845599e-03,
+ s6 = 3.1947532989e-05,
+ r1 = 1.3920053244e+00,
+ r2 = 7.2193557024e-01,
+ r3 = 1.7193385959e-01,
+ r4 = 1.8645919859e-02,
+ r5 = 7.7794247773e-04,
+ r6 = 7.3266842264e-06,
+ w0 = 4.1893854737e-01,
+ w1 = 8.3333335817e-02,
+ w2 = -2.7777778450e-03,
+ w3 = 7.9365057172e-04,
+ w4 = -5.9518753551e-04,
+ w5 = 8.3633989561e-04,
+ w6 = -1.6309292987e-03;
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+ int i, hx, ix;
+ nadj = 0;
+ hx = *(int *) (&x);
+ ix = hx & 0x7fffffff;
+ if (ix >= 0x7f800000)
+ return x * x;
+ if (ix == 0)
+ return INFINITY;
+ if (ix < 0x1c800000) {
+ if (hx < 0) {
+ return - native_log(-x);
+ } else
+ return - native_log(x);
+ }
+ if (hx < 0) {
+ if (ix >= 0x4b000000)
+ return INFINITY;
+ t = __gen_ocl_internal_sinpi(x);
+ if (__gen_ocl_fabs(t) < 1e-8f)
+ return INFINITY;
+ nadj = native_log(M_PI_F / __gen_ocl_fabs(t * x));
+ x = -x;
+ }
+
+ if (ix == 0x3f800000 || ix == 0x40000000)
+ r = 0;
+ else if (ix < 0x40000000) {
+ if (ix <= 0x3f666666) {
+ r = - native_log(x);
+ if (ix >= 0x3f3b4a20) {
+ y = 1 - x;
+ i = 0;
+ } else if (ix >= 0x3e6d3308) {
+ y = x - (tc - 1);
+ i = 1;
+ } else {
+ y = x;
+ i = 2;
+ }
+ } else {
+ r = 0;
+ if (ix >= 0x3fdda618) {
+ y = 2 - x;
+ i = 0;
+ } else if (ix >= 0x3F9da620) {
+ y = x - tc;
+ i = 1;
+ } else {
+ y = x - 1;
+ i = 2;
+ }
+ }
+ switch (i) {
+ case 0:
+ z = y * y;
+ p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+ p = y * p1 + p2;
+ r += (p - .5f * y);
+ break;
+ case 1:
+ z = y * y;
+ w = z * y;
+ p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+ p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+ p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+ p = z * p1 - (tt - w * (p2 + y * p3));
+ r += (tf + p);
+ break;
+ case 2:
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+ p2 = 1 + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ r += (-.5f * y + p1 / p2);
+ }
+ } else if (ix < 0x41000000) {
+ i = x;
+ t = 0;
+ y = x - i;
+ p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
+ q = 1 + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+ r = .5f * y + p / q;
+ z = 1;
+ switch (i) {
+ case 7:
+ z *= (y + 6.f);
+ case 6:
+ z *= (y + 5.f);
+ case 5:
+ z *= (y + 4.f);
+ case 4:
+ z *= (y + 3.f);
+ case 3:
+ z *= (y + 2.f);
+ r += native_log(z);
+ break;
+ }
+ } else if (ix < 0x5c800000) {
+ t = native_log(x);
+ z = 1 / x;
+ y = z * z;
+ w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+ r = (x - .5f) * (t - 1) + w;
+ } else
+ r = x * (native_log(x) - 1);
+ if (hx < 0)
+ r = nadj - r;
+ return r;
+}
+
+INLINE_OVERLOADABLE float lgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ const float
+ zero= 0.,
+ one = 1.0000000000e+00,
+ pi = 3.1415927410e+00,
+ a0 = 7.7215664089e-02,
+ a1 = 3.2246702909e-01,
+ a2 = 6.7352302372e-02,
+ a3 = 2.0580807701e-02,
+ a4 = 7.3855509982e-03,
+ a5 = 2.8905137442e-03,
+ a6 = 1.1927076848e-03,
+ a7 = 5.1006977446e-04,
+ a8 = 2.2086278477e-04,
+ a9 = 1.0801156895e-04,
+ a10 = 2.5214456400e-05,
+ a11 = 4.4864096708e-05,
+ tc = 1.4616321325e+00,
+ tf = -1.2148628384e-01,
+ tt = 6.6971006518e-09,
+ t0 = 4.8383611441e-01,
+ t1 = -1.4758771658e-01,
+ t2 = 6.4624942839e-02,
+ t3 = -3.2788541168e-02,
+ t4 = 1.7970675603e-02,
+ t5 = -1.0314224288e-02,
+ t6 = 6.1005386524e-03,
+ t7 = -3.6845202558e-03,
+ t8 = 2.2596477065e-03,
+ t9 = -1.4034647029e-03,
+ t10 = 8.8108185446e-04,
+ t11 = -5.3859531181e-04,
+ t12 = 3.1563205994e-04,
+ t13 = -3.1275415677e-04,
+ t14 = 3.3552918467e-04,
+ u0 = -7.7215664089e-02,
+ u1 = 6.3282704353e-01,
+ u2 = 1.4549225569e+00,
+ u3 = 9.7771751881e-01,
+ u4 = 2.2896373272e-01,
+ u5 = 1.3381091878e-02,
+ v1 = 2.4559779167e+00,
+ v2 = 2.1284897327e+00,
+ v3 = 7.6928514242e-01,
+ v4 = 1.0422264785e-01,
+ v5 = 3.2170924824e-03,
+ s0 = -7.7215664089e-02,
+ s1 = 2.1498242021e-01,
+ s2 = 3.2577878237e-01,
+ s3 = 1.4635047317e-01,
+ s4 = 2.6642270386e-02,
+ s5 = 1.8402845599e-03,
+ s6 = 3.1947532989e-05,
+ r1 = 1.3920053244e+00,
+ r2 = 7.2193557024e-01,
+ r3 = 1.7193385959e-01,
+ r4 = 1.8645919859e-02,
+ r5 = 7.7794247773e-04,
+ r6 = 7.3266842264e-06,
+ w0 = 4.1893854737e-01,
+ w1 = 8.3333335817e-02,
+ w2 = -2.7777778450e-03,
+ w3 = 7.9365057172e-04,
+ w4 = -5.9518753551e-04,
+ w5 = 8.3633989561e-04,
+ w6 = -1.6309292987e-03;
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+ int i, hx, ix;
+ nadj = 0;
+ hx = *(int *)&x;
+ ix = hx & 0x7fffffff;
+ if (ix >= 0x7f800000)
+ return x * x;
+ if (ix == 0)
+ return ((x + one) / zero);
+ if (ix < 0x1c800000) {
+ if (hx < 0) {
+ return -native_log(-x);
+ } else
+ return -native_log(x);
+ }
+ if (hx < 0) {
+ if (ix >= 0x4b000000)
+ return ((-x) / zero);
+ t = __gen_ocl_internal_sinpi(x);
+ if (t == zero)
+ return ((-x) / zero);
+ nadj = native_log(pi / __gen_ocl_fabs(t * x));
+ x = -x;
+ }
+ if (ix == 0x3f800000 || ix == 0x40000000)
+ r = 0;
+ else if (ix < 0x40000000) {
+ if (ix <= 0x3f666666) {
+ r = -native_log(x);
+ if (ix >= 0x3f3b4a20) {
+ y = one - x;
+ i = 0;
+ } else if (ix >= 0x3e6d3308) {
+ y = x - (tc - one);
+ i = 1;
+ } else {
+ y = x;
+ i = 2;
+ }
+ } else {
+ r = zero;
+ if (ix >= 0x3fdda618) {
+ y = (float) 2.0 - x;
+ i = 0;
+ }
+ else if (ix >= 0x3F9da620) {
+ y = x - tc;
+ i = 1;
+ }
+ else {
+ y = x - one;
+ i = 2;
+ }
+ }
+ switch (i) {
+ case 0:
+ z = y * y;
+ p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+ p = y * p1 + p2;
+ r += (p - (float) 0.5 * y);
+ break;
+ case 1:
+ z = y * y;
+ w = z * y;
+ p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+ p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+ p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+ p = z * p1 - (tt - w * (p2 + y * p3));
+ r += (tf + p);
+ break;
+ case 2:
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+ p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ r += (-(float) 0.5 * y + p1 / p2);
+ }
+ } else if (ix < 0x41000000) {
+ i = (int) x;
+ t = zero;
+ y = x - (float) i;
+ p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
+ q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+ r = .5f * y + p / q;
+ z = one;
+ switch (i) {
+ case 7:
+ z *= (y + (float) 6.0);
+ case 6:
+ z *= (y + (float) 5.0);
+ case 5:
+ z *= (y + (float) 4.0);
+ case 4:
+ z *= (y + (float) 3.0);
+ case 3:
+ z *= (y + (float) 2.0);
+ r += native_log(z);
+ break;
+ }
+
+ } else if (ix < 0x5c800000) {
+ t = native_log(x);
+ z = one / x;
+ y = z * z;
+ w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+ r = (x - .5f) * (t - one) + w;
+ } else
+ r = x * (native_log(x) - one);
+ if (hx < 0)
+ r = nadj - r;
+ return r;
+}
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#define BODY \
+ const float \
+ zero= 0., \
+ one = 1.0000000000e+00, \
+ pi = 3.1415927410e+00, \
+ a0 = 7.7215664089e-02, \
+ a1 = 3.2246702909e-01, \
+ a2 = 6.7352302372e-02, \
+ a3 = 2.0580807701e-02, \
+ a4 = 7.3855509982e-03, \
+ a5 = 2.8905137442e-03, \
+ a6 = 1.1927076848e-03, \
+ a7 = 5.1006977446e-04, \
+ a8 = 2.2086278477e-04, \
+ a9 = 1.0801156895e-04, \
+ a10 = 2.5214456400e-05, \
+ a11 = 4.4864096708e-05, \
+ tc = 1.4616321325e+00, \
+ tf = -1.2148628384e-01, \
+ tt = 6.6971006518e-09, \
+ t0 = 4.8383611441e-01, \
+ t1 = -1.4758771658e-01, \
+ t2 = 6.4624942839e-02, \
+ t3 = -3.2788541168e-02, \
+ t4 = 1.7970675603e-02, \
+ t5 = -1.0314224288e-02, \
+ t6 = 6.1005386524e-03, \
+ t7 = -3.6845202558e-03, \
+ t8 = 2.2596477065e-03, \
+ t9 = -1.4034647029e-03, \
+ t10 = 8.8108185446e-04, \
+ t11 = -5.3859531181e-04, \
+ t12 = 3.1563205994e-04, \
+ t13 = -3.1275415677e-04, \
+ t14 = 3.3552918467e-04, \
+ u0 = -7.7215664089e-02, \
+ u1 = 6.3282704353e-01, \
+ u2 = 1.4549225569e+00, \
+ u3 = 9.7771751881e-01, \
+ u4 = 2.2896373272e-01, \
+ u5 = 1.3381091878e-02, \
+ v1 = 2.4559779167e+00, \
+ v2 = 2.1284897327e+00, \
+ v3 = 7.6928514242e-01, \
+ v4 = 1.0422264785e-01, \
+ v5 = 3.2170924824e-03, \
+ s0 = -7.7215664089e-02, \
+ s1 = 2.1498242021e-01, \
+ s2 = 3.2577878237e-01, \
+ s3 = 1.4635047317e-01, \
+ s4 = 2.6642270386e-02, \
+ s5 = 1.8402845599e-03, \
+ s6 = 3.1947532989e-05, \
+ r1 = 1.3920053244e+00, \
+ r2 = 7.2193557024e-01, \
+ r3 = 1.7193385959e-01, \
+ r4 = 1.8645919859e-02, \
+ r5 = 7.7794247773e-04, \
+ r6 = 7.3266842264e-06, \
+ w0 = 4.1893854737e-01, \
+ w1 = 8.3333335817e-02, \
+ w2 = -2.7777778450e-03, \
+ w3 = 7.9365057172e-04, \
+ w4 = -5.9518753551e-04, \
+ w5 = 8.3633989561e-04, \
+ w6 = -1.6309292987e-03; \
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w; \
+ int i, hx, ix; \
+ nadj = 0; \
+ hx = *(int *)&x; \
+ *signgamp = 1; \
+ ix = hx & 0x7fffffff; \
+ if (ix >= 0x7f800000) \
+ return x * x; \
+ if (ix == 0) \
+ return ((x + one) / zero); \
+ if (ix < 0x1c800000) { \
+ if (hx < 0) { \
+ *signgamp = -1; \
+ return -native_log(-x); \
+ } else \
+ return -native_log(x); \
+ } \
+ if (hx < 0) { \
+ if (ix >= 0x4b000000) \
+ return ((-x) / zero); \
+ t = __gen_ocl_internal_sinpi(x); \
+ if (t == zero) \
+ return ((-x) / zero); \
+ nadj = native_log(pi / __gen_ocl_fabs(t * x)); \
+ if (t < zero) \
+ *signgamp = -1; \
+ x = -x; \
+ } \
+ if (ix == 0x3f800000 || ix == 0x40000000) \
+ r = 0; \
+ else if (ix < 0x40000000) { \
+ if (ix <= 0x3f666666) { \
+ r = -native_log(x); \
+ if (ix >= 0x3f3b4a20) { \
+ y = one - x; \
+ i = 0; \
+ } else if (ix >= 0x3e6d3308) { \
+ y = x - (tc - one); \
+ i = 1; \
+ } else { \
+ y = x; \
+ i = 2; \
+ } \
+ } else { \
+ r = zero; \
+ if (ix >= 0x3fdda618) { \
+ y = (float) 2.0 - x; \
+ i = 0; \
+ } \
+ else if (ix >= 0x3F9da620) { \
+ y = x - tc; \
+ i = 1; \
+ } \
+ else { \
+ y = x - one; \
+ i = 2; \
+ } \
+ } \
+ switch (i) { \
+ case 0: \
+ z = y * y; \
+ p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); \
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); \
+ p = y * p1 + p2; \
+ r += (p - (float) 0.5 * y); \
+ break; \
+ case 1: \
+ z = y * y; \
+ w = z * y; \
+ p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); \
+ p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); \
+ p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); \
+ p = z * p1 - (tt - w * (p2 + y * p3)); \
+ r += (tf + p); \
+ break; \
+ case 2: \
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); \
+ p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); \
+ r += (-(float) 0.5 * y + p1 / p2); \
+ } \
+ } else if (ix < 0x41000000) { \
+ i = (int) x; \
+ t = zero; \
+ y = x - (float) i; \
+ p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); \
+ q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); \
+ r = .5f * y + p / q; \
+ z = one; \
+ switch (i) { \
+ case 7: \
+ z *= (y + (float) 6.0); \
+ case 6: \
+ z *= (y + (float) 5.0); \
+ case 5: \
+ z *= (y + (float) 4.0); \
+ case 4: \
+ z *= (y + (float) 3.0); \
+ case 3: \
+ z *= (y + (float) 2.0); \
+ r += native_log(z); \
+ break; \
+ } \
+ \
+ } else if (ix < 0x5c800000) { \
+ t = native_log(x); \
+ z = one / x; \
+ y = z * z; \
+ w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); \
+ r = (x - .5f) * (t - one) + w; \
+ } else \
+ r = x * (native_log(x) - one); \
+ if (hx < 0) \
+ r = nadj - r; \
+ return r;
+INLINE_OVERLOADABLE float lgamma_r(float x, global int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, local int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
+#undef BODY
+
+INLINE_OVERLOADABLE float native_log10(float x) {
+ return native_log2(x) * 0.3010299956f;
+}
+INLINE_OVERLOADABLE float log1p(float x) {
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ const float
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ two25 = 3.355443200e+07, /* 0x4c000000 */
+ Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
+ Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
+ Lp3 = 2.8571429849e-01, /* 3E924925 */
+ Lp4 = 2.2222198546e-01, /* 3E638E29 */
+ Lp5 = 1.8183572590e-01, /* 3E3A3325 */
+ Lp6 = 1.5313838422e-01, /* 3E1CD04F */
+ Lp7 = 1.4798198640e-01; /* 3E178897 */
+ const float zero = 0.0;
+ float hfsq,f,c,s,z,R,u;
+ int k,hx,hu,ax;
+ union {float f; unsigned i;} un;
+ un.f = x; hx = un.i;
+ ax = hx&0x7fffffff;
+
+ k = 1;
+ if (hx < 0x3ed413d7) { /* x < 0.41422 */
+ if(ax>=0x3f800000) { /* x <= -1.0 */
+ if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
+ else return (x-x)/(x-x); /* log1p(x<-1)=NaN */
+ }
+ if(ax<0x31000000) { /* |x| < 2**-29 */
+ if(two25+x>zero /* raise inexact */
+ &&ax<0x24800000) /* |x| < 2**-54 */
+ return x;
+ else
+ return x - x*x*(float)0.5;
+ }
+ if(hx>0||hx<=((int)0xbe95f61f)) {
+ k=0;f=x;hu=1;} /* -0.2929<x<0.41422 */
+ }
+ if (hx >= 0x7f800000) return x+x;
+ if(k!=0) {
+ if(hx<0x5a000000) {
+ u = (float)1.0+x;
+
+ un.f = u; hu = un.i;
+ k = (hu>>23)-127;
+ /* correction term */
+ c = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
+ c /= u;
+ } else {
+ u = x;
+ un.f = u; hu = un.i;
+ k = (hu>>23)-127;
+ c = 0;
+ }
+ hu &= 0x007fffff;
+ if(hu<0x3504f7) {
+ un.i = hu|0x3f800000; u = un.f;/* normalize u */
+ } else {
+ k += 1;
+ un.i = hu|0x3f000000; u = un.f; /* normalize u/2 */
+ hu = (0x00800000-hu)>>2;
+ }
+ f = u-(float)1.0;
+ }
+ hfsq=(float)0.5*f*f;
+ if(hu==0) { /* |f| < 2**-20 */
+ if(f==zero) { if(k==0) return zero;
+ else {c += k*ln2_lo; return k*ln2_hi+c;} }
+ R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+ if(k==0) return f-R; else
+ return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+ }
+ s = f/((float)2.0+f);
+ z = s*s;
+ R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+ if(k==0) return f-(hfsq-s*(hfsq+R)); else
+ return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
+
+}
+INLINE_OVERLOADABLE float logb(float x) {
+union {float f; unsigned i;} u;
+ u.f = x;
+ int e = ((u.i & 0x7f800000) >> 23);
+ if(e == 0) {
+ /* sub normal or +/-0 */
+ return -INFINITY;
+ } else if(e == 0xff) {
+ /* inf & nan */
+ return x*x;
+ } else {
+ return (float)(e-127);
+ }
+}
+#define FP_ILOGB0 (-0x7FFFFFFF-1)
+#define FP_ILOGBNAN FP_ILOGB0
+INLINE_OVERLOADABLE int ilogb(float x) {
+ union { int i; float f; } u;
+ if (isnan(x))
+ return FP_ILOGBNAN;
+ if (isinf(x))
+ return 0x7FFFFFFF;
+ u.f = x;
+ u.i &= 0x7fffffff;
+ if (u.i == 0)
+ return FP_ILOGB0;
+ if (u.i >= 0x800000)
+ return (u.i >> 23) - 127;
+ int r = -126;
+ int a = u.i & 0x7FFFFF;
+ while(a < 0x800000) {
+ a <<= 1;
+ r --;
+ }
+ return r;
+}
+INLINE_OVERLOADABLE float nan(uint code) {
+ return NAN;
+}
+INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+INLINE_OVERLOADABLE float native_tan(float x) {
+ return native_sin(x) / native_cos(x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+ float sign = 1.0f;
+ int ix;
+ if(isinf(x)) return NAN;
+ if(x < 0.0f) { x = -x; sign = -1.0f; }
+ GEN_OCL_GET_FLOAT_WORD(ix, x);
+ if(x> 0x1.0p24) return 0.0f;
+ float m = __gen_ocl_internal_floor(x);
+ ix = (int)m;
+ m = x-m;
+ int n = __gen_ocl_internal_floor(m*4.0f);
+ if(m == 0.5f) {
+ return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
+ }
+ if(m == 0.0f) {
+ return (ix&0x1) == 0 ? 0.0f : -0.0f;
+ }
+
+ switch(n) {
+ case 0:
+ return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
+ case 1:
+ return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+ case 2:
+ return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+ default:
+ return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
+ }
+}
+INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
+INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
+INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+ /* copied from fdlibm */
+ const unsigned
+ B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
+ B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
+
+ const float
+ C = 5.4285717010e-01, /* 19/35 = 0x3f0af8b0 */
+ D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
+ E = 1.4142856598e+00, /* 99/70 = 0x3fb50750 */
+ F = 1.6071428061e+00, /* 45/28 = 0x3fcdb6db */
+ G = 3.5714286566e-01; /* 5/14 = 0x3eb6db6e */
+
+ float r,s,t, w;
+ int hx;
+ uint sign;
+ uint high;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ sign=hx&0x80000000; /* sign= sign(x) */
+ hx ^=sign;
+ if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
+ if(hx==0)
+ return(x); /* cbrt(0) is itself */
+
+ GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
+ /* rough cbrt to 5 bits */
+ if(hx<0x00800000) /* subnormal number */
+ {
+ //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
+ //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
+ t = (sign = 0) ? 0.0f : -0.0f;
+ return t;
+ }
+ else
+ GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
+
+
+ /* new cbrt to 23 bits */
+ r=t*t/x;
+ s=C+r*t;
+ t*=G+F/(s+E+D/s);
+ /* one step newton iteration to 53 bits with error less than 0.667 ulps */
+ s=t*t; /* t*t is exact */
+ r=x/s;
+ w=t+t;
+ r=(r-t)/(w+r); /* r-s is exact */
+ t=t+t*r;
+
+ /* retore the sign bit */
+ GEN_OCL_GET_FLOAT_WORD(high,t);
+ GEN_OCL_SET_FLOAT_WORD(t,high|sign);
+ return(t);
+}
+
+#define BODY \
+ *cosval = cos(x); \
+ return sin(x);
+INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
+#undef BODY
+
+INLINE float __gen_ocl_asin_util(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ float
+ pS0 = 1.66666666666666657415e-01,
+ pS1 = -3.25565818622400915405e-01,
+ pS2 = 2.01212532134862925881e-01,
+ pS3 = -4.00555345006794114027e-02,
+ pS4 = 7.91534994289814532176e-04,
+ pS5 = 3.47933107596021167570e-05,
+ qS1 = -2.40339491173441421878e+00,
+ qS2 = 2.02094576023350569471e+00,
+ qS3 = -6.88283971605453293030e-01,
+ qS4 = 7.70381505559019352791e-02;
+
+ float t = x*x;
+ float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
+ float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+ float w = p / q;
+ return x + x*w;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+ uint ix;
+ union { uint i; float f; } u;
+ u.f = x;
+ ix = u.i & 0x7fffffff;
+ if(ix == 0x3f800000) {
+ return x * M_PI_2_F; /* asin(|1|)=+-pi/2 with inexact */
+ }
+ if(ix > 0x3f800000) { /* |x|>= 1 */
+ return NAN; /* asin(|x|>1) is NaN */
+ }
+
+ if(ix < 0x32000000) { /* if |x| < 2**-27 */
+ if(HUGE_VALF + x > FLT_ONE) return x; /* return x with inexact if x!=0*/
+ }
+
+ if(x < -0.5) {
+ return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
+ } else if(x > 0.5) {
+ return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
+ } else {
+ return __gen_ocl_asin_util(x);
+ }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+ return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+ if(x > 0.5)
+ return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
+ else
+ return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+ return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+__constant float atanhi[4] = {
+ 4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+ 7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+ 9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+ 1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+};
+__constant float atanlo[4] = {
+ 5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+ 3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+ 3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+ 7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+};
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+ /* copied from fdlibm */
+ float aT[11];
+ aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
+ aT[1] = -2.0000000298e-01; /* 0xbe4ccccd */
+ aT[2] = 1.4285714924e-01; /* 0x3e124925 */
+ aT[3] = -1.1111110449e-01; /* 0xbde38e38 */
+ aT[4] = 9.0908870101e-02; /* 0x3dba2e6e */
+ aT[5] = -7.6918758452e-02; /* 0xbd9d8795 */
+ aT[6] = 6.6610731184e-02; /* 0x3d886b35 */
+ aT[7] = -5.8335702866e-02; /* 0xbd6ef16b */
+ aT[8] = 4.9768779427e-02; /* 0x3d4bda59 */
+ aT[9] = -3.6531571299e-02; /* 0xbd15a221 */
+ aT[10] = 1.6285819933e-02; /* 0x3c8569d7 */
+ const float one = 1.0, huge = 1.0e30;
+
+ float w,s1,s2,z;
+ int ix,hx,id;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix>=0x50800000) { /* if |x| >= 2^34 */
+ if(ix>0x7f800000)
+ return x+x; /* NaN */
+ if(hx>0) return atanhi[3]+atanlo[3];
+ else return -atanhi[3]-atanlo[3];
+ } if (ix < 0x3ee00000) { /* |x| < 0.4375 */
+ if (ix < 0x31000000) { /* |x| < 2^-29 */
+ if(huge+x>one) return x; /* raise inexact */
+ }
+ id = -1;
+ } else {
+ x = __gen_ocl_fabs(x);
+ if (ix < 0x3f980000) { /* |x| < 1.1875 */
+ if (ix < 0x3f300000) { /* 7/16 <=|x|<11/16 */
+ id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
+ } else { /* 11/16<=|x|< 19/16 */
+ id = 1; x = (x-one)/(x+one);
+ }
+ } else {
+ if (ix < 0x401c0000) { /* |x| < 2.4375 */
+ id = 2; x = (x-(float)1.5)/(one+(float)1.5*x);
+ } else { /* 2.4375 <= |x| < 2^66 */
+ id = 3; x = -(float)1.0/x;
+ }
+ }}
+ /* end of argument reduction */
+ z = x*x;
+ w = z*z;
+ /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+ s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
+ s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
+ if (id<0) return x - x*(s1+s2);
+ else {
+ z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
+ return (hx<0)? -z:z;
+ }
+
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+ return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+ return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+ return 1 - __gen_ocl_internal_erf(x);
+}
+
+// XXX work-around PTX profile
+#define sqrt native_sqrt
+INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+ /* copied from fdlibm */
+ float z;
+ int k,m,hx,hy,ix,iy;
+ const float
+ tiny = 1.0e-30,
+ zero = 0.0,
+ pi_o_4 = 7.8539818525e-01, /* 0x3f490fdb */
+ pi_o_2 = 1.5707963705e+00, /* 0x3fc90fdb */
+ pi = 3.1415927410e+00, /* 0x40490fdb */
+ pi_lo = -8.7422776573e-08; /* 0xb3bbbd2e */
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ iy = hy&0x7fffffff;
+
+ if((ix>0x7f800000)||
+ (iy>0x7f800000)) /* x or y is NaN */
+ return x+y;
+ if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y); /* x=1.0 */
+ m = ((hy>>31)&1)|((hx>>30)&2); /* 2*sign(x)+sign(y) */
+
+ /* when y = 0 */
+ if(iy==0) {
+ switch(m) {
+ case 0:
+ case 1: return y; /* atan(+-0,+anything)=+-0 */
+ case 2: return pi+tiny;/* atan(+0,-anything) = pi */
+ case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
+ }
+ }
+ /* when x = 0 */
+ if(ix==0) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+
+ /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
+ if(ix <= 0x7fffff && iy <= 0x7fffff) {
+ x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
+ y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
+ }
+
+ /* when x is INF */
+ if(ix==0x7f800000) {
+ if(iy==0x7f800000) {
+ switch(m) {
+ case 0: return pi_o_4+tiny;/* atan(+INF,+INF) */
+ case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
+ case 2: return (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
+ case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
+ }
+ } else {
+ switch(m) {
+ case 0: return zero ; /* atan(+...,+INF) */
+ case 1: return -zero ; /* atan(-...,+INF) */
+ case 2: return pi+tiny ; /* atan(+...,-INF) */
+ case 3: return -pi-tiny ; /* atan(-...,-INF) */
+ }
+ }
+ }
+ /* when y is INF */
+ if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+
+ /* compute y/x */
+ k = (iy-ix)>>23;
+ if(k > 60) z=pi_o_2+(float)0.5*pi_lo; /* |y/x| > 2**60 */
+ else if(hx<0&&k<-60) z=0.0; /* |y|/x < -2**60 */
+ else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
+ switch (m) {
+ case 0: return z ; /* atan(+,+) */
+ case 1: {
+ uint zh;
+ GEN_OCL_GET_FLOAT_WORD(zh,z);
+ GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
+ }
+ return z ; /* atan(-,+) */
+ case 2: return pi-(z-pi_lo);/* atan(+,-) */
+ default: /* case 3 */
+ return (z-pi_lo)-pi;/* atan(-,-) */
+ }
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+ uint ix = as_uint(x), iy = as_uint(y),
+ pos_zero = 0, neg_zero = 0x80000000u,
+ pos_inf = 0x7f800000, neg_inf = 0xff800000u;
+ if(iy == pos_zero) {
+ if(ix == pos_zero)
+ return 0;
+ if(ix == neg_zero)
+ return 1;
+ if(x < 0)
+ return 1;
+ if(x > 0)
+ return 0;
+ }
+ if(iy == neg_zero) {
+ if(ix == pos_zero)
+ return -0.f;
+ if(ix == neg_zero)
+ return -1;
+ if(x < 0)
+ return -1;
+ if(x > 0)
+ return -0.f;
+ }
+ if((ix & 0x7fffffff) == 0) {
+ if(y < 0)
+ return -.5f;
+ if(y > 0)
+ return .5f;
+ }
+ if(ix == pos_inf) {
+ if(y > 0 && iy != pos_inf)
+ return 0;
+ if(y < 0 && iy != neg_inf)
+ return -0.f;
+ }
+ if(ix == neg_inf) {
+ if(y > 0 && iy != pos_inf)
+ return 1;
+ if(y < 0 && iy != neg_inf)
+ return -1;
+ }
+ if(iy == pos_inf) {
+ if(ix == pos_inf)
+ return 0.25f;
+ if(ix == neg_inf)
+ return 0.75f;
+ if(x >= 0 || x <= 0)
+ return 0.5f;
+ }
+ if(iy == neg_inf) {
+ if(ix == pos_inf)
+ return -0.25f;
+ if(ix == neg_inf)
+ return -0.75f;
+ if(x >= 0 || x <= 0)
+ return -0.5f;
+ }
+ return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) {
+ float y = __gen_ocl_rndz(x);
+ if (__gen_ocl_fabs(x - y) >= 0.5f)
+ y += __gen_ocl_internal_copysign(1.f, x);
+ return y;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+ return __gen_ocl_rnde(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+ //use native instruction when it has enough precision
+ if (x > -0x1.6p1 && x < 0x1.6p1)
+ {
+ return native_exp(x);
+ }
+
+ float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
+ u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
+ twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
+ ivln2 = 1.4426950216e+00; /* 0x3fb8aa3b =1/ln2 */
+ float y,hi=0.0,lo=0.0,t;
+ int k=0,xsb;
+ unsigned hx;
+ float ln2HI_0 = 6.9313812256e-01; /* 0x3f317180 */
+ float ln2HI_1 = -6.9313812256e-01; /* 0xbf317180 */
+ float ln2LO_0 = 9.0580006145e-06; /* 0x3717f7d1 */
+ float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
+ float half_0 = 0.5;
+ float half_1 = -0.5;
+
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ xsb = (hx>>31)&1; /* sign bit of x */
+ hx &= 0x7fffffff; /* high word of |x| */
+
+ /* filter out non-finite argument */
+ if(hx >= 0x42b17218) { /* if |x|>=88.721... */
+ // native_exp already handled this
+ return native_exp(x);
+ }
+
+ /* argument reduction */
+ if(hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
+ if(hx < 0x3F851592) { /* and |x| < 1.5 ln2 */
+ hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0);
+ lo= xsb == 1? ln2LO_1 : ln2LO_0;
+ k = 1-xsb-xsb;
+ } else {
+ float tmp = xsb == 1 ? half_1 : half_0;
+ k = ivln2*x+tmp;
+ t = k;
+ hi = x - t*ln2HI_0; /* t*ln2HI is exact here */
+ lo = t*ln2LO_0;
+ }
+ x = hi - lo;
+ }
+
+ y = native_exp(x);
+ if(k >= -125) {
+ unsigned hy;
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23)); /* add k to y's exponent */
+ return y;
+ } else {
+ unsigned hy;
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+ return y*twom100;
+ }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+ //return x-y*__gen_ocl_rndz(x/y);
+ float one = 1.0;
+ float Zero[2];
+ int n,hx,hy,hz,ix,iy,sx,i;
+ Zero[0] = 0.0;
+ Zero[1] = -0.0;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ sx = hx&0x80000000; /* sign of x */
+ hx ^=sx; /* |x| */
+ hy &= 0x7fffffff; /* |y| */
+ /* purge off exception values */
+ if(hy==0||(hx>=0x7f800000)|| /* y=0,or x not finite */
+ (hy>0x7f800000)) /* or y is NaN */
+ return (x*y)/(x*y);
+ if(hx<hy) return x; /* |x|<|y| return x */
+ if(hx==hy)
+ return Zero[(unsigned)sx>>31]; /* |x|=|y| return x*0*/
+
+ /* determine ix = ilogb(x) */
+ if(hx<0x00800000) { /* subnormal x */
+ for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+ } else ix = (hx>>23)-127;
+
+ /* determine iy = ilogb(y) */
+ if(hy<0x00800000) { /* subnormal y */
+ for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+ } else iy = (hy>>23)-127;
+
+ /* set up {hx,lx}, {hy,ly} and align y to x */
+ if(ix >= -126)
+ hx = 0x00800000|(0x007fffff&hx);
+ else { /* subnormal x, shift x to normal */
+ n = -126-ix;
+ hx = hx<<n;
+ }
+ if(iy >= -126)
+ hy = 0x00800000|(0x007fffff&hy);
+ else { /* subnormal y, shift y to normal */
+ n = -126-iy;
+ hy = hy<<n;
+ }
+ /* fix point fmod */
+ n = ix - iy;
+ while(n--) {
+ hz=hx-hy;
+ if(hz<0){hx = hx+hx;}
+ else {
+ if(hz==0) /* return sign(x)*0 */
+ return Zero[(unsigned)sx>>31];
+ hx = hz+hz;
+ }
+ }
+ hz=hx-hy;
+ if(hz>=0) {hx=hz;}
+
+ /* convert back to floating value and restore the sign */
+ if(hx==0) /* return sign(x)*0 */
+ return Zero[(unsigned)sx>>31];
+ while(hx<0x00800000) { /* normalize x */
+ hx = hx+hx;
+ iy -= 1;
+ }
+ if(iy>= -126) { /* normalize output */
+ hx = ((hx-0x00800000)|((iy+127)<<23));
+ GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+ } else { /* subnormal output */
+ n = -126 - iy;
+ hx >>= n;
+ GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+ x *= one; /* create necessary signal */
+ }
+ return x; /* exact output */
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+ //return __gen_ocl_pow(M_E_F, x) - 1;
+ float Q1 = -3.3333335072e-02, /* 0xbd088889 */
+ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
+ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
+ Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+ Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+ Q4 = 4.0082177293e-06, /* 0x36867e54 */
+ Q5 = -2.0109921195e-07, /* 0xb457edbb */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ one = 1.0,
+ o_threshold= 8.8721679688e+01; /* 0x42b17180 */
+ float y,hi,lo,c,t,e,hxs,hfx,r1;
+ int k,xsb;
+ int hx;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ xsb = hx&0x80000000;
+ /* sign bit of x */
+ //if(xsb==0)
+ //y=x;
+ //else
+ //y= -x; /* y = |x| */
+ y = __gen_ocl_internal_fabs(x);
+ hx &= 0x7fffffff; /* high word of |x| */
+ /* filter out huge and non-finite argument */
+ if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */
+ if(hx >= 0x42b17218) { /* if |x|>=88.721... */
+ if(hx>0x7f800000)
+ return x+x; /* NaN */
+ if(hx==0x7f800000)
+ return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+ if(x > o_threshold)
+ return huge*huge; /* overflow */
+ }
+ if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+ if(x+tiny<(float)0.0) /* raise inexact */
+ return tiny-one; /* return -1 */
+ }
+ }
+ /* argument reduction */
+ if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */
+ if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+ if(xsb==0){
+ hi = x - ln2_hi; lo = ln2_lo; k = 1;
+ } else {
+ hi = x + ln2_hi; lo = -ln2_lo; k = -1;
+ }
+ } else {
+ k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+ t = k;
+ hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+ lo = t*ln2_lo;
+ }
+ x = hi - lo;
+ c = (hi-x)-lo;
+ } else if(hx < 0x33000000) { /* when |x|<2**-25, return x */
+ //t = huge+x; /* return x with inexact flags when x!=0 */
+ //return x - (t-(huge+x));
+ return x;
+ } else k = 0;
+ /* x is now in primary range */
+ hfx = (float)0.5*x;
+ hxs = x*hfx;
+ r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+ t = (float)3.0-r1*hfx;
+ e = hxs*((r1-t)/((float)6.0 - x*t));
+ if(k==0)
+ return x - (x*e-hxs); /* c is 0 */
+ else{
+ e = (x*(e-c)-c);
+ e -= hxs;
+ if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+ if(k==1){
+ if(x < (float)-0.25)
+ return -(float)2.0*(e-(x+(float)0.5));
+ else
+ return (one+(float)2.0*(x-e));
+ }
+ if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
+ int i;
+ y = one-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ return y-one;
+ }
+ t = one;
+ if(k<23) {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+ y = t-(e-x);
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ } else {
+ int i;
+ GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */
+ y = x-(e+t);
+ y += one;
+ GEN_OCL_GET_FLOAT_WORD(i,y);
+ GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
+ }
+ }
+ return y;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+ //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+ float one = 1.0,
+ ln2 = 6.9314718246e-01;/* 0x3f317218 */
+ float t;
+ int hx;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ if(hx<0x3f800000) { /* x < 1 */
+ return (x-x)/(x-x);
+ } else if(hx >=0x4d800000) { /* x > 2**28 */
+ if(hx >=0x7f800000) {/* x is inf of NaN */
+ return x+x;
+ } else
+ return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+ } else if (hx==0x3f800000) {
+ return 0.0; /* acosh(1) = 0 */
+ } else if (hx > 0x40000000) { /* 2**28 > x > 2 */
+ t=x*x;
+ return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
+ } else { /* 1<x<2 */
+ t = x-one;
+ return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+ }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+ //return native_log(x + native_sqrt(x * x + 1));
+ float one = 1.0000000000e+00, /* 0x3F800000 */
+ ln2 = 6.9314718246e-01, /* 0x3f317218 */
+ huge= 1.0000000000e+30;
+ float w;
+ int hx,ix;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix< 0x38000000) { /* |x|<2**-14 */
+ if(huge+x>one) return x; /* return x inexact except 0 */
+ }
+ if(ix>0x47000000) {/* |x| > 2**14 */
+ if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+ w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+ } else {
+ float xa = __gen_ocl_internal_fabs(x);
+ if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+ w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+ } else { /* 2.0 > |x| > 2**-14 */
+ float t = xa*xa;
+ w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+ }
+ }
+ return __gen_ocl_internal_copysign(w, x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+ //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+ float one = 1.0,
+ shuge = 1.0e37;
+ float t,w,h;
+ int ix,jx;
+ GEN_OCL_GET_FLOAT_WORD(jx,x);
+ ix = jx&0x7fffffff;
+ /* x is INF or NaN */
+ if(ix>=0x7f800000) return x+x;
+ h = 0.5;
+ if (jx<0) h = -h;
+ /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+ if (ix < 0x41b00000) { /* |x|<22 */
+ if (ix<0x31800000) /* |x|<2**-28 */
+ if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+ t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+ if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+ return h*(t+t/(t+one));
+ }
+ /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+ if (ix < 0x42b17180) return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+ /* |x| in [log(maxdouble), overflowthresold] */
+ if (ix<=0x42b2d4fc) {
+ w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+ t = h*w;
+ return t*w;
+ }
+ /* |x| > overflowthresold, sinh(x) overflow */
+ return x*shuge;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+ //float y = native_exp(-2 * x);
+ //return (1 - y) / (1 + y);
+ float one=1.0, two=2.0, tiny = 1.0e-30;
+ float t,z;
+ int jx,ix;
+ GEN_OCL_GET_FLOAT_WORD(jx,x);
+ ix = jx&0x7fffffff;
+ /* x is INF or NaN */
+ if(ix>=0x7f800000) {
+ if (jx>=0)
+ return one/x+one; /* tanh(+-inf)=+-1 */
+ else
+ return one/x-one; /* tanh(NaN) = NaN */
+ }
+
+ if (ix < 0x41b00000) { /* |x|<22 */
+ if (ix == 0)
+ return x; /* x == +-0 */
+ if (ix<0x24000000) /* |x|<2**-55 */
+ return x*(one+x); /* tanh(small) = small */
+ if (ix>=0x3f800000) { /* |x|>=1 */
+ t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
+ z = one - two/(t+two);
+ } else {
+ t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
+ z= -t/(t+two);
+ }
+ } else { /* |x| > 22, return +-1 */
+ z = one - tiny; /* raised inexact flag */
+ }
+ return (jx>=0)? z: -z;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+ //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+ float halF = 0.5,
+ huge = 1.0e+30,
+ tiny = 1.0e-30,
+ one = 1.0;
+ float t,w;
+ int ix;
+ GEN_OCL_GET_FLOAT_WORD(ix,x);
+ ix &= 0x7fffffff;
+ /* |x| in [0,22] */
+ if (ix < 0x41b00000) {
+ /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
+ if(ix<0x3eb17218) {
+ t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
+ w = one+t;
+ if (ix<0x24000000) return w; /* cosh(tiny) = 1 */
+ return one+(t*t)/(w+w);
+ }
+ /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
+ t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
+ return halF*t+halF/t;
+ }
+ /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
+ if (ix < 0x42b17180) return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
+ /* |x| in [log(maxdouble), overflowthresold] */
+ if (ix<=0x42b2d4fc) {
+ w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
+ t = halF*w;
+ return t*w;
+ }
+ /* x is INF or NaN */
+ if(ix>=0x7f800000) return x*x;
+ /* |x| > overflowthresold, cosh(x) overflow */
+ return huge*huge;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
+ //return x-y*__gen_ocl_rnde(x/y);
+ float zero = 0.0;
+ int hx,hp;
+ unsigned sx;
+ float p_half;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_GET_FLOAT_WORD(hp,p);
+ sx = hx&0x80000000;
+ hp &= 0x7fffffff;
+ hx &= 0x7fffffff;
+ /* purge off exception values */
+ if(hp==0) return (x*p)/(x*p); /* p = 0 */
+ if((hx>=0x7f800000)|| /* x not finite */
+ ((hp>0x7f800000))) /* p is NaN */
+ return (x*p)/(x*p);
+ if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
+ if ((hx-hp)==0) return zero*x;
+ x = __gen_ocl_fabs(x);
+ p = __gen_ocl_fabs(p);
+ if (hp<0x01000000) {
+ if(x+x>p) {
+ x-=p;
+ if(x+x>=p) x -= p;
+ }
+ } else {
+ p_half = (float)0.5*p;
+ if(x>p_half) {
+ x-=p;
+ if(x>=p_half) x -= p;
+ }
+ }
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
+ return x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+ if(!__ocl_finitef(x)||x==(float)0.0) return x;
+ x = __gen_ocl_scalbnf(x,n);
+ return x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+ //return 0.5f * native_sqrt((1 + x) / (1 - x));
+ float xa = __gen_ocl_fabs (x);
+ float t;
+ if (isless (xa, 0.5f)){
+ if (xa < 0x1.0p-28f) return x;
+ t = xa + xa;
+ t = 0.5f * log1p (t + t * xa / (1.0f - xa));
+ } else if (isless (xa, 1.0f)){
+ t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
+ } else{
+ if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
+ return x / 0.0f;
+ }
+ return __gen_ocl_internal_copysign(t, x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+ float px, qx,ans;
+ short n;
+ int i;
+ float*p;
+ float MAXL10 = 38.230809449325611792;
+ float LOG210 = 3.32192809488736234787e0;
+ float LG102A = 3.00781250000000000000E-1;
+ float LG102B = 2.48745663981195213739E-4;
+ float P[6];
+ P[0] = 2.063216740311022E-001;
+ P[1] = 5.420251702225484E-001;
+ P[2] = 1.171292686296281E+000;
+ P[3] = 2.034649854009453E+000;
+ P[4] = 2.650948748208892E+000;
+ P[5] = 2.302585167056758E+000;
+ if( isinf(x))
+ return INFINITY;
+
+ if( x < -MAXL10 )return 0.0;
+ /* The following is necessary because range reduction blows up: */
+ if( x == 0 )return 1.0;
+
+ /* Express 10**x = 10**g 2**n
+ * = 10**g 10**( n log10(2) )
+ * = 10**( g + n log10(2) )
+ */
+ px = x * LOG210;
+ qx = __gen_ocl_internal_floor( px + 0.5 );
+ n = qx;
+ x -= qx * LG102A;
+ x -= qx * LG102B;
+
+ /* rational approximation for exponential
+ * of the fractional part:
+ * 10**x - 1 = 2x P(x**2)/( Q(x**2) - P(x**2) )
+ */
+ p = P;
+ ans = *p++;
+ i = 5;
+ do{
+ ans = ans * x + *p++;
+ }
+ while( --i );
+ px = 1.0 + x * ans;
+
+ /* multiply by power of 2 */
+ x = __gen_ocl_internal_ldexp( px, n );
+ return x;
+}
+
+// TODO use llvm intrinsics definitions
+#define cospi __gen_ocl_internal_cospi
+#define cosh __gen_ocl_internal_cosh
+#define acos __gen_ocl_internal_acos
+#define acospi __gen_ocl_internal_acospi
+#define acosh __gen_ocl_internal_acosh
+#define sinpi __gen_ocl_internal_sinpi
+#define sinh __gen_ocl_internal_sinh
+#define asin __gen_ocl_internal_asin
+#define asinpi __gen_ocl_internal_asinpi
+#define asinh __gen_ocl_internal_asinh
+#define tanpi __gen_ocl_internal_tanpi
+#define tanh __gen_ocl_internal_tanh
+#define atan __gen_ocl_internal_atan
+#define atan2 __gen_ocl_internal_atan2
+#define atan2pi __gen_ocl_internal_atan2pi
+#define atanpi __gen_ocl_internal_atanpi
+#define atanh __gen_ocl_internal_atanh
+#define pow powr
+#define cbrt __gen_ocl_internal_cbrt
+#define rint __gen_ocl_internal_rint
+#define copysign __gen_ocl_internal_copysign
+#define erf __gen_ocl_internal_erf
+#define erfc __gen_ocl_internal_erfc
+#define fmod __gen_ocl_internal_fmod
+#define remainder __gen_ocl_internal_remainder
+#define ldexp __gen_ocl_internal_ldexp
+PURE CONST float __gen_ocl_mad(float a, float b, float c);
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
+INLINE_OVERLOADABLE float mad(float a, float b, float c) {
+ return __gen_ocl_mad(a, b, c);
+}
+
+#define DEF(TYPE1, TYPE2) \
+ INLINE_OVERLOADABLE TYPE1 select(TYPE1 src0, TYPE1 src1, TYPE2 cond) { \
+ return cond ? src1 : src0; \
+ }
+DEF(char, char)
+DEF(char, uchar)
+DEF(uchar, char)
+DEF(uchar, uchar)
+DEF(short, short)
+DEF(short, ushort)
+DEF(ushort, short)
+DEF(ushort, ushort)
+DEF(int, int)
+DEF(int, uint)
+DEF(uint, int)
+DEF(uint, uint)
+DEF(long, long)
+DEF(long, ulong)
+DEF(ulong, long)
+DEF(ulong, ulong)
+DEF(float, int)
+DEF(float, uint)
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
+// Common Functions (see 6.11.4 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float step(float edge, float x) {
+ return x < edge ? 0.0 : 1.0;
+}
+
+#define DECL_MIN_MAX_CLAMP(TYPE) \
+INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
+ return a > b ? a : b; \
+} \
+INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
+ return a < b ? a : b; \
+} \
+INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+ return max(min(v, u), l); \
+}
+DECL_MIN_MAX_CLAMP(int)
+DECL_MIN_MAX_CLAMP(short)
+DECL_MIN_MAX_CLAMP(char)
+DECL_MIN_MAX_CLAMP(uint)
+DECL_MIN_MAX_CLAMP(unsigned short)
+DECL_MIN_MAX_CLAMP(unsigned char)
+DECL_MIN_MAX_CLAMP(long)
+DECL_MIN_MAX_CLAMP(ulong)
+#undef DECL_MIN_MAX_CLAMP
+INLINE_OVERLOADABLE float max(float a, float b) {
+ return __gen_ocl_fmax(a, b);
+}
+INLINE_OVERLOADABLE float min(float a, float b) {
+ return __gen_ocl_fmin(a, b);
+}
+INLINE_OVERLOADABLE float clamp(float v, float l, float u) {
+ return max(min(v, u), l);
+}
+
+#define BODY \
+ if (isnan(x) || isinf(x)) { \
+ *exp = 0; \
+ return x; \
+ } \
+ uint u = as_uint(x); \
+ uint a = u & 0x7FFFFFFFu; \
+ if (a == 0) { \
+ *exp = 0; \
+ return x; \
+ } \
+ if (a >= 0x800000) { \
+ *exp = (a >> 23) - 126; \
+ return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+ } \
+ int e = -126; \
+ while (a < 0x400000) { \
+ e --; \
+ a <<= 1; \
+ } \
+ a <<= 1; \
+ *exp = e; \
+ return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+INLINE_OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
+#undef BODY
+
+INLINE_OVERLOADABLE float nextafter(float x, float y) {
+ int hx, hy, ix, iy;
+ hx = as_int(x);
+ hy = as_int(y);
+ ix = hx & 0x7fffffff;
+ iy = hy & 0x7fffffff;
+ if(ix>0x7f800000 || iy>0x7f800000)
+ return x+y;
+ if(hx == hy)
+ return y;
+ if(ix == 0) {
+ if(iy == 0)
+ return y;
+ else
+ return as_float((hy&0x80000000) | 1);
+ }
+ if(hx >= 0) {
+ if(hx > hy) {
+ hx -= 1;
+ } else {
+ hx += 1;
+ }
+ } else {
+ if(hy >= 0 || hx > hy){
+ hx -= 1;
+ } else {
+ hx += 1;
+ }
+ }
+ return as_float(hx);
+}
+
+#define BODY \
+ uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+ if (ix > 0x7F800000) { \
+ *i = nan(0u); \
+ return nan(0u); \
+ } \
+ if (ix == 0x7F800000) { \
+ *i = x; \
+ return as_float(hx & 0x80000000u); \
+ } \
+ *i = __gen_ocl_rndz(x); \
+ return x - *i;
+INLINE_OVERLOADABLE float modf(float x, global float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, local float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, private float *i) { BODY; }
+#undef BODY
+INLINE_OVERLOADABLE float degrees(float radians) { return (180 / M_PI_F) * radians; }
+INLINE_OVERLOADABLE float radians(float degrees) { return (M_PI_F / 180) * degrees; }
+
+INLINE_OVERLOADABLE float smoothstep(float e0, float e1, float x) {
+ x = clamp((x - e0) / (e1 - e0), 0.f, 1.f);
+ return x * x * (3 - 2 * x);
+}
+
+INLINE_OVERLOADABLE float sign(float x) {
+ if(x > 0)
+ return 1;
+ if(x < 0)
+ return -1;
+ if(x == -0.f)
+ return -0.f;
+ return 0.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+ float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+ return a > b ? x : b > a ? y : max(x, y);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+ float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+ return a < b ? x : b < a ? y : min(x, y);
+}
+INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+ if(isnan(x))
+ return x;
+ if(isnan(y))
+ return y;
+ return x > y ? (x - y) : +0.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+ float z,ax,z_h,z_l,p_h,p_l;
+ float y1,t1,t2,r,s,sn,t,u,v,w;
+ int i,j,k,yisint,n;
+ int hx,hy,ix,iy,is;
+ float bp[2],dp_h[2],dp_l[2],
+ zero = 0.0,
+ one = 1.0,
+ two = 2.0,
+ two24 = 16777216.0, /* 0x4b800000 */
+ huge = 1.0e30,
+ tiny = 1.0e-30,
+ /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+ L1 = 6.0000002384e-01, /* 0x3f19999a */
+ L2 = 4.2857143283e-01, /* 0x3edb6db7 */
+ L3 = 3.3333334327e-01, /* 0x3eaaaaab */
+ L4 = 2.7272811532e-01, /* 0x3e8ba305 */
+ L5 = 2.3066075146e-01, /* 0x3e6c3255 */
+ L6 = 2.0697501302e-01, /* 0x3e53f142 */
+ P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+ P2 = -2.7777778450e-03, /* 0xbb360b61 */
+ P3 = 6.6137559770e-05, /* 0x388ab355 */
+ P4 = -1.6533901999e-06, /* 0xb5ddea0e */
+ P5 = 4.1381369442e-08, /* 0x3331bb4c */
+ lg2 = 6.9314718246e-01, /* 0x3f317218 */
+ lg2_h = 6.93145752e-01, /* 0x3f317200 */
+ lg2_l = 1.42860654e-06, /* 0x35bfbe8c */
+ ovt = 4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+ cp = 9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+ cp_h = 9.6179199219e-01, /* 0x3f763800 =head of cp */
+ cp_l = 4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+ ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+ ivln2_h = 1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+ ivln2_l = 7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+ bp[0] = 1.0,bp[1] = 1.5,
+ dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
+ dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+ GEN_OCL_GET_FLOAT_WORD(hx,x);
+ GEN_OCL_GET_FLOAT_WORD(hy,y);
+ ix = hx&0x7fffffff; iy = hy&0x7fffffff;
+ if (ix < 0x00800000) { /* x < 2**-126 */
+ ix = 0;/* Gen does not support subnormal number now */
+ }
+ if (iy < 0x00800000) { /* y < 2**-126 */
+ iy = 0;/* Gen does not support subnormal number now */
+ }
+ /* y==zero: x**0 = 1 */
+ if(iy==0) return one;
+ if(hx==0x3f800000) return one;
+ /* +-NaN return x+y */
+ if(ix > 0x7f800000 || iy > 0x7f800000)
+ return (x+0.0f)+y+(0.0f);
+ /* determine if y is an odd int when x < 0
+ * yisint = 0 ... y is not an integer
+ * yisint = 1 ... y is an odd int
+ * yisint = 2 ... y is an even int
+ */
+ yisint = 0;
+ if(hx<0) {
+ if(iy>=0x4b800000) yisint = 2; /* even integer y */
+ else if(iy>=0x3f800000) {
+ k = (iy>>23)-0x7f; /* exponent */
+ j = iy>>(23-k);
+ if((j<<(23-k))==iy) yisint = 2-(j&1);
+ }
+ }
+ /* special value of y */
+ if (iy==0x7f800000) { /* y is +-inf */
+ if (ix==0x3f800000)
+ //return y - y; /* inf**+-1 is NaN */
+ return one;
+ else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
+ return (hy>=0)? y: zero;
+ else /* (|x|<1)**-,+inf = inf,0 */
+ return (hy<0)?-y: zero;
+ }
+ if(iy==0x3f800000) { /* y is +-1 */
+ if(hy<0) return one/x; else return x;
+ }
+ if(hy==0x40000000) return x*x; /* y is 2 */
+ if(hy==0x3f000000) { /* y is 0.5 */
+ if(hx>=0)return __gen_ocl_sqrt(x);
+ }
+
+ ax = __gen_ocl_fabs(x);
+ /* special value of x */
+ if(ix==0x7f800000||ix==0||ix==0x3f800000){
+ z = ax; /*x is +-0,+-inf,+-1*/
+ if(hy<0) z = one/z; /* z = (1/|x|) */
+ if(hx<0) {
+ if(((ix-0x3f800000)|yisint)==0) {
+ z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+ } else if(yisint==1)
+ z = -z; /* (x<0)**odd = -(|x|**odd) */
+ }
+ return z;
+ }
+ n = ((uint)hx>>31)-1;
+
+ /* (x<0)**(non-int) is NaN */
+ if((n|yisint)==0) return (x-x)/(x-x);
+
+ sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+ if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+
+ /* |y| is huge */
+ if(iy>0x4d000000) { /* if |y| > 2**27 */
+ /* over/underflow if x is not close to one */
+ if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
+ if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
+ /* now |1-x| is tiny <= 2**-20, suffice to compute
+ log(x) by x-x^2/2+x^3/3-x^4/4 */
+ t = ax-1; /* t has 20 trailing zeros */
+ w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+ u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
+ v = t*ivln2_l-w*ivln2;
+ t1 = u+v;
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+ t2 = v-(t1-u);
+ } else {
+ float s2,s_h,s_l,t_h,t_l;
+ n = 0;
+ /* take care subnormal number */
+ //if(ix<0x00800000)
+ //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+ n += ((ix)>>23)-0x7f;
+ j = ix&0x007fffff;
+ /* determine interval */
+ ix = j|0x3f800000; /* normalize ix */
+ if(j<=0x1cc471) k=0; /* |x|<sqrt(3/2) */
+ else if(j<0x5db3d7) k=1; /* |x|<sqrt(3) */
+ else {k=0;n+=1;ix -= 0x00800000;}
+ GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+ /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+ u = ax-bp[k]; /* bp[0]=1.0, bp[1]=1.5 */
+ v = one/(ax+bp[k]);
+ s = u*v;
+ s_h = s;
+ GEN_OCL_GET_FLOAT_WORD(is,s_h);
+ GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+ /* t_h=ax+bp[k] High */
+ is = ((ix>>1)&0xfffff000)|0x20000000;
+ GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+ t_l = ax - (t_h-bp[k]);
+ s_l = v*((u-s_h*t_h)-s_h*t_l);
+ /* compute log(ax) */
+ s2 = s*s;
+ r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+ r += s_l*(s_h+s);
+ s2 = s_h*s_h;
+ t_h = 3.0f+s2+r;
+ GEN_OCL_GET_FLOAT_WORD(is,t_h);
+ GEN_OCL_SET_FLOAT_WORD(t_h,is&0xfffff000);
+ t_l = r-((t_h-3.0f)-s2);
+ /* u+v = s*(1+...) */
+ u = s_h*t_h;
+ v = s_l*t_h+t_l*s;
+ /* 2/(3log2)*(s+...) */
+ p_h = u+v;
+ GEN_OCL_GET_FLOAT_WORD(is,p_h);
+ GEN_OCL_SET_FLOAT_WORD(p_h,is&0xfffff000);
+ p_l = v-(p_h-u);
+ z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
+ z_l = cp_l*p_h+p_l*cp+dp_l[k];
+ /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+ t = (float)n;
+ t1 = (((z_h+z_l)+dp_h[k])+t);
+ GEN_OCL_GET_FLOAT_WORD(is,t1);
+ GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+ t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+ }
+
+ /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+ GEN_OCL_GET_FLOAT_WORD(is,y);
+ GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+ p_l = (y-y1)*t1+y*t2;
+ p_h = y1*t1;
+ z = p_l+p_h;
+ GEN_OCL_GET_FLOAT_WORD(j,z);
+ if (j>0x43000000) /* if z > 128 */
+ return sn*huge*huge; /* overflow */
+ else if (j==0x43000000) { /* if z == 128 */
+ if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
+ }
+ else if ((j&0x7fffffff)>0x43160000) /* z <= -150 */
+ return sn*tiny*tiny; /* underflow */
+ else if (j==0xc3160000){ /* z == -150 */
+ if(p_l<=z-p_h) return sn*tiny*tiny; /* underflow */
+ }
+
+ /*
+ * compute 2**(p_h+p_l)
+ */
+ i = j&0x7fffffff;
+ k = (i>>23)-0x7f;
+ n = 0;
+ if(i>0x3f000000) { /* if |z| > 0.5, set n = [z+0.5] */
+ n = j+(0x00800000>>(k+1));
+ k = ((n&0x7fffffff)>>23)-0x7f; /* new k for n */
+ GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+ n = ((n&0x007fffff)|0x00800000)>>(23-k);
+ if(j<0) n = -n;
+ p_h -= t;
+ }
+ t = p_l+p_h;
+ GEN_OCL_GET_FLOAT_WORD(is,t);
+ GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
+ u = t*lg2_h;
+ v = (p_l-(t-p_h))*lg2+t*lg2_l;
+ z = u+v;
+ w = v-(z-u);
+ t = z*z;
+ t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+ r = (z*t1)/(t1-two)-(w+z*w);
+ z = one-(r-z);
+ GEN_OCL_GET_FLOAT_WORD(j,z);
+ j += (n<<23);
+ if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n); /* subnormal output */
+ else GEN_OCL_SET_FLOAT_WORD(z,j);
+ return sn*z;
+}
+
+
+INLINE_OVERLOADABLE float hypot(float x, float y) {
+ //return __gen_ocl_sqrt(x*x + y*y);
+ float a,b,an,bn,cn;
+ int e;
+ if (isfinite (x) && isfinite (y)){ /* Determine absolute values. */
+ x = __gen_ocl_fabs (x);
+ y = __gen_ocl_fabs (y);
+ /* Find the bigger and the smaller one. */
+ a = max(x,y);
+ b = min(x,y);
+ /* Now 0 <= b <= a. */
+ /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1. */
+ an = frexp (a, &e);
+ bn = ldexp (b, - e);
+ /* Through the normalization, no unneeded overflow or underflow will occur here. */
+ cn = __gen_ocl_sqrt (an * an + bn * bn);
+ return ldexp (cn, e);
+ }else{
+ if (isinf (x) || isinf (y)) /* x or y is infinite. Return +Infinity. */
+ return INFINITY;
+ else /* x or y is NaN. Return NaN. */
+ return x + y;
+ }
+}
+
+#define BODY \
+ if (isnan(x)) { \
+ *p = x; \
+ return x; \
+ } \
+ *p = __gen_ocl_internal_floor(x); \
+ if (isinf(x)) { \
+ return x > 0 ? +0. : -0.; \
+ } \
+ return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+INLINE_OVERLOADABLE float fract(float x, global float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, local float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, private float *p) { BODY; }
+#undef BODY
+
+#define BODY \
+ float Zero[2]; \
+ int n,hx,hy,hz,ix,iy,sx,i,sy; \
+ uint q,sxy; \
+ Zero[0] = 0.0;Zero[1] = -0.0; \
+ GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
+ sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
+ hx ^=sx; hy &= 0x7fffffff; \
+ if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
+ if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
+ *quo = 0;return NAN; \
+ } \
+ if( hy == 0x7F800000 || hx == 0 ) { \
+ *quo = 0;return x; \
+ } \
+ if( hx == hy ) { \
+ *quo = (x == y) ? 1 : -1; \
+ return sx ? -0.0 : 0.0; \
+ } \
+ if(hx<hy) { \
+ q = 0; \
+ goto fixup; \
+ } else if(hx==hy) { \
+ *quo = (sxy ? -1 : 1); \
+ return Zero[(uint)sx>>31]; \
+ } \
+ ix = (hx>>23)-127; \
+ iy = (hy>>23)-127; \
+ hx = 0x00800000|(0x007fffff&hx); \
+ hy = 0x00800000|(0x007fffff&hy); \
+ n = ix - iy; \
+ q = 0; \
+ while(n--) { \
+ hz=hx-hy; \
+ if(hz<0) hx = hx << 1; \
+ else {hx = hz << 1; q++;} \
+ q <<= 1; \
+ } \
+ hz=hx-hy; \
+ if(hz>=0) {hx=hz;q++;} \
+ if(hx==0) { \
+ q &= 0x0000007f; \
+ *quo = (sxy ? -q : q); \
+ return Zero[(uint)sx>>31]; \
+ } \
+ while(hx<0x00800000) { \
+ hx <<= 1;iy -= 1; \
+ } \
+ if(iy>= -126) { \
+ hx = ((hx-0x00800000)|((iy+127)<<23)); \
+ } else {\
+ n = -126 - iy; \
+ hx >>= n; \
+ } \
+fixup: \
+ GEN_OCL_SET_FLOAT_WORD(x,hx); \
+ if(hx<0x00800000){ \
+ GEN_OCL_GET_FLOAT_WORD(hy,y); \
+ hy &= 0x7fffffff; \
+ if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
+ x = 0; \
+ }else{ \
+ y = __gen_ocl_fabs(y); \
+ if (y < 0x1p-125f) { \
+ if (x+x>y || (x+x==y && (q & 1))) { \
+ q++;x-=y; \
+ } \
+ }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
+ q++;x-=y; \
+ } \
+ GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
+ } \
+ int sign = sx==sy?0:1; \
+ q &= 0x0000007f; \
+ *quo = (sign ? -q : q); \
+ return x;
+
+INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) {
+ BODY;
+}
+INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
+INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
+#undef BODY
+INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+INLINE_OVERLOADABLE float pown(float x, int n) {
+ if (x == 0 && n == 0)
+ return 1;
+ return powr(x, n);
+}
+
+INLINE_OVERLOADABLE float internal_rootn(float x, int n, const bool isFastpath)
+{
+ float ax,re;
+ int sign = 0;
+ if( n == 0 )return NAN;
+ //rootn ( x, n ) returns a NaN for x < 0 and n is even.
+ if( x < 0 && 0 == (n&1) )
+ return NAN;
+ if( x == 0.0 ){
+ switch( n & 0x80000001 ){
+ //rootn ( +-0, n ) is +0 for even n > 0.
+ case 0:
+ return 0.0f;
+ //rootn ( +-0, n ) is +-0 for odd n > 0.
+ case 1:
+ return x;
+ //rootn ( +-0, n ) is +inf for even n < 0.
+ case 0x80000000:
+ return INFINITY;
+
+ //rootn ( +-0, n ) is +-inf for odd n < 0.
+ case 0x80000001:
+ return __gen_ocl_internal_copysign(INFINITY, x);
+ }
+ }
+ ax = __gen_ocl_fabs(x);
+ if(x <0.0f && (n&1))
+ sign = 1;
+ if (isFastpath)
+ re = __gen_ocl_pow(ax,1.f/n);
+ else
+ re = __gen_ocl_internal_pow(ax,1.f/n);
+ if(sign)
+ re = -re;
+ return re;
+}
+
+INLINE_OVERLOADABLE float rootn(float x, int n) {
+ return internal_rootn(x, n, 0);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Geometric functions (see 6.11.5 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float dot(float p0, float p1) {
+ return p0 * p1;
+}
+INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
+ return p0.x * p1.x + p0.y * p1.y;
+}
+INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
+ return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
+}
+INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
+ return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
+}
+INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+#define BODY \
+ if(m == 0) \
+ return 0; \
+ if(isinf(m)) \
+ return INFINITY; \
+ if(m < 1) \
+ m = 1; \
+ x /= m; \
+ return m * sqrt(dot(x,x));
+INLINE_OVERLOADABLE float length(float2 x) {
+ float m = max(__gen_ocl_fabs(x.s0), __gen_ocl_fabs(x.s1));
+ BODY;
+}
+INLINE_OVERLOADABLE float length(float3 x) {
+ float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), __gen_ocl_fabs(x.s2)));
+ BODY;
+}
+INLINE_OVERLOADABLE float length(float4 x) {
+ float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), max(__gen_ocl_fabs(x.s2), __gen_ocl_fabs(x.s3))));
+ BODY;
+}
+#undef BODY
+INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float normalize(float x) {
+ union { float f; unsigned u; } u;
+ u.f = x;
+ if(u.u == 0)
+ return 0.f;
+ if(isnan(x))
+ return NAN;
+ return u.u < 0x7fffffff ? 1.f : -1.f;
+}
+INLINE_OVERLOADABLE float2 normalize(float2 x) {
+ float m = length(x);
+ if(m == 0)
+ return 0;
+ return x / m;
+}
+INLINE_OVERLOADABLE float3 normalize(float3 x) {
+ float m = length(x);
+ if(m == 0)
+ return 0;
+ return x / m;
+}
+INLINE_OVERLOADABLE float4 normalize(float4 x) {
+ float m = length(x);
+ if(m == 0)
+ return 0;
+ return x / m;
+}
+
+INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_normalize(float x) { return x > 0 ? 1.f : (x < 0 ? -1.f : 0.f); }
+INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
+ return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
+}
+INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
+ return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+ return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+ *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+ return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+}
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+ DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+ DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
+ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
+ DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+ DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+INLINE_OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+}
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 2 * offset) = v.s0; \
+ *(p + 2 * offset + 1) = v.s1; \
+} \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+ vstore2(v.lo, 2*offset, p); \
+ vstore2(v.hi, 2*offset, p+2); \
+} \
+INLINE_OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+ vstore4(v.lo, 2*offset, p); \
+ vstore4(v.hi, 2*offset, p+4); \
+} \
+INLINE_OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+ vstore8(v.lo, 2*offset, p); \
+ vstore8(v.hi, 2*offset, p+8); \
+}
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+ DECL_BYTE_RD_SPACE(TYPE, __global) \
+ DECL_BYTE_RD_SPACE(TYPE, __local) \
+ DECL_BYTE_RD_SPACE(TYPE, __private) \
+ DECL_BYTE_RD_SPACE(TYPE, __constant) \
+ DECL_BYTE_WR_SPACE(TYPE, __global) \
+ DECL_BYTE_WR_SPACE(TYPE, __local) \
+ DECL_BYTE_WR_SPACE(TYPE, __private)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+
+INLINE_OVERLOADABLE short f32to16_rtp(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (f > con)
+ return s - signbit(f) * 2 + 1;
+ else
+ return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtn(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (con > f)
+ return s + signbit(f) * 2 - 1;
+ else
+ return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtz(float f) {
+ short s = __gen_ocl_f32to16(f);
+ float con = __gen_ocl_f16to32(s);
+ //if(isinf(con)) return s;
+ if (((con > f) && !signbit(f)) ||
+ ((con < f) && signbit(f)))
+ return s - 1;
+ else
+ return s;
+}
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+INLINE_OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+ return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+INLINE_OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+ return (float2)(vload_half(offset*2, p), \
+ vload_half(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+ return (float3)(vload_half(offset*3, p), \
+ vload_half(offset*3 + 1, p), \
+ vload_half(offset*3 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+ return (float3)(vload_half(offset*4, p), \
+ vload_half(offset*4 + 1, p), \
+ vload_half(offset*4 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+ return (float4)(vload_half2(offset*2, p), \
+ vload_half2(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+ return (float8)(vload_half4(offset*2, p), \
+ vload_half4(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
+ return (float16)(vload_half8(offset*2, p), \
+ vload_half8(offset*2 + 1, p)); \
+}
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+INLINE_OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
+ *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+INLINE_OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.lo, offset*2, p); \
+ vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+ vstore_half2##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.s0, offset*3, p); \
+ vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+ vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+ vstore_half##ROUND(data.s0, offset*4, p); \
+ vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+ vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+ vstore_half2##ROUND(data.lo, offset*2, p); \
+ vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+ vstore_half4##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+ vstore_half4##ROUND(data.lo, offset*2, p); \
+ vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+ vstore_half8##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+ vstore_half8##ROUND(data.lo, offset*2, p); \
+ vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+ vstore_half16##ROUND(data, offset, p); \
+}
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, , __gen_ocl_f32to16) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
+ DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
+
+DECL_HALF_LD_SPACE(__global)
+DECL_HALF_LD_SPACE(__local)
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__private)
+
+DECL_HALF_ST_SPACE(__global)
+DECL_HALF_ST_SPACE(__local)
+DECL_HALF_ST_SPACE(__private)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
+
+#define vloada_half vload_half
+#define vloada_half2 vload_half2
+#define vloada_half4 vload_half4
+#define vloada_half8 vload_half8
+#define vloada_half16 vload_half16
+
+// XXX workaround ptx profile
+#define fabs __gen_ocl_internal_fabs
+#define trunc __gen_ocl_internal_trunc
+#define round __gen_ocl_internal_round
+#define floor __gen_ocl_internal_floor
+#define ceil __gen_ocl_internal_ceil
+#define log __gen_ocl_internal_log
+#define log2 __gen_ocl_internal_log2
+#define log10 __gen_ocl_internal_log10
+#define exp __gen_ocl_internal_exp
+#define exp2 native_exp2
+#define exp10 __gen_ocl_internal_exp10
+#define expm1 __gen_ocl_internal_expm1
+#define fmin __gen_ocl_internal_fmin
+#define fmax __gen_ocl_internal_fmax
+#define fma mad
+#define fdim __gen_ocl_internal_fdim
+#define maxmag __gen_ocl_internal_maxmag
+#define minmag __gen_ocl_internal_minmag
+
+/////////////////////////////////////////////////////////////////////////////
+// Miscellaneous Vector Functions (see 6.11.12 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+#define DEC2(TYPE, XTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask) { \
+ TYPE##2 y; \
+ y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+ y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+ return y; \
+ }
+
+#define DEC4(TYPE, XTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##4 shuffle(XTYPE x, MASKTYPE##4 mask) { \
+ TYPE##4 y; \
+ y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+ y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+ y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+ y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+ return y; \
+ }
+
+#define DEC8(TYPE, XTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##8 shuffle(XTYPE x, MASKTYPE##8 mask) { \
+ TYPE##8 y; \
+ y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+ y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+ y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+ y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+ y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+ y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+ y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+ y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+ return y; \
+ }
+
+#define DEC16(TYPE, XTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##16 shuffle(XTYPE x, MASKTYPE##16 mask) { \
+ TYPE##16 y; \
+ y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+ y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+ y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+ y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+ y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+ y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+ y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+ y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+ y.s8 = ((TYPE *) &x)[mask.s8 & (vec_step(x) - 1)]; \
+ y.s9 = ((TYPE *) &x)[mask.s9 & (vec_step(x) - 1)]; \
+ y.sa = ((TYPE *) &x)[mask.sa & (vec_step(x) - 1)]; \
+ y.sb = ((TYPE *) &x)[mask.sb & (vec_step(x) - 1)]; \
+ y.sc = ((TYPE *) &x)[mask.sc & (vec_step(x) - 1)]; \
+ y.sd = ((TYPE *) &x)[mask.sd & (vec_step(x) - 1)]; \
+ y.se = ((TYPE *) &x)[mask.se & (vec_step(x) - 1)]; \
+ y.sf = ((TYPE *) &x)[mask.sf & (vec_step(x) - 1)]; \
+ return y; \
+ }
+
+#define DEFMASK(TYPE, MASKTYPE) \
+ DEC2(TYPE, TYPE##2, MASKTYPE); DEC2(TYPE, TYPE##4, MASKTYPE); DEC2(TYPE, TYPE##8, MASKTYPE); DEC2(TYPE, TYPE##16, MASKTYPE) \
+ DEC4(TYPE, TYPE##2, MASKTYPE); DEC4(TYPE, TYPE##4, MASKTYPE); DEC4(TYPE, TYPE##8, MASKTYPE); DEC4(TYPE, TYPE##16, MASKTYPE) \
+ DEC8(TYPE, TYPE##2, MASKTYPE); DEC8(TYPE, TYPE##4, MASKTYPE); DEC8(TYPE, TYPE##8, MASKTYPE); DEC8(TYPE, TYPE##16, MASKTYPE) \
+ DEC16(TYPE, TYPE##2, MASKTYPE); DEC16(TYPE, TYPE##4, MASKTYPE); DEC16(TYPE, TYPE##8, MASKTYPE); DEC16(TYPE, TYPE##16, MASKTYPE)
+
+#define DEF(TYPE) \
+ DEFMASK(TYPE, uchar) \
+ DEFMASK(TYPE, ushort) \
+ DEFMASK(TYPE, uint) \
+ DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEC2(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##2 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC2X(TYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask) { \
+ TYPE##2 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ return z; \
+ }
+
+#define DEC4(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##4 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC4X(TYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask) { \
+ TYPE##4 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+ return z; \
+ }
+
+#define DEC8(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##8 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC8X(TYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask) { \
+ TYPE##8 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+ z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+ z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+ z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+ z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+ return z; \
+ }
+
+#define DEC16(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##16 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC16X(TYPE, MASKTYPE) \
+ INLINE_OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask) { \
+ TYPE##16 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+ z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+ z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+ z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+ z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+ z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
+ z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
+ z.sa = mask.sa < 16 ? ((TYPE *)&x)[mask.sa] : ((TYPE *)&y)[mask.sa & 15]; \
+ z.sb = mask.sb < 16 ? ((TYPE *)&x)[mask.sb] : ((TYPE *)&y)[mask.sb & 15]; \
+ z.sc = mask.sc < 16 ? ((TYPE *)&x)[mask.sc] : ((TYPE *)&y)[mask.sc & 15]; \
+ z.sd = mask.sd < 16 ? ((TYPE *)&x)[mask.sd] : ((TYPE *)&y)[mask.sd & 15]; \
+ z.se = mask.se < 16 ? ((TYPE *)&x)[mask.se] : ((TYPE *)&y)[mask.se & 15]; \
+ z.sf = mask.sf < 16 ? ((TYPE *)&x)[mask.sf] : ((TYPE *)&y)[mask.sf & 15]; \
+ return z; \
+ }
+
+#define DEFMASK(TYPE, MASKTYPE) \
+ DEC2(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+ DEC2(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+ DEC2(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+ DEC2X(TYPE, MASKTYPE) \
+ DEC4(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+ DEC4(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+ DEC4(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+ DEC4X(TYPE, MASKTYPE) \
+ DEC8(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+ DEC8(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+ DEC8(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+ DEC8X(TYPE, MASKTYPE) \
+ DEC16(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+ DEC16(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+ DEC16(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+ DEC16X(TYPE, MASKTYPE)
+
+#define DEF(TYPE) \
+ DEFMASK(TYPE, uchar) \
+ DEFMASK(TYPE, ushort) \
+ DEFMASK(TYPE, uint) \
+ DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC2X
+#undef DEC4
+#undef DEC4X
+#undef DEC8
+#undef DEC8X
+#undef DEC16
+#undef DEC16X
+/////////////////////////////////////////////////////////////////////////////
+// Synchronization functions
+/////////////////////////////////////////////////////////////////////////////
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+void __gen_ocl_barrier_local(void);
+void __gen_ocl_barrier_global(void);
+void __gen_ocl_barrier_local_and_global(void);
+
+typedef uint cl_mem_fence_flags;
+void barrier(cl_mem_fence_flags flags);
+
+INLINE void mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void read_mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void write_mem_fence(cl_mem_fence_flags flags) {
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Async Copies and prefetch
+/////////////////////////////////////////////////////////////////////////////
+#define BODY(SRC_STRIDE, DST_STRIDE) \
+ uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
+ uint count = num / size; \
+ uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1); \
+ offset = offset * get_local_size(0) + get_local_id(0); \
+ for(uint i=0; i<count; i+=1) { \
+ *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+ offset += size; \
+ } \
+ if(offset < num) \
+ *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+ return 0;
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE event_t async_work_group_copy (local TYPE *dst, const global TYPE *src, \
+ size_t num, event_t event) { \
+ BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_copy (global TYPE *dst, const local TYPE *src, \
+ size_t num, event_t event) { \
+ BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst, const global TYPE *src, \
+ size_t num, size_t src_stride, event_t event) { \
+ BODY(src_stride, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst, const local TYPE *src, \
+ size_t num, size_t dst_stride, event_t event) { \
+ BODY(1, dst_stride); \
+}
+#define DEF(TYPE) \
+ DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+DEF(float)
+DEF(double)
+#undef BODY
+#undef DEFN
+#undef DEF
+
+INLINE void wait_group_events (int num_events, event_t *event_list) {
+ barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
+// Atomic functions
+/////////////////////////////////////////////////////////////////////////////
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX) \
+ INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+ return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val); \
+ }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
+
+#define DECL_ATOMIC_OP(NAME) \
+ DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_) \
+ DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+
+DECL_ATOMIC_OP(add)
+DECL_ATOMIC_OP(sub)
+DECL_ATOMIC_OP(and)
+DECL_ATOMIC_OP(or)
+DECL_ATOMIC_OP(xor)
+DECL_ATOMIC_OP(xchg)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX) \
+ INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+ return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+ }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+ INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+ return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+ }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+ DECL_ATOMIC_OP_TYPE(NAME, uint) \
+ DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(inc)
+DECL_ATOMIC_OP(dec)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+ INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+ return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+ }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+ DECL_ATOMIC_OP_TYPE(NAME, uint) \
+ DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(cmpxchg)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
+/////////////////////////////////////////////////////////////////////////////
+// Force the compilation to SIMD8 or SIMD16
+/////////////////////////////////////////////////////////////////////////////
+
+int __gen_ocl_force_simd8(void);
+int __gen_ocl_force_simd16(void);
+
+#define NULL ((void*)0)
+
+// ##BEGIN_COMMON_DEFINES##
+// ##END_COMMON_DEFINES##
+
+/////////////////////////////////////////////////////////////////////////////
+// Image access functions
+/////////////////////////////////////////////////////////////////////////////
+
+// 1D read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+
+// 2D & 1D Array read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+
+// 3D & 2D Array read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+
+// 1D write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, float4 color);
+
+// 2D & 1D Array write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
+
+// 3D & 2D Array write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
+
+int __gen_ocl_get_image_width(uint surface_id);
+int __gen_ocl_get_image_height(uint surface_id);
+int __gen_ocl_get_image_channel_data_type(uint surface_id);
+int __gen_ocl_get_image_channel_order(uint surface_id);
+int __gen_ocl_get_image_depth(uint surface_id);
+/* The printf function. */
+/* From LLVM 3.4, c string are all in constant address space */
+#if 100*__clang_major__ + __clang_minor__ < 304
+int __gen_ocl_printf_stub(const char * format, ...);
+#else
+int __gen_ocl_printf_stub(constant char * format, ...);
+#endif
+#define printf __gen_ocl_printf_stub
+
+// 2D 3D Image Common Macro
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_1 1
+#else
+#define GEN_FIX_1 0
+#endif
+
+#define GET_IMAGE(cl_image, surface_id) \
+ uint surface_id = (uint)cl_image
+INLINE_OVERLOADABLE float __gen_compute_array_index(const float index, image1d_array_t image)
+{
+ GET_IMAGE(image, surface_id);
+ float array_size = __gen_ocl_get_image_depth(surface_id);
+ return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+INLINE_OVERLOADABLE float __gen_compute_array_index(float index, image2d_array_t image)
+{
+ GET_IMAGE(image, surface_id);
+ float array_size = __gen_ocl_get_image_depth(surface_id);
+ return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image1d_array_t image)
+{
+ GET_IMAGE(image, surface_id);
+ int array_size = __gen_ocl_get_image_depth(surface_id);
+ return clamp(index, 0, array_size - 1);
+}
+
+INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image2d_array_t image)
+{
+ GET_IMAGE(image, surface_id);
+ int array_size = __gen_ocl_get_image_depth(surface_id);
+ return clamp(index, 0, array_size - 1);
+}
+
+#define DECL_READ_IMAGE0(int_clamping_fix, \
+ image_type, type, suffix, coord_type, n) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \
+ const sampler_t sampler, \
+ coord_type coord) \
+ { \
+ GET_IMAGE(cl_image, surface_id); \
+ GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai); \
+ if (int_clamping_fix && \
+ ((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) && \
+ ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORD(surface_id, sampler, coord)); \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORDF(surface_id, sampler, coord), 0); \
+ }
+
+#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix, \
+ image_type, type, suffix, coord_type, n) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \
+ const sampler_t sampler, \
+ coord_type coord) \
+ { \
+ GET_IMAGE(cl_image, surface_id); \
+ GET_IMAGE_ARRAY_SIZE(cl_image, coord, float, ai) \
+ coord_type tmpCoord = coord; \
+ if (float_coord_rounding_fix | int_clamping_fix) { \
+ if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) \
+ && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) { \
+ if (float_coord_rounding_fix \
+ && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) { \
+ FIXUP_FLOAT_COORD(tmpCoord); \
+ } \
+ if (int_clamping_fix) { \
+ coord_type intCoord; \
+ if (sampler & CLK_NORMALIZED_COORDS_TRUE) { \
+ DENORMALIZE_COORD(surface_id, intCoord, tmpCoord); \
+ } else \
+ intCoord = tmpCoord; \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORDI(surface_id, sampler, intCoord));\
+ } \
+ } \
+ } \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORDF(surface_id, sampler, tmpCoord), 0);\
+ }
+
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type, n) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \
+ coord_type coord) \
+ { \
+ GET_IMAGE(cl_image, surface_id); \
+ GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai) \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORDF(surface_id, \
+ CLK_NORMALIZED_COORDS_FALSE \
+ | CLK_ADDRESS_NONE \
+ | CLK_FILTER_NEAREST, (float)coord), 0); \
+ }
+
+#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
+ INLINE_OVERLOADABLE void write_image ##suffix(image_type cl_image, coord_type coord, type color)\
+ {\
+ GET_IMAGE(cl_image, surface_id);\
+ __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
+ }
+
+#define DECL_IMAGE_INFO_COMMON(image_type) \
+ INLINE_OVERLOADABLE int get_image_channel_data_type(image_type image)\
+ { \
+ GET_IMAGE(image, surface_id);\
+ return __gen_ocl_get_image_channel_data_type(surface_id); \
+ }\
+ INLINE_OVERLOADABLE int get_image_channel_order(image_type image)\
+ { \
+ GET_IMAGE(image, surface_id);\
+ return __gen_ocl_get_image_channel_order(surface_id); \
+ } \
+ INLINE_OVERLOADABLE int get_image_width(image_type image) \
+ { \
+ GET_IMAGE(image, surface_id); \
+ return __gen_ocl_get_image_width(surface_id); \
+ }
+
+// 1D
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix) \
+ DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int, 1) \
+ DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float, 1) \
+ DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int, 1) \
+ DECL_WRITE_IMAGE(image_type, type, suffix, int) \
+ DECL_WRITE_IMAGE(image_type, type, suffix, float)
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord < 0 ? -1 : coord), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord = srcCoord * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord, color
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord < 0 && tmpCoord > -0x1p-20f) \
+ tmpCoord += -0x1p-9; \
+ }
+
+DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_t, uint4, ui)
+DECL_IMAGE(0, image1d_t, float4, f)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, uint4, ui)
+DECL_IMAGE(0, image1d_buffer_t, float4, f)
+
+// 1D Info
+DECL_IMAGE_INFO_COMMON(image1d_t)
+DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef DECL_IMAGE
+// End of 1D
+
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n) \
+ DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int ##n, n) \
+ DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n, n) \
+ DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n, n) \
+ DECL_WRITE_IMAGE(image_type, type, suffix, int ## n) \
+ DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
+// 2D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), \
+ (int)(coord.s1 < 0 ? -1 : coord.s1), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+ dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f) \
+ tmpCoord.s0 += -0x1p-9; \
+ if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f) \
+ tmpCoord.s1 += -0x1p-9f; \
+ }
+
+DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
+DECL_IMAGE(0, image2d_t, float4, f, 2)
+
+// 1D Array
+#undef GET_IMAGE_ARRAY_SIZE
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, (int)0, ai, 2
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), 0, (int)ai, 2
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, __gen_compute_array_index(coord.s1, cl_image), color
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+ coord_type ai = __gen_compute_array_index(coord.s1, image);
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f) \
+ tmpCoord.s0 += -0x1p-9; \
+ }
+
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, uint4, ui, 2)
+DECL_IMAGE(0, image1d_array_t, float4, f, 2)
+
+// 2D Info
+DECL_IMAGE_INFO_COMMON(image2d_t)
+INLINE_OVERLOADABLE int get_image_height(image2d_t image)
+{
+ GET_IMAGE(image, surface_id);
+ return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_t image)
+{
+ return (int2){get_image_width(image), get_image_height(image)};
+}
+
+// 1D Array info
+DECL_IMAGE_INFO_COMMON(image1d_array_t)
+INLINE_OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
+{
+ GET_IMAGE(image, surface_id);
+ return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDI
+#undef EXPEND_READ_COORDF
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 2D and 1D Array
+
+// 3D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)coord.s2
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+ (int)(coord.s1 < 0 ? -1 : coord.s1), (int)(coord.s2 < 0 ? -1 : coord.s2), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+ dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id); \
+ dstCoord.z = srcCoord.z * __gen_ocl_get_image_depth(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20) \
+ tmpCoord.s0 += -0x1p-9; \
+ if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20) \
+ tmpCoord.s1 += -0x1p-9; \
+ if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20) \
+ tmpCoord.s2 += -0x1p-9; \
+ }
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
+DECL_IMAGE(0, image3d_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
+DECL_IMAGE(0, image3d_t, float4, f, 3)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, ai, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+ (int)(coord.s1 < 0 ? -1 : coord.s1), (int)ai, 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+ dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, __gen_compute_array_index(coord.s2, cl_image), color
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20) \
+ tmpCoord.s0 += -0x1p-9; \
+ if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20) \
+ tmpCoord.s1 += -0x1p-9; \
+ }
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+ coord_type ai = __gen_compute_array_index(coord.s2, image);
+
+// 2D Array
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 4)
+DECL_IMAGE(0, image2d_array_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 3)
+DECL_IMAGE(0, image2d_array_t, float4, f, 3)
+
+// 3D Info
+DECL_IMAGE_INFO_COMMON(image3d_t)
+INLINE_OVERLOADABLE int get_image_height(image3d_t image)
+{
+ GET_IMAGE(image, surface_id);
+ return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int get_image_depth(image3d_t image)
+{
+ GET_IMAGE(image, surface_id);
+ return __gen_ocl_get_image_depth(surface_id);
+}
+INLINE_OVERLOADABLE int4 get_image_dim(image3d_t image)
+{
+ return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0};
+}
+
+// 2D Array Info
+DECL_IMAGE_INFO_COMMON(image2d_array_t)
+INLINE_OVERLOADABLE int get_image_height(image2d_array_t image)
+{
+ GET_IMAGE(image, surface_id);
+ return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_array_t image)
+{
+ return (int2){get_image_width(image), get_image_height(image)};
+}
+INLINE_OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
+{
+ GET_IMAGE(image, surface_id);
+ return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 3D and 2D Array
+
+#undef DECL_IMAGE
+#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
+#undef DECL_WRITE_IMAGE
+#undef GEN_FIX_1
+// End of Image
+
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x)
+{
+ return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x)
+{
+ return native_log(x + native_sqrt(x * x + 1));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x)
+{
+ return 0.5f * native_log((1 + x) / (1 - x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x)
+{
+ return __gen_ocl_pow(x, 0.3333333333f);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x)
+{
+ return native_cos(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x)
+{
+ return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x)
+{
+ return __gen_ocl_cos(x * M_PI_F);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x)
+{
+ return native_exp(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x)
+{
+ return native_exp10(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x)
+{
+ return __gen_ocl_pow(M_E_F, x) - 1;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y)
+{
+ return x-y*__gen_ocl_rndz(x/y);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y)
+{
+ return __gen_ocl_sqrt(x*x + y*y);
+}
+
+INLINE_OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x)
+{
+ return __gen_ocl_rndd(native_log2(x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n)
+{
+ return __gen_ocl_pow(2, n) * x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x)
+{
+ return native_log(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x)
+{
+ return native_log2(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x)
+{
+ return native_log10(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x)
+{
+ return native_log(x + 1);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x)
+{
+ return __gen_ocl_rndd(native_log2(x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y)
+{
+ return x-y*__gen_ocl_rnde(x/y);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n)
+{
+ return internal_rootn(x, n, 1);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x)
+{
+ return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __global float *cosval)
+{
+ *cosval = native_cos(x);
+ return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __local float *cosval)
+{
+ *cosval = native_cos(x);
+ return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __private float *cosval)
+{
+ *cosval = native_cos(x);
+ return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x)
+{
+ return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x)
+{
+ return __gen_ocl_sin(x * M_PI_F);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x)
+{
+ return native_tan(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x)
+{
+ float y = native_exp(-2 * x);
+ return (1 - y) / (1 + y);
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : disable
+
+#undef DECL_IMAGE
+#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
+#undef DECL_WRITE_IMAGE
+
+#undef GET_IMAGE
+// ##BEGIN_VECTOR##
+// ##END_VECTOR##
+
+#undef INLINE_OVERLOADABLE
+#undef PURE
+#undef CONST
+#undef OVERLOADABLE
+#undef INLINE
+
+#endif /* __GEN_OCL_STDLIB_H__ */
diff --git a/backend/src/sys/alloc.cpp b/backend/src/sys/alloc.cpp
new file mode 100644
index 0000000..2db95c9
--- /dev/null
+++ b/backend/src/sys/alloc.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file alloc.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Provides facilities to track allocations and pre-initialize memory at
+ * memory allocation and memory free time
+ */
+#include "sys/alloc.hpp"
+#include "sys/atomic.hpp"
+#include "sys/mutex.hpp"
+
+#if GBE_DEBUG_MEMORY
+#include <tr1/unordered_map>
+#include <cstring>
+#endif /* GBE_DEBUG_MEMORY */
+
+#if defined(__ICC__)
+#include <stdint.h>
+#endif /* __ICC__ */
+#include <map>
+#include <vector>
+#include <iomanip>
+
+////////////////////////////////////////////////////////////////////////////////
+/// Memory debugger
+////////////////////////////////////////////////////////////////////////////////
+
+#if GBE_DEBUG_MEMORY
+namespace gbe
+{
+ /*! Store each allocation data */
+ struct AllocData {
+ INLINE AllocData(void) {}
+ INLINE AllocData(int fileName_, int functionName_, int line_, intptr_t alloc_) :
+ fileName(fileName_), functionName(functionName_), line(line_), alloc(alloc_) {}
+ int fileName, functionName, line;
+ intptr_t alloc;
+ };
+
+ /*! Store allocation information */
+ struct MemDebugger {
+ MemDebugger(void) : unfreedNum(0), allocNum(0) {}
+ ~MemDebugger(void) { this->dumpAlloc(); }
+ void* insertAlloc(void *ptr, const char *file, const char *function, int line);
+ void removeAlloc(void *ptr);
+ void dumpAlloc(void);
+ void dumpData(const AllocData &data);
+ /*! Count the still unfreed allocations */
+ volatile intptr_t unfreedNum;
+ /*! Total number of allocations done */
+ volatile intptr_t allocNum;
+ /*! Sorts the file name and function name strings */
+ std::tr1::unordered_map<const char*, int> staticStringMap;
+ /*! Each element contains the actual string */
+ std::vector<const char*> staticStringVector;
+ std::map<uintptr_t, AllocData> allocMap;
+ /*! Protect the memory debugger accesses */
+ MutexSys mutex;
+ };
+
+ void* MemDebugger::insertAlloc(void *ptr, const char *file, const char *function, int line)
+ {
+ if (ptr == NULL) return ptr;
+ Lock<MutexSys> lock(mutex);
+ const uintptr_t iptr = (uintptr_t) ptr;
+ if (UNLIKELY(allocMap.find(iptr) != allocMap.end())) {
+ this->dumpData(allocMap.find(iptr)->second);
+ FATAL("Pointer already in map");
+ }
+ const auto fileIt = staticStringMap.find(file);
+ const auto functionIt = staticStringMap.find(function);
+ int fileName, functionName;
+ if (fileIt == staticStringMap.end()) {
+ staticStringVector.push_back(file);
+ staticStringMap[file] = fileName = int(staticStringVector.size()) - 1;
+ } else
+ fileName = staticStringMap[file];
+ if (functionIt == staticStringMap.end()) {
+ staticStringVector.push_back(function);
+ staticStringMap[function] = functionName = int(staticStringVector.size()) - 1;
+ } else
+ functionName = staticStringMap[function];
+ allocMap[iptr] = AllocData(fileName, functionName, line, allocNum);
+ unfreedNum++;
+ allocNum++;
+ return ptr;
+ }
+
+ void MemDebugger::removeAlloc(void *ptr)
+ {
+ if (ptr == NULL) return;
+ Lock<MutexSys> lock(mutex);
+ const uintptr_t iptr = (uintptr_t) ptr;
+ FATAL_IF(allocMap.find(iptr) == allocMap.end(), "Pointer not referenced");
+ allocMap.erase(iptr);
+ unfreedNum--;
+ }
+
+ void MemDebugger::dumpData(const AllocData &data) {
+ std::cerr << "ALLOC " << data.alloc << ": " <<
+ "file " << staticStringVector[data.fileName] << ", " <<
+ "function " << staticStringVector[data.functionName] << ", " <<
+ "line " << data.line << std::endl;
+ }
+
+ void MemDebugger::dumpAlloc(void) {
+ std::cerr << "MemDebugger: Unfreed number: " << unfreedNum << std::endl;
+ for (const auto &alloc : allocMap) this->dumpData(alloc.second);
+ std::cerr << "MemDebugger: " << staticStringVector.size()
+ << " allocated static strings" << std::endl;
+ }
+
+ /*! The user can deactivate the memory initialization */
+ static bool memoryInitializationEnabled = true;
+
+ /*! Declare C like interface functions here */
+ static MemDebugger *memDebugger = NULL;
+
+ /*! Monitor maximum memory requirement in the compiler */
+ static MutexSys *sizeMutex = NULL;
+ static bool isMutexInitializing = true;
+ static size_t memDebuggerCurrSize(0u);
+ static size_t memDebuggerMaxSize(0u);
+ static void SizeMutexDeallocate(void) { if (sizeMutex) delete sizeMutex; }
+ static void SizeMutexAllocate(void) {
+ if (sizeMutex == NULL && isMutexInitializing == false) {
+ isMutexInitializing = true;
+ sizeMutex = new MutexSys;
+ atexit(SizeMutexDeallocate);
+ }
+ }
+
+ /*! Stop the memory debugger */
+ static void MemDebuggerEnd(void) {
+ MemDebugger *_debug = memDebugger;
+ memDebugger = NULL;
+ std::cout << "Maximum memory consumption: "
+ << std::setprecision(2) << std::fixed
+ << float(memDebuggerMaxSize) / 1024. << "KB" << std::endl;
+ delete _debug;
+ GBE_ASSERT(memDebuggerCurrSize == 0);
+ }
+
+ /*! Bring up the debugger at pre-main */
+ static struct ForceMemDebugger {
+ ForceMemDebugger(void) {
+ doesnotmatter = GBE_NEW(int);
+ GBE_DELETE(doesnotmatter);
+ }
+ int *doesnotmatter;
+ } forceMemDebugger;
+
+ /*! Start the memory debugger */
+ static void MemDebuggerStart(void) {
+ if (memDebugger == NULL) {
+ atexit(MemDebuggerEnd);
+ memDebugger = new MemDebugger;
+ }
+ }
+
+ void* MemDebuggerInsertAlloc(void *ptr, const char *file, const char *function, int line) {
+ if (memDebugger == NULL) MemDebuggerStart();
+ return memDebugger->insertAlloc(ptr, file, function, line);
+ }
+ void MemDebuggerRemoveAlloc(void *ptr) {
+ if (memDebugger == NULL) MemDebuggerStart();
+ memDebugger->removeAlloc(ptr);
+ }
+ void MemDebuggerDumpAlloc(void) {
+ if (memDebugger == NULL) MemDebuggerStart();
+ memDebugger->dumpAlloc();
+ }
+ void MemDebuggerEnableMemoryInitialization(bool enabled) {
+ memoryInitializationEnabled = enabled;
+ }
+ void MemDebuggerInitializeMem(void *mem, size_t sz) {
+ if (memoryInitializationEnabled) std::memset(mem, 0xcd, sz);
+ }
+} /* namespace gbe */
+
+#endif /* GBE_DEBUG_MEMORY */
+
+namespace gbe
+{
+#if GBE_DEBUG_MEMORY
+ void* memAlloc(size_t size) {
+ void *ptr = std::malloc(size + sizeof(size_t));
+ *(size_t *) ptr = size;
+ MemDebuggerInitializeMem((char*) ptr + sizeof(size_t), size);
+ SizeMutexAllocate();
+ if (sizeMutex) sizeMutex->lock();
+ memDebuggerCurrSize += size;
+ memDebuggerMaxSize = std::max(memDebuggerCurrSize, memDebuggerMaxSize);
+ if (sizeMutex) sizeMutex->unlock();
+ return (char *) ptr + sizeof(size_t);
+ }
+ void memFree(void *ptr) {
+ if (ptr != NULL) {
+ char *toFree = (char*) ptr - sizeof(size_t);
+ const size_t size = *(size_t *) toFree;
+ MemDebuggerInitializeMem(ptr, size);
+ SizeMutexAllocate();
+ if (sizeMutex) sizeMutex->lock();
+ memDebuggerCurrSize -= size;
+ if (sizeMutex) sizeMutex->unlock();
+ std::free(toFree);
+ }
+ }
+#else
+ void* memAlloc(size_t size) { return std::malloc(size); }
+ void memFree(void *ptr) { if (ptr != NULL) std::free(ptr); }
+#endif /* GBE_DEBUG_MEMORY */
+
+} /* namespace gbe */
+
+#if GBE_DEBUG_MEMORY
+
+namespace gbe
+{
+ void* alignedMalloc(size_t size, size_t align) {
+ void* mem = malloc(size+align+sizeof(uintptr_t) + sizeof(void*));
+ FATAL_IF (!mem && size, "memory allocation failed");
+ char* aligned = (char*) mem + sizeof(uintptr_t) + sizeof(void*);
+ aligned += align - ((uintptr_t)aligned & (align - 1));
+ ((void**)aligned)[-1] = mem;
+ ((uintptr_t*)aligned)[-2] = uintptr_t(size);
+ MemDebuggerInitializeMem(aligned, size);
+ SizeMutexAllocate();
+ if (sizeMutex) sizeMutex->lock();
+ memDebuggerCurrSize += size;
+ memDebuggerMaxSize = std::max(memDebuggerCurrSize, memDebuggerMaxSize);
+ if (sizeMutex) sizeMutex->unlock();
+ return aligned;
+ }
+
+ void alignedFree(void* ptr) {
+ if (ptr) {
+ const size_t size = ((uintptr_t*)ptr)[-2];
+ MemDebuggerInitializeMem(ptr, size);
+ free(((void**)ptr)[-1]);
+ SizeMutexAllocate();
+ if (sizeMutex) sizeMutex->lock();
+ memDebuggerCurrSize -= size;
+ if (sizeMutex) sizeMutex->unlock();
+ }
+ }
+} /* namespace gbe */
+
+#else /* GBE_DEBUG_MEMORY */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__) || defined(__GLIBC__)
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <iostream>
+
+namespace gbe
+{
+ void* alignedMalloc(size_t size, size_t align) {
+ void* ptr = memalign(align,size);
+ FATAL_IF (!ptr && size, "memory allocation failed");
+ MemDebuggerInitializeMem(ptr, size);
+ return ptr;
+ }
+
+ void alignedFree(void *ptr) { if (ptr) std::free(ptr); }
+} /* namespace gbe */
+
+#else
+#error "Unsupported platform"
+#endif /* __LINUX__ */
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Linear allocator
+////////////////////////////////////////////////////////////////////////////////
+
+namespace gbe
+{
+ LinearAllocator::Segment::Segment(size_t size) :
+ size(size), offset(0u), data(alignedMalloc(size, CACHE_LINE)), next(NULL){}
+
+ LinearAllocator::Segment::~Segment(void) {
+ alignedFree(data);
+ if (this->next) GBE_DELETE(this->next);
+ }
+
+ LinearAllocator::LinearAllocator(size_t minSize, size_t maxSize) :
+ maxSize(std::max(maxSize, size_t(CACHE_LINE)))
+ {
+ this->curr = GBE_NEW(LinearAllocator::Segment, std::max(minSize, size_t(1)));
+ }
+
+ LinearAllocator::~LinearAllocator(void) {
+ if (this->curr) GBE_DELETE(this->curr);
+ }
+
+ void *LinearAllocator::allocate(size_t size)
+ {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+ if (ptr) GBE_ALIGNED_MALLOC(size, sizeof(void*));
+#else
+ // Try to use the current segment. This is the most likely condition here
+ this->curr->offset = ALIGN(this->curr->offset, sizeof(void*));
+ if (this->curr->offset + size <= this->curr->size) {
+ char *ptr = (char*) curr->data + this->curr->offset;
+ this->curr->offset += size;
+ return (void*) ptr;
+ }
+
+ // Well not really a use case in this code base
+ if (UNLIKELY(size > maxSize)) {
+ // This is really bad since we do two allocations
+ Segment *unfortunate = GBE_NEW(Segment, size);
+ GBE_ASSERT(this->curr);
+ Segment *next = this->curr->next;
+ this->curr->next = unfortunate;
+ unfortunate->next = next;
+ return unfortunate->data;
+ }
+
+ // OK. We need a new segment
+ const size_t segmentSize = std::max(size, 2*this->curr->size);
+ Segment *next = GBE_NEW(Segment, segmentSize);
+ next->next = curr;
+ this->curr = next;
+ char *ptr = (char*) curr->data;
+ this->curr->offset += size;
+ return ptr;
+#endif
+ }
+
+} /* namespace gbe */
+
diff --git a/backend/src/sys/alloc.hpp b/backend/src/sys/alloc.hpp
new file mode 100644
index 0000000..8fcb3a7
--- /dev/null
+++ b/backend/src/sys/alloc.hpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file alloc.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_ALLOC_HPP__
+#define __GBE_ALLOC_HPP__
+
+#include "sys/platform.hpp"
+#include "sys/assert.hpp"
+#include <algorithm>
+#include <limits>
+
+namespace gbe
+{
+ /*! regular allocation */
+ void* memAlloc(size_t size);
+ void memFree(void *ptr);
+
+ /*! Aligned allocation */
+ void* alignedMalloc(size_t size, size_t align = 64);
+ void alignedFree(void* ptr);
+
+ /*! Monitor memory allocations */
+#if GBE_DEBUG_MEMORY
+ void* MemDebuggerInsertAlloc(void*, const char*, const char*, int);
+ void MemDebuggerRemoveAlloc(void *ptr);
+ void MemDebuggerDumpAlloc(void);
+ void MemDebuggerInitializeMem(void *mem, size_t sz);
+ void MemDebuggerEnableMemoryInitialization(bool enabled);
+#else
+ INLINE void* MemDebuggerInsertAlloc(void *ptr, const char*, const char*, int) {return ptr;}
+ INLINE void MemDebuggerRemoveAlloc(void *ptr) {}
+ INLINE void MemDebuggerDumpAlloc(void) {}
+ INLINE void MemDebuggerInitializeMem(void *mem, size_t sz) {}
+ INLINE void MemDebuggerEnableMemoryInitialization(bool enabled) {}
+#endif /* GBE_DEBUG_MEMORY */
+
+ /*! Properly handle the allocated type */
+ template <typename T>
+ T* _MemDebuggerInsertAlloc(T *ptr, const char *file, const char *function, int line) {
+ MemDebuggerInsertAlloc(ptr, file, function, line);
+ return ptr;
+ }
+} /* namespace gbe */
+
+/*! Declare a class with custom allocators */
+#define GBE_CLASS(TYPE) \
+ GBE_STRUCT(TYPE) \
+private:
+
+/*! Declare a structure with custom allocators */
+#define GBE_STRUCT(TYPE) \
+public: \
+ void* operator new(size_t size) { \
+ return gbe::alignedMalloc(size, GBE_DEFAULT_ALIGNMENT); \
+ } \
+ void* operator new[](size_t size) { \
+ return gbe::alignedMalloc(size, GBE_DEFAULT_ALIGNMENT); \
+ } \
+ void* operator new(size_t size, void *p) { return p; } \
+ void* operator new[](size_t size, void *p) { return p; } \
+ void operator delete(void* ptr) { return gbe::alignedFree(ptr); } \
+ void operator delete[](void* ptr) { return gbe::alignedFree(ptr); }
+
+/*! Macros to handle allocation position */
+#define GBE_NEW(T,...) \
+ gbe::_MemDebuggerInsertAlloc(new T(__VA_ARGS__), __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_NO_ARG(T) \
+ gbe::_MemDebuggerInsertAlloc(new T, __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_ARRAY(T,N,...) \
+ gbe::_MemDebuggerInsertAlloc(new T[N](__VA_ARGS__), __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_ARRAY_NO_ARG(T,N)\
+ gbe::_MemDebuggerInsertAlloc(new T[N], __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_P(T,X,...) \
+ gbe::_MemDebuggerInsertAlloc(new (X) T(__VA_ARGS__), __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_DELETE(X) \
+ do { gbe::MemDebuggerRemoveAlloc(X); delete X; } while (0)
+
+#define GBE_DELETE_ARRAY(X) \
+ do { gbe::MemDebuggerRemoveAlloc(X); delete[] X; } while (0)
+
+#define GBE_MALLOC(SZ) \
+ gbe::MemDebuggerInsertAlloc(gbe::memAlloc(SZ),__FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_FREE(X) \
+ do { gbe::MemDebuggerRemoveAlloc(X); gbe::memFree(X); } while (0)
+
+#define GBE_ALIGNED_FREE(X) \
+ do { gbe::MemDebuggerRemoveAlloc(X); gbe::alignedFree(X); } while (0)
+
+#define GBE_ALIGNED_MALLOC(SZ,ALIGN) \
+ gbe::MemDebuggerInsertAlloc(gbe::alignedMalloc(SZ,ALIGN),__FILE__, __FUNCTION__, __LINE__)
+
+namespace gbe
+{
+ /*! STL compliant allocator to intercept all memory allocations */
+ template<typename T>
+ class Allocator {
+ public:
+ typedef T value_type;
+ typedef value_type* pointer;
+ typedef const value_type* const_pointer;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+ typedef typename std::allocator<void>::const_pointer void_allocator_ptr;
+ template<typename U>
+ struct rebind { typedef Allocator<U> other; };
+
+ INLINE Allocator(void) {}
+ INLINE ~Allocator(void) {}
+ INLINE Allocator(Allocator const&) {}
+ template<typename U>
+ INLINE Allocator(Allocator<U> const&) {}
+ INLINE pointer address(reference r) { return &r; }
+ INLINE const_pointer address(const_reference r) { return &r; }
+ INLINE pointer allocate(size_type n, void_allocator_ptr = 0) {
+ if (ALIGNOF(T) > sizeof(uintptr_t))
+ return (pointer) GBE_ALIGNED_MALLOC(n*sizeof(T), ALIGNOF(T));
+ else
+ return (pointer) GBE_MALLOC(n * sizeof(T));
+ }
+ INLINE void deallocate(pointer p, size_type) {
+ if (ALIGNOF(T) > sizeof(uintptr_t))
+ GBE_ALIGNED_FREE(p);
+ else
+ GBE_FREE(p);
+ }
+ INLINE size_type max_size(void) const {
+ return std::numeric_limits<size_type>::max() / sizeof(T);
+ }
+ INLINE void construct(pointer p, const T& t = T()) { ::new(p) T(t); }
+ INLINE void destroy(pointer p) { p->~T(); }
+ INLINE bool operator==(Allocator const&) { return true; }
+ INLINE bool operator!=(Allocator const& a) { return !operator==(a); }
+ };
+
+// Deactivate fast allocators
+#ifndef GBE_DEBUG_SPECIAL_ALLOCATOR
+#define GBE_DEBUG_SPECIAL_ALLOCATOR 0
+#endif
+
+ /*! A growing pool never gives memory to the system but chain free elements
+ * together such as deallocation can be quickly done
+ */
+ template <typename T>
+ class GrowingPool
+ {
+ public:
+ GrowingPool(uint32_t elemNum = 1) :
+ curr(GBE_NEW(GrowingPoolElem, elemNum <= 1 ? 1 : elemNum)),
+ free(NULL), full(NULL), freeList(NULL) {}
+ ~GrowingPool(void) {
+ GBE_SAFE_DELETE(curr);
+ GBE_SAFE_DELETE(free);
+ GBE_SAFE_DELETE(full);
+ }
+ void *allocate(void) {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+ return GBE_ALIGNED_MALLOC(sizeof(T), ALIGNOF(T));
+#else
+ // Pick up an element from the free list
+ if (this->freeList != NULL) {
+ void *data = (void*) freeList;
+ this->freeList = *(void**) freeList;
+ return data;
+ }
+
+ // Pick up an element from the current block (if not full)
+ if (this->curr->allocated < this->curr->maxElemNum) {
+ void *data = (T*) curr->data + curr->allocated++;
+ return data;
+ }
+
+ // Block is full
+ this->curr->next = this->full;
+ this->full = this->curr;
+
+ // Try to pick up a free block
+ if (this->free) this->getFreeBlock();
+
+ // No free block we must allocate a new one
+ else
+ this->curr = GBE_NEW(GrowingPoolElem, 2 * this->curr->maxElemNum);
+
+ void *data = (T*) curr->data + curr->allocated++;
+ return data;
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+ }
+ void deallocate(void *t) {
+ if (t == NULL) return;
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+ GBE_ALIGNED_FREE(t);
+#else
+ *(void**) t = this->freeList;
+ this->freeList = t;
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+ }
+ void rewind(void) {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR == 0
+ // All free elements return to their blocks
+ this->freeList = NULL;
+
+ // Put back current block in full list
+ if (this->curr) {
+ this->curr->next = this->full;
+ this->full = this->curr;
+ this->curr = NULL;
+ }
+
+ // Reverse the chain list and mark all blocks as empty
+ while (this->full) {
+ GrowingPoolElem *next = this->full->next;
+ this->full->allocated = 0;
+ this->full->next = this->free;
+ this->free = this->full;
+ this->full = next;
+ }
+
+ // Provide a valid current block
+ this->getFreeBlock();
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+ }
+ private:
+ /*! Pick-up a free block */
+ INLINE void getFreeBlock(void) {
+ GBE_ASSERT(this->free);
+ this->curr = this->free;
+ this->free = this->free->next;
+ this->curr->next = NULL;
+ }
+ /*! Chunk of elements to allocate */
+ class GrowingPoolElem
+ {
+ friend class GrowingPool;
+ GrowingPoolElem(size_t elemNum) {
+ const size_t sz = std::max(sizeof(T), sizeof(void*));
+ this->data = (T*) GBE_ALIGNED_MALLOC(elemNum * sz, ALIGNOF(T));
+ this->next = NULL;
+ this->maxElemNum = elemNum;
+ this->allocated = 0;
+ }
+ ~GrowingPoolElem(void) {
+ GBE_ALIGNED_FREE(this->data);
+ if (this->next) GBE_DELETE(this->next);
+ }
+ T *data;
+ GrowingPoolElem *next;
+ size_t allocated, maxElemNum;
+ };
+ GrowingPoolElem *curr; //!< To get new element from
+ GrowingPoolElem *free; //!< Blocks that can be reused (after rewind)
+ GrowingPoolElem *full; //!< Blocks fully used
+ void *freeList; //!< Elements that have been deallocated
+ GBE_CLASS(GrowingPool);
+ };
+
+/*! Helper macros to build and destroy objects with a growing pool */
+#define DECL_POOL(TYPE, POOL) \
+ GrowingPool<TYPE> POOL; \
+ template <typename... Args> \
+ TYPE *new##TYPE(Args&&... args) { \
+ return new (POOL.allocate()) TYPE(args...); \
+ } \
+ void delete##TYPE(TYPE *ptr) { \
+ ptr->~TYPE(); \
+ POOL.deallocate(ptr); \
+ }
+
+ /*! A linear allocator just grows and does not reuse freed memory. It can
+ * however allocate objects of any size
+ */
+ class LinearAllocator
+ {
+ public:
+ /*! Initiate the linear allocator (one segment is allocated) */
+ LinearAllocator(size_t minSize = CACHE_LINE, size_t maxSize = 64*KB);
+ /*! Free up everything */
+ ~LinearAllocator(void);
+ /*! Allocate size bytes */
+ void *allocate(size_t size);
+ /*! Nothing here */
+ INLINE void deallocate(void *ptr) {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+ if (ptr) GBE_ALIGNED_FREE(ptr);
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+ }
+ private:
+ /*! Helds an allocated segment of memory */
+ struct Segment {
+ /*! Allocate a new segment */
+ Segment(size_t size);
+ /*! Destroy the segment and the next ones */
+ ~Segment(void);
+ /* Size of the segment */
+ size_t size;
+ /*! Offset to the next free bytes (if any left) */
+ size_t offset;
+ /*! Pointer to valid data */
+ void *data;
+ /*! Pointer to the next segment */
+ Segment *next;
+ /*! Use internal allocator */
+ GBE_STRUCT(Segment);
+ };
+ /*! Points to the current segment we can allocate from */
+ Segment *curr;
+ /*! Maximum segment size */
+ size_t maxSize;
+ /*! Use internal allocator */
+ GBE_CLASS(LinearAllocator);
+ };
+
+} /* namespace gbe */
+
+#endif /* __GBE_ALLOC_HPP__ */
+
diff --git a/backend/src/sys/assert.cpp b/backend/src/sys/assert.cpp
new file mode 100644
index 0000000..52178a1
--- /dev/null
+++ b/backend/src/sys/assert.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#if GBE_COMPILE_UTESTS
+
+#include "sys/assert.hpp"
+#include "sys/exception.hpp"
+#include "sys/cvar.hpp"
+#include <cassert>
+#include <cstdlib>
+
+namespace gbe
+{
+ BVAR(OCL_BREAK_POINT_IN_ASSERTION, false);
+ BVAR(OCL_ABORT_IN_ASSERTION, false);
+
+ void onFailedAssertion(const char *msg, const char *file, const char *fn, int line)
+ {
+ char lineString[256];
+ sprintf(lineString, "%i", line);
+ assert(msg != NULL && file != NULL && fn != NULL);
+ const std::string str = "Compiler error: "
+ + std::string(msg) + "\n at file "
+ + std::string(file)
+ + ", function " + std::string(fn)
+ + ", line " + std::string(lineString);
+ if (OCL_BREAK_POINT_IN_ASSERTION)
+ DEBUGBREAK();
+ if (OCL_ABORT_IN_ASSERTION) {
+ assert(false);
+ exit(-1);
+ }
+ throw Exception(str);
+ }
+} /* namespace gbe */
+
+#else
+
+#include "sys/assert.hpp"
+#include "sys/exception.hpp"
+#include "sys/platform.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <unistd.h>
+
+namespace gbe
+{
+ void onFailedAssertion(const char *msg, const char *file, const char *fn, int32_t line)
+ {
+ assert(msg != NULL && file != NULL && fn != NULL);
+ fprintf(stderr, "ASSERTION FAILED: %s\n"
+ " at file %s, function %s, line %i\n",
+ msg, file, fn, line);
+ fflush(stdout);
+ DEBUGBREAK();
+ _exit(-1);
+ }
+} /* namespace gbe */
+
+#endif /* GBE_COMPILE_UTESTS */
+
diff --git a/backend/src/sys/assert.hpp b/backend/src/sys/assert.hpp
new file mode 100644
index 0000000..553e391
--- /dev/null
+++ b/backend/src/sys/assert.hpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_ASSERT_HPP__
+#define __GBE_ASSERT_HPP__
+
+namespace gbe
+{
+ /*! To ensure that condition truth. Optional message is supported */
+ void onFailedAssertion(const char *msg, const char *file, const char *fn, int line);
+} /* namespace gbe */
+
+#endif /* __GBE_ASSERT_HPP__ */
+
diff --git a/backend/src/sys/atomic.hpp b/backend/src/sys/atomic.hpp
new file mode 100644
index 0000000..3684ae9
--- /dev/null
+++ b/backend/src/sys/atomic.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __GBE_ATOMIC_HPP__
+#define __GBE_ATOMIC_HPP__
+
+#include "sys/intrinsics.hpp"
+
+namespace gbe
+{
+ template <typename T>
+ struct AtomicInternal {
+ protected:
+ AtomicInternal(const AtomicInternal&); // don't implement
+ AtomicInternal& operator= (const AtomicInternal&); // don't implement
+
+ public:
+ INLINE AtomicInternal(void) {}
+ INLINE AtomicInternal(T data) : data(data) {}
+ INLINE AtomicInternal& operator =(const T input) { data = input; return *this; }
+ INLINE operator T() const { return data; }
+ INLINE void storeRelease(T x) { __store_release(&data, x); }
+ public:
+ INLINE friend T operator+= (AtomicInternal& value, T input) { return atomic_add(&value.data, input) + input; }
+ INLINE friend T operator++ (AtomicInternal& value) { return atomic_add(&value.data, 1) + 1; }
+ INLINE friend T operator-- (AtomicInternal& value) { return atomic_add(&value.data, -1) - 1; }
+ INLINE friend T operator++ (AtomicInternal& value, int) { return atomic_add(&value.data, 1); }
+ INLINE friend T operator-- (AtomicInternal& value, int) { return atomic_add(&value.data, -1); }
+ INLINE friend T cmpxchg (AtomicInternal& value, const T v, const T c) { return atomic_cmpxchg(&value.data,v,c); }
+
+ private:
+ volatile T data;
+ GBE_STRUCT(AtomicInternal);
+ };
+
+ typedef AtomicInternal<atomic32_t> Atomic32;
+ typedef AtomicInternal<atomic_t> Atomic;
+}
+
+#endif /* __GBE_ATOMIC_HPP__ */
+
diff --git a/backend/src/sys/cvar.cpp b/backend/src/sys/cvar.cpp
new file mode 100644
index 0000000..1ee2c98
--- /dev/null
+++ b/backend/src/sys/cvar.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file cvar.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "sys/cvar.hpp"
+#include <cstdio>
+
+namespace gbe
+{
+
+ CVarInit::CVarInit(const char *name, int32_t *addr, int32_t imin, int32_t i, int32_t imax) :
+ varType(CVarInit::INTEGER)
+ {
+ this->i.min = imin;
+ this->i.max = imax;
+ const char *env = getenv(name);
+ if (env != NULL) {
+ sscanf(env, "%i", &i);
+ i = std::min(imax, std::max(imin, i));
+ }
+ *addr = i;
+ }
+
+ CVarInit::CVarInit(const char *name, float *addr, float fmin, float f, float fmax) :
+ varType(CVarInit::FLOAT)
+ {
+ this->f.min = fmin;
+ this->f.max = fmax;
+ const char *env = getenv(name);
+ if (env != NULL) {
+ sscanf(env, "%f", &f);
+ f = std::min(fmax, std::max(fmin, f));
+ }
+ *addr = f;
+ }
+
+ CVarInit::CVarInit(const char *name, std::string *str, const std::string &v) :
+ varType(CVarInit::STRING)
+ {
+ const char *env = getenv(name);
+ *str = env != NULL ? env : v;
+ }
+
+} /* namespace gbe */
+
diff --git a/backend/src/sys/cvar.hpp b/backend/src/sys/cvar.hpp
new file mode 100644
index 0000000..7350a3e
--- /dev/null
+++ b/backend/src/sys/cvar.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file cvar.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Quake like console variable system. Just use the environment variables from
+ * the console to change their value
+ */
+
+#ifndef __GBE_CVAR_HPP__
+#define __GBE_CVAR_HPP__
+
+#include "sys/platform.hpp"
+
+namespace gbe
+{
+ /*! A CVar is either a float, an integer or a string value. CVarInit is only
+ * here to set the global variable in pre-main
+ */
+ class CVarInit
+ {
+ public:
+ enum {
+ STRING = 0,
+ INTEGER = 1,
+ FLOAT = 2
+ };
+ /*! Build a CVar from an integer environment variable */
+ explicit CVarInit(const char *name, int32_t *addr, int32_t imin, int32_t i, int32_t imax);
+ /*! Build a CVar from a float environment variable */
+ explicit CVarInit(const char *name, float *addr, float fmin, float f, float fmax);
+ /*! Build a CVar from a string environment variable */
+ explicit CVarInit(const char *name, std::string *str, const std::string &v);
+ int varType; //!< STRING, INTEGER or FLOAT
+ std::string *str; //!< string variable
+ union {
+ struct { int32_t min, *curr, max; } i; //!< integer variables with bounds
+ struct { float min, *curr, max; } f; //!< float variables with bounds
+ };
+ };
+} /* namespace gbe */
+
+/*! Declare an integer console variable */
+#define IVAR(NAME, MIN, CURR, MAX) \
+ int32_t NAME; \
+ static gbe::CVarInit __CVAR##NAME##__LINE__##__(#NAME, &NAME, int32_t(MIN), int32_t(CURR), int32_t(MAX));
+
+/*! Declare a float console variable */
+#define FVAR(NAME, MIN, CURR, MAX) \
+ float NAME; \
+ static gbe::CVarInit __CVAR##NAME##__LINE__##__(#NAME, &NAME, float(MIN), float(CURR), float(MAX));
+
+/*! Declare a string console variable */
+#define SVAR(NAME, STR) \
+ std::string NAME; \
+ static gbe::CVarInit __CVAR##NAME##__LINE__##__(#NAME, &NAME, STR);
+
+/*! Declare a Boolean variable (just an integer in {0,1}) */
+#define BVAR(NAME, CURR) IVAR(NAME, 0, CURR ? 1 : 0, 1)
+
+#endif /* __GBE_CVAR_HPP__ */
+
diff --git a/backend/src/sys/exception.hpp b/backend/src/sys/exception.hpp
new file mode 100644
index 0000000..d74ca0d
--- /dev/null
+++ b/backend/src/sys/exception.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file exception.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_EXCEPTION_HPP__
+#define __GBE_EXCEPTION_HPP__
+
+#if GBE_COMPILE_UTESTS
+
+#include <exception>
+#include <string>
+
+namespace gbe
+{
+ /*! Exception are only used while using unit tests */
+ class Exception : public std::exception
+ {
+ public:
+ Exception(const std::string &msg) throw() : msg(msg) {}
+ Exception(const Exception &other) throw() : msg(other.msg) {}
+ ~Exception(void) throw() {}
+ Exception &operator= (const Exception &other) throw() {
+ this->msg = other.msg;
+ return *this;
+ }
+ const char *what(void) const throw() { return msg.c_str(); }
+ private:
+ std::string msg; //!< String message
+ };
+
+} /* namespace gbe */
+
+#endif /* GBE_COMPILE_UTESTS */
+#endif /* __GBE_EXCEPTION_HPP__ */
+
diff --git a/backend/src/sys/fixed_array.hpp b/backend/src/sys/fixed_array.hpp
new file mode 100644
index 0000000..d84c350
--- /dev/null
+++ b/backend/src/sys/fixed_array.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file fixed_array.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_FIXED_ARRAY_HPP__
+#define __GBE_FIXED_ARRAY_HPP__
+
+#include "platform.hpp"
+#include <cstring>
+
+namespace gbe
+{
+ /*! Regular C array but with bound checks */
+ template<typename T, size_t N>
+ class fixed_array
+ {
+ public:
+ /*! Do not initialize the data */
+ fixed_array(void) {}
+ /*! Copy the input array */
+ fixed_array(const T array[N]) { std::memcpy(elem, array, N * sizeof(T)); }
+ /*! First element (non const) */
+ T* begin(void) { return &elem[0]; }
+ /*! First non-valid element (non const) */
+ T* end(void) { return begin() + N; }
+ /*! First element (const) */
+ const T* begin(void) const { return &elem[0]; }
+ /*! First non-valid element (const) */
+ const T* end(void) const { return begin() + N; }
+ /*! Number of elements in the array */
+ size_t size(void) const { return N; }
+ /*! Get the pointer to the data (non-const) */
+ T* data(void) { return &elem[0]; }
+ /*! Get the pointer to the data (const) */
+ const T* data(void) const { return &elem[0]; }
+ /*! First element (const) */
+ const T& front(void) const { return *begin(); }
+ /*! Last element (const) */
+ const T& back(void) const { return *(end() - 1); }
+ /*! First element (non-const) */
+ T& front(void) { return *begin(); }
+ /*! Last element (non-const) */
+ T& back(void) { return *(end() - 1); }
+ /*! Get element at position index (with bound check) */
+ INLINE T& operator[] (size_t index) {
+ GBE_ASSERT(index < size());
+ return elem[index];
+ }
+ /*! Get element at position index (with bound check) */
+ INLINE const T& operator[] (size_t index) const {
+ GBE_ASSERT(index < size());
+ return elem[index];
+ }
+ private:
+ T elem[N]; //!< Store the elements
+ STATIC_ASSERT(N > 0); //!< zero element is not allowed
+ GBE_CLASS(fixed_array);
+ };
+
+} /* namespace gbe */
+
+#endif /* __GBE_FIXED_ARRAY_HPP__ */
+
diff --git a/backend/src/sys/hash_map.hpp b/backend/src/sys/hash_map.hpp
new file mode 100644
index 0000000..fb1d1ef
--- /dev/null
+++ b/backend/src/sys/hash_map.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file hash_map.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_HASH_MAP_HPP__
+#define __GBE_HASH_MAP_HPP__
+
+#include "sys/platform.hpp"
+
+#ifdef __MSVC__
+#include <unordered_map>
+#else
+#include <tr1/unordered_map>
+#endif /* __MSVC__ */
+
+namespace gbe
+{
+ /*! Add specific allocator to the hash map */
+ template <class Key,
+ class T,
+ class Hash = std::hash<Key>,
+ class Pred = std::equal_to<Key>>
+ class hash_map : public std::tr1::unordered_map<Key,T,Hash,Pred,Allocator<std::pair<const Key,T>>>,
+ public NonCopyable
+ {
+ public:
+ // Typedefs
+ typedef std::pair<const Key, T> value_type;
+ typedef Allocator<value_type> allocator_type;
+ typedef std::tr1::unordered_map<Key,T,Hash,Pred,allocator_type> parent_type;
+ typedef typename allocator_type::size_type size_type;
+ typedef Key key_type;
+ typedef T mapped_type;
+ typedef Hash hasher;
+ typedef Pred key_equal;
+
+ /*! Default constructor */
+ INLINE explicit hash_map(size_type n = 3,
+ const hasher& hf = hasher(),
+ const key_equal& eql = key_equal(),
+ const allocator_type& a = allocator_type()) :
+ parent_type(n, hf, eql, a) {}
+ /*! Iteration constructor */
+ template <class InputIterator>
+ INLINE hash_map(InputIterator first,
+ InputIterator last,
+ size_type n = 3,
+ const hasher& hf = hasher(),
+ const key_equal& eql = key_equal(),
+ const allocator_type& a = allocator_type()) :
+ parent_type(first,last,n,hf,eql,a) {}
+#if 0
+ /*! Copy constructor */
+ INLINE hash_map(const hash_map &other) : parent_type(other) {}
+#endif
+ GBE_CLASS(hash_map);
+ };
+} /* namespace gbe */
+
+#endif /* __GBE_HASH_MAP_HPP__ */
+
diff --git a/backend/src/sys/intrinsics.hpp b/backend/src/sys/intrinsics.hpp
new file mode 100644
index 0000000..2e25dc7
--- /dev/null
+++ b/backend/src/sys/intrinsics.hpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __GBE_INTRINSICS_HPP__
+#define __GBE_INTRINSICS_HPP__
+
+#include "sys/platform.hpp"
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#if defined(__MSVC__)
+
+#include <intrin.h>
+
+#define GBE_COMPILER_WRITE_BARRIER _WriteBarrier()
+#define GBE_COMPILER_READ_WRITE_BARRIER _ReadWriteBarrier()
+
+#if _MSC_VER >= 1400
+#pragma intrinsic(_ReadBarrier)
+#define GBE_COMPILER_READ_BARRIER _ReadBarrier()
+#else
+#define GBE_COMPILER_READ_BARRIER _ReadWriteBarrier()
+#endif /* _MSC_VER >= 1400 */
+
+INLINE int __bsf(int v) {
+ unsigned long r = 0; _BitScanForward(&r,v); return r;
+}
+
+INLINE int __bsr(int v) {
+ unsigned long r = 0; _BitScanReverse(&r,v); return r;
+}
+
+INLINE int __btc(int v, int i) {
+ long r = v; _bittestandcomplement(&r,i); return r;
+}
+
+INLINE int __bts(int v, int i) {
+ long r = v; _bittestandset(&r,i); return r;
+}
+
+INLINE int __btr(int v, int i) {
+ long r = v; _bittestandreset(&r,i); return r;
+}
+
+INLINE void memoryFence(void) { _mm_mfence(); }
+
+#if defined(__X86_64__) && !defined(__INTEL_COMPILER)
+
+INLINE size_t __bsf(size_t v) {
+ unsigned long r = 0; _BitScanForward64(&r,v); return r;
+}
+
+INLINE size_t __bsr(size_t v) {
+ unsigned long r = 0; _BitScanReverse64(&r,v); return r;
+}
+
+INLINE size_t __btc(size_t v, size_t i) {
+ __int64_t r = v; _bittestandcomplement64(&r,i); return r;
+}
+
+INLINE size_t __bts(size_t v, size_t i) {
+ __int64_t r = v; _bittestandset64(&r,i); return r;
+}
+
+INLINE size_t __btr(size_t v, size_t i) {
+ __int64_t r = v; _bittestandreset64(&r,i); return r;
+}
+
+#endif /* defined(__X86_64__) && !defined(__INTEL_COMPILER) */
+
+typedef int32_t atomic32_t;
+
+INLINE int32_t atomic_add(volatile int32_t* m, const int32_t v) {
+ return _InterlockedExchangeAdd((volatile long*)m,v);
+}
+
+INLINE int32_t atomic_cmpxchg(volatile int32_t* m, const int32_t v, const int32_t c) {
+ return _InterlockedCompareExchange((volatile long*)m,v,c);
+}
+
+#if defined(__X86_64__)
+
+typedef int64_t atomic_t;
+
+INLINE int64_t atomic_add(volatile int64_t* m, const int64_t v) {
+ return _InterlockedExchangeAdd64(m,v);
+}
+
+INLINE int64_t atomic_cmpxchg(volatile int64_t* m, const int64_t v, const int64_t c) {
+ return _InterlockedCompareExchange64(m,v,c);
+}
+
+#else
+
+typedef int32_t atomic_t;
+
+#endif /* defined(__X86_64__) */
+
+#else
+
+INLINE unsigned int __popcnt(unsigned int in) {
+ int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r;
+}
+
+INLINE int __bsf(int v) {
+ int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE int __bsr(int v) {
+ int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE int __btc(int v, int i) {
+ int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE int __bts(int v, int i) {
+ int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE int __btr(int v, int i) {
+ int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE size_t __bsf(size_t v) {
+ size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE size_t __bsr(size_t v) {
+ size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE size_t __btc(size_t v, size_t i) {
+ size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE size_t __bts(size_t v, size_t i) {
+ size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE size_t __btr(size_t v, size_t i) {
+ size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE void memoryFence(void) { _mm_mfence(); }
+
+typedef int32_t atomic32_t;
+
+INLINE int32_t atomic_add(int32_t volatile* value, int32_t input)
+{ asm volatile("lock xadd %0,%1" : "+r" (input), "+m" (*value) : "r" (input), "m" (*value)); return input; }
+
+INLINE int32_t atomic_cmpxchg(int32_t volatile* value, const int32_t input, int32_t comparand)
+{ asm volatile("lock cmpxchg %2,%0" : "=m" (*value), "=a" (comparand) : "r" (input), "m" (*value), "a" (comparand) : "flags"); return comparand; }
+
+#if defined(__X86_64__)
+
+ typedef int64_t atomic_t;
+
+ INLINE int64_t atomic_add(int64_t volatile* value, int64_t input)
+ { asm volatile("lock xaddq %0,%1" : "+r" (input), "+m" (*value) : "r" (input), "m" (*value)); return input; }
+
+ INLINE int64_t atomic_cmpxchg(int64_t volatile* value, const int64_t input, int64_t comparand)
+ { asm volatile("lock cmpxchgq %2,%0" : "+m" (*value), "+a" (comparand) : "r" (input), "m" (*value), "r" (comparand) : "flags"); return comparand; }
+
+#else
+
+ typedef int32_t atomic_t;
+
+#endif /* defined(__X86_64__) */
+
+#define GBE_COMPILER_READ_WRITE_BARRIER asm volatile("" ::: "memory");
+#define GBE_COMPILER_WRITE_BARRIER GBE_COMPILER_READ_WRITE_BARRIER
+#define GBE_COMPILER_READ_BARRIER GBE_COMPILER_READ_WRITE_BARRIER
+
+#endif /* __MSVC__ */
+
+template <typename T>
+INLINE T __load_acquire(volatile T *ptr)
+{
+ GBE_COMPILER_READ_WRITE_BARRIER;
+ T x = *ptr; // for x86, load == load_acquire
+ GBE_COMPILER_READ_WRITE_BARRIER;
+ return x;
+}
+
+template <typename T>
+INLINE void __store_release(volatile T *ptr, T x)
+{
+ GBE_COMPILER_READ_WRITE_BARRIER;
+ *ptr = x; // for x86, store == store_release
+ GBE_COMPILER_READ_WRITE_BARRIER;
+}
+#endif /* __GBE_INTRINSICS_HPP__ */
+
diff --git a/backend/src/sys/intrusive_list.cpp b/backend/src/sys/intrusive_list.cpp
new file mode 100644
index 0000000..ed7067c
--- /dev/null
+++ b/backend/src/sys/intrusive_list.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2007 Maciej Sinilo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "intrusive_list.hpp"
+
+namespace gbe
+{
+ intrusive_list_base::intrusive_list_base() : m_root() {}
+
+ intrusive_list_base::size_type intrusive_list_base::size() const {
+ size_type numNodes(0);
+ const intrusive_list_node* iter = &m_root;
+ do {
+ iter = iter->next;
+ ++numNodes;
+ } while (iter != &m_root);
+ return numNodes - 1;
+ }
+
+ void append(intrusive_list_node *node, intrusive_list_node *prev) {
+ GBE_ASSERT(!node->in_list());
+ node->next = prev->next;
+ node->next->prev = node;
+ prev->next = node;
+ node->prev = prev;
+ }
+
+ void prepend(intrusive_list_node *node, intrusive_list_node *next) {
+ GBE_ASSERT(!node->in_list());
+ node->prev = next->prev;
+ node->prev->next = node;
+ next->prev = node;
+ node->next = next;
+ }
+
+ void link(intrusive_list_node* node, intrusive_list_node* nextNode) {
+ prepend(node, nextNode);
+ }
+
+ void unlink(intrusive_list_node* node) {
+ GBE_ASSERT(node->in_list());
+ node->prev->next = node->next;
+ node->next->prev = node->prev;
+ node->next = node->prev = node;
+ }
+} /* namespace gbe */
+
diff --git a/backend/src/sys/intrusive_list.hpp b/backend/src/sys/intrusive_list.hpp
new file mode 100644
index 0000000..2e2f2a9
--- /dev/null
+++ b/backend/src/sys/intrusive_list.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2007 Maciej Sinilo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __GBE_INTRUSIVE_LIST_HPP__
+#define __GBE_INTRUSIVE_LIST_HPP__
+
+#include "sys/platform.hpp"
+
+namespace gbe
+{
+ /*! List elements must inherit from it */
+ struct intrusive_list_node
+ {
+ INLINE intrusive_list_node(void) { next = prev = this; }
+ INLINE bool in_list(void) const { return this != next; }
+ intrusive_list_node *next;
+ intrusive_list_node *prev;
+ };
+
+ /*! Insert node such that prev -> node */
+ void append(intrusive_list_node *node, intrusive_list_node *prev);
+ /*! Insert node such that node -> next */
+ void prepend(intrusive_list_node *node, intrusive_list_node *next);
+ /*! Same as prepend */
+ void link(intrusive_list_node* node, intrusive_list_node* nextNode);
+ /*! Remove the node from its current list */
+ void unlink(intrusive_list_node* node);
+
+ template<typename Pointer, typename Reference>
+ class intrusive_list_iterator
+ {
+ public:
+ typedef Pointer pointer;
+ typedef Reference reference;
+
+ INLINE intrusive_list_iterator(void): m_node(0) {}
+ INLINE intrusive_list_iterator(Pointer iterNode) : m_node(iterNode) {}
+
+ INLINE Reference operator*(void) const {
+ GBE_ASSERT(m_node);
+ return *m_node;
+ }
+ INLINE Pointer operator->(void) const { return m_node; }
+ INLINE Pointer node(void) const { return m_node; }
+
+ INLINE intrusive_list_iterator& operator++(void) {
+ m_node = static_cast<Pointer>(m_node->next);
+ return *this;
+ }
+ INLINE intrusive_list_iterator& operator--(void) {
+ m_node = static_cast<Pointer>(m_node->prev);
+ return *this;
+ }
+ INLINE intrusive_list_iterator operator++(int) {
+ intrusive_list_iterator copy(*this);
+ ++(*this);
+ return copy;
+ }
+ INLINE intrusive_list_iterator operator--(int) {
+ intrusive_list_iterator copy(*this);
+ --(*this);
+ return copy;
+ }
+
+ INLINE bool operator== (const intrusive_list_iterator& rhs) const {
+ return rhs.m_node == m_node;
+ }
+ INLINE bool operator!= (const intrusive_list_iterator& rhs) const {
+ return !(rhs == *this);
+ }
+ private:
+ Pointer m_node;
+ };
+
+ class intrusive_list_base
+ {
+ public:
+ typedef size_t size_type;
+
+ INLINE void pop_back(void) { unlink(m_root.prev); }
+ INLINE void pop_front(void) { unlink(m_root.next); }
+ INLINE bool empty(void) const { return !m_root.in_list(); }
+ size_type size(void) const;
+
+ protected:
+ intrusive_list_base(void);
+ INLINE ~intrusive_list_base(void) {}
+
+ intrusive_list_node m_root;
+
+ private:
+ intrusive_list_base(const intrusive_list_base&);
+ intrusive_list_base& operator=(const intrusive_list_base&);
+ };
+
+ template<class T>
+ class intrusive_list : public intrusive_list_base
+ {
+ public:
+ typedef T node_type;
+ typedef T value_type;
+ typedef intrusive_list_iterator<T*, T&> iterator;
+ typedef intrusive_list_iterator<const T*, const T&> const_iterator;
+
+ intrusive_list(void) : intrusive_list_base() {
+ intrusive_list_node* testNode((T*)0);
+ static_cast<void>(sizeof(testNode));
+ }
+
+ void push_back(value_type* v) { link(v, &m_root); }
+ void push_front(value_type* v) { link(v, m_root.next); }
+
+ iterator begin(void) { return iterator(upcast(m_root.next)); }
+ iterator end(void) { return iterator(upcast(&m_root)); }
+ iterator rbegin(void) { return iterator(upcast(m_root.prev)); }
+ iterator rend(void) { return iterator(upcast(&m_root)); }
+ const_iterator begin(void) const { return const_iterator(upcast(m_root.next)); }
+ const_iterator end(void) const { return const_iterator(upcast(&m_root)); }
+ const_iterator rbegin(void) const { return const_iterator(upcast(m_root.prev)); }
+ const_iterator rend(void) const { return const_iterator(upcast(&m_root)); }
+
+ INLINE value_type* front(void) { return upcast(m_root.next); }
+ INLINE value_type* back(void) { return upcast(m_root.prev); }
+ INLINE const value_type* front(void) const { return upcast(m_root.next); }
+ INLINE const value_type* back(void) const { return upcast(m_root.prev); }
+
+ iterator insert(iterator pos, value_type* v) {
+ link(v, pos.node());
+ return iterator(v);
+ }
+ iterator erase(iterator it) {
+ iterator itErase(it);
+ ++it;
+ unlink(itErase.node());
+ return it;
+ }
+ iterator erase(iterator first, iterator last) {
+ while (first != last) first = erase(first);
+ return first;
+ }
+
+ void clear(void) { erase(begin(), end()); }
+ void fast_clear(void) { m_root.next = m_root.prev = &m_root; }
+ static void remove(value_type* v) { unlink(v); }
+
+ private:
+ static INLINE node_type* upcast(intrusive_list_node* n) {
+ return static_cast<node_type*>(n);
+ }
+ static INLINE const node_type* upcast(const intrusive_list_node* n) {
+ return static_cast<const node_type*>(n);
+ }
+ };
+} /* namespace gbe */
+
+#endif /* __GBE_INTRUSIVE_LIST_HPP__ */
+
diff --git a/backend/src/sys/list.hpp b/backend/src/sys/list.hpp
new file mode 100644
index 0000000..51b9c39
--- /dev/null
+++ b/backend/src/sys/list.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file list.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_LIST_HPP__
+#define __GBE_LIST_HPP__
+
+#include "sys/platform.hpp"
+#include <list>
+
+namespace gbe
+{
+ /*! Use custom allocator instead of std one */
+ template <typename T>
+ class list : public std::list<T, Allocator<T>>
+ {
+ public:
+ // Typedefs
+ typedef T value_type;
+ typedef Allocator<value_type> allocator_type;
+ typedef std::list<T, allocator_type> parent_type;
+ typedef typename allocator_type::size_type size_type;
+
+ /*! Default constructor */
+ INLINE explicit list(const allocator_type &a = allocator_type()) :
+ parent_type(a) {}
+ /*! Repetitive constructor */
+ INLINE explicit list(size_type n,
+ const T &value = T(),
+ const allocator_type &a = allocator_type()) :
+ parent_type(n, value, a) {}
+ /*! Iteration constructor */
+ template <class InputIterator>
+ INLINE list(InputIterator first,
+ InputIterator last,
+ const allocator_type &a = allocator_type()) :
+ parent_type(first, last, a) {}
+ /*! Copy constructor */
+ INLINE list(const list &x) : parent_type(x) {}
+ GBE_CLASS(list);
+ };
+} /* namespace gbe */
+
+#endif /* __GBE_LIST_HPP__ */
+
diff --git a/backend/src/sys/map.hpp b/backend/src/sys/map.hpp
new file mode 100644
index 0000000..1c72400
--- /dev/null
+++ b/backend/src/sys/map.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file map.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_MAP_HPP__
+#define __GBE_MAP_HPP__
+
+#include "sys/platform.hpp"
+#include <map>
+
+namespace gbe
+{
+ /*! Use custom allocator instead of std one */
+ template<class Key, class T, class Pred = std::less<Key>>
+ class map : public std::map<Key,T,Pred,Allocator<std::pair<const Key, T>>>,
+ public NonCopyable
+ {
+ public:
+ // Typedefs
+ typedef std::pair<const Key, T> value_type;
+ typedef Allocator<value_type> allocator_type;
+ typedef std::map<Key,T,Pred,allocator_type> parent_type;
+ typedef Key key_type;
+ typedef T mapped_type;
+ typedef Pred key_compare;
+ typedef typename allocator_type::pointer pointer;
+ typedef typename allocator_type::const_pointer const_pointer;
+ typedef typename allocator_type::reference reference;
+ typedef typename allocator_type::const_reference const_reference;
+
+ /*! Default constructor */
+ INLINE map(const key_compare &comp = key_compare(),
+ const allocator_type &a = allocator_type()) :
+ parent_type(comp, a) {}
+ /*! Iteration constructor */
+ template<class InputIterator>
+ INLINE map(InputIterator first,
+ InputIterator last,
+ const key_compare &comp = key_compare(),
+ const allocator_type& a = allocator_type()) :
+ parent_type(first, last, comp, a) {}
+#if 0
+ /*! Copy constructor */
+ INLINE map(const map& x) : parent_type(x) {}
+#endif
+ /*! Better than using find if we do not care about the iterator itself */
+ INLINE bool contains(const Key &key) const {
+ return this->find(key) != this->end();
+ }
+ GBE_CLASS(map);
+ };
+} /* namespace gbe */
+
+#endif /* __GBE_MAP_HPP__ */
+
diff --git a/backend/src/sys/mutex.cpp b/backend/src/sys/mutex.cpp
new file mode 100644
index 0000000..9640150
--- /dev/null
+++ b/backend/src/sys/mutex.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "sys/mutex.hpp"
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace gbe
+{
+ /*! system mutex using windows API */
+ MutexSys::MutexSys( void ) { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+ MutexSys::~MutexSys( void ) { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete ((CRITICAL_SECTION*)mutex); }
+ void MutexSys::lock( void ) { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+ void MutexSys::unlock( void ) { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__)
+#include <pthread.h>
+
+namespace gbe
+{
+ /*! system mutex using pthreads */
+ MutexSys::MutexSys( void ) { mutex = new pthread_mutex_t; pthread_mutex_init((pthread_mutex_t*)mutex, NULL); }
+ MutexSys::~MutexSys( void ) { pthread_mutex_destroy((pthread_mutex_t*)mutex); delete ((pthread_mutex_t*)mutex); }
+ void MutexSys::lock( void ) { pthread_mutex_lock((pthread_mutex_t*)mutex); }
+ void MutexSys::unlock( void ) { pthread_mutex_unlock((pthread_mutex_t*)mutex); }
+}
+#endif
+
diff --git a/backend/src/sys/mutex.hpp b/backend/src/sys/mutex.hpp
new file mode 100644
index 0000000..1a462b0
--- /dev/null
+++ b/backend/src/sys/mutex.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __GBE_MUTEX_HPP__
+#define __GBE_MUTEX_HPP__
+
+#include "platform.hpp"
+#include "atomic.hpp"
+#include <xmmintrin.h>
+
+namespace gbe
+{
+ class MutexSys {
+ friend class ConditionSys;
+ public:
+ MutexSys(void);
+ ~MutexSys(void);
+ void lock(void);
+ void unlock(void);
+ protected:
+ void* mutex;
+ MutexSys(const MutexSys&); // don't implement
+ MutexSys& operator= (const MutexSys&); // don't implement
+ GBE_CLASS(MutexSys);
+ };
+
+ /*! active mutex */
+ class MutexActive {
+ public:
+ INLINE MutexActive(void) : _lock(LOCK_IS_FREE) {}
+ INLINE void lock(void) {
+ GBE_COMPILER_READ_BARRIER;
+ while (cmpxchg(_lock, LOCK_IS_TAKEN, LOCK_IS_FREE) != LOCK_IS_FREE)
+ _mm_pause();
+ GBE_COMPILER_READ_BARRIER;
+ }
+ INLINE void unlock(void) { _lock.storeRelease(LOCK_IS_FREE); }
+ protected:
+ enum { LOCK_IS_FREE = 0, LOCK_IS_TAKEN = 1 };
+ Atomic _lock;
+ MutexActive(const MutexActive&); // don't implement
+ MutexActive& operator=(const MutexActive&); // don't implement
+ GBE_CLASS(MutexActive);
+ };
+
+ /*! safe mutex lock and unlock helper */
+ template<typename Mutex> class Lock {
+ public:
+ Lock (Mutex& mutex) : mutex(mutex) { mutex.lock(); }
+ ~Lock() { mutex.unlock(); }
+ protected:
+ Mutex& mutex;
+ Lock(const Lock&); // don't implement
+ Lock& operator= (const Lock&); // don't implement
+ GBE_CLASS(Lock);
+ };
+}
+
+#endif /* __GBE_MUTEX_HPP__ */
diff --git a/backend/src/sys/platform.cpp b/backend/src/sys/platform.cpp
new file mode 100644
index 0000000..95768ee
--- /dev/null
+++ b/backend/src/sys/platform.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "sys/platform.hpp"
+#include "sys/intrinsics.hpp"
+#include <string>
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace gbe
+{
+ double getSeconds() {
+ LARGE_INTEGER freq, val;
+ QueryPerformanceFrequency(&freq);
+ QueryPerformanceCounter(&val);
+ return (double)val.QuadPart / (double)freq.QuadPart;
+ }
+
+ void FATAL(const std::string &msg) {
+ std::cerr << msg << std::endl;
+ MessageBox(NULL, msg.c_str(), "Fatal Error", MB_OK | MB_ICONEXCLAMATION);
+ GBE_ASSERT(0);
+#ifdef __GNUC__
+ exit(-1);
+#else
+ _exit(-1);
+#endif /* __GNUC__ */
+ }
+
+} /* namespace gbe */
+#endif /* __WIN32__ */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/time.h>
+#include <unistd.h>
+
+namespace gbe
+{
+ double getSeconds() {
+ struct timeval tp; gettimeofday(&tp,NULL);
+ return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+ }
+
+ void FATAL(const std::string &msg) {
+ std::cerr << msg << std::endl;
+ GBE_ASSERT(0);
+ _exit(-1);
+ }
+} /* namespace gbe */
+
+#endif /* __UNIX__ */
+
diff --git a/backend/src/sys/platform.hpp b/backend/src/sys/platform.hpp
new file mode 100644
index 0000000..b8a2841
--- /dev/null
+++ b/backend/src/sys/platform.hpp
@@ -0,0 +1,441 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_PLATFORM_HPP__
+#define __GBE_PLATFORM_HPP__
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <ostream>
+#include <istream>
+#include <string>
+#include <cassert>
+#include <new>
+
+////////////////////////////////////////////////////////////////////////////////
+/// CPU architecture
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#else
+#define __X86__
+#endif
+
+/* We require SSE ... */
+#ifndef __SSE__
+#define __SSE__
+#endif
+
+/* ... and SSE2 */
+#ifndef __SSE2__
+#define __SSE2__
+#endif
+
+#if defined(_INCLUDED_IMM)
+// #define __AVX__
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600) && !defined(__INTEL_COMPILER) || defined(_DEBUG) && defined(_WIN32)
+#define __NO_AVX__
+#endif
+
+#if defined(_MSC_VER) && !defined(__SSE4_2__)
+// #define __SSE4_2__ //! activates SSE4.2 support
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Operating system
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+# if !defined(__LINUX__)
+# define __LINUX__
+# endif
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+# if !defined(__FREEBSD__)
+# define __FREEBSD__
+# endif
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+# if !defined(__WIN32__)
+# define __WIN32__
+# endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+# if !defined(__MACOSX__)
+# define __MACOSX__
+# endif
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+# if !defined(__UNIX__)
+# define __UNIX__
+# endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Compiler
+////////////////////////////////////////////////////////////////////////////////
+
+/*! GCC compiler */
+#ifdef __GNUC__
+// #define __GNUC__
+#endif
+
+/*! Intel compiler */
+#ifdef __INTEL_COMPILER
+#define __ICC__
+#endif
+
+/*! Visual C compiler */
+#ifdef _MSC_VER
+#define __MSVC__
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Makros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#define __dllexport extern "C" __declspec(dllexport)
+#define __dllimport extern "C" __declspec(dllimport)
+#else
+#define __dllexport extern "C"
+#define __dllimport extern "C"
+#endif
+
+#ifdef __MSVC__
+#undef NOINLINE
+#define NOINLINE __declspec(noinline)
+#define INLINE __forceinline
+#define RESTRICT __restrict
+#define THREAD __declspec(thread)
+#define ALIGNED(...) __declspec(align(__VA_ARGS__))
+//#define __FUNCTION__ __FUNCTION__
+#define DEBUGBREAK() __debugbreak()
+#else
+#undef NOINLINE
+#undef INLINE
+#define NOINLINE __attribute__((noinline))
+#define INLINE inline __attribute__((always_inline))
+#define RESTRICT __restrict
+#define THREAD __thread
+#define ALIGNED(...) __attribute__((aligned(__VA_ARGS__)))
+#define __FUNCTION__ __PRETTY_FUNCTION__
+#define DEBUGBREAK() asm ("int $3")
+#endif
+
+/*! Modern x86 processors */
+#define CACHE_LINE 64
+#define CACHE_LINE_ALIGNED ALIGNED(CACHE_LINE)
+
+#ifdef __GNUC__
+ #define MAYBE_UNUSED __attribute__((used))
+#else
+ #define MAYBE_UNUSED
+#endif
+
+#if defined(_MSC_VER)
+#define __builtin_expect(expr,b) expr
+#endif
+
+/*! Debug syntactic sugar */
+#if GBE_DEBUG
+#define IF_DEBUG(EXPR) EXPR
+#else
+#define IF_DEBUG(EXPR)
+#endif /* GBE_DEBUG */
+
+/*! Debug printing macros */
+#define STRING(x) #x
+#define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl
+#define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl
+
+/*! Branch hint */
+#define LIKELY(x) __builtin_expect(!!(x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+
+/*! Stringify macros */
+#define JOIN(X, Y) _DO_JOIN(X, Y)
+#define _DO_JOIN(X, Y) _DO_JOIN2(X, Y)
+#define _DO_JOIN2(X, Y) X##Y
+
+/*! Run-time assertion */
+#if GBE_DEBUG
+#define GBE_ASSERT(EXPR) do { \
+ if (UNLIKELY(!(EXPR))) \
+ gbe::onFailedAssertion(#EXPR, __FILE__, __FUNCTION__, __LINE__); \
+} while (0)
+#define GBE_ASSERTM(EXPR, MSG) do { \
+ if (UNLIKELY(!(EXPR))) \
+ gbe::onFailedAssertion(MSG, __FILE__, __FUNCTION__, __LINE__); \
+} while (0)
+#else
+#define GBE_ASSERT(EXPR) do { } while (0)
+#define GBE_ASSERTM(EXPR, MSG) do { } while (0)
+#endif /* GBE_DEBUG */
+
+#define NOT_IMPLEMENTED GBE_ASSERTM (false, "Not implemented")
+#define NOT_SUPPORTED GBE_ASSERTM (false, "Not supported")
+
+/*! Fatal error macros */
+#define FATAL_IF(COND, MSG) \
+do { \
+ if(UNLIKELY(COND)) FATAL(MSG); \
+} while (0)
+
+/* Safe deletion macros */
+#define GBE_SAFE_DELETE_ARRAY(x) do { if (x != NULL) GBE_DELETE_ARRAY(x); } while (0)
+#define GBE_SAFE_DELETE(x) do { if (x != NULL) GBE_DELETE(x); } while (0)
+
+/* Number of elements in an array */
+#define ARRAY_ELEM_NUM(x) (sizeof(x) / sizeof(x[0]))
+
+/* Align X on A */
+#define ALIGN(X,A) (((X) % (A)) ? ((X) + (A) - ((X) % (A))) : (X))
+
+/*! Produce a string from the macro locatiom */
+#define HERE (STRING(__LINE__) "@" __FILE__)
+
+/*! Typesafe encapusalation of a type (mostly for integers) */
+#define TYPE_SAFE(SAFE, UNSAFE) \
+class SAFE \
+{ \
+public: \
+ INLINE SAFE(void) {} \
+ explicit INLINE SAFE(uint16_t unsafe) : unsafe(unsafe) {} \
+ INLINE operator UNSAFE (void) const { return unsafe; } \
+ UNSAFE value(void) const { return unsafe; } \
+private: \
+ UNSAFE unsafe; \
+};
+
+/*! Default alignment for the platform */
+#define GBE_DEFAULT_ALIGNMENT 16
+
+/*! Useful constants */
+#define KB 1024
+#define MB (KB*KB)
+
+/*! Portable AlignOf */
+template <typename T>
+struct AlignOf {
+ struct Helper { char x; T t; };
+ enum { value = offsetof(Helper, t) };
+};
+
+//gcc 4.8+ support C++11 alignof keyword
+#if (__GNUC__ >= 4 && __GNUC_MINOR__ >= 8)
+#define ALIGNOF(T) (alignof(T))
+#else
+#define ALIGNOF(T) (AlignOf<T>::value)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Visibility parameters (DLL export and so on)
+////////////////////////////////////////////////////////////////////////////////
+#if defined __WIN32__
+ #if defined __GNUC__
+ #define GBE_EXPORT_SYMBOL __attribute__ ((dllexport))
+ #define GBE_IMPORT_SYMBOL __attribute__ ((dllimport))
+ #else
+ #define GBE_IMPORT_SYMBOL __declspec(dllimport)
+ #define GBE_EXPORT_SYMBOL __declspec(dllexport)
+ #endif /* __GNUC__ */
+#else
+ #define GBE_EXPORT_SYMBOL __attribute__ ((visibility ("default")))
+ #define GBE_IMPORT_SYMBOL
+#endif /* __WIN32__ */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic Types
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MSVC__)
+typedef __int64_t int64_t;
+typedef unsigned __int64_t uint64_t;
+typedef __int32_t int32_t;
+typedef unsigned __int32_t uint32_t;
+typedef __int16_t int16_t;
+typedef unsigned __int16_t uint16_t;
+typedef __int8_t int8_t;
+typedef unsigned __int8_t uint8_t;
+#else
+#include <cstdint>
+#endif
+
+#if defined(__X86_64__)
+typedef int64_t index_t;
+#else
+typedef int32_t index_t;
+#endif
+
+/*! To protect some classes from being copied */
+class NonCopyable
+{
+protected:
+ INLINE NonCopyable(void) {}
+ INLINE ~NonCopyable(void) {}
+private:
+ INLINE NonCopyable(const NonCopyable&) {}
+ INLINE NonCopyable& operator= (const NonCopyable&) {return *this;}
+};
+
+#define TO_MAGIC(A, B, C, D) (A<<24 | B<<16 | C<<8 | D)
+
+class Serializable
+{
+public:
+ INLINE Serializable(void) = default;
+ INLINE Serializable(const Serializable&) = default;
+ INLINE Serializable& operator= (const Serializable&) = default;
+
+ virtual size_t serializeToBin(std::ostream& outs) = 0;
+ virtual size_t deserializeFromBin(std::istream& ins) = 0;
+
+ /* These two will follow LLVM's ABI. */
+ virtual size_t serializeToLLVM(void) { return 0;/* not implemented now. */}
+ virtual size_t deserializeFromLLVM(void) { return 0;/* not implemented now. */}
+
+ virtual void printStatus(int indent = 0, std::ostream& outs = std::cout) { }
+
+ virtual ~Serializable(void) { }
+
+protected:
+ static std::string indent_to_str(int indent) {
+ std::string ind(indent, ' ');
+ return ind;
+ }
+};
+
+/* Help Macro for serialization. */
+#define SERIALIZE_OUT(elt, out, sz) \
+ do { \
+ auto tmp_val = elt; \
+ out.write((char *)(&tmp_val), sizeof(elt)); \
+ sz += sizeof(elt); \
+ } while(0)
+
+#define DESERIALIZE_IN(elt, in, sz) \
+ do { \
+ in.read((char *)(&(elt)), sizeof(elt)); \
+ sz += sizeof(elt); \
+ } while(0)
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __ICC__
+#pragma warning(disable:265) // floating-point operation result is out of range
+#pragma warning(disable:383) // value copied to temporary, reference to temporary used
+#pragma warning(disable:869) // parameter was never referenced
+#pragma warning(disable:981) // operands are evaluated in unspecified order
+#pragma warning(disable:1418) // external function definition with no prior declaration
+#pragma warning(disable:1419) // external declaration in primary source file
+#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+#pragma warning(disable:1125) // virtual function override intended?
+#endif /* __ICC__ */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Default Includes and Functions
+////////////////////////////////////////////////////////////////////////////////
+
+#include "sys/alloc.hpp"
+
+namespace gbe
+{
+ /*! selects */
+ INLINE bool select(bool s, bool t , bool f) { return s ? t : f; }
+ INLINE int select(bool s, int t, int f) { return s ? t : f; }
+ INLINE float select(bool s, float t, float f) { return s ? t : f; }
+
+ /*! Fatal error function */
+ void FATAL(const std::string&);
+
+ /*! Return the next power of 2 */
+ INLINE uint32_t nextHighestPowerOf2(uint32_t x) {
+ x--;
+ x |= x >> 1;
+ x |= x >> 2;
+ x |= x >> 4;
+ x |= x >> 8;
+ x |= x >> 16;
+ return ++x;
+ }
+
+ INLINE uint32_t logi2(uint32_t x) {
+ uint32_t r = 0;
+ while(x >>= 1) r++;
+ return r;
+ }
+
+ template<uint32_t N>
+ INLINE uint32_t isPowerOf(uint32_t i) {
+ while (i > 1) {
+ if (i%N) return false;
+ i = i/N;
+ }
+ return true;
+ }
+ template<> INLINE uint32_t isPowerOf<2>(uint32_t i) { return ((i-1)&i) == 0; }
+
+ /*! random functions */
+ template<typename T> T random() { return T(0); }
+ template<> INLINE int32_t random() { return int(rand()); }
+ template<> INLINE uint32_t random() { return uint32_t(rand()); }
+ template<> INLINE float random() { return random<uint32_t>()/float(RAND_MAX); }
+ template<> INLINE double random() { return random<uint32_t>()/double(RAND_MAX); }
+
+ /** returns performance counter in seconds */
+ double getSeconds();
+
+} /* namespace gbe */
+
+#endif /* __GBE_PLATFORM_HPP__ */
+
diff --git a/backend/src/sys/set.hpp b/backend/src/sys/set.hpp
new file mode 100644
index 0000000..db68807
--- /dev/null
+++ b/backend/src/sys/set.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file set.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_SET_HPP__
+#define __GBE_SET_HPP__
+
+#include "sys/platform.hpp"
+#include <set>
+
+namespace gbe
+{
+ /*! Add our custom allocator to std::set */
+ template<class Key, class Pred = std::less<Key>>
+ class set : public std::set<Key,Pred,Allocator<Key>>, public NonCopyable
+ {
+ public:
+ // Typedefs
+ typedef Key value_type;
+ typedef Allocator<value_type> allocator_type;
+ typedef std::set<Key,Pred,Allocator<Key>> parent_type;
+ typedef Key key_type;
+ typedef Pred key_compare;
+
+ /*! Default constructor */
+ INLINE set(const key_compare &comp = key_compare(),
+ const allocator_type &a = allocator_type()) :
+ parent_type(comp, a) {}
+ /*! Iteration constructor */
+ template<class InputIterator>
+ INLINE set(InputIterator first,
+ InputIterator last,
+ const key_compare &comp = key_compare(),
+ const allocator_type& a = allocator_type()) :
+ parent_type(first, last, comp, a) {}
+#if 0
+ /*! Copy constructor */
+ INLINE set(const set& x) : parent_type(x) {}
+#endif
+ /*! Better than using find if we do not care about the iterator itself */
+ INLINE bool contains(const Key &key) const {
+ return this->find(key) != this->end();
+ }
+ GBE_CLASS(set);
+ };
+
+} /* namespace gbe */
+
+#endif /* __GBE_SET_HPP__ */
+
diff --git a/backend/src/sys/vector.hpp b/backend/src/sys/vector.hpp
new file mode 100644
index 0000000..dc89991
--- /dev/null
+++ b/backend/src/sys/vector.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file vector.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_VECTOR_HPP__
+#define __GBE_VECTOR_HPP__
+
+#include "sys/platform.hpp"
+#include <vector>
+
+namespace gbe
+{
+ /*! Add bound checks to the standard vector class and use the internal
+ * allocator
+ */
+ template<class T>
+ class vector : public std::vector<T, Allocator<T>>
+ {
+ public:
+ // Typedefs
+ typedef std::vector<T, Allocator<T>> parent_type;
+ typedef Allocator<T> allocator_type;
+ typedef typename allocator_type::size_type size_type;
+ typedef typename parent_type::iterator iterator;
+
+ /*! Default constructor */
+ INLINE explicit vector(const allocator_type &a = allocator_type()) :
+ parent_type(a) {}
+#if 0
+ /*! Copy constructor */
+ INLINE vector(const vector &x) : parent_type(x) {}
+#endif
+ /*! Repetitive sequence constructor */
+ INLINE explicit vector(size_type n,
+ const T& value= T(),
+ const allocator_type &a = allocator_type()) :
+ parent_type(n, value, a) {}
+ /*! Iteration constructor */
+ template <class InputIterator>
+ INLINE vector(InputIterator first,
+ InputIterator last,
+ const allocator_type &a = allocator_type()) :
+ parent_type(first, last, a) {}
+ /*! Get element at position index (with a bound check) */
+ T &operator[] (size_t index) {
+ GBE_ASSERT(index < this->size());
+ return parent_type::operator[] (index);
+ }
+ /*! Get element at position index (with a bound check) */
+ const T &operator[] (size_t index) const {
+ GBE_ASSERT(index < this->size());
+ return parent_type::operator[] (index);
+ }
+ GBE_CLASS(vector);
+ };
+} /* namespace gbe */
+
+#endif /* __GBE_VECTOR_HPP__ */
+
diff --git a/backend/src/update.sh b/backend/src/update.sh
new file mode 100755
index 0000000..0e5f8c0
--- /dev/null
+++ b/backend/src/update.sh
@@ -0,0 +1,3 @@
+#! /bin/sh -e
+./update_as.sh
+./update_convert.sh
diff --git a/backend/src/update_as.sh b/backend/src/update_as.sh
new file mode 100755
index 0000000..c68e789
--- /dev/null
+++ b/backend/src/update_as.sh
@@ -0,0 +1,11 @@
+#! /bin/sh -e
+
+AS_HEADER=ocl_as.h
+
+exec >$AS_HEADER.tmp
+echo "// This file is autogenerated by gen_as.sh."
+echo "// Don't modify it manually."
+./gen_as.sh
+exec >&2
+
+mv $AS_HEADER.tmp $AS_HEADER
diff --git a/backend/src/update_blob_ocl_header.py b/backend/src/update_blob_ocl_header.py
new file mode 100755
index 0000000..50f2501
--- /dev/null
+++ b/backend/src/update_blob_ocl_header.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2012 Intel Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Author: Zhigang Gong <zhigang.gong at linux.intel.com>
+#/
+import sys
+import os
+
+if len(sys.argv) != 3:
+ print "Invalid argument {0}".format(sys.argv)
+ print "use {0} tmpl_file_name output_file_name".format(sys.argv[0])
+ raise
+
+def safeUnlink(filename):
+ try:
+ os.remove(filename)
+ except OSError:
+ pass
+
+header_segments = [ "vector", "as", "convert", "common_defines"]
+blobFileName = sys.argv[2]
+blobTempName = sys.argv[2] + '.tmp'
+safeUnlink(blobFileName)
+tmplFile = open(sys.argv[1], 'r')
+blob = open(blobTempName, 'w')
+path = os.path.dirname(sys.argv[1])
+if path == '':
+ path = '.'
+
+matched_header = ""
+for tline in tmplFile:
+ if matched_header == "":
+ blob.write(tline)
+ for header in header_segments:
+ if tline.strip() == '// ##BEGIN_{0}##'.format(header.upper()) :
+ hFile = open(path + '/ocl_' + header + '.h', 'r')
+ lineNr = 0
+ for hline in hFile:
+ if lineNr >= 2: #ignore the 2 lines of comment at the top of file.
+ blob.write(hline)
+ lineNr += 1
+ hFile.close()
+ matched_header = header
+ else:
+ if tline.strip() == '// ##END_{0}##'.format(matched_header.upper()) :
+ blob.write(tline)
+ matched_header = "";
+
+tmplFile.close()
+blob.close()
+os.rename(blobTempName, blobFileName)
diff --git a/backend/src/update_convert.sh b/backend/src/update_convert.sh
new file mode 100755
index 0000000..3c47917
--- /dev/null
+++ b/backend/src/update_convert.sh
@@ -0,0 +1,12 @@
+#! /bin/sh -e
+
+CONVERT_HEADER=ocl_convert.h
+
+
+exec >$CONVERT_HEADER.tmp
+echo "// This file is autogenerated by gen_convert.sh."
+echo "// Don't modify it manually."
+./gen_convert.sh
+exec >&2
+
+mv $CONVERT_HEADER.tmp $CONVERT_HEADER
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..d96a2e0
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,21 @@
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../utests
+ ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+set (benchmark_sources
+ ../utests/utest_error.c
+ ../utests/utest_assert.cpp
+ ../utests/utest.cpp
+ ../utests/utest_file_map.cpp
+ ../utests/utest_helper.cpp
+ enqueue_copy_buf.cpp)
+
+ADD_LIBRARY(benchmarks SHARED ${ADDMATHFUNC} ${benchmark_sources})
+
+#TARGET_LINK_LIBRARIES(benchmarks cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+TARGET_LINK_LIBRARIES(benchmarks cl m)
+
+ADD_EXECUTABLE(benchmark_run benchmark_run.cpp)
+TARGET_LINK_LIBRARIES(benchmark_run benchmarks)
diff --git a/benchmark/benchmark_run.cpp b/benchmark/benchmark_run.cpp
new file mode 100644
index 0000000..b29ccc3
--- /dev/null
+++ b/benchmark/benchmark_run.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest_run.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Just run the unit tests. The user can possibly provides the subset of it
+ */
+#include "utest_helper.hpp"
+#include "utest_exception.hpp"
+#include <iostream>
+#include <getopt.h>
+
+static const char *shortopts = "c:lanh";
+struct option longopts[] = {
+{"casename", required_argument, NULL, 'c'},
+{"list", no_argument, NULL, 'l'},
+{"all", no_argument, NULL, 'a'},
+{"allnoissue", no_argument, NULL, 'n'},
+{"help", no_argument, NULL, 'h'},
+{0, 0, 0, 0},
+};
+
+void usage()
+{
+ std::cout << "\
+Usage:\n\
+ ./utest_run <option>\n\
+\n\
+ option:\n\
+ -c <casename>: run sub-case named 'casename'\n\
+ -l : list all the available case name\n\
+ -a : run all test cases\n\
+ -n : run all test cases without known issue (default option)\n\
+ -h : display this usage\n\
+\
+ "<< std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+
+ int c = 0;
+ cl_ocl_init();
+
+ c = getopt_long (argc, argv, shortopts, longopts, NULL);
+
+ if (argc == 1)
+ c = 'n';
+ if (argc == 2 && c < 1 ){
+ c = 'c';
+ optarg = argv[1];
+ }
+
+ do {
+ switch (c)
+ {
+ case 'c':
+ try {
+ UTest::run(optarg);
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'l':
+ UTest::listAllCases();
+ break;
+
+ case 'a':
+ try {
+ UTest::runAll();
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'n':
+ try {
+ UTest::runAllNoIssue();
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'h':
+ default:
+ usage();
+ exit(1);
+ }
+ } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1);
+
+ cl_ocl_destroy();
+}
diff --git a/benchmark/enqueue_copy_buf.cpp b/benchmark/enqueue_copy_buf.cpp
new file mode 100644
index 0000000..0d0d4df
--- /dev/null
+++ b/benchmark/enqueue_copy_buf.cpp
@@ -0,0 +1,69 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+ unsigned int i;
+ cl_char* buf0;
+
+ OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+ buf0 = (cl_char *)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 0, sizeof(char), 0, NULL, NULL, NULL);
+
+ for (i=0; i < sz; i++) {
+ buf0[i]=(rand() & 0xFF);
+ }
+
+ clEnqueueUnmapMemObject(queue, buf[0], buf0, 0, NULL, NULL);
+
+ if (src_off + cb > sz || dst_off + cb > sz) {
+ /* Expect Error. */
+ OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+ src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+ return;
+ }
+
+ OCL_ASSERT(CL_SUCCESS == clEnqueueCopyBuffer(queue, buf[0], buf[1],
+ src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+}
+
+int tim_subtract(struct timeval *y, struct timeval *x, struct timeval *result){
+ if ( x->tv_sec > y->tv_sec )
+ return -1;
+
+ if ((x->tv_sec == y->tv_sec) && (x->tv_usec > y->tv_usec))
+ return -1;
+
+ if ( result != NULL){
+ result->tv_sec = ( y->tv_sec - x->tv_sec );
+ result->tv_usec = ( y->tv_usec - x->tv_usec );
+
+ if (result->tv_usec < 0){
+ result->tv_sec --;
+ result->tv_usec += 1000000;
+ }
+ }
+
+ int msec = 1000.0*(y->tv_sec - x->tv_sec) + (y->tv_usec - x->tv_usec)/1000.0;
+ return msec;
+}
+
+
+int enqueue_copy_buf(void)
+{
+ size_t i;
+ const size_t sz = 127 *1023 * 1023;
+ struct timeval start,stop;
+
+ gettimeofday(&start,0);
+
+ for (i=0; i<10; i++) {
+ test_copy_buf(sz, 0, 0, sz);
+ }
+
+ gettimeofday(&stop,0);
+ return tim_subtract(&stop, &start, 0);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(enqueue_copy_buf);
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
new file mode 100644
index 0000000..7e5b730
--- /dev/null
+++ b/docs/Beignet.mdwn
@@ -0,0 +1,230 @@
+Beignet
+=======
+
+Beignet is an open source implementation of the OpenCL specification - a generic
+compute oriented API. This code base contains the code to run OpenCL programs on
+Intel GPUs which basically defines and implements the OpenCL host functions
+required to initialize the device, create the command queues, the kernels and
+the programs and run them on the GPU. The code base also contains the compiler
+part of the stack which is included in `backend/`. For more specific information
+about the compiler, please refer to `backend/README.md`
+
+News
+----
+[[Beignet project news|Beignet/NEWS]]
+
+Prerequisite
+------------
+
+The project depends on the following external libaries:
+
+- libdrm libraries (libdrm and libdrm\_intel)
+- Various LLVM components
+- If run with X server, beignet needs XLib, Xfixes and Xext installed. Otherwise, no X11 dependency.
+
+And if you want to work with the standard ICD libOpenCL.so, then you need
+two more packages (the following package name is for Ubuntu):
+
+- ocl-icd-dev
+- ocl-icd-libopencl1
+
+If you don't want to enable ICD, or your system doesn't have ICD OpenCL support,
+you can still link to the beignet OpenCL library. You can find the beignet/libcl.so
+in your system's library installation directories.
+
+Note that the compiler depends on LLVM (Low-Level Virtual Machine project).
+Right now, the code has been compiled with LLVM 3.3/3.4. It will not compile
+with anything older.
+
+[http://llvm.org/releases/](http://llvm.org/releases/)
+
+LLVM 3.3 , 3.4 and 3.5 are supported. Till now, the recommended LLVM/CLANG version is 3.5.
+There are some severe OpenCL related regression in clang 3.4 version.
+
+**Note about LLVM 3.4**
+
+* If you want to try Clang/LLVM 3.4, you need to disable terminfo:
+--disable-terminfo. It's a llvm 3.4 bug.
+
+Please be noted that the code was compiled on GCC 4.6, GCC 4.7 and GCC 4.8. Since the code
+uses really recent C++11 features, you may expect problems with older compilers.
+
+How to build and install
+------------------------
+
+The project uses CMake with three profiles:
+
+1. Debug (-g)
+2. RelWithDebInfo (-g with optimizations)
+3. Release (only optimizations)
+
+Basically, from the root directory of the project
+
+`> mkdir build`
+
+`> cd build`
+
+`> cmake ../ # to configure`
+
+CMake will check the dependencies and will complain if it does not find them.
+
+`> make`
+
+The cmake will build the backend firstly. Please refer to:
+[[OpenCL Gen Backend|Beignet/Backend]] to get more dependencies.
+
+Once built, the run-time produces a shared object libcl.so which basically
+directly implements the OpenCL API. A set of tests are also produced. They may
+be found in `utests/`.
+
+Simply invoke:
+`> make install`
+
+It installs the following six files to the beignet/ directory relatively to
+your library installation directory.
+- libcl.so
+- libgbeinterp.so
+- libgbe.so
+- ocl\_stdlib.h, ocl\_stdlib.h.pch
+- beignet.bc
+
+It installs the OCL icd vendor files to /etc/OpenCL/vendors, if the system support ICD.
+- intel-beignet.icd
+
+How to run
+----------
+
+After build and install of beignet, you may need to check whether it works on your
+platform. Beignet also produces various tests to ensure the compiler and the run-time
+consistency. This small test framework uses a simple c++ registration system to
+register all the unit tests.
+
+You need to call setenv.sh in the utests/ directory to set some environment variables
+firstly as below:
+
+`> . setenv.sh`
+
+Then in `utests/`:
+
+`> ./utest_run`
+
+will run all the unit tests one after the others
+
+`> ./utest_run some_unit_test0 some_unit_test1`
+
+will only run `some_unit_test0` and `some_unit_test1` tests
+
+On all supported target platform, the pass rate should be 100%. If it is not, you may
+need to refer the "Known Issues" section.
+
+Supported Targets
+-----------------
+
+ * 3rd Generation Intel Core Processors
+ * Intel “Bay Trail” platforms with Intel HD Graphics
+ * 4th Generation Intel Core Processors, need kernel patch currently, see below
+ for details:
+
+Known Issues
+------------
+
+* GPU hang issues.
+ To check whether GPU hang, you could execute dmesg and check whether it has the following message:
+ `[17909.175965] [drm:i915_hangcheck_hung] *ERROR* Hangcheck timer elapsed...`
+ If it does, there was a GPU hang. Usually, this means something wrong in the kernel, as it indicates
+ the OCL kernel hasn't finished for about 6 seconds or even more. If you think the OCL kernel does need
+ to run that long and have confidence with the kernel, you could disable the linux kernel driver's
+ hang check feature to fix this hang issue. Just invoke the following command on Ubuntu system:
+
+ `# echo -n 0 > /sys/module/i915/parameters/enable_hangcheck`
+
+ But this command is a little bit dangerous, as if your kernel really hang, then the gpu will lock up
+ forever until a reboot.
+
+* Almost all unit tests fail on Linux kernel 3.15/3.16.
+ There is a known issue in some versions of linux kernel which enable register whitelist feature
+ but miss some necessary registers which are required for beignet. The problematic version are
+ around 3.15 and 3.16 which have commit f0a346b... but haven't commit c9224f... If it is the case,
+ you can apply c9224f... manually and rebuild the kernel or just disable the parse command by
+ invoke the following command (use Ubuntu as an example):
+ `# echo 0 > /sys/module/i915/parameters/enable_cmd_parser`
+
+* Some unit test cases, maybe 20 to 30, fail on 4th Generation (HSW) platform.
+ The 4th Generation Intel Core Processors's support requires some Linux kernel
+ modification. You need to apply the patch at:
+ [https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support)
+
+* Precision issue.
+ Currently Gen does not provide native support of high precision math functions
+ required by OpenCL. We provide a software version to achieve high precision,
+ which you can turn on through `export OCL_STRICT_CONFORMANCE=1`.
+ But be careful, this would make your CL kernel run a little longer.
+
+* cl\_khr\_gl\_sharing.
+ This extension highly depends on mesa support. It seems that mesa would not provide
+ such type of extensions, we may have to hack with mesa source code to support this
+ extension. This feature used to work with a previous mesa git version. But now, it's
+ simply broken.
+
+TODO
+----
+
+In terms of the OpenCL 1.2 spec, beignet is quite complete now. We can pass almost
+all the piglit OpenCL test cases now. And the pass rate for the OpenCV test suite
+is also good which is about 99%. There are still some remains work items listed as below,
+most of them are extension support and performance related.
+
+- Performance tuning. There are some major optimizations need to be done,
+ Peephole optimization, convert to structured BBs and leverage Gen's structured
+ instructions, and optimize the extreme slow software based sin/cos/... math
+ functions due to the native math instruction lack of necessary precision.
+ And all the code is inlined which will increase the icache miss rate
+ significantly. And many other things which are specified partially in
+ [[here|Beignet/Backend/TODO]].
+
+- Complete cl\_khr\_gl\_sharing support. We lack of some APIs implementation such
+ as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
+ the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D. We may need to
+ find a graceful way to co-work with mesa.
+
+- Check that NDRangeKernels can be pushed into _different_ queues from several
+ threads.
+
+- No state tracking at all. One batch buffer is created at each "draw call"
+ (i.e. for each NDRangeKernels). This is really inefficient since some
+ expensive pipe controls are issued for each batch buffer.
+
+- Valgrind reports some leaks in libdrm. It sounds like a false positive but it
+ has to be checked. Idem for LLVM. There is one leak here to check.
+
+More generally, everything in the run-time that triggers the "FATAL" macro means
+that something that must be supported is not implemented properly (either it
+does not comply with the standard or it is just missing)
+
+Project repository
+------------------
+Right now, we host our project on fdo at:
+[http://cgit.freedesktop.org/beignet/](http://cgit.freedesktop.org/beignet/).
+And the intel 01.org:
+[https://01.org/beignet](https://01.org/beignet)
+
+The team
+--------
+Beignet project was created by Ben Segovia. Since 2013, Now we have a team in
+Intel China OTC graphics team continue to work on this project.
+The official contact for this project is: Zou Nanhai (<nanhai.zou at intel.com>).
+
+How to contribute
+-----------------
+You are always welcome to contribute to this project, just need to subscribe
+to the beignet mail list and send patches to it for review.
+The official mail list is as below:
+[http://lists.freedesktop.org/mailman/listinfo/beignet](http://lists.freedesktop.org/mailman/listinfo/beignet)
+
+Documents for OpenCL application developers
+-------------------------------------------
+- [[Cross compile|Beignet/howto/cross-compiler-howto]]
+- [[Kernel Optimization Guide|Beignet/optimization-guide]]
+
+The wiki URL is as below:
+[http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/Beignet/Backend.mdwn b/docs/Beignet/Backend.mdwn
new file mode 100644
index 0000000..319ce81
--- /dev/null
+++ b/docs/Beignet/Backend.mdwn
@@ -0,0 +1,96 @@
+Beignet Compiler
+================
+
+This code base contains the compiler part of the Beignet OpenCL stack. The
+compiler is responsible to take a OpenCL language string and to compile it into
+a binary that can be executed on Intel integrated GPUs.
+
+Limitations
+-----------
+
+Today, the compiler is far from complete. See [[here|Backend/TODO]] for a
+(incomplete) lists of things to do.
+
+Interface with the run-time
+---------------------------
+
+Even if the compiler makes a very liberal use of C++ (templates, variadic
+templates, macros), we really tried hard to make a very simple interface with
+the run-time. The interface is therefore a pure C99 interface and it is defined
+in `src/backend/program.h`.
+
+The goal is to hide the complexity of the inner data structures and to enable
+simple run-time implementation using straightforward C99.
+
+Note that the data structures are fully opaque: this allows us to use both the
+C++ simulator or the real Gen program in a relatively non-intrusive way.
+
+Various environment variables
+-----------------------------
+
+Environment variables are used all over the code. Most important ones are:
+
+- `OCL_STRICT_CONFORMANCE` `(0 or 1)`. Gen does not provide native high
+ precision math instructions compliant with OpenCL Spec. So we provide a
+ software version to meet the high precision requirement. Obviously the
+ software version's performance is not as good as native version supported by
+ GEN hardware. What's more, most graphics application don't need this high
+ precision, so we choose 0 as the default value. So OpenCL apps do not suffer
+ the performance penalty for using high precision math functions.
+
+- `OCL_SIMD_WIDTH` `(8 or 16)`. Select the number of lanes per hardware thread,
+ Normally, you don't need to set it, we will select suitable simd width for
+ a given kernel. Default value is 16.
+
+- `OCL_OUTPUT_GEN_IR` `(0 or 1)`. Output Gen IR (scalar intermediate
+ representation) code
+
+- `OCL_OUTPUT_LLVM` `(0 or 1)`. Output LLVM code after the lowering passes
+
+- `OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS` `(0 or 1)`. Output LLVM code before the
+ lowering passes
+
+- `OCL_OUTPUT_ASM` `(0 or 1)`. Output Gen ISA
+
+- `OCL_OUTPUT_REG_ALLOC` `(0 or 1)`. Output Gen register allocations, including
+ virtual register to physical register mapping, live ranges.
+
+- `OCL_OUTPUT_BUILD_LOG` `(0 or 1)`. Output error messages if there is any
+ during CL kernel compiling and linking.
+
+- `OCL_OUTPUT_CFG` `(0 or 1)`. Output control flow graph in .dot file.
+
+- `OCL_OUTPUT_CFG_ONLY` `(0 or 1)`. Output control flow graph in .dot file,
+ but without instructions in each BasicBlock.
+
+- `OCL_PRE_ALLOC_INSN_SCHEDULE` `(0 or 1)`. The instruction scheduler in
+ beignet are currently splitted into two passes: before and after register
+ allocation. The pre-alloc scheduler tend to decrease register pressure.
+ This variable is used to disable/enable pre-alloc scheduler. This pass is
+ disabled now for some bugs.
+
+- `OCL_POST_ALLOC_INSN_SCHEDULE` `(0 or 1)`. Disable/enable post-alloc
+ instruction scheduler. The post-alloc scheduler tend to reduce instruction
+ latency. By default, this is enabled now.
+
+- `OCL_SIMD16_SPILL_THRESHOLD` `(0 to 256)`. Tune how much registers can be
+ spilled under SIMD16. Default value is 16. We find spill too much register
+ under SIMD16 is not as good as fall back to SIMD8 mode. So we set the
+ variable to control spilled register number under SIMD16.
+
+- `OCL_USE_PCH` `(0 or 1)`. The default value is 1. If it is enabled, we use
+ a pre compiled header file which include all basic ocl headers. This would
+ reduce the compile time.
+
+Implementation details
+----------------------
+
+Several key decisions may use the hardware in an usual way. See the following
+documents for the technical details about the compiler implementation:
+
+- [[Mixed buffer pointer)|mixed_buffer_pointer]]
+- [[Unstructured branches|unstructured_branches]]
+- [[Scalar intermediate representation|gen_ir]]
+- [[Clean backend implementation|compiler_backend]]
+
+Ben Segovia.
diff --git a/docs/Beignet/Backend/TODO.mdwn b/docs/Beignet/Backend/TODO.mdwn
new file mode 100644
index 0000000..501c508
--- /dev/null
+++ b/docs/Beignet/Backend/TODO.mdwn
@@ -0,0 +1,110 @@
+TODO
+====
+
+The compiler is quite complete now in terms of functionality. It could pass
+almos all of the piglit OCL test cases and the pass rate for the OpenCV test
+suite is also quite good now. But there are plenty of things to do for the
+final performance tuning.
+
+OpenCL standard library
+-----------------------
+
+Today we define the OpenCL API in header file `src/ocl_stdlib.h`.
+
+By the way, one question remains: do we want to implement
+the high-precision functions as _inline_ functions or as external functions to
+call? Indeed, inlining all functions may lead to severe code bloats while
+calling functions will require to implement a proper ABI. We certainly want to
+do both actually.
+
+LLVM front-end
+--------------
+
+The code is defined in `src/llvm`. We used the SPIR and the OpenCL profile
+to compile the code. Therefore, a good part of the job is already done. However,
+many things must be implemented:
+
+- Better resolving of the PHI functions. Today, we always generate MOV
+ instructions at the end of each basic block . They can be easily optimized.
+
+- From LLVM 3.3, we use SPIR IR. We need to use the compiler defined type to
+ represent sampler\_t/image2d\_t/image1d\_t/....
+
+- Considering to use libclc in our project and avoid to use the PCH which is not
+ compatible for different clang versions. And may contribute what we have done in
+ the ocl\_stdlib.h to libclc if possible.
+
+- Optimize math functions. If the native math instructions don't compy with the
+ OCL spec, we use pure software style to implement those math instructions which
+ is extremely slow, for example. The cos and sin for HD4000 platform are very slow.
+ For some applications which may not need such a high accurate results. We may
+ provide a mechanism to use native\_xxx functions instead of the extremely slow
+ version.
+
+Gen IR
+------
+
+The code is defined in `src/ir`. Main things to do are:
+
+- Convert unstructured BBs to structured format, and leverage Gen's structured
+ instruction such as if/else/endif to encoding those BBs. Then we can save many
+ instructions which are used to maintain software pcips and predications.
+
+- Implement those llvm.memset/llvm.memcpy more efficiently. Currently, we lower
+ them as normal memcpy at llvm module level and not considering the intrinsics
+ all have a constant data length.
+
+- Finishing the handling of function arguments (see the [[IR
+ description|gen_ir]] for more details)
+
+- Merging of independent uniform loads (and samples). This is a major
+ performance improvement once the uniform analysis is done. Basically, several
+ uniform loads may be collapsed into one load if no writes happens in-between.
+ This will obviously impact both instruction selection and the register
+ allocation.
+
+- Implement fast path for small local variables. When the kernel only defines
+ a small local array/variable, there will be a good chance to allocate the local
+ array/variable in register space rather than system memory. This will reduce a
+ lot of memory load/stroe from the system memory.
+
+Backend
+-------
+
+The code is defined in `src/backend`. Main things to do are:
+
+- Optimize register spilling (see the [[compiler backend description|compiler_backend]] for more details)
+
+- Implementing proper instruction selection. A "simple" tree matching algorithm
+ should provide good results for Gen
+
+- Improving the instruction scheduling pass. Need to implement proper pre register
+ allocation scheduling to lower register pressure.
+
+- Reduce the macro instructions in gen\_context. The macro instructions added in
+ gen\_context will not get a chance to do post register allocation scheduling.
+
+- leverage the structured if/endif for branching processing.
+
+- Peephole optimization. There are many chances to do further peephole optimization.
+
+General plumbing
+----------------
+
+I tried to keep the code clean, well, as far as C++ can be really clean. There
+are some header cleaning steps required though, in particular in the backend
+code.
+
+The context used in the IR code generation (see `src/ir/context.*pp`) should be
+split up and cleaned up too.
+
+I also purely and simply copied and pasted the Gen ISA disassembler from Mesa.
+This leads to code duplication. Also some messages used by OpenCL (untyped reads
+and writes) are not properly decoded yet.
+
+All of those code should be improved and cleaned up are tracked with "XXX"
+comments in the code.
+
+Parts of the code leaks memory when exceptions are used. There are some pointers
+to track and replace with std::unique\_ptr. Note that we also add a custom memory
+debugger that nicely complements (i.e. it is fast) Valgrind.
diff --git a/docs/Beignet/Backend/compiler_backend.mdwn b/docs/Beignet/Backend/compiler_backend.mdwn
new file mode 100644
index 0000000..3c489b2
--- /dev/null
+++ b/docs/Beignet/Backend/compiler_backend.mdwn
@@ -0,0 +1,118 @@
+Compiler Back End
+=================
+
+Well, the complete code base is somehow a compiler backend for LLVM. Here, we
+really speak about the final code generation passes that you may find in
+`src/backend`.
+
+As explained in [[the scalar IR presentation|gen_ir]], we bet on a very
+simple scalar IR to make it easy to parse and modify. The idea is to fix the
+unrelated problem (very Gen specific) where we can i.e. when the code is
+generated.
+
+The code generation in the compiler backend is classically divided into four
+steps
+
+- Instruction selection (defined in `src/backend/gen_insn_selection.*pp`). We
+ expose an interface for the instruction selection engine. We implemented a
+ very simple selection (called `SimpleSelection`) that does a quick and dirty
+ one-to-many instruction generation.
+
+- Register allocation (defined in `src/backend/gen_reg_allocation.*pp`). The
+ code implements a linear scan allocator on the code selected in the previous
+ pass. See below for more details about register vector allocations.
+
+- Instruction scheduling. This one is not done yet. We just output the same
+ instruction order as the program order. Note that we plan to implement an
+ adaptive scheduling between register allocation and instruction selection (to
+ avoid spilling as much as possible)
+
+- Instruction encoding. This is the final step that encodes the program into Gen
+ ISA.
+
+Instruction selection
+---------------------
+
+Usually, the instruction selection consists in mapping `p` instructions to `q`
+ISA instructions under a cost driven model. Each basic block is therefore _tiled_
+into some numbers of groups of ISA instructions such that the final cost is
+minimized.
+
+The literature is particularly dense on the subject. Compilers usually use today
+either tree matching methods or selection DAG techniques (as LLVM backends do)
+
+The instruction selection is still a work in progress in our compiler and we
+only implement the most stupid (and inefficient) technique: we simply generate
+as many instructions as we need for each _individual_ IR instructions. Since we
+do not support immediate sources, this in particular leads to really ugly
+looking code such as `mov (16) r2:f 1.f`. It is still a work in progress.
+
+Other than that, the instruction selection is really a book keeping structure.
+We basically output `SelectionInstruction` objects which are the 1-to-1 mapping
+of Gen ISA encoding functions defined in `src/backend/gen_encoder.*pp`.
+
+However, the `SelectionInstruction` still use unallocated virtual registers and
+do *not* use vectors but simply tuples of virtual registers.
+
+Register allocation
+-------------------
+
+The register allocation actually consists in two steps:
+
+1. Handling the vector for all the instructions that require them
+
+2. Performing the register allocation itself
+
+Step 1 consists in scanning all the vectors required by sends. Obviously, the
+same register may be used in different vectors and that may lead to
+interferences. We simply sort the vectors from the largest to the smallest and
+allocate them in that order. As an optimization we also identify sub-vectors
+i.e. vectors included in larger ones and no not allocate them.
+
+The code may be largely improved in particular if we take into account liveness
+interferences as well. Basically, a register may be part of several vectors if the
+registers that are not in both vectors at the same location are not alive at the
+same time.
+
+This is still a work in progress. Code is right now handled by method
+`GenRegAllocator::allocateVector`.
+
+Step 2 performs the register allocation i.e. it associates each virtual register
+to one (or several) physical registers. The first thing is that the Gen register
+file is very flexible i.e. it can (almost) be freely partitioned. To handle this
+peculiarity, we simply implemented a free list based generic memory allocator as
+done with `RegisterFilePartitioner` in `src/backend/context.cpp`.
+
+We provide two directions of memory allocation. From tail to head direction is
+used for normal register, and from head to tail is for the curbe payload register
+allocation.
+
+We then simply implemented a linear scan allocator (see
+`gen_reg_allocation.cpp`). The spilling is implemented in the same file. The
+heuristics we used is the register's end point. It always try to spill the
+register with largest liveness end point if possible. Although Gen support to
+spill 4 SIMD8 register at once, we only support one currently. Need to optimize
+it latter, at least for the vectors' spilling. Maybe a new pass in the backend
+to find opportunity to gatter more spilled register into one contiguous area
+is also worth to do. We also can consider the spill register's interval to
+do smarter scratch memory allocation to reduce scratch memory requirement.
+
+Instruction scheduling
+----------------------
+
+Intra-basic block instruction scheduling is relatively simple. It is implemented
+but has known bug, we need further effort to fix it.
+
+Instruction encoding
+--------------------
+
+This is mostly done in `src/backend/gen_context.cpp` and
+`src/backend/gen_encoder./*pp`. This is mostly glue code and it is pretty
+straightforward. We just forward the selection code using the physically
+allocated registers. There is nothing special here. Just boilerplate.
+
+There are plenty of huge macro instructions in the `gen_context.cpp` currently.
+Most of them are for the long/double support on a Gen platform which doesn't support
+long/double in the hardware level. We may need to clean up and move those non-hardware
+related functions into upper layer. Too many huge instruction which will totally
+make the register spilling and dead code elimination harder and inefficient.
diff --git a/docs/Beignet/Backend/gen_ir.mdwn b/docs/Beignet/Backend/gen_ir.mdwn
new file mode 100644
index 0000000..635cbb4
--- /dev/null
+++ b/docs/Beignet/Backend/gen_ir.mdwn
@@ -0,0 +1,254 @@
+Scalar Intermediate Representation
+==================================
+
+The IR code is included in `src/ir/` of the compiler code base
+The IR as designed in this compiler is the fruit of a long reflection I mostly
+have with Thomas Raoux. Note I usually call it "Gen IR".
+
+Scalar vs vector IR
+-------------------
+
+This is actually the major question: do we need a vector IR or a scalar IR? On
+the LLVM side, we have both. LLVM IR can manipulate vectors and scalars (and
+even generalized values but we can ignore it for now).
+
+For that reason, the Clang front-end generates both scalar and vector code.
+Typically, a `uint4` variable will output a vector of 4 integers. Arithmetic
+computations will be directly done on vector variables.
+
+One the HW side, the situation is completely different:
+
+- We are going to use the parallel mode (align1) i.e. the struct-of-array mode
+ for the EU. This is a SIMD scalar mode.
+
+- The only source of vectors we are going to have is on the sends instructions
+ (and marginally for some other instructions like the div\_rem math instruction)
+
+One may therefore argue that we need vector instructions to handle the sends.
+Send will indeed require both vector destinations and sources. This may be a
+strong argument *for* vectors in the IR. However, the situation is not that
+good.
+
+Indeed, if we look carefully at the send instructions we see that they will
+require vectors that are *not* vectors in LLVM IR. This code for example:
+
+<code>
+\_\_global uint4 \*src;<br/>
+uint4 x = src[get\_global\_id(0)];<br/>
+</code>
+
+will be translated into an untyped write in the Gen ISA. Unfortunately, the
+address and the values to write are in the *same* vector. However, LLVM IR will
+output a store like:
+
+`store(%addr, %value)`
+
+which basically uses one scalar (the address) and one value (the vector to
+write). Therefore even if we handle vectors in the IR, that will not directly
+solve the problem we have at the end for the send instructions.
+
+We therefore decided to go the other direction:
+
+- We have a purely scalar IR
+
+- To replace vectors, we simply use multiple sources and destinations
+
+- Real vectors required by send instructions are handled at the very bottom of
+the stack in the register allocation passes.
+
+This leads to a very simple intermediate representation which is mostly a pure
+scalar RISC machine.
+
+Very limited IR
+---------------
+
+The other major question, in particular when you look similar stacks like NVidia
+SPIR, is:
+
+do we need to encode in the IR register modifiers (abs, negate...) and immediate
+registers (like in add.f x y 1.0)?
+
+Contrary to other IRs (SPIR and even LLVM that both supports immediates), we also
+chose to have a very simply IR, much simpler than the final ISA, and to merge
+back what we need at the instruction selection pass. Since we need instruction
+selection, let us keep the IR simple.
+
+Also, there are a lot of major issues that can not be covered in the IR and
+require to be specifically handled at the very end of the code:
+
+- send vectors (see previous section)
+
+- send headers (value and register allocation) which are also part of the vector
+problem
+
+- SIMD8 mode in SIMD16 code. Some send messages do not support SIMD16 encoding
+and require SIMD8. Typically examples are typed writes i.e. scatters to textures.
+Also, this cannot be encoded in some way in a regular scalar IR.
+
+For these reasons, most of the problems directly related to Gen naturally find
+their solutions in either the instruction selection or the register allocator.
+
+This leads to the following strategy:
+
+- Keep the IR very simple and limited
+
+- Use all the analysis tools you need in the IR before the final code generation
+to build any information you need. This is pure "book-keeping".
+
+- Use any previous analysis and finish the job at the very end
+
+This classical approach leads to limit the complexity in the IR while forcing us
+to write the proper tools in the final stages.
+
+Why not using LLVM IR directly?
+-------------------------------
+
+We hesitated a long time between writing a dedicated IR (as we did) and just
+using LLVM IR. Indeed, LLVM comes with a large set of tools that are parts of
+"LLVM backends". LLVM provides a lot of tools to perform the instruction
+selection (`SelectionDAG`) and the register allocation. Two things however
+prevent us from choosing this path:
+
+- We only have a limited experience with LLVM and no experience at all with the
+LLVM backends
+
+- LLVM register allocators do not handle at all the peculiarities of Gen:
+
+ * flexible register file. Gen registers are more like memory than registers
+ and can be freely allocated and aliased. LLVM register allocators only
+ support partial aliasing like x86 machines do (rax -> eax -> ax)
+
+ * no proper tools to handle vectors in the register allocator as we need for
+ sends
+
+Since we will need to do some significant work anyway, this leads us to choose a
+more hard-coded path with a in-house IR. Note that will not prevent us from
+implementing later a LLVM backend "by the book" as Nvidia does today with SPIR
+(using a LLVM backend to do the LLVM IR -> SPIR conversion)
+
+
+SSA or no SSA
+-------------
+
+Since we have a purely scalar IR, implementing a SSA transformation on the IR
+may be convenient. However, most the literature about compiler back-ends use
+non-SSA representation of the code. Since the primary goal is to write a
+compiler _back-end_ (instruction selection, register allocation and instruction
+scheduling), we keep the code in non-SSA letting the higher level optimizations
+to LLVM.
+
+Types, registers, instructions, functions and units
+---------------------------------------------------
+
+The IR is organized as follows:
+
+- Types (defined in `src/ir/type.*pp`). These are scalar types only. Since the
+ code is completely lowered down, there is no more reference to structures,
+ pointers or vectors. Everything is scalar values and when "vectors" or
+ "structures" would be needed, we use instead multiple scalar sources or
+ destinations.
+
+- Registers (defined in `src/ir/register.*pp`). They are untyped (since Gen IR
+ are untyped) and we have 65,535 of them per function
+
+- Instructions (defined in `src/ir/instruction.*pp`). They are typed (to
+ distinguish integer and FP adds for example) and possibly support multiple
+ destinations and sources. We also provide a convenient framework to introspect
+ the instruction in a simple (and memory efficient) way
+
+- Functions (defined in `src/ir/function.*pp`). They are basically the counter
+ part of LLVM functions or OpenCL kernels. Note that function arguments are a
+ problem. We actually use the SPIR ABI. Everything smaller than the machine word
+ size (i.e. 32 bits for Gen) is passed by value with a register. Everything
+ else which is bigger than is passed by pointer with a ByVal attribute.
+ Note that requires some special treatment in the IR (see below) to make the
+ code faster by replacing function argument loads by "pushed constants". We
+ also defined one "register file" per function i.e. the registers are defined
+ relatively to the function that uses them. Each function is made of basic
+ blocks i.e. sequence of instructions that are executed linearly.
+
+- Units (defined in `src/ir/unit.*pp`). Units are just a collection of
+ functions and constants (not supported yet).
+
+Function arguments and pushed constants
+---------------------------------------
+
+Gen can push values into the register file i.e. some registers are preset when
+the kernel starts to run. As detailed previously, the SPIR ABI is convenient
+since every argument is either one register or one pointer to load from or to
+store to.
+
+However, when a pointer is used for an argument, loads are issued which may be
+avoided by using constant pushes.
+
+Once again OCL makes the task a bit harder than expected. Indeed, the C
+semantic once again applies to function arguments as well.
+
+Look at these three examples:
+
+### Case 1. Direct loads -> constant push can be used
+
+<code>
+struct foo { int x; int y; }; </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
+{<br/>
+ dst[get\_global\_id(0)] = bar.x + bar.y;<br/>
+}
+</code>
+
+We use a _direct_ _load_ for `bar` with `bar.x` and `bar.y`. Values can be
+pushed into registers and we can replace the loads by register reads.
+
+### Case 2. Indirect loads -> we need to load the values from memory
+
+<code>
+struct foo { int x[16]; }; </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
+{<br/>
+ dst[get\_global\_id(0)] = bar.x[get\_local\_id(0)];<br/>
+}
+</code>
+
+We use an indirect load with `bar.x[get\_local\_id(0)]`. Here we need to issue a
+load from memory (well, actually, we could do a gather from registers, but it is
+not supported yet).
+
+### Case 3. Writes to arguments -> we need to spill the values to memory first
+
+<code>
+struct foo { int x[16]; }; </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
+{<br/>
+bar.x[0] = get\_global\_id(1);<br/>
+ dst[get\_global\_id(0)] = bar.x[get\_local\_id(0)];<br/>
+}
+</code>
+
+Here the values are written before being read. This causes some troubles since
+we are running in SIMD mode. Indeed, we only have in memory *one* instance of
+the function arguments. Here, *many* SIMD lanes and actually *many* hardware
+threads are running at the same time. This means that we can not write the data
+to memory. We need to allocate a private area for each SIMD lane.
+
+In that case, we need to spill back the function arguments into memory. We spill
+once per SIMD lane. Then, we read from this private area rather than the
+function arguments directly.
+
+This analysis is partially done today in `src/ir/lowering.*pp`. We identify all
+the cases but only the case with constant pushing is fully implemented.
+Actually, the two last cases are easy to implement but this requires one or two
+days of work.
+
+Value and liveness analysis tools
+---------------------------------
+
+You may also notice that we provide a complete framework for value analysis
+(i.e. to figure when a value or instruction destination is used and where the
+instruction sources come from). The code is in `src/ir/value.*pp`. Well, today,
+this code will burn a crazy amount of memory (use of std::set all over the
+place) but it at least provides the analysis required by many other passes.
+Compacting the data structures and using O(n) algorithms instead of the O(ln(n))
+are in the TODO list for sure :-)
+
+Finally, we also provide a liveness analysis tool which simply figures out which
+registers are alive at the end of each block (classically "live out" sets).
diff --git a/docs/Beignet/Backend/mixed_buffer_pointer.mdwn b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
new file mode 100644
index 0000000..f43ab7e
--- /dev/null
+++ b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
@@ -0,0 +1,46 @@
+Mixed Buffer Pointer
+--------------------
+
+Segmented address space...
+--------------------------
+
+The first challenge with OpenCL is its very liberal use of pointers. The memory
+is segment into several address spaces:
+
+- private. This is the memory for each work item
+
+- global. These are buffers in memory shared by all work items and work groups
+
+- constant. These are constant buffers in memory shared by all work items and
+work groups as well
+
+- local. These is a memory shared by all work items in the *same* work group
+
+... But with no restriction inside each address space
+-----------------------------------------------------
+
+The challenge is that there is no restriction in OpenCL inside each address
+space i.e. the full C semantic applies in particular regarding pointer
+arithmetic.
+
+Therefore the following code is valid:
+
+<code>
+\_\_kernel void example(\_\_global int *dst, \_\_global int *src0, \_\_global int *src1)<br/>
+{<br/>
+ \_\_global int *from;<br/>
+ if (get\_global\_id(0) % 2)<br/>
+ from = src0;<br/>
+ else<br/>
+ from = src1;<br/>
+ dst[get\_global\_id(0)] = from[get\_global\_id(0)];<br/>
+}
+</code>
+
+As one may see, the load done in the last line actually mixes pointers from both
+source src0 and src1. This typically makes the use of binding table indices
+pretty hard. In we use binding table 0 for dst, 1 for src0 and 2 for src1 (for
+example), we are not able to express the load in the last line with one send
+only. The pointer "from" in the last line is so called a mixed buffer pointer.
+
+(To be updated)
diff --git a/docs/Beignet/Backend/unstructured_branches.mdwn b/docs/Beignet/Backend/unstructured_branches.mdwn
new file mode 100644
index 0000000..37a294c
--- /dev/null
+++ b/docs/Beignet/Backend/unstructured_branches.mdwn
@@ -0,0 +1,271 @@
+Unstructured Branches
+=====================
+
+A major challenge in making a OpenCL compiler is certainly to handle any kind of
+branches. Indeed LLVM does not make any distinction between structured branches.
+See [here](http://llvm.org/docs/LangRef.html) for a complete description of
+the LLVM assembly specification.
+
+The C branching code is simply lowered down in the following instructions:
+
+- `ret` to return from the current function
+- `br` that, if predicated, possibly jumps to two destinations (one for the
+ taken branch and one for the other).
+- `switch` that implements the C switch/case construct.
+- `indirectbr` that implements a jump table
+- `invoke` and `resume` mostly used to handle exceptions
+
+Exceptions and jump tables are not supported in OpenCL. Switch cases can be
+lowered down to a sequence of if/else statements (using a divide and conquer
+approach a switch/case can be dispatched in log(n) complexity where n is the
+number of targets).
+
+This leads us to properly implement `br` and `ret` instructions.
+
+Solution 1 - Using Gen structured branches
+------------------------------------------
+
+Gen structured branches are the following instructions:
+
+`if` `else` `endif` `break` `continue` `while` `brd` `brc`
+
+Transforming the LLVM IR code into structured code results in basically
+reverse-engineering the LLVM code into the original C code.
+Unfortunately, there are several key problems:
+
+- OpenCL supports `goto` keyword that may jump to an arbitrary location
+- LLVM can transform the control flow graph in any kind of form
+- Worse is that a reducible control flow graph can be turned into an irreducible
+one by the optimizer.
+
+This can lead to complicated code transform and basic block duplication. The
+specification allows the compiler to abort if an irreducible control flow is
+detected but as an implementor, this is quite awkward to abort the compilation
+because the optimizer turns an reducible CFG to an irreducible one. Using
+structured branches is the open door to many corner cases.
+
+Thing is it exists a pretty elegant solution that can be almost seamlessly
+supported by Gen. This is the solution we retained.
+
+Solution 2 - Linearizing the control flow graph
+-----------------------------------------------
+
+The general problem is to map a general control flow graph to a SIMD machine.
+The problem is fairly well understood today. A recent research paper actually
+dedicated to OpenCL like languages which use the "SPMD" (single program multiple
+data) programming model present interesting insights about how to map SIMD
+architectures to such languages (see [here]
+(http://www.cdl.uni-saarland.de/papers/karrenberg_opencl.pdf)).
+
+### Core idea
+
+- Linearizing the CFG initially consists in removing all forward branches and
+"replace" them by predication. Indeed, the program will be still correct if you
+predicate instructions based instead of forward jumps. This is basically the
+a control flow to data flow conversion.
+
+- Of course, removing all forward branches is inefficient. To improve that, we
+simply introduce "if conditions" in the head of basic blocks to know if we run
+the basic block. If no lanes is going to be activated in the basic block, we
+jump to another basic block where _potentially_ some lanes are going to be
+reactivated.
+
+Consider the following CFG:
+
+<pre>
+o-------o
+| |
+| 1 |---->-----o
+| | |
+o-------o |
+ | |
+ | |
+o-------o |
+| | |
+| 2 |---->-----------o
+| | | |
+o-------o | |
+ | | |
+ | | |
+ | o------o | |
+ | | | | |
+ | v | | |
+o-------o | | |
+| | | | |
+| 3 | | | |
+| | | | |
+o-------o | | |
+ | | | | |
+ | o------o | |
+ | | |
+o-------o | |
+| | | |
+| 4 |<---------o |
+| | |
+o-------o |
+ | |
+ | |
+o-------o |
+| | |
+| 5 |<----------------o
+| |
+o-------o
+</pre>
+
+Mapping it to a SIMD machine may seem challenging. Actually it is not too
+complicated. The problem is with the 2->5 jump. Indeed, we have to be sure that
+we are not missing any computation done in block 4.
+
+To do so:
+- Instead of jumping from block 2 to block 5, we jump from block 2 to block 4.
+- We implement a `JOIN` point on top of block 4. We check if any lane is going
+to be reactivated for the block 4. If not, we jump to block 5.
+
+This leads to the following linearized CFG:
+<pre>
+o-------o
+| |
+| 1 |---->-----o
+| | |
+o-------o |
+ | |
+ | |
+o-------o |
+| | |
+| 2 |---->-----------o
+| | | |
+o-------o | |
+ | | |
+ | | |
+ | o--<---o | |
+ | | | | |
+ | v | | |
+o-------o | | |
+| | | | |
+| 3 | ^ | |
+| | | | |
+o-------o | | |
+ | | | | |
+ | o-->---o | |
+ | | |
+o-------o | |
+| |==========|=====|====O
+| 4 |<---------|-----o |
+| |<---------o |
+o-------o |
+ | |
+ | |
+o-------o |
+| | |
+| 5 |<====================O
+| |
+o-------o
+</pre>
+
+There is a new jump from block 4 to block 5.
+
+### Implementation on Gen
+
+When using structured branches, Gen can supports auto-masking i.e. based on the
+branches which are taken, the control flow is properly handled and masks are
+automatically applied on all instructions.
+
+However, there is no similar support for unstructured branches. We therefore
+decided to mask instructions manually and use single program flow. This is
+actually quite easy to do since Gen is able to predicate any branches.
+
+Now, how to evaluate the if conditions in an efficient way?
+
+The choice we did is to use *per-lane block IPs*: for each SIMD lane, we store a
+short (16 bits) for each lane in a regular 256 bits GPR (general purpose
+register). This "blockIP" register is used in the following way:
+
+At the beginning of each block, we compare the blockIP register with the ID of
+the block. The lane is going to be _activated_ if its blockIP is _smaller_ than
+the ID of the block. Otherwise, the lane is deactivated.
+
+Therefore, we build a flag register at the entry of each basic block with a
+single 16-wide uint16_t compare. If no lane is activated, a jump is performed to
+the next block where some lanes is going to be activated.
+
+Since this is regular jumps, we just use `jmpi` instruction. With the help of
+predication, we can express all the different possibilities:
+
+- backward branches are always taken if _any_ of lanes in the predicate is true.
+We just use `<+f0.0.anyh>` predication.
+- forward branches is *not* taken if some of the lanes are going to activated in
+the next block. We therefore compare the blockIP with the ID of the _next_
+block. If all of them are strictly greater than the ID of the next block, we
+jump. We therefore use the `<+f0.0.allh>` predicate in that case.
+- `JOIN` points are even simpler. We simply jump if none of the lane is activated.
+We therefore use the `<-f0.0.anyh>` predicate.
+
+The complete encoding is done in `src/backend/gen_insn_selection.cpp`. Forward
+branches are handled by `SimpleSelection::emitForwardBranch`. Backward branches
+are handled by `SimpleSelection::emitBackwardBranch`. Finally, since `JOIN` points
+are at the top of each basic blocks, they are handled by
+`SimpleSelection::emitLabelInstruction`.
+
+### Computing `JOIN` points
+
+The last problem is to compute `JOIN` point i.e. we need to know if we need to
+jump at the beginning of each block and if we do, what is the target of the
+branch. The code is relatively straightforward and can be found in
+`src/backend/context.cpp`. Function is `Context::buildJIPs`.
+</br>
+Actually, the current implementation is not that elegant. A colleague, Thomas
+Raoux, has a simpler and better idea to handle it.
+
+### Advantages and drawbacks of the method
+
+- The method has one decisive advantage: it is simple and extremely robust. It can
+handle any kind of CFGs (reducible or not) and does not require any
+transformation. The use of shorts is also not random. 16-wide compares is issued
+in 2 cycles (so it is twice fast as 16-wide 32 bits compares).
+- Main drawback will be performance. Even if this is not so bad, we still need
+more instructions than if we used structured branches. Mostly
+ * one or two instructions for `JOIN` points
+ * three instructions for backward and forward jumps (two more than structured
+ branches that just require the branch instruction itself)
+
+Note that all extra instructions are 16 bits instructions (i.e. they use shorts)
+so they will only cost 2 cycles anyway.
+
+The last point is that Gen encoding restricts conditional modifiers and
+predicates to be the same in the instruction. This requires to copy or recompute
+the flag register for compares and select. So one more instruction is required
+for these two instructions. Once again, this would require only 2 cycles.
+
+Remarks on `ret` instructions
+-----------------------------
+
+Since we can handle any kind of CFG, handling the return statements are
+relatively straightforward. We first create one return block at the end of the
+program. Then we replace all other returns by a unconditional jump to this
+block. The CFG linearization will take care of the rest.
+We then simply encode the (only one) return instruction as a End-Of-Thread
+message (EOT).
+Code examples
+-------------
+
+Some tests were written to assert the correctness of the CFG linearization and the
+code generation. They can be found in the _run-time_ code base here:
+
+`utest/compiler_if_else.cpp`
+
+`utest/compiler_lower_return0.cpp`
+
+`utest/compiler_lower_return1.cpp`
+
+`utest/compiler_lower_return2.cpp`
+
+`utest/compiler_short_scatter.cpp`
+
+`utest/compiler_unstructured_branch0.cpp`
+
+`utest/compiler_unstructured_branch1.cpp`
+
+`utest/compiler_unstructured_branch2.cpp`
+
+`utest/compiler_unstructured_branch3.cpp`
+
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
new file mode 100644
index 0000000..1adb48a
--- /dev/null
+++ b/docs/NEWS.mdwn
@@ -0,0 +1,16 @@
+# News
+
+## Sep 15, 2014
+[Beignet 0.9.3](https://01.org/zh/beignet/downloads/beignet-0.9.3-2014-09-15) is released. This is a bug-fix release.
+
+## July 17, 2014
+[Beignet 0.9.2](https://01.org/zh/beignet/downloads/beignet-0.9.2-2014-07-17) is released. This is a bug-fix release.
+
+## July 4, 2014
+[Beignet 0.9.1](https://01.org/zh/beignet/downloads/beignet-0.9.1-2014-07-04) is released. This is a bug-fix release.
+
+## June 26, 2014
+[Beignet 0.9.0](https://01.org/zh/beignet/downloads/beignet-0.9-2014-06-26) is released. This is a major release. Please see the release notes for more information.
+
+## Feb 12, 2014
+[Beignet 0.8.0](https://01.org/zh/beignet/downloads/2014/beignet-0.8.0-2014-02-12) is released. This is a major release. Please see the release notes for more information.
diff --git a/docs/howto/cross-compiler-howto.mdwn b/docs/howto/cross-compiler-howto.mdwn
new file mode 100644
index 0000000..535cd9a
--- /dev/null
+++ b/docs/howto/cross-compiler-howto.mdwn
@@ -0,0 +1,60 @@
+Cross Compiler HowTo
+====================
+
+Beignet supports both PC devices with full profile and embedded/handheld
+devices with embeded profile. This document describes how to build Beignet
+and OpenCL kernels for a target machine (embedded/handheld devices) in a
+host machine with the help of cross compiler, and also the large-size-reduced
+Beignet driver package for the target machine.
+
+Build Beignet with a cross compiler
+-----------------------------------
+
+Besides the general cross compile methods, reference the following options when
+configure Beignet with cmake.
+
+- LLVM_INSTALL_DIR
+ Beignet depends on llvm+clang, this option refers to the path of llvm-config,
+ llvm-as, llvm-link and clang in the cross compiler environment.
+
+- CMAKE_SKIP_RPATH
+ Some cross compiler systems forbid the usage of rpath in binaries/libraries,
+ set this option to be TRUE.
+
+- GEN_PCI_ID
+ It is the GPU pci_id of the target machine, for example, 0x0162 is the pciid
+ of Intel Ivybridge GPU, and 0x0f31 is Intel Baytrail GPU. The information can
+ be queried with command 'lspci -n'.
+
+- CMAKE_INSTALL_PREFIX
+ This option controls the prefix of installation path.
+
+Distribution of large-size-reduced Beignet driver package
+---------------------------------------------------------
+
+On embedded/handheld devices, storage and memory are scarce, it is necessary to
+provide only the OpenCL runtime library without OpenCL compiler, and only the
+executable binary kernel is supported on such devices.
+
+It means that just distribute libcl.so and libgbeinterp.so (~320k in total after strip)
+are enough for OpenCL embeded profile in the target machine.
+
+Build OpenCL kernels with OpenCL offline compiler
+-------------------------------------------------
+
+Since the target machine does not contain the OpenCL compiler, the OpenCL source
+kernel need to be compiled with an OpenCL offline compiler (gbe_bin_generater)
+into binary kernel in the host machine, and the OpenCL application can load the
+binary kernel with function clCreateProgramWithBinary.
+
+The OpenCL offline compiler gbe_bin_generater is the result of Beignet build and
+locates at .../your_path_to_build/backend/src/gbe_bin_generater, see below for the
+command options.
+
+gbe_bin_generater INFILE [-pbuild_parameter] -oOUTFILE -tGEN_PCI_ID
+
+For example, the following command builds OpenCL source kernel from file 'mykernel.cl'
+for Ivybridge with pci_id 0x0162, and write the result (executable binary kernel)
+into file 'mykernel.bin'.
+
+gbe_bin_generater mykernel.cl -omykernel.bin -t0x0162
diff --git a/docs/optimization-guide.mdwn b/docs/optimization-guide.mdwn
new file mode 100644
index 0000000..8fb29a6
--- /dev/null
+++ b/docs/optimization-guide.mdwn
@@ -0,0 +1,28 @@
+Optimization Guide
+====================
+
+All the SIMD optimization principle also apply to Beignet optimization.
+Furthermore, there are some special tips for Beignet optimization.
+
+1. It is recommended to choose multiple of 16 work group size. Too much SLM usage may reduce parallelism at group level.
+ If kernel uses large amount SLM, it's better to choose large work group size. Please refer the following table for recommendations
+ with some SLM usage.
+| Amount of SLM | 0 | 4K | 8K | 16K | 32K |
+| WorkGroup size| 16 | 64 | 128 | 256 | 512 |
+
+2. GEN7's read/write on global memory with DWORD and DWORD4 are significantly faster than read/write on BYTE/WORD.
+ Use DWORD or DWORD4 to access data in global memory if possible. If you cannot avoid the byte/word access, try to do it on SLM.
+
+3. Use float data type as much as possible.
+
+4. Avoid using long. GEN7's performance for long integer is poor.
+
+5. If there is a small constant buffer, define it in the kernel instead of using the constant buffer argument if possible.
+ The compiler may optimize it if the buffer is defined inside kernel.
+
+6. Avoid unnecessary synchronizations, both in the runtime and in the kernel. For examples, clFinish and clWaitForEvents in runtime
+ and barrier() in the kernel.
+
+7. Consider native version of math built-ins, such as native\_sin, native\_cos, if your kernel is not precision sensitive.
+
+8. Try to eliminate branching as much as possible. For example using min, max, clamp or select built-ins instead of if/else if possible.
diff --git a/include/CL/cl.h b/include/CL/cl.h
new file mode 100644
index 0000000..316565d
--- /dev/null
+++ b/include/CL/cl.h
@@ -0,0 +1,1214 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id * cl_platform_id;
+typedef struct _cl_device_id * cl_device_id;
+typedef struct _cl_context * cl_context;
+typedef struct _cl_command_queue * cl_command_queue;
+typedef struct _cl_mem * cl_mem;
+typedef struct _cl_program * cl_program;
+typedef struct _cl_kernel * cl_kernel;
+typedef struct _cl_event * cl_event;
+typedef struct _cl_sampler * cl_sampler;
+
+typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong cl_bitfield;
+typedef cl_bitfield cl_device_type;
+typedef cl_uint cl_platform_info;
+typedef cl_uint cl_device_info;
+typedef cl_bitfield cl_device_fp_config;
+typedef cl_uint cl_device_mem_cache_type;
+typedef cl_uint cl_device_local_mem_type;
+typedef cl_bitfield cl_device_exec_capabilities;
+typedef cl_bitfield cl_command_queue_properties;
+typedef intptr_t cl_device_partition_property;
+typedef cl_bitfield cl_device_affinity_domain;
+
+typedef intptr_t cl_context_properties;
+typedef cl_uint cl_context_info;
+typedef cl_uint cl_command_queue_info;
+typedef cl_uint cl_channel_order;
+typedef cl_uint cl_channel_type;
+typedef cl_bitfield cl_mem_flags;
+typedef cl_uint cl_mem_object_type;
+typedef cl_uint cl_mem_info;
+typedef cl_bitfield cl_mem_migration_flags;
+typedef cl_uint cl_image_info;
+typedef cl_uint cl_buffer_create_type;
+typedef cl_uint cl_addressing_mode;
+typedef cl_uint cl_filter_mode;
+typedef cl_uint cl_sampler_info;
+typedef cl_bitfield cl_map_flags;
+typedef cl_uint cl_program_info;
+typedef cl_uint cl_program_build_info;
+typedef cl_uint cl_program_binary_type;
+typedef cl_int cl_build_status;
+typedef cl_uint cl_kernel_info;
+typedef cl_uint cl_kernel_arg_info;
+typedef cl_uint cl_kernel_arg_address_qualifier;
+typedef cl_uint cl_kernel_arg_access_qualifier;
+typedef cl_bitfield cl_kernel_arg_type_qualifier;
+typedef cl_uint cl_kernel_work_group_info;
+typedef cl_uint cl_event_info;
+typedef cl_uint cl_command_type;
+typedef cl_uint cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+ cl_channel_order image_channel_order;
+ cl_channel_type image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+ cl_mem_object_type image_type;
+ size_t image_width;
+ size_t image_height;
+ size_t image_depth;
+ size_t image_array_size;
+ size_t image_row_pitch;
+ size_t image_slice_pitch;
+ cl_uint num_mip_levels;
+ cl_uint num_samples;
+ cl_mem buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+ size_t origin;
+ size_t size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS 0
+#define CL_DEVICE_NOT_FOUND -1
+#define CL_DEVICE_NOT_AVAILABLE -2
+#define CL_COMPILER_NOT_AVAILABLE -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
+#define CL_OUT_OF_RESOURCES -5
+#define CL_OUT_OF_HOST_MEMORY -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE -7
+#define CL_MEM_COPY_OVERLAP -8
+#define CL_IMAGE_FORMAT_MISMATCH -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
+#define CL_BUILD_PROGRAM_FAILURE -11
+#define CL_MAP_FAILURE -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE -15
+#define CL_LINKER_NOT_AVAILABLE -16
+#define CL_LINK_PROGRAM_FAILURE -17
+#define CL_DEVICE_PARTITION_FAILED -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19
+
+#define CL_INVALID_VALUE -30
+#define CL_INVALID_DEVICE_TYPE -31
+#define CL_INVALID_PLATFORM -32
+#define CL_INVALID_DEVICE -33
+#define CL_INVALID_CONTEXT -34
+#define CL_INVALID_QUEUE_PROPERTIES -35
+#define CL_INVALID_COMMAND_QUEUE -36
+#define CL_INVALID_HOST_PTR -37
+#define CL_INVALID_MEM_OBJECT -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
+#define CL_INVALID_IMAGE_SIZE -40
+#define CL_INVALID_SAMPLER -41
+#define CL_INVALID_BINARY -42
+#define CL_INVALID_BUILD_OPTIONS -43
+#define CL_INVALID_PROGRAM -44
+#define CL_INVALID_PROGRAM_EXECUTABLE -45
+#define CL_INVALID_KERNEL_NAME -46
+#define CL_INVALID_KERNEL_DEFINITION -47
+#define CL_INVALID_KERNEL -48
+#define CL_INVALID_ARG_INDEX -49
+#define CL_INVALID_ARG_VALUE -50
+#define CL_INVALID_ARG_SIZE -51
+#define CL_INVALID_KERNEL_ARGS -52
+#define CL_INVALID_WORK_DIMENSION -53
+#define CL_INVALID_WORK_GROUP_SIZE -54
+#define CL_INVALID_WORK_ITEM_SIZE -55
+#define CL_INVALID_GLOBAL_OFFSET -56
+#define CL_INVALID_EVENT_WAIT_LIST -57
+#define CL_INVALID_EVENT -58
+#define CL_INVALID_OPERATION -59
+#define CL_INVALID_GL_OBJECT -60
+#define CL_INVALID_BUFFER_SIZE -61
+#define CL_INVALID_MIP_LEVEL -62
+#define CL_INVALID_GLOBAL_WORK_SIZE -63
+#define CL_INVALID_PROPERTY -64
+#define CL_INVALID_IMAGE_DESCRIPTOR -65
+#define CL_INVALID_COMPILER_OPTIONS -66
+#define CL_INVALID_LINKER_OPTIONS -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0 1
+#define CL_VERSION_1_1 1
+#define CL_VERSION_1_2 1
+
+/* cl_bool */
+#define CL_FALSE 0
+#define CL_TRUE 1
+#define CL_BLOCKING CL_TRUE
+#define CL_NON_BLOCKING CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE 0x0900
+#define CL_PLATFORM_VERSION 0x0901
+#define CL_PLATFORM_NAME 0x0902
+#define CL_PLATFORM_VENDOR 0x0903
+#define CL_PLATFORM_EXTENSIONS 0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
+#define CL_DEVICE_TYPE_CPU (1 << 1)
+#define CL_DEVICE_TYPE_GPU (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
+#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE 0x1000
+#define CL_DEVICE_VENDOR_ID 0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
+#define CL_DEVICE_ADDRESS_BITS 0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT 0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
+#define CL_DEVICE_MAX_SAMPLERS 0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
+#define CL_DEVICE_ENDIAN_LITTLE 0x1026
+#define CL_DEVICE_AVAILABLE 0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
+#define CL_DEVICE_NAME 0x102B
+#define CL_DEVICE_VENDOR 0x102C
+#define CL_DRIVER_VERSION 0x102D
+#define CL_DEVICE_PROFILE 0x102E
+#define CL_DEVICE_VERSION 0x102F
+#define CL_DEVICE_EXTENSIONS 0x1030
+#define CL_DEVICE_PLATFORM 0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
+#define CL_DEVICE_OPENCL_C_VERSION 0x103D
+#define CL_DEVICE_LINKER_AVAILABLE 0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
+#define CL_DEVICE_PARENT_DEVICE 0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES 0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
+#define CL_DEVICE_PARTITION_TYPE 0x1046
+#define CL_DEVICE_REFERENCE_COUNT 0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM (1 << 0)
+#define CL_FP_INF_NAN (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST (1 << 2)
+#define CL_FP_ROUND_TO_ZERO (1 << 3)
+#define CL_FP_ROUND_TO_INF (1 << 4)
+#define CL_FP_FMA (1 << 5)
+#define CL_FP_SOFT_FLOAT (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE 0x0
+#define CL_READ_ONLY_CACHE 0x1
+#define CL_READ_WRITE_CACHE 0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL 0x1
+#define CL_GLOBAL 0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
+
+/* cl_context_info */
+#define CL_CONTEXT_REFERENCE_COUNT 0x1080
+#define CL_CONTEXT_DEVICES 0x1081
+#define CL_CONTEXT_PROPERTIES 0x1082
+#define CL_CONTEXT_NUM_DEVICES 0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM 0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT 0x1090
+#define CL_QUEUE_DEVICE 0x1091
+#define CL_QUEUE_REFERENCE_COUNT 0x1092
+#define CL_QUEUE_PROPERTIES 0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE (1 << 0)
+#define CL_MEM_WRITE_ONLY (1 << 1)
+#define CL_MEM_READ_ONLY (1 << 2)
+#define CL_MEM_USE_HOST_PTR (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
+#define CL_MEM_COPY_HOST_PTR (1 << 5)
+/* reserved (1 << 6) */
+#define CL_MEM_HOST_WRITE_ONLY (1 << 7)
+#define CL_MEM_HOST_READ_ONLY (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1)
+
+/* cl_channel_order */
+#define CL_R 0x10B0
+#define CL_A 0x10B1
+#define CL_RG 0x10B2
+#define CL_RA 0x10B3
+#define CL_RGB 0x10B4
+#define CL_RGBA 0x10B5
+#define CL_BGRA 0x10B6
+#define CL_ARGB 0x10B7
+#define CL_INTENSITY 0x10B8
+#define CL_LUMINANCE 0x10B9
+#define CL_Rx 0x10BA
+#define CL_RGx 0x10BB
+#define CL_RGBx 0x10BC
+#define CL_DEPTH 0x10BD
+#define CL_DEPTH_STENCIL 0x10BE
+
+/* cl_channel_type */
+#define CL_SNORM_INT8 0x10D0
+#define CL_SNORM_INT16 0x10D1
+#define CL_UNORM_INT8 0x10D2
+#define CL_UNORM_INT16 0x10D3
+#define CL_UNORM_SHORT_565 0x10D4
+#define CL_UNORM_SHORT_555 0x10D5
+#define CL_UNORM_INT_101010 0x10D6
+#define CL_SIGNED_INT8 0x10D7
+#define CL_SIGNED_INT16 0x10D8
+#define CL_SIGNED_INT32 0x10D9
+#define CL_UNSIGNED_INT8 0x10DA
+#define CL_UNSIGNED_INT16 0x10DB
+#define CL_UNSIGNED_INT32 0x10DC
+#define CL_HALF_FLOAT 0x10DD
+#define CL_FLOAT 0x10DE
+#define CL_UNORM_INT24 0x10DF
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER 0x10F0
+#define CL_MEM_OBJECT_IMAGE2D 0x10F1
+#define CL_MEM_OBJECT_IMAGE3D 0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D 0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
+
+/* cl_mem_info */
+#define CL_MEM_TYPE 0x1100
+#define CL_MEM_FLAGS 0x1101
+#define CL_MEM_SIZE 0x1102
+#define CL_MEM_HOST_PTR 0x1103
+#define CL_MEM_MAP_COUNT 0x1104
+#define CL_MEM_REFERENCE_COUNT 0x1105
+#define CL_MEM_CONTEXT 0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
+#define CL_MEM_OFFSET 0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT 0x1110
+#define CL_IMAGE_ELEMENT_SIZE 0x1111
+#define CL_IMAGE_ROW_PITCH 0x1112
+#define CL_IMAGE_SLICE_PITCH 0x1113
+#define CL_IMAGE_WIDTH 0x1114
+#define CL_IMAGE_HEIGHT 0x1115
+#define CL_IMAGE_DEPTH 0x1116
+#define CL_IMAGE_ARRAY_SIZE 0x1117
+#define CL_IMAGE_BUFFER 0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS 0x1119
+#define CL_IMAGE_NUM_SAMPLES 0x111A
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE 0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
+#define CL_ADDRESS_CLAMP 0x1132
+#define CL_ADDRESS_REPEAT 0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT 0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST 0x1140
+#define CL_FILTER_LINEAR 0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT 0x1150
+#define CL_SAMPLER_CONTEXT 0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
+#define CL_SAMPLER_ADDRESSING_MODE 0x1153
+#define CL_SAMPLER_FILTER_MODE 0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ (1 << 0)
+#define CL_MAP_WRITE (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT 0x1160
+#define CL_PROGRAM_CONTEXT 0x1161
+#define CL_PROGRAM_NUM_DEVICES 0x1162
+#define CL_PROGRAM_DEVICES 0x1163
+#define CL_PROGRAM_SOURCE 0x1164
+#define CL_PROGRAM_BINARY_SIZES 0x1165
+#define CL_PROGRAM_BINARIES 0x1166
+#define CL_PROGRAM_NUM_KERNELS 0x1167
+#define CL_PROGRAM_KERNEL_NAMES 0x1168
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS 0x1181
+#define CL_PROGRAM_BUILD_OPTIONS 0x1182
+#define CL_PROGRAM_BUILD_LOG 0x1183
+#define CL_PROGRAM_BINARY_TYPE 0x1184
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS 0
+#define CL_BUILD_NONE -1
+#define CL_BUILD_ERROR -2
+#define CL_BUILD_IN_PROGRESS -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME 0x1190
+#define CL_KERNEL_NUM_ARGS 0x1191
+#define CL_KERNEL_REFERENCE_COUNT 0x1192
+#define CL_KERNEL_CONTEXT 0x1193
+#define CL_KERNEL_PROGRAM 0x1194
+#define CL_KERNEL_ATTRIBUTES 0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197
+#define CL_KERNEL_ARG_TYPE_NAME 0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199
+#define CL_KERNEL_ARG_NAME 0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3
+
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE 0
+#define CL_KERNEL_ARG_TYPE_CONST (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_QUEUE 0x11D0
+#define CL_EVENT_COMMAND_TYPE 0x11D1
+#define CL_EVENT_REFERENCE_COUNT 0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
+#define CL_EVENT_CONTEXT 0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
+#define CL_COMMAND_TASK 0x11F1
+#define CL_COMMAND_NATIVE_KERNEL 0x11F2
+#define CL_COMMAND_READ_BUFFER 0x11F3
+#define CL_COMMAND_WRITE_BUFFER 0x11F4
+#define CL_COMMAND_COPY_BUFFER 0x11F5
+#define CL_COMMAND_READ_IMAGE 0x11F6
+#define CL_COMMAND_WRITE_IMAGE 0x11F7
+#define CL_COMMAND_COPY_IMAGE 0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
+#define CL_COMMAND_MAP_BUFFER 0x11FB
+#define CL_COMMAND_MAP_IMAGE 0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
+#define CL_COMMAND_MARKER 0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
+#define CL_COMMAND_READ_BUFFER_RECT 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
+#define CL_COMMAND_USER 0x1204
+#define CL_COMMAND_BARRIER 0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
+#define CL_COMMAND_FILL_BUFFER 0x1207
+#define CL_COMMAND_FILL_IMAGE 0x1208
+
+/* command execution status */
+#define CL_COMPLETE 0x0
+#define CL_RUNNING 0x1
+#define CL_SUBMITTED 0x2
+#define CL_QUEUED 0x3
+
+/* cl_buffer_create_type */
+#define CL_BUFFER_CREATE_TYPE_REGION 0x1220
+
+/* cl_profiling_info */
+#define CL_PROFILING_COMMAND_QUEUED 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT 0x1281
+#define CL_PROFILING_COMMAND_START 0x1282
+#define CL_PROFILING_COMMAND_END 0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id /* platform */,
+ cl_platform_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id /* platform */,
+ cl_device_type /* device_type */,
+ cl_uint /* num_entries */,
+ cl_device_id * /* devices */,
+ cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id /* device */,
+ cl_device_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id /* in_device */,
+ const cl_device_partition_property * /* properties */,
+ cl_uint /* num_devices */,
+ cl_device_id * /* out_devices */,
+ cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+/* Context APIs */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* devices */,
+ void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+ cl_device_type /* device_type */,
+ void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context /* context */,
+ cl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context /* context */,
+ cl_device_id /* device */,
+ cl_command_queue_properties /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue /* command_queue */,
+ cl_command_queue_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ size_t /* size */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem /* buffer */,
+ cl_mem_flags /* flags */,
+ cl_buffer_create_type /* buffer_create_type */,
+ const void * /* buffer_create_info */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ const cl_image_desc * /* image_desc */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_mem_object_type /* image_type */,
+ cl_uint /* num_entries */,
+ cl_image_format * /* image_formats */,
+ cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem /* memobj */,
+ cl_mem_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem /* image */,
+ cl_image_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback( cl_mem /* memobj */,
+ void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1;
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context /* context */,
+ cl_bool /* normalized_coords */,
+ cl_addressing_mode /* addressing_mode */,
+ cl_filter_mode /* filter_mode */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler /* sampler */,
+ cl_sampler_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context /* context */,
+ cl_uint /* count */,
+ const char ** /* strings */,
+ const size_t * /* lengths */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const size_t * /* lengths */,
+ const unsigned char ** /* binaries */,
+ cl_int * /* binary_status */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* kernel_names */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_headers */,
+ const cl_program * /* input_headers */,
+ const char ** /* header_include_names */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_programs */,
+ const cl_program * /* input_programs */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program /* program */,
+ cl_program_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program /* program */,
+ cl_device_id /* device */,
+ cl_program_build_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program /* program */,
+ const char * /* kernel_name */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program /* program */,
+ cl_uint /* num_kernels */,
+ cl_kernel * /* kernels */,
+ cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel /* kernel */,
+ cl_uint /* arg_index */,
+ size_t /* arg_size */,
+ const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel /* kernel */,
+ cl_kernel_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel /* kernel */,
+ cl_uint /* arg_indx */,
+ cl_kernel_arg_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel /* kernel */,
+ cl_device_id /* device */,
+ cl_kernel_work_group_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event /* event */,
+ cl_event_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context /* context */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event /* event */,
+ cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event /* event */,
+ cl_int /* command_exec_callback_type */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event /* event */,
+ cl_profiling_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ size_t /* offset */,
+ size_t /* size */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ size_t /* offset */,
+ size_t /* size */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ const void * /* pattern */,
+ size_t /* pattern_size */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ size_t /* src_offset */,
+ size_t /* dst_offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin */,
+ const size_t * /* dst_origin */,
+ const size_t * /* region */,
+ size_t /* src_row_pitch */,
+ size_t /* src_slice_pitch */,
+ size_t /* dst_row_pitch */,
+ size_t /* dst_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_read */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* row_pitch */,
+ size_t /* slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_write */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* input_row_pitch */,
+ size_t /* input_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ const void * /* fill_color */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_image */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* dst_offset */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_image */,
+ size_t /* src_offset */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t * /* image_row_pitch */,
+ size_t * /* image_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+ cl_mem /* memobj */,
+ void * /* mapped_ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_mem_migration_flags /* flags */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* work_dim */,
+ const size_t * /* global_work_offset */,
+ const size_t * /* global_work_size */,
+ const size_t * /* local_work_size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue /* command_queue */,
+ void (CL_CALLBACK * /*user_func*/)(void *),
+ void * /* args */,
+ size_t /* cb_args */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_list */,
+ const void ** /* args_mem_loc */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found. The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+ const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_row_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_depth */,
+ size_t /* image_row_pitch */,
+ size_t /* image_slice_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue /* command_queue */,
+ cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+ cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_H */
+
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
new file mode 100644
index 0000000..38fac19
--- /dev/null
+++ b/include/CL/cl.hpp
@@ -0,0 +1,12452 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and
+ * OpenCL 1.2 (rev 15)
+ * \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *
+ * Additions and fixes from:
+ * Brian Cole, March 3rd 2010 and April 2012
+ * Matt Gruenke, April 2012.
+ * Bruce Merry, February 2013.
+ * Tom Deakin and Simon McIntosh-Smith, July 2013
+ *
+ * \version 1.2.6
+ * \date August 2013
+ *
+ * Optional extension support
+ *
+ * cl
+ * cl_ext_device_fission
+ * #define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ *
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ *
+ * const char * helloStr = "__kernel void "
+ * "hello(void) "
+ * "{ "
+ * " "
+ * "} ";
+ *
+ * int
+ * main(void)
+ * {
+ * cl_int err = CL_SUCCESS;
+ * try {
+ *
+ * std::vector<cl::Platform> platforms;
+ * cl::Platform::get(&platforms);
+ * if (platforms.size() == 0) {
+ * std::cout << "Platform size 0\n";
+ * return -1;
+ * }
+ *
+ * cl_context_properties properties[] =
+ * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ * cl::Context context(CL_DEVICE_TYPE_CPU, properties);
+ *
+ * std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ *
+ * cl::Program::Sources source(1,
+ * std::make_pair(helloStr,strlen(helloStr)));
+ * cl::Program program_ = cl::Program(context, source);
+ * program_.build(devices);
+ *
+ * cl::Kernel kernel(program_, "hello", &err);
+ *
+ * cl::Event event;
+ * cl::CommandQueue queue(context, devices[0], 0, &err);
+ * queue.enqueueNDRangeKernel(
+ * kernel,
+ * cl::NullRange,
+ * cl::NDRange(4,4),
+ * cl::NullRange,
+ * NULL,
+ * &event);
+ *
+ * event.wait();
+ * }
+ * catch (cl::Error err) {
+ * std::cerr
+ * << "ERROR: "
+ * << err.what()
+ * << "("
+ * << err.err()
+ * << ")"
+ * << std::endl;
+ * }
+ *
+ * return EXIT_SUCCESS;
+ * }
+ *
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <malloc.h>
+#include <iterator>
+#include <intrin.h>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#pragma push_macro("max")
+#undef max
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+//
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif
+
+#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+#define __INIT_CL_EXT_FCN_PTR(name) \
+ if(!pfn_##name) { \
+ pfn_##name = (PFN_##name) \
+ clGetExtensionFunctionAddress(#name); \
+ if(!pfn_##name) { \
+ } \
+ }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+ if(!pfn_##name) { \
+ pfn_##name = (PFN_##name) \
+ clGetExtensionFunctionAddressForPlatform(platform, #name); \
+ if(!pfn_##name) { \
+ } \
+ }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class
+ *
+ * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+ cl_int err_;
+ const char * errStr_;
+public:
+ /*! \brief Create a new CL error exception for a given error code
+ * and corresponding message.
+ *
+ * \param err error code value.
+ *
+ * \param errStr a descriptive string that must remain in scope until
+ * handling of the exception has concluded. If set, it
+ * will be returned by what().
+ */
+ Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+ {}
+
+ ~Error() throw() {}
+
+ /*! \brief Get error string associated with exception
+ *
+ * \return A memory pointer to the error message string.
+ */
+ virtual const char * what() const throw ()
+ {
+ if (errStr_ == NULL) {
+ return "empty";
+ }
+ else {
+ return errStr_;
+ }
+ }
+
+ /*! \brief Get error code associated with exception
+ *
+ * \return The error code.
+ */
+ cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+ cl_int err,
+ const char * errStr = NULL)
+{
+ if (err != CL_SUCCESS) {
+ throw Error(err, errStr);
+ }
+ return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+ (void) errStr; // suppress unused variable warning
+ return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer)
+#define __COPY_ERR __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram)
+
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR __ERR_STR(Retain Object)
+#define __RELEASE_ERR __ERR_STR(Release Object)
+#define __FLUSH_ERR __ERR_STR(clFlush)
+#define __FINISH_ERR __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING)
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+
+ * \note Deprecated. Please use std::string as default or
+ * re-define the string class to match the std::string
+ * interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+ ::size_t size_;
+ char * str_;
+public:
+ //! \brief Constructs an empty string, allocating no memory.
+ string(void) : size_(0), str_(NULL)
+ {
+ }
+
+ /*! \brief Constructs a string populated from an arbitrary value of
+ * specified size.
+ *
+ * An extra '\0' is added, in case none was contained in str.
+ *
+ * \param str the initial value of the string instance. Note that '\0'
+ * characters receive no special treatment. If NULL,
+ * the string is left empty, with a size of 0.
+ *
+ * \param size the number of characters to copy from str.
+ */
+ string(const char * str, ::size_t size) :
+ size_(size),
+ str_(NULL)
+ {
+ if( size > 0 ) {
+ str_ = new char[size_+1];
+ if (str_ != NULL) {
+ memcpy(str_, str, size_ * sizeof(char));
+ str_[size_] = '\0';
+ }
+ else {
+ size_ = 0;
+ }
+ }
+ }
+
+ /*! \brief Constructs a string populated from a null-terminated value.
+ *
+ * \param str the null-terminated initial value of the string instance.
+ * If NULL, the string is left empty, with a size of 0.
+ */
+ string(const char * str) :
+ size_(0),
+ str_(NULL)
+ {
+ if( str ) {
+ size_= ::strlen(str);
+ }
+ if( size_ > 0 ) {
+ str_ = new char[size_ + 1];
+ if (str_ != NULL) {
+ memcpy(str_, str, (size_ + 1) * sizeof(char));
+ }
+ }
+ }
+
+ void resize( ::size_t n )
+ {
+ if( size_ == n ) {
+ return;
+ }
+ if (n == 0) {
+ if( str_ ) {
+ delete [] str_;
+ }
+ str_ = NULL;
+ size_ = 0;
+ }
+ else {
+ char *newString = new char[n + 1];
+ int copySize = n;
+ if( size_ < n ) {
+ copySize = size_;
+ }
+ size_ = n;
+
+ if(str_) {
+ memcpy(newString, str_, (copySize + 1) * sizeof(char));
+ }
+ if( copySize < size_ ) {
+ memset(newString + copySize, 0, size_ - copySize);
+ }
+ newString[size_] = '\0';
+
+ delete [] str_;
+ str_ = newString;
+ }
+ }
+
+ const char& operator[] ( ::size_t pos ) const
+ {
+ return str_[pos];
+ }
+
+ char& operator[] ( ::size_t pos )
+ {
+ return str_[pos];
+ }
+
+ /*! \brief Copies the value of another string to this one.
+ *
+ * \param rhs the string to copy.
+ *
+ * \returns a reference to the modified instance.
+ */
+ string& operator=(const string& rhs)
+ {
+ if (this == &rhs) {
+ return *this;
+ }
+
+ if( str_ != NULL ) {
+ delete [] str_;
+ str_ = NULL;
+ size_ = 0;
+ }
+
+ if (rhs.size_ == 0 || rhs.str_ == NULL) {
+ str_ = NULL;
+ size_ = 0;
+ }
+ else {
+ str_ = new char[rhs.size_ + 1];
+ size_ = rhs.size_;
+
+ if (str_ != NULL) {
+ memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+ }
+ else {
+ size_ = 0;
+ }
+ }
+
+ return *this;
+ }
+
+ /*! \brief Constructs a string by copying the value of another instance.
+ *
+ * \param rhs the string to copy.
+ */
+ string(const string& rhs) :
+ size_(0),
+ str_(NULL)
+ {
+ *this = rhs;
+ }
+
+ //! \brief Destructor - frees memory used to hold the current value.
+ ~string()
+ {
+ delete[] str_;
+ str_ = NULL;
+ }
+
+ //! \brief Queries the length of the string, excluding any added '\0's.
+ ::size_t size(void) const { return size_; }
+
+ //! \brief Queries the length of the string, excluding any added '\0's.
+ ::size_t length(void) const { return size(); }
+
+ /*! \brief Returns a pointer to the private copy held by this instance,
+ * or "" if empty/unset.
+ */
+ const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING)
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR)
+#define VECTOR_CLASS cl::vector
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring
+ *
+ * \note Deprecated. Please use std::vector as default or
+ * re-define the vector class to match the std::vector
+ * interface by defining VECTOR_CLASS
+
+ * \note Not recommended for use with custom objects as
+ * current implementation will construct N elements
+ *
+ * std::vector functionality.
+ * \brief Fixed sized vector compatible with std::vector.
+ *
+ * \note
+ * This differs from std::vector<> not just in memory allocation,
+ * but also in terms of when members are constructed, destroyed,
+ * and assigned instead of being copy constructed.
+ *
+ * \param T type of element contained in the vector.
+ *
+ * \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+ T data_[N];
+ unsigned int size_;
+
+public:
+ //! \brief Constructs an empty vector with no memory allocated.
+ vector() :
+ size_(static_cast<unsigned int>(0))
+ {}
+
+ //! \brief Deallocates the vector's memory and destroys all of its elements.
+ ~vector()
+ {
+ clear();
+ }
+
+ //! \brief Returns the number of elements currently contained.
+ unsigned int size(void) const
+ {
+ return size_;
+ }
+
+ /*! \brief Empties the vector of all elements.
+ * \note
+ * This does not deallocate memory but will invoke destructors
+ * on contained elements.
+ */
+ void clear()
+ {
+ while(!empty()) {
+ pop_back();
+ }
+ }
+
+ /*! \brief Appends an element after the last valid element.
+ * Calling this on a vector that has reached capacity will throw an
+ * exception if exceptions are enabled.
+ */
+ void push_back (const T& x)
+ {
+ if (size() < N) {
+ new (&data_[size_]) T(x);
+ size_++;
+ } else {
+ detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+ }
+ }
+
+ /*! \brief Removes the last valid element from the vector.
+ * Calling this on an empty vector will throw an exception
+ * if exceptions are enabled.
+ */
+ void pop_back(void)
+ {
+ if (size_ != 0) {
+ --size_;
+ data_[size_].~T();
+ } else {
+ detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+ }
+ }
+
+ /*! \brief Constructs with a value copied from another.
+ *
+ * \param vec the vector to copy.
+ */
+ vector(const vector<T, N>& vec) :
+ size_(vec.size_)
+ {
+ if (size_ != 0) {
+ assign(vec.begin(), vec.end());
+ }
+ }
+
+ /*! \brief Constructs with a specified number of initial elements.
+ *
+ * \param size number of initial elements.
+ *
+ * \param val value of initial elements.
+ */
+ vector(unsigned int size, const T& val = T()) :
+ size_(0)
+ {
+ for (unsigned int i = 0; i < size; i++) {
+ push_back(val);
+ }
+ }
+
+ /*! \brief Overwrites the current content with that copied from another
+ * instance.
+ *
+ * \param rhs vector to copy.
+ *
+ * \returns a reference to this.
+ */
+ vector<T, N>& operator=(const vector<T, N>& rhs)
+ {
+ if (this == &rhs) {
+ return *this;
+ }
+
+ if (rhs.size_ != 0) {
+ assign(rhs.begin(), rhs.end());
+ } else {
+ clear();
+ }
+
+ return *this;
+ }
+
+ /*! \brief Tests equality against another instance.
+ *
+ * \param vec the vector against which to compare.
+ */
+ bool operator==(vector<T,N> &vec)
+ {
+ if (size() != vec.size()) {
+ return false;
+ }
+
+ for( unsigned int i = 0; i < size(); ++i ) {
+ if( operator[](i) != vec[i] ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ //! \brief Conversion operator to T*.
+ operator T* () { return data_; }
+
+ //! \brief Conversion operator to const T*.
+ operator const T* () const { return data_; }
+
+ //! \brief Tests whether this instance has any elements.
+ bool empty (void) const
+ {
+ return size_==0;
+ }
+
+ //! \brief Returns the maximum number of elements this instance can hold.
+ unsigned int max_size (void) const
+ {
+ return N;
+ }
+
+ //! \brief Returns the maximum number of elements this instance can hold.
+ unsigned int capacity () const
+ {
+ return N;
+ }
+
+ /*! \brief Returns a reference to a given element.
+ *
+ * \param index which element to access. *
+ * \note
+ * The caller is responsible for ensuring index is >= 0 and < size().
+ */
+ T& operator[](int index)
+ {
+ return data_[index];
+ }
+
+ /*! \brief Returns a const reference to a given element.
+ *
+ * \param index which element to access.
+ *
+ * \note
+ * The caller is responsible for ensuring index is >= 0 and < size().
+ */
+ const T& operator[](int index) const
+ {
+ return data_[index];
+ }
+
+ /*! \brief Assigns elements of the vector based on a source iterator range.
+ *
+ * \param start Beginning iterator of source range
+ * \param end Enditerator of source range
+ *
+ * \note
+ * Will throw an exception if exceptions are enabled and size exceeded.
+ */
+ template<class I>
+ void assign(I start, I end)
+ {
+ clear();
+ while(start != end) {
+ push_back(*start);
+ start++;
+ }
+ }
+
+ /*! \class iterator
+ * \brief Const iterator class for vectors
+ */
+ class iterator
+ {
+ private:
+ const vector<T,N> *vec_;
+ int index_;
+
+ /**
+ * Internal iterator constructor to capture reference
+ * to the vector it iterates over rather than taking
+ * the vector by copy.
+ */
+ iterator (const vector<T,N> &vec, int index) :
+ vec_(&vec)
+ {
+ if( !vec.empty() ) {
+ index_ = index;
+ } else {
+ index_ = -1;
+ }
+ }
+
+ public:
+ iterator(void) :
+ index_(-1),
+ vec_(NULL)
+ {
+ }
+
+ iterator(const iterator& rhs) :
+ vec_(rhs.vec_),
+ index_(rhs.index_)
+ {
+ }
+
+ ~iterator(void) {}
+
+ static iterator begin(const cl::vector<T,N> &vec)
+ {
+ iterator i(vec, 0);
+
+ return i;
+ }
+
+ static iterator end(const cl::vector<T,N> &vec)
+ {
+ iterator i(vec, vec.size());
+
+ return i;
+ }
+
+ bool operator==(iterator i)
+ {
+ return ((vec_ == i.vec_) &&
+ (index_ == i.index_));
+ }
+
+ bool operator!=(iterator i)
+ {
+ return (!(*this==i));
+ }
+
+ iterator& operator++()
+ {
+ ++index_;
+ return *this;
+ }
+
+ iterator operator++(int)
+ {
+ iterator retVal(*this);
+ ++index_;
+ return retVal;
+ }
+
+ iterator& operator--()
+ {
+ --index_;
+ return *this;
+ }
+
+ iterator operator--(int)
+ {
+ iterator retVal(*this);
+ --index_;
+ return retVal;
+ }
+
+ const T& operator *() const
+ {
+ return (*vec_)[index_];
+ }
+ };
+
+ iterator begin(void)
+ {
+ return iterator::begin(*this);
+ }
+
+ iterator begin(void) const
+ {
+ return iterator::begin(*this);
+ }
+
+ iterator end(void)
+ {
+ return iterator::end(*this);
+ }
+
+ iterator end(void) const
+ {
+ return iterator::end(*this);
+ }
+
+ T& front(void)
+ {
+ return data_[0];
+ }
+
+ T& back(void)
+ {
+ return data_[size_];
+ }
+
+ const T& front(void) const
+ {
+ return data_[0];
+ }
+
+ const T& back(void) const
+ {
+ return data_[size_-1];
+ }
+};
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+ /*
+ * Compare and exchange primitives are needed for handling of defaults
+ */
+ inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+ {
+#ifdef _WIN32
+ return (int)(InterlockedCompareExchange(
+ (volatile long*)dest,
+ (long)exchange,
+ (long)comparand));
+#elif defined(__APPLE__) || defined(__MACOSX)
+ return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
+#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
+ return (__sync_val_compare_and_swap(
+ dest,
+ comparand,
+ exchange));
+#endif // !_WIN32
+ }
+
+ inline void fence() { _mm_mfence(); }
+}; // namespace detail
+
+
+/*! \brief class used to interface between C++ and
+ * OpenCL C calls that require arrays of size_t values, whose
+ * size is known statically.
+ */
+template <int N>
+class size_t
+{
+private:
+ ::size_t data_[N];
+
+public:
+ //! \brief Initialize size_t to all 0s
+ size_t()
+ {
+ for( int i = 0; i < N; ++i ) {
+ data_[i] = 0;
+ }
+ }
+
+ ::size_t& operator[](int index)
+ {
+ return data_[index];
+ }
+
+ const ::size_t& operator[](int index) const
+ {
+ return data_[index];
+ }
+
+ //! \brief Conversion operator to T*.
+ operator ::size_t* () { return data_; }
+
+ //! \brief Conversion operator to const T*.
+ operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+ return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+ ::size_t required;
+ cl_int err = f(name, 0, NULL, &required);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ T* value = (T*) alloca(required);
+ err = f(name, required, value, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ param->assign(&value[0], &value[required/sizeof(T)]);
+ return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+ ::size_t required;
+ cl_int err = f(name, 0, NULL, &required);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+ err = f(name, required, value, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ ::size_t elements = required / sizeof(typename T::cl_type);
+ param->assign(&value[0], &value[elements]);
+ for (::size_t i = 0; i < elements; i++)
+ {
+ if (value[i] != NULL)
+ {
+ err = (*param)[i].retain();
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+ }
+ }
+ return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+ cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+ ::size_t required;
+ cl_int err = f(name, 0, NULL, &required);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ char* value = (char*) alloca(required);
+ err = f(name, required, value, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ *param = value;
+ return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+ ::size_t required;
+ cl_int err = f(name, 0, NULL, &required);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ ::size_t* value = (::size_t*) alloca(required);
+ err = f(name, required, value, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+
+ for(int i = 0; i < N; ++i) {
+ (*param)[i] = value[i];
+ }
+
+ return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+ typename T::cl_type value;
+ cl_int err = f(name, sizeof(value), &value, NULL);
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+ *param = value;
+ if (value != NULL)
+ {
+ err = param->retain();
+ if (err != CL_SUCCESS) {
+ return err;
+ }
+ }
+ return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+ F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+ F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+ \
+ F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+ F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+ F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+ F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+ F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+ F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+ F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+ F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+ F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+ F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+ F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+ F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+ F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+ F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+ F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+ F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+ F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+ F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+ F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+ F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+ F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+ F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+ \
+ F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+ F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+ F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+ \
+ F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+ F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+ F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+ F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+ \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+ F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+ \
+ F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+ F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+ F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+ F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+ F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+ F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+ F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+ \
+ F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+ F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+ F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+ F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+ F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+ F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+ F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+ \
+ F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+ F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+ F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+ F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+ F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+ \
+ F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+ F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+ F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+ F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+ F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+ F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+ F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+ \
+ F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+ F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+ F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+ \
+ F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+ F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+ F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+ F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+ F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+ \
+ F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+ F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+ F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+ \
+ F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+ F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+ F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+ F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+ F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+ F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+ F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+ F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+ F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+ F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+ F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+ \
+ F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+ F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+ \
+ F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+ F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+ \
+ F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+ F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+ \
+ F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+ F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+ \
+ F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+ \
+ F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+ \
+ F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+ F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+ F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+ F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+ \
+ F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+ F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+ F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>) \
+ F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+ F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+ F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+ F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+ F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+ F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+ F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+ F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+ F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token; \
+template<> \
+struct param_traits<detail:: token,param_name> \
+{ \
+ enum { value = param_name }; \
+ typedef T param_type; \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+ return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+ Func f_; const Arg0& arg0_;
+ cl_int operator ()(
+ cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+ { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+ Func f_; const Arg0& arg0_; const Arg1& arg1_;
+ cl_int operator ()(
+ cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+ { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+ GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+ return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+ GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+ return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+ /**
+ * Retain the device.
+ * \param device A valid device created using createSubDevices
+ * \return
+ * CL_SUCCESS if the function executed successfully.
+ * CL_INVALID_DEVICE if device was not a valid subdevice
+ * CL_OUT_OF_RESOURCES
+ * CL_OUT_OF_HOST_MEMORY
+ */
+ static cl_int retain(cl_device_id device)
+ { return ::clRetainDevice(device); }
+ /**
+ * Retain the device.
+ * \param device A valid device created using createSubDevices
+ * \return
+ * CL_SUCCESS if the function executed successfully.
+ * CL_INVALID_DEVICE if device was not a valid subdevice
+ * CL_OUT_OF_RESOURCES
+ * CL_OUT_OF_HOST_MEMORY
+ */
+ static cl_int release(cl_device_id device)
+ { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+ // cl_device_id does not have retain().
+ static cl_int retain(cl_device_id)
+ { return CL_SUCCESS; }
+ // cl_device_id does not have release().
+ static cl_int release(cl_device_id)
+ { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+ // cl_platform_id does not have retain().
+ static cl_int retain(cl_platform_id)
+ { return CL_SUCCESS; }
+ // cl_platform_id does not have release().
+ static cl_int release(cl_platform_id)
+ { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+ static cl_int retain(cl_context context)
+ { return ::clRetainContext(context); }
+ static cl_int release(cl_context context)
+ { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+ static cl_int retain(cl_command_queue queue)
+ { return ::clRetainCommandQueue(queue); }
+ static cl_int release(cl_command_queue queue)
+ { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+ static cl_int retain(cl_mem memory)
+ { return ::clRetainMemObject(memory); }
+ static cl_int release(cl_mem memory)
+ { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+ static cl_int retain(cl_sampler sampler)
+ { return ::clRetainSampler(sampler); }
+ static cl_int release(cl_sampler sampler)
+ { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+ static cl_int retain(cl_program program)
+ { return ::clRetainProgram(program); }
+ static cl_int release(cl_program program)
+ { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+ static cl_int retain(cl_kernel kernel)
+ { return ::clRetainKernel(kernel); }
+ static cl_int release(cl_kernel kernel)
+ { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+ static cl_int retain(cl_event event)
+ { return ::clRetainEvent(event); }
+ static cl_int release(cl_event event)
+ { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+ int highVersion = 0;
+ int lowVersion = 0;
+ int index = 7;
+ while(versionInfo[index] != '.' ) {
+ highVersion *= 10;
+ highVersion += versionInfo[index]-'0';
+ ++index;
+ }
+ ++index;
+ while(versionInfo[index] != ' ' ) {
+ lowVersion *= 10;
+ lowVersion += versionInfo[index]-'0';
+ ++index;
+ }
+ return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+ ::size_t size = 0;
+ clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+ char *versionInfo = (char *) alloca(size);
+ clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+ return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+ cl_platform_id platform;
+ clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+ return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+ // The platform cannot be queried directly, so we first have to grab a
+ // device and obtain its context
+ ::size_t size = 0;
+ clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+ if (size == 0)
+ return 0;
+ cl_device_id *devices = (cl_device_id *) alloca(size);
+ clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+ return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+ typedef T cl_type;
+
+protected:
+ cl_type object_;
+
+public:
+ Wrapper() : object_(NULL) { }
+
+ Wrapper(const cl_type &obj) : object_(obj) { }
+
+ ~Wrapper()
+ {
+ if (object_ != NULL) { release(); }
+ }
+
+ Wrapper(const Wrapper<cl_type>& rhs)
+ {
+ object_ = rhs.object_;
+ if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+ }
+
+ Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+ {
+ if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+ object_ = rhs.object_;
+ if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+ return *this;
+ }
+
+ Wrapper<cl_type>& operator = (const cl_type &rhs)
+ {
+ if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+ object_ = rhs;
+ return *this;
+ }
+
+ cl_type operator ()() const { return object_; }
+
+ cl_type& operator ()() { return object_; }
+
+protected:
+ template<typename Func, typename U>
+ friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+ cl_int retain() const
+ {
+ return ReferenceHandler<cl_type>::retain(object_);
+ }
+
+ cl_int release() const
+ {
+ return ReferenceHandler<cl_type>::release(object_);
+ }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+ typedef cl_device_id cl_type;
+
+protected:
+ cl_type object_;
+ bool referenceCountable_;
+
+ static bool isReferenceCountable(cl_device_id device)
+ {
+ bool retVal = false;
+ if (device != NULL) {
+ int version = getDevicePlatformVersion(device);
+ if(version > ((1 << 16) + 1)) {
+ retVal = true;
+ }
+ }
+ return retVal;
+ }
+
+public:
+ Wrapper() : object_(NULL), referenceCountable_(false)
+ {
+ }
+
+ Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false)
+ {
+ referenceCountable_ = isReferenceCountable(obj);
+ }
+
+ ~Wrapper()
+ {
+ if (object_ != NULL) { release(); }
+ }
+
+ Wrapper(const Wrapper<cl_type>& rhs)
+ {
+ object_ = rhs.object_;
+ referenceCountable_ = isReferenceCountable(object_);
+ if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+ }
+
+ Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+ {
+ if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+ object_ = rhs.object_;
+ referenceCountable_ = rhs.referenceCountable_;
+ if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+ return *this;
+ }
+
+ Wrapper<cl_type>& operator = (const cl_type &rhs)
+ {
+ if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+ object_ = rhs;
+ referenceCountable_ = isReferenceCountable(object_);
+ return *this;
+ }
+
+ cl_type operator ()() const { return object_; }
+
+ cl_type& operator ()() { return object_; }
+
+protected:
+ template<typename Func, typename U>
+ friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+ template<typename Func, typename U>
+ friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+ cl_int retain() const
+ {
+ if( referenceCountable_ ) {
+ return ReferenceHandler<cl_type>::retain(object_);
+ }
+ else {
+ return CL_SUCCESS;
+ }
+ }
+
+ cl_int release() const
+ {
+ if( referenceCountable_ ) {
+ return ReferenceHandler<cl_type>::release(object_);
+ }
+ else {
+ return CL_SUCCESS;
+ }
+ }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ * \brief Adds constructors and member functions for cl_image_format.
+ *
+ * \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+ //! \brief Default constructor - performs no initialization.
+ ImageFormat(){}
+
+ //! \brief Initializing constructor.
+ ImageFormat(cl_channel_order order, cl_channel_type type)
+ {
+ image_channel_order = order;
+ image_channel_data_type = type;
+ }
+
+ //! \brief Assignment operator.
+ ImageFormat& operator = (const ImageFormat& rhs)
+ {
+ if (this != &rhs) {
+ this->image_channel_data_type = rhs.image_channel_data_type;
+ this->image_channel_order = rhs.image_channel_order;
+ }
+ return *this;
+ }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ * \note Copies of these objects are inexpensive, since they don't 'own'
+ * any underlying resources or data structures.
+ *
+ * \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+ //! \brief Default constructor - initializes to NULL.
+ Device() : detail::Wrapper<cl_type>() { }
+
+ /*! \brief Copy constructor.
+ *
+ * This simply copies the device ID value, which is an inexpensive operation.
+ */
+ Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+ /*! \brief Constructor from cl_device_id.
+ *
+ * This simply copies the device ID value, which is an inexpensive operation.
+ */
+ Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+ /*! \brief Returns the first device on the default context.
+ *
+ * \see Context::getDefault()
+ */
+ static Device getDefault(cl_int * err = NULL);
+
+ /*! \brief Assignment operator from Device.
+ *
+ * This simply copies the device ID value, which is an inexpensive operation.
+ */
+ Device& operator = (const Device& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment operator from cl_device_id.
+ *
+ * This simply copies the device ID value, which is an inexpensive operation.
+ */
+ Device& operator = (const cl_device_id& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetDeviceInfo().
+ template <typename T>
+ cl_int getInfo(cl_device_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+ __GET_DEVICE_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_device_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_device_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ /**
+ * CL 1.2 version
+ */
+#if defined(CL_VERSION_1_2)
+ //! \brief Wrapper for clCreateSubDevicesEXT().
+ cl_int createSubDevices(
+ const cl_device_partition_property * properties,
+ VECTOR_CLASS<Device>* devices)
+ {
+ cl_uint n = 0;
+ cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_SUB_DEVICES);
+ }
+
+ cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+ err = clCreateSubDevices(object_, properties, n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_SUB_DEVICES);
+ }
+
+ devices->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+ cl_int createSubDevices(
+ const cl_device_partition_property_ext * properties,
+ VECTOR_CLASS<Device>* devices)
+ {
+ typedef CL_API_ENTRY cl_int
+ ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+ cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+ __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+ cl_uint n = 0;
+ cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_SUB_DEVICES);
+ }
+
+ cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+ err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_SUB_DEVICES);
+ }
+
+ devices->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ * \note Copies of these objects are inexpensive, since they don't 'own'
+ * any underlying resources or data structures.
+ *
+ * \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+ //! \brief Default constructor - initializes to NULL.
+ Platform() : detail::Wrapper<cl_type>() { }
+
+ /*! \brief Copy constructor.
+ *
+ * This simply copies the platform ID value, which is an inexpensive operation.
+ */
+ Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+ /*! \brief Constructor from cl_platform_id.
+ *
+ * This simply copies the platform ID value, which is an inexpensive operation.
+ */
+ Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+ /*! \brief Assignment operator from Platform.
+ *
+ * This simply copies the platform ID value, which is an inexpensive operation.
+ */
+ Platform& operator = (const Platform& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment operator from cl_platform_id.
+ *
+ * This simply copies the platform ID value, which is an inexpensive operation.
+ */
+ Platform& operator = (const cl_platform_id& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetPlatformInfo().
+ cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+ __GET_PLATFORM_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_platform_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_platform_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ /*! \brief Gets a list of devices for this platform.
+ *
+ * Wraps clGetDeviceIDs().
+ */
+ cl_int getDevices(
+ cl_device_type type,
+ VECTOR_CLASS<Device>* devices) const
+ {
+ cl_uint n = 0;
+ if( devices == NULL ) {
+ return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+ }
+ cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+ err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ devices->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+
+#if defined(USE_DX_INTEROP)
+ /*! \brief Get the list of available D3D10 devices.
+ *
+ * \param d3d_device_source.
+ *
+ * \param d3d_object.
+ *
+ * \param d3d_device_set.
+ *
+ * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+ * values returned in devices can be used to identify a specific OpenCL
+ * device. If \a devices argument is NULL, this argument is ignored.
+ *
+ * \return One of the following values:
+ * - CL_SUCCESS if the function is executed successfully.
+ *
+ * The application can query specific capabilities of the OpenCL device(s)
+ * returned by cl::getDevices. This can be used by the application to
+ * determine which device(s) to use.
+ *
+ * \note In the case that exceptions are enabled and a return value
+ * other than CL_SUCCESS is generated, then cl::Error exception is
+ * generated.
+ */
+ cl_int getDevices(
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ VECTOR_CLASS<Device>* devices) const
+ {
+ typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+ cl_platform_id platform,
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint* num_devices);
+
+ if( devices == NULL ) {
+ return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+ }
+
+ static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+ __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+ cl_uint n = 0;
+ cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+ object_,
+ d3d_device_source,
+ d3d_object,
+ d3d_device_set,
+ 0,
+ NULL,
+ &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+ err = pfn_clGetDeviceIDsFromD3D10KHR(
+ object_,
+ d3d_device_source,
+ d3d_object,
+ d3d_device_set,
+ n,
+ ids,
+ NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+ }
+
+ devices->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+#endif
+
+ /*! \brief Gets a list of available platforms.
+ *
+ * Wraps clGetPlatformIDs().
+ */
+ static cl_int get(
+ VECTOR_CLASS<Platform>* platforms)
+ {
+ cl_uint n = 0;
+
+ if( platforms == NULL ) {
+ return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+ }
+
+ cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ }
+
+ cl_platform_id* ids = (cl_platform_id*) alloca(
+ n * sizeof(cl_platform_id));
+ err = ::clGetPlatformIDs(n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ }
+
+ platforms->assign(&ids[0], &ids[n]);
+ return CL_SUCCESS;
+ }
+
+ /*! \brief Gets the first available platform.
+ *
+ * Wraps clGetPlatformIDs(), returning the first result.
+ */
+ static cl_int get(
+ Platform * platform)
+ {
+ cl_uint n = 0;
+
+ if( platform == NULL ) {
+ return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+ }
+
+ cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ }
+
+ cl_platform_id* ids = (cl_platform_id*) alloca(
+ n * sizeof(cl_platform_id));
+ err = ::clGetPlatformIDs(n, ids, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ }
+
+ *platform = ids[0];
+ return CL_SUCCESS;
+ }
+
+ /*! \brief Gets the first available platform, returning it by value.
+ *
+ * Wraps clGetPlatformIDs(), returning the first result.
+ */
+ static Platform get(
+ cl_int * errResult = NULL)
+ {
+ Platform platform;
+ cl_uint n = 0;
+ cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+ if (err != CL_SUCCESS) {
+ detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ if (errResult != NULL) {
+ *errResult = err;
+ }
+ }
+
+ cl_platform_id* ids = (cl_platform_id*) alloca(
+ n * sizeof(cl_platform_id));
+ err = ::clGetPlatformIDs(n, ids, NULL);
+
+ if (err != CL_SUCCESS) {
+ detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+ }
+
+ if (errResult != NULL) {
+ *errResult = err;
+ }
+
+ return ids[0];
+ }
+
+ static Platform getDefault(
+ cl_int *errResult = NULL )
+ {
+ return get(errResult);
+ }
+
+
+#if defined(CL_VERSION_1_2)
+ //! \brief Wrapper for clUnloadCompiler().
+ cl_int
+ unloadCompiler()
+ {
+ return ::clUnloadPlatformCompiler(object_);
+ }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+ return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ * \note Copies of these objects are shallow, meaning that the copy will refer
+ * to the same underlying cl_context as the original. For details, see
+ * clRetainContext() and clReleaseContext().
+ *
+ * \see cl_context
+ */
+class Context
+ : public detail::Wrapper<cl_context>
+{
+private:
+ static volatile int default_initialized_;
+ static Context default_;
+ static volatile cl_int default_error_;
+public:
+ /*! \brief Destructor.
+ *
+ * This calls clReleaseContext() on the value held by this instance.
+ */
+ ~Context() { }
+
+ /*! \brief Constructs a context including a list of specified devices.
+ *
+ * Wraps clCreateContext().
+ */
+ Context(
+ const VECTOR_CLASS<Device>& devices,
+ cl_context_properties* properties = NULL,
+ void (CL_CALLBACK * notifyFptr)(
+ const char *,
+ const void *,
+ ::size_t,
+ void *) = NULL,
+ void* data = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ ::size_t numDevices = devices.size();
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
+
+ object_ = ::clCreateContext(
+ properties, (cl_uint) numDevices,
+ deviceIDs,
+ notifyFptr, data, &error);
+
+ detail::errHandler(error, __CREATE_CONTEXT_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Context(
+ const Device& device,
+ cl_context_properties* properties = NULL,
+ void (CL_CALLBACK * notifyFptr)(
+ const char *,
+ const void *,
+ ::size_t,
+ void *) = NULL,
+ void* data = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ cl_device_id deviceID = device();
+
+ object_ = ::clCreateContext(
+ properties, 1,
+ &deviceID,
+ notifyFptr, data, &error);
+
+ detail::errHandler(error, __CREATE_CONTEXT_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ /*! \brief Constructs a context including all or a subset of devices of a specified type.
+ *
+ * Wraps clCreateContextFromType().
+ */
+ Context(
+ cl_device_type type,
+ cl_context_properties* properties = NULL,
+ void (CL_CALLBACK * notifyFptr)(
+ const char *,
+ const void *,
+ ::size_t,
+ void *) = NULL,
+ void* data = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+#if !defined(__APPLE__) || !defined(__MACOS)
+ cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+ if (properties == NULL) {
+ // Get a valid platform ID as we cannot send in a blank one
+ VECTOR_CLASS<Platform> platforms;
+ error = Platform::get(&platforms);
+ if (error != CL_SUCCESS) {
+ detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ return;
+ }
+
+ // Check the platforms we found for a device of our specified type
+ cl_context_properties platform_id = 0;
+ for (unsigned int i = 0; i < platforms.size(); i++) {
+
+ VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+ try {
+#endif
+
+ error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+ } catch (Error) {}
+ // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+ // We do error checking next anyway, and can throw there if needed
+#endif
+
+ // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+ if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+ detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ if (devices.size() > 0) {
+ platform_id = (cl_context_properties)platforms[i]();
+ break;
+ }
+ }
+
+ if (platform_id == 0) {
+ detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+ if (err != NULL) {
+ *err = CL_DEVICE_NOT_FOUND;
+ }
+ return;
+ }
+
+ prop[1] = platform_id;
+ properties = &prop[0];
+ }
+#endif
+ object_ = ::clCreateContextFromType(
+ properties, type, notifyFptr, data, &error);
+
+ detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+ *
+ * \note All calls to this function return the same cl_context as the first.
+ */
+ static Context getDefault(cl_int * err = NULL)
+ {
+ int state = detail::compare_exchange(
+ &default_initialized_,
+ __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+
+ if (state & __DEFAULT_INITIALIZED) {
+ if (err != NULL) {
+ *err = default_error_;
+ }
+ return default_;
+ }
+
+ if (state & __DEFAULT_BEING_INITIALIZED) {
+ // Assume writes will propagate eventually...
+ while(default_initialized_ != __DEFAULT_INITIALIZED) {
+ detail::fence();
+ }
+
+ if (err != NULL) {
+ *err = default_error_;
+ }
+ return default_;
+ }
+
+ cl_int error;
+ default_ = Context(
+ CL_DEVICE_TYPE_DEFAULT,
+ NULL,
+ NULL,
+ NULL,
+ &error);
+
+ detail::fence();
+
+ default_error_ = error;
+ // Assume writes will propagate eventually...
+ default_initialized_ = __DEFAULT_INITIALIZED;
+
+ detail::fence();
+
+ if (err != NULL) {
+ *err = default_error_;
+ }
+ return default_;
+
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ Context() : detail::Wrapper<cl_type>() { }
+
+ /*! \brief Copy constructor.
+ *
+ * This calls clRetainContext() on the parameter's cl_context.
+ */
+ Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+ /*! \brief Constructor from cl_context - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the cl_context
+ * into the new Context object.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+ /*! \brief Assignment operator from Context.
+ *
+ * This calls clRetainContext() on the parameter and clReleaseContext() on
+ * the previous value held by this instance.
+ */
+ Context& operator = (const Context& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment operator from cl_context - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the rhs and calls
+ * clReleaseContext() on the value previously held by this instance.
+ */
+ Context& operator = (const cl_context& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetContextInfo().
+ template <typename T>
+ cl_int getInfo(cl_context_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetContextInfo, object_, name, param),
+ __GET_CONTEXT_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetContextInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_context_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_context_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ /*! \brief Gets a list of supported image formats.
+ *
+ * Wraps clGetSupportedImageFormats().
+ */
+ cl_int getSupportedImageFormats(
+ cl_mem_flags flags,
+ cl_mem_object_type type,
+ VECTOR_CLASS<ImageFormat>* formats) const
+ {
+ cl_uint numEntries;
+ cl_int err = ::clGetSupportedImageFormats(
+ object_,
+ flags,
+ type,
+ 0,
+ NULL,
+ &numEntries);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+ }
+
+ ImageFormat* value = (ImageFormat*)
+ alloca(numEntries * sizeof(ImageFormat));
+ err = ::clGetSupportedImageFormats(
+ object_,
+ flags,
+ type,
+ numEntries,
+ (cl_image_format*) value,
+ NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+ }
+
+ formats->assign(&value[0], &value[numEntries]);
+ return CL_SUCCESS;
+ }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+ cl_int error;
+ Device device;
+
+ Context context = Context::getDefault(&error);
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+ if (error != CL_SUCCESS) {
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+ else {
+ device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+ if (err != NULL) {
+ *err = CL_SUCCESS;
+ }
+ }
+
+ return device;
+}
+
+
+#ifdef _WIN32
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif
+
+/*! \brief Class interface for cl_event.
+ *
+ * \note Copies of these objects are shallow, meaning that the copy will refer
+ * to the same underlying cl_event as the original. For details, see
+ * clRetainEvent() and clReleaseEvent().
+ *
+ * \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+ /*! \brief Destructor.
+ *
+ * This calls clReleaseEvent() on the value held by this instance.
+ */
+ ~Event() { }
+
+ //! \brief Default constructor - initializes to NULL.
+ Event() : detail::Wrapper<cl_type>() { }
+
+ /*! \brief Copy constructor.
+ *
+ * This calls clRetainEvent() on the parameter's cl_event.
+ */
+ Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+ /*! \brief Constructor from cl_event - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the cl_event
+ * into the new Event object.
+ */
+ Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+ /*! \brief Assignment operator from cl_event - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the rhs and calls
+ * clReleaseEvent() on the value previously held by this instance.
+ */
+ Event& operator = (const Event& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment operator from cl_event.
+ *
+ * This calls clRetainEvent() on the parameter and clReleaseEvent() on
+ * the previous value held by this instance.
+ */
+ Event& operator = (const cl_event& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetEventInfo().
+ template <typename T>
+ cl_int getInfo(cl_event_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetEventInfo, object_, name, param),
+ __GET_EVENT_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetEventInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_event_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_event_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ //! \brief Wrapper for clGetEventProfilingInfo().
+ template <typename T>
+ cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+ {
+ return detail::errHandler(detail::getInfo(
+ &::clGetEventProfilingInfo, object_, name, param),
+ __GET_EVENT_PROFILE_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_profiling_info, name>::param_type
+ getProfilingInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_profiling_info, name>::param_type param;
+ cl_int result = getProfilingInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ /*! \brief Blocks the calling thread until this event completes.
+ *
+ * Wraps clWaitForEvents().
+ */
+ cl_int wait() const
+ {
+ return detail::errHandler(
+ ::clWaitForEvents(1, &object_),
+ __WAIT_FOR_EVENTS_ERR);
+ }
+
+#if defined(CL_VERSION_1_1)
+ /*! \brief Registers a user callback function for a specific command execution status.
+ *
+ * Wraps clSetEventCallback().
+ */
+ cl_int setCallback(
+ cl_int type,
+ void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
+ void * user_data = NULL)
+ {
+ return detail::errHandler(
+ ::clSetEventCallback(
+ object_,
+ type,
+ pfn_notify,
+ user_data),
+ __SET_EVENT_CALLBACK_ERR);
+ }
+#endif
+
+ /*! \brief Blocks the calling thread until every event specified is complete.
+ *
+ * Wraps clWaitForEvents().
+ */
+ static cl_int
+ waitForEvents(const VECTOR_CLASS<Event>& events)
+ {
+ return detail::errHandler(
+ ::clWaitForEvents(
+ (cl_uint) events.size(), (cl_event*)&events.front()),
+ __WAIT_FOR_EVENTS_ERR);
+ }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ *
+ * See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+ /*! \brief Constructs a user event on a given context.
+ *
+ * Wraps clCreateUserEvent().
+ */
+ UserEvent(
+ const Context& context,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateUserEvent(
+ context(),
+ &error);
+
+ detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ UserEvent() : Event() { }
+
+ //! \brief Copy constructor - performs shallow copy.
+ UserEvent(const UserEvent& event) : Event(event) { }
+
+ //! \brief Assignment Operator - performs shallow copy.
+ UserEvent& operator = (const UserEvent& rhs)
+ {
+ if (this != &rhs) {
+ Event::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Sets the execution status of a user event object.
+ *
+ * Wraps clSetUserEventStatus().
+ */
+ cl_int setStatus(cl_int status)
+ {
+ return detail::errHandler(
+ ::clSetUserEventStatus(object_,status),
+ __SET_USER_EVENT_STATUS_ERR);
+ }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ *
+ * Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+ return detail::errHandler(
+ ::clWaitForEvents(
+ (cl_uint) events.size(), (cl_event*)&events.front()),
+ __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ * \note Copies of these objects are shallow, meaning that the copy will refer
+ * to the same underlying cl_mem as the original. For details, see
+ * clRetainMemObject() and clReleaseMemObject().
+ *
+ * \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+
+ /*! \brief Destructor.
+ *
+ * This calls clReleaseMemObject() on the value held by this instance.
+ */
+ ~Memory() {}
+
+ //! \brief Default constructor - initializes to NULL.
+ Memory() : detail::Wrapper<cl_type>() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * This calls clRetainMemObject() on the parameter's cl_mem.
+ */
+ Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the cl_mem
+ * into the new Memory object.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+ /*! \brief Assignment operator from Memory.
+ *
+ * This calls clRetainMemObject() on the parameter and clReleaseMemObject()
+ * on the previous value held by this instance.
+ */
+ Memory& operator = (const Memory& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment operator from cl_mem - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the rhs and calls
+ * clReleaseMemObject() on the value previously held by this instance.
+ */
+ Memory& operator = (const cl_mem& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetMemObjectInfo().
+ template <typename T>
+ cl_int getInfo(cl_mem_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+ __GET_MEM_OBJECT_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_mem_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_mem_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+#if defined(CL_VERSION_1_1)
+ /*! \brief Registers a callback function to be called when the memory object
+ * is no longer needed.
+ *
+ * Wraps clSetMemObjectDestructorCallback().
+ *
+ * Repeated calls to this function, for a given cl_mem value, will append
+ * to the list of functions called (in reverse order) when memory object's
+ * resources are freed and the memory object is deleted.
+ *
+ * \note
+ * The registered callbacks are associated with the underlying cl_mem
+ * value - not the Memory class instance.
+ */
+ cl_int setDestructorCallback(
+ void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
+ void * user_data = NULL)
+ {
+ return detail::errHandler(
+ ::clSetMemObjectDestructorCallback(
+ object_,
+ pfn_notify,
+ user_data),
+ __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+ }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+ /*! \brief Constructs a Buffer in a specified context.
+ *
+ * Wraps clCreateBuffer().
+ *
+ * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+ * specified. Note alignment & exclusivity requirements.
+ */
+ Buffer(
+ const Context& context,
+ cl_mem_flags flags,
+ ::size_t size,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ /*! \brief Constructs a Buffer in the default context.
+ *
+ * Wraps clCreateBuffer().
+ *
+ * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+ * specified. Note alignment & exclusivity requirements.
+ *
+ * \see Context::getDefault()
+ */
+ Buffer(
+ cl_mem_flags flags,
+ ::size_t size,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ Context context = Context::getDefault(err);
+
+ object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ /*!
+ * \brief Construct a Buffer from a host container via iterators.
+ * IteratorType must be random access.
+ * If useHostPtr is specified iterators must represent contiguous data.
+ */
+ template< typename IteratorType >
+ Buffer(
+ IteratorType startIterator,
+ IteratorType endIterator,
+ bool readOnly,
+ bool useHostPtr = false,
+ cl_int* err = NULL)
+ {
+ typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+ cl_int error;
+
+ cl_mem_flags flags = 0;
+ if( readOnly ) {
+ flags |= CL_MEM_READ_ONLY;
+ }
+ else {
+ flags |= CL_MEM_READ_WRITE;
+ }
+ if( useHostPtr ) {
+ flags |= CL_MEM_USE_HOST_PTR;
+ }
+
+ ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+ Context context = Context::getDefault(err);
+
+ if( useHostPtr ) {
+ object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+ } else {
+ object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+ }
+
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+
+ if( !useHostPtr ) {
+ error = cl::copy(startIterator, endIterator, *this);
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+ }
+
+ /*!
+ * \brief Construct a Buffer from a host container via iterators using a specified context.
+ * IteratorType must be random access.
+ * If useHostPtr is specified iterators must represent contiguous data.
+ */
+ template< typename IteratorType >
+ Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+ bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+ //! \brief Default constructor - initializes to NULL.
+ Buffer() : Memory() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+ /*! \brief Assignment from Buffer - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Buffer& operator = (const Buffer& rhs)
+ {
+ if (this != &rhs) {
+ Memory::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Buffer& operator = (const cl_mem& rhs)
+ {
+ Memory::operator=(rhs);
+ return *this;
+ }
+
+#if defined(CL_VERSION_1_1)
+ /*! \brief Creates a new buffer object from this.
+ *
+ * Wraps clCreateSubBuffer().
+ */
+ Buffer createSubBuffer(
+ cl_mem_flags flags,
+ cl_buffer_create_type buffer_create_type,
+ const void * buffer_create_info,
+ cl_int * err = NULL)
+ {
+ Buffer result;
+ cl_int error;
+ result.object_ = ::clCreateSubBuffer(
+ object_,
+ flags,
+ buffer_create_type,
+ buffer_create_info,
+ &error);
+
+ detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+
+ return result;
+ }
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ * This is provided to facilitate interoperability with Direct3D.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+ typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+ cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer,
+ cl_int* errcode_ret);
+
+ /*! \brief Constructs a BufferD3D10, in a specified context, from a
+ * given ID3D10Buffer.
+ *
+ * Wraps clCreateFromD3D10BufferKHR().
+ */
+ BufferD3D10(
+ const Context& context,
+ cl_mem_flags flags,
+ ID3D10Buffer* bufobj,
+ cl_int * err = NULL)
+ {
+ static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+ vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+ cl_platform platform = -1;
+ for( int i = 0; i < props.size(); ++i ) {
+ if( props[i] == CL_CONTEXT_PLATFORM ) {
+ platform = props[i+1];
+ }
+ }
+ __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+ __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+ cl_int error;
+ object_ = pfn_clCreateFromD3D10BufferKHR(
+ context(),
+ flags,
+ bufobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ BufferD3D10() : Buffer() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+ /*! \brief Assignment from BufferD3D10 - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferD3D10& operator = (const BufferD3D10& rhs)
+ {
+ if (this != &rhs) {
+ Buffer::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferD3D10& operator = (const cl_mem& rhs)
+ {
+ Buffer::operator=(rhs);
+ return *this;
+ }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ * This is provided to facilitate interoperability with OpenGL.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+ /*! \brief Constructs a BufferGL in a specified context, from a given
+ * GL buffer.
+ *
+ * Wraps clCreateFromGLBuffer().
+ */
+ BufferGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLuint bufobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLBuffer(
+ context(),
+ flags,
+ bufobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ BufferGL() : Buffer() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+ /*! \brief Assignment from BufferGL - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferGL& operator = (const BufferGL& rhs)
+ {
+ if (this != &rhs) {
+ Buffer::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferGL& operator = (const cl_mem& rhs)
+ {
+ Buffer::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetGLObjectInfo().
+ cl_int getObjectInfo(
+ cl_gl_object_type *type,
+ GLuint * gl_object_name)
+ {
+ return detail::errHandler(
+ ::clGetGLObjectInfo(object_,type,gl_object_name),
+ __GET_GL_OBJECT_INFO_ERR);
+ }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ * This is provided to facilitate interoperability with OpenGL.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+ /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+ * GL Renderbuffer.
+ *
+ * Wraps clCreateFromGLRenderbuffer().
+ */
+ BufferRenderGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLuint bufobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLRenderbuffer(
+ context(),
+ flags,
+ bufobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ BufferRenderGL() : Buffer() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+ /*! \brief Assignment from BufferGL - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferRenderGL& operator = (const BufferRenderGL& rhs)
+ {
+ if (this != &rhs) {
+ Buffer::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ BufferRenderGL& operator = (const cl_mem& rhs)
+ {
+ Buffer::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetGLObjectInfo().
+ cl_int getObjectInfo(
+ cl_gl_object_type *type,
+ GLuint * gl_object_name)
+ {
+ return detail::errHandler(
+ ::clGetGLObjectInfo(object_,type,gl_object_name),
+ __GET_GL_OBJECT_INFO_ERR);
+ }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class Image : public Memory
+{
+protected:
+ //! \brief Default constructor - initializes to NULL.
+ Image() : Memory() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image(const Image& image) : Memory(image) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+ /*! \brief Assignment from Image - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image& operator = (const Image& rhs)
+ {
+ if (this != &rhs) {
+ Memory::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image& operator = (const cl_mem& rhs)
+ {
+ Memory::operator=(rhs);
+ return *this;
+ }
+
+public:
+ //! \brief Wrapper for clGetImageInfo().
+ template <typename T>
+ cl_int getImageInfo(cl_image_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetImageInfo, object_, name, param),
+ __GET_IMAGE_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetImageInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_image_info, name>::param_type
+ getImageInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_image_info, name>::param_type param;
+ cl_int result = getImageInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class Image1D : public Image
+{
+public:
+ /*! \brief Constructs a 1D Image in a specified context.
+ *
+ * Wraps clCreateImage().
+ */
+ Image1D(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t width,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE1D,
+ width,
+ 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ object_ = ::clCreateImage(
+ context(),
+ flags,
+ &format,
+ &desc,
+ host_ptr,
+ &error);
+
+ detail::errHandler(error, __CREATE_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ Image1D() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image1D(const Image1D& image1D) : Image(image1D) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+ /*! \brief Assignment from Image1D - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image1D& operator = (const Image1D& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image1D& operator = (const cl_mem& rhs)
+ {
+ Image::operator=(rhs);
+ return *this;
+ }
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+ Image1DBuffer(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t width,
+ const Buffer &buffer,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE1D_BUFFER,
+ width,
+ 0, 0, 0, 0, 0, 0, 0,
+ buffer()
+ };
+ object_ = ::clCreateImage(
+ context(),
+ flags,
+ &format,
+ &desc,
+ NULL,
+ &error);
+
+ detail::errHandler(error, __CREATE_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Image1DBuffer() { }
+
+ Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
+
+ __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+ Image1DBuffer& operator = (const Image1DBuffer& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+
+ Image1DBuffer& operator = (const cl_mem& rhs)
+ {
+ Image::operator=(rhs);
+ return *this;
+ }
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+ Image1DArray(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t arraySize,
+ ::size_t width,
+ ::size_t rowPitch,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE1D_ARRAY,
+ width,
+ 0, 0, // height, depth (unused)
+ arraySize,
+ rowPitch,
+ 0, 0, 0, 0
+ };
+ object_ = ::clCreateImage(
+ context(),
+ flags,
+ &format,
+ &desc,
+ host_ptr,
+ &error);
+
+ detail::errHandler(error, __CREATE_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Image1DArray() { }
+
+ Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
+
+ __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+ Image1DArray& operator = (const Image1DArray& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+
+ Image1DArray& operator = (const cl_mem& rhs)
+ {
+ Image::operator=(rhs);
+ return *this;
+ }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class Image2D : public Image
+{
+public:
+ /*! \brief Constructs a 1D Image in a specified context.
+ *
+ * Wraps clCreateImage().
+ */
+ Image2D(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t width,
+ ::size_t height,
+ ::size_t row_pitch = 0,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+ // Run-time decision based on the actual platform
+ {
+ cl_uint version = detail::getContextPlatformVersion(context());
+ useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+ }
+#elif defined(CL_VERSION_1_2)
+ useCreateImage = true;
+#else
+ useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+ if (useCreateImage)
+ {
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE2D,
+ width,
+ height,
+ 0, 0, // depth, array size (unused)
+ row_pitch,
+ 0, 0, 0, 0
+ };
+ object_ = ::clCreateImage(
+ context(),
+ flags,
+ &format,
+ &desc,
+ host_ptr,
+ &error);
+
+ detail::errHandler(error, __CREATE_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+ if (!useCreateImage)
+ {
+ object_ = ::clCreateImage2D(
+ context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+ detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ Image2D() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image2D(const Image2D& image2D) : Image(image2D) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+ /*! \brief Assignment from Image2D - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image2D& operator = (const Image2D& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image2D& operator = (const cl_mem& rhs)
+ {
+ Image::operator=(rhs);
+ return *this;
+ }
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ * This is provided to facilitate interoperability with OpenGL.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ * \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+ /*! \brief Constructs an Image2DGL in a specified context, from a given
+ * GL Texture.
+ *
+ * Wraps clCreateFromGLTexture2D().
+ */
+ Image2DGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLTexture2D(
+ context(),
+ flags,
+ target,
+ miplevel,
+ texobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ Image2DGL() : Image2D() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+ /*! \brief Assignment from Image2DGL - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image2DGL& operator = (const Image2DGL& rhs)
+ {
+ if (this != &rhs) {
+ Image2D::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image2DGL& operator = (const cl_mem& rhs)
+ {
+ Image2D::operator=(rhs);
+ return *this;
+ }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+ Image2DArray(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t arraySize,
+ ::size_t width,
+ ::size_t height,
+ ::size_t rowPitch,
+ ::size_t slicePitch,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE2D_ARRAY,
+ width,
+ height,
+ 0, // depth (unused)
+ arraySize,
+ rowPitch,
+ slicePitch,
+ 0, 0, 0
+ };
+ object_ = ::clCreateImage(
+ context(),
+ flags,
+ &format,
+ &desc,
+ host_ptr,
+ &error);
+
+ detail::errHandler(error, __CREATE_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Image2DArray() { }
+
+ Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
+
+ __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+ Image2DArray& operator = (const Image2DArray& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+
+ Image2DArray& operator = (const cl_mem& rhs)
+ {
+ Image::operator=(rhs);
+ return *this;
+ }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class Image3D : public Image
+{
+public:
+ /*! \brief Constructs a 3D Image in a specified context.
+ *
+ * Wraps clCreateImage().
+ */
+ Image3D(
+ const Context& context,
+ cl_mem_flags flags,
+ ImageFormat format,
+ ::size_t width,
+ ::size_t height,
+ ::size_t depth,
+ ::size_t row_pitch = 0,
+ ::size_t slice_pitch = 0,
+ void* host_ptr = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+ // Run-time decision based on the actual platform
+ {
+ cl_uint version = detail::getContextPlatformVersion(context());
+ useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+ }
+#elif defined(CL_VERSION_1_2)
+ useCreateImage = true;
+#else
+ useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+ if (useCreateImage)
+ {
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE3D,
+ width,
+ height,
+ depth,
+ 0, // array size (unused)
+ row_pitch,
+ slice_pitch,
+ 0, 0, 0
+ };
+ object_ = ::clCreateImage(
+ context(),
+ flags,
+ &format,
+ &desc,
+ host_ptr,
+ &error);
+
+ detail::errHandler(error, __CREATE_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+ if (!useCreateImage)
+ {
+ object_ = ::clCreateImage3D(
+ context(), flags, &format, width, height, depth, row_pitch,
+ slice_pitch, host_ptr, &error);
+
+ detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ Image3D() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image3D(const Image3D& image3D) : Image(image3D) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+ /*! \brief Assignment from Image3D - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image3D& operator = (const Image3D& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image3D& operator = (const cl_mem& rhs)
+ {
+ Image::operator=(rhs);
+ return *this;
+ }
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ * This is provided to facilitate interoperability with OpenGL.
+ *
+ * See Memory for details about copy semantics, etc.
+ *
+ * \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+ /*! \brief Constructs an Image3DGL in a specified context, from a given
+ * GL Texture.
+ *
+ * Wraps clCreateFromGLTexture3D().
+ */
+ Image3DGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLTexture3D(
+ context(),
+ flags,
+ target,
+ miplevel,
+ texobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ //! \brief Default constructor - initializes to NULL.
+ Image3DGL() : Image3D() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+ /*! \brief Constructor from cl_mem - takes ownership.
+ *
+ * See Memory for further details.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+ /*! \brief Assignment from Image3DGL - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image3DGL& operator = (const Image3DGL& rhs)
+ {
+ if (this != &rhs) {
+ Image3D::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment from cl_mem - performs shallow copy.
+ *
+ * See Memory for further details.
+ */
+ Image3DGL& operator = (const cl_mem& rhs)
+ {
+ Image3D::operator=(rhs);
+ return *this;
+ }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+ ImageGL(
+ const Context& context,
+ cl_mem_flags flags,
+ GLenum target,
+ GLint miplevel,
+ GLuint texobj,
+ cl_int * err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateFromGLTexture(
+ context(),
+ flags,
+ target,
+ miplevel,
+ texobj,
+ &error);
+
+ detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ ImageGL() : Image() { }
+
+ ImageGL(const ImageGL& image) : Image(image) { }
+
+ __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+ ImageGL& operator = (const ImageGL& rhs)
+ {
+ if (this != &rhs) {
+ Image::operator=(rhs);
+ }
+ return *this;
+ }
+
+ ImageGL& operator = (const cl_mem& rhs)
+ {
+ Image::operator=(rhs);
+ return *this;
+ }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for cl_sampler.
+ *
+ * \note Copies of these objects are shallow, meaning that the copy will refer
+ * to the same underlying cl_sampler as the original. For details, see
+ * clRetainSampler() and clReleaseSampler().
+ *
+ * \see cl_sampler
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+ /*! \brief Destructor.
+ *
+ * This calls clReleaseSampler() on the value held by this instance.
+ */
+ ~Sampler() { }
+
+ //! \brief Default constructor - initializes to NULL.
+ Sampler() { }
+
+ /*! \brief Constructs a Sampler in a specified context.
+ *
+ * Wraps clCreateSampler().
+ */
+ Sampler(
+ const Context& context,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateSampler(
+ context(),
+ normalized_coords,
+ addressing_mode,
+ filter_mode,
+ &error);
+
+ detail::errHandler(error, __CREATE_SAMPLER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * This calls clRetainSampler() on the parameter's cl_sampler.
+ */
+ Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+ /*! \brief Constructor from cl_sampler - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the cl_sampler
+ * into the new Sampler object.
+ */
+ Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+ /*! \brief Assignment operator from Sampler.
+ *
+ * This calls clRetainSampler() on the parameter and clReleaseSampler()
+ * on the previous value held by this instance.
+ */
+ Sampler& operator = (const Sampler& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment operator from cl_sampler - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the rhs and calls
+ * clReleaseSampler() on the value previously held by this instance.
+ */
+ Sampler& operator = (const cl_sampler& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ //! \brief Wrapper for clGetSamplerInfo().
+ template <typename T>
+ cl_int getInfo(cl_sampler_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+ __GET_SAMPLER_INFO_ERR);
+ }
+
+ //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_sampler_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_sampler_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+ size_t<3> sizes_;
+ cl_uint dimensions_;
+
+public:
+ //! \brief Default constructor - resulting range has zero dimensions.
+ NDRange()
+ : dimensions_(0)
+ { }
+
+ //! \brief Constructs one-dimensional range.
+ NDRange(::size_t size0)
+ : dimensions_(1)
+ {
+ sizes_[0] = size0;
+ }
+
+ //! \brief Constructs two-dimensional range.
+ NDRange(::size_t size0, ::size_t size1)
+ : dimensions_(2)
+ {
+ sizes_[0] = size0;
+ sizes_[1] = size1;
+ }
+
+ //! \brief Constructs three-dimensional range.
+ NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+ : dimensions_(3)
+ {
+ sizes_[0] = size0;
+ sizes_[1] = size1;
+ sizes_[2] = size2;
+ }
+
+ /*! \brief Conversion operator to const ::size_t *.
+ *
+ * \returns a pointer to the size of the first dimension.
+ */
+ operator const ::size_t*() const {
+ return (const ::size_t*) sizes_;
+ }
+
+ //! \brief Queries the number of dimensions in the range.
+ ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+ ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+ static ::size_t size(const T&) { return sizeof(T); }
+ static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+ static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+ static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+}
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+ LocalSpaceArg ret = { size };
+ return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+ LocalSpaceArg ret = { size };
+ return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ * \note Copies of these objects are shallow, meaning that the copy will refer
+ * to the same underlying cl_kernel as the original. For details, see
+ * clRetainKernel() and clReleaseKernel().
+ *
+ * \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+ inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+ /*! \brief Destructor.
+ *
+ * This calls clReleaseKernel() on the value held by this instance.
+ */
+ ~Kernel() { }
+
+ //! \brief Default constructor - initializes to NULL.
+ Kernel() { }
+
+ /*! \brief Copy constructor - performs shallow copy.
+ *
+ * This calls clRetainKernel() on the parameter's cl_kernel.
+ */
+ Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+ /*! \brief Constructor from cl_kernel - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the cl_kernel
+ * into the new Kernel object.
+ */
+ __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+ /*! \brief Assignment operator from Kernel.
+ *
+ * This calls clRetainKernel() on the parameter and clReleaseKernel()
+ * on the previous value held by this instance.
+ */
+ Kernel& operator = (const Kernel& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ /*! \brief Assignment operator from cl_kernel - takes ownership.
+ *
+ * This effectively transfers ownership of a refcount on the rhs and calls
+ * clReleaseKernel() on the value previously held by this instance.
+ */
+ Kernel& operator = (const cl_kernel& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_kernel_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetKernelInfo, object_, name, param),
+ __GET_KERNEL_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_kernel_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_kernel_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+#if defined(CL_VERSION_1_2)
+ template <typename T>
+ cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+ __GET_KERNEL_ARG_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+ getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_kernel_arg_info, name>::param_type param;
+ cl_int result = getArgInfo(argIndex, name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+#endif // #if defined(CL_VERSION_1_2)
+
+ template <typename T>
+ cl_int getWorkGroupInfo(
+ const Device& device, cl_kernel_work_group_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(
+ &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+ __GET_KERNEL_WORK_GROUP_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+ getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_kernel_work_group_info, name>::param_type param;
+ cl_int result = getWorkGroupInfo(device, name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ template <typename T>
+ cl_int setArg(cl_uint index, T value)
+ {
+ return detail::errHandler(
+ ::clSetKernelArg(
+ object_,
+ index,
+ detail::KernelArgumentHandler<T>::size(value),
+ detail::KernelArgumentHandler<T>::ptr(value)),
+ __SET_KERNEL_ARGS_ERR);
+ }
+
+ cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+ {
+ return detail::errHandler(
+ ::clSetKernelArg(object_, index, size, argPtr),
+ __SET_KERNEL_ARGS_ERR);
+ }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+ typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+ typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+ Program(
+ const STRING_CLASS& source,
+ bool build = false,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ const char * strings = source.c_str();
+ const ::size_t length = source.size();
+
+ Context context = Context::getDefault(err);
+
+ object_ = ::clCreateProgramWithSource(
+ context(), (cl_uint)1, &strings, &length, &error);
+
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+ if (error == CL_SUCCESS && build) {
+
+ error = ::clBuildProgram(
+ object_,
+ 0,
+ NULL,
+ "",
+ NULL,
+ NULL);
+
+ detail::errHandler(error, __BUILD_PROGRAM_ERR);
+ }
+
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Program(
+ const Context& context,
+ const STRING_CLASS& source,
+ bool build = false,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ const char * strings = source.c_str();
+ const ::size_t length = source.size();
+
+ object_ = ::clCreateProgramWithSource(
+ context(), (cl_uint)1, &strings, &length, &error);
+
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+ if (error == CL_SUCCESS && build) {
+
+ error = ::clBuildProgram(
+ object_,
+ 0,
+ NULL,
+ "",
+ NULL,
+ NULL);
+
+ detail::errHandler(error, __BUILD_PROGRAM_ERR);
+ }
+
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ Program(
+ const Context& context,
+ const Sources& sources,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ const ::size_t n = (::size_t)sources.size();
+ ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+ const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+ for (::size_t i = 0; i < n; ++i) {
+ strings[i] = sources[(int)i].first;
+ lengths[i] = sources[(int)i].second;
+ }
+
+ object_ = ::clCreateProgramWithSource(
+ context(), (cl_uint)n, strings, lengths, &error);
+
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ /**
+ * Construct a program object from a list of devices and a per-device list of binaries.
+ * \param context A valid OpenCL context in which to construct the program.
+ * \param devices A vector of OpenCL device objects for which the program will be created.
+ * \param binaries A vector of pairs of a pointer to a binary object and its length.
+ * \param binaryStatus An optional vector that on completion will be resized to
+ * match the size of binaries and filled with values to specify if each binary
+ * was successfully loaded.
+ * Set to CL_SUCCESS if the binary was successfully loaded.
+ * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+ * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+ * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+ * CL_INVALID_CONTEXT if context is not a valid context.
+ * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices;
+ * or if any entry in binaries is NULL or has length 0.
+ * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+ * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+ * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+ */
+ Program(
+ const Context& context,
+ const VECTOR_CLASS<Device>& devices,
+ const Binaries& binaries,
+ VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ const ::size_t numDevices = devices.size();
+
+ // Catch size mismatch early and return
+ if(binaries.size() != numDevices) {
+ error = CL_INVALID_VALUE;
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ return;
+ }
+
+ ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+ const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+ for (::size_t i = 0; i < numDevices; ++i) {
+ images[i] = (const unsigned char*)binaries[i].first;
+ lengths[i] = binaries[(int)i].second;
+ }
+
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
+
+ if(binaryStatus) {
+ binaryStatus->resize(numDevices);
+ }
+
+ object_ = ::clCreateProgramWithBinary(
+ context(), (cl_uint) devices.size(),
+ deviceIDs,
+ lengths, images, binaryStatus != NULL
+ ? &binaryStatus->front()
+ : NULL, &error);
+
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+
+#if defined(CL_VERSION_1_2)
+ /**
+ * Create program using builtin kernels.
+ * \param kernelNames Semi-colon separated list of builtin kernel names
+ */
+ Program(
+ const Context& context,
+ const VECTOR_CLASS<Device>& devices,
+ const STRING_CLASS& kernelNames,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+
+ ::size_t numDevices = devices.size();
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
+
+ object_ = ::clCreateProgramWithBuiltInKernels(
+ context(),
+ (cl_uint) devices.size(),
+ deviceIDs,
+ kernelNames.c_str(),
+ &error);
+
+ detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+#endif // #if defined(CL_VERSION_1_2)
+
+ Program() { }
+
+ Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+ __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+ Program& operator = (const Program& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ Program& operator = (const cl_program& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ cl_int build(
+ const VECTOR_CLASS<Device>& devices,
+ const char* options = NULL,
+ void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+ void* data = NULL) const
+ {
+ ::size_t numDevices = devices.size();
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
+
+ return detail::errHandler(
+ ::clBuildProgram(
+ object_,
+ (cl_uint)
+ devices.size(),
+ deviceIDs,
+ options,
+ notifyFptr,
+ data),
+ __BUILD_PROGRAM_ERR);
+ }
+
+ cl_int build(
+ const char* options = NULL,
+ void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+ void* data = NULL) const
+ {
+ return detail::errHandler(
+ ::clBuildProgram(
+ object_,
+ 0,
+ NULL,
+ options,
+ notifyFptr,
+ data),
+ __BUILD_PROGRAM_ERR);
+ }
+
+#if defined(CL_VERSION_1_2)
+ cl_int compile(
+ const char* options = NULL,
+ void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+ void* data = NULL) const
+ {
+ return detail::errHandler(
+ ::clCompileProgram(
+ object_,
+ 0,
+ NULL,
+ options,
+ 0,
+ NULL,
+ NULL,
+ notifyFptr,
+ data),
+ __COMPILE_PROGRAM_ERR);
+ }
+#endif
+
+ template <typename T>
+ cl_int getInfo(cl_program_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(&::clGetProgramInfo, object_, name, param),
+ __GET_PROGRAM_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_program_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_program_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ template <typename T>
+ cl_int getBuildInfo(
+ const Device& device, cl_program_build_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(
+ &::clGetProgramBuildInfo, object_, device(), name, param),
+ __GET_PROGRAM_BUILD_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_program_build_info, name>::param_type
+ getBuildInfo(const Device& device, cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_program_build_info, name>::param_type param;
+ cl_int result = getBuildInfo(device, name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+ {
+ cl_uint numKernels;
+ cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+ }
+
+ Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+ err = ::clCreateKernelsInProgram(
+ object_, numKernels, (cl_kernel*) value, NULL);
+ if (err != CL_SUCCESS) {
+ return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+ }
+
+ kernels->assign(&value[0], &value[numKernels]);
+ return CL_SUCCESS;
+ }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+ Program input1,
+ Program input2,
+ const char* options = NULL,
+ void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+ void* data = NULL,
+ cl_int* err = NULL)
+{
+ cl_int err_local = CL_SUCCESS;
+
+ cl_program programs[2] = { input1(), input2() };
+
+ Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
+
+ cl_program prog = ::clLinkProgram(
+ ctx(),
+ 0,
+ NULL,
+ options,
+ 2,
+ programs,
+ notifyFptr,
+ data,
+ &err_local);
+
+ detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+ if (err != NULL) {
+ *err = err_local;
+ }
+
+ return Program(prog);
+}
+
+inline Program linkProgram(
+ VECTOR_CLASS<Program> inputPrograms,
+ const char* options = NULL,
+ void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+ void* data = NULL,
+ cl_int* err = NULL)
+{
+ cl_int err_local = CL_SUCCESS;
+
+ cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+ if (programs != NULL) {
+ for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+ programs[i] = inputPrograms[i]();
+ }
+ }
+
+ cl_program prog = ::clLinkProgram(
+ Context::getDefault()(),
+ 0,
+ NULL,
+ options,
+ (cl_uint)inputPrograms.size(),
+ programs,
+ notifyFptr,
+ data,
+ &err_local);
+
+ detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+ if (err != NULL) {
+ *err = err_local;
+ }
+
+ return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+ VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+ VECTOR_CLASS<char *> binaries;
+ for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s)
+ {
+ char *ptr = NULL;
+ if (*s != 0)
+ ptr = new char[*s];
+ binaries.push_back(ptr);
+ }
+
+ cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+ if (err != NULL) {
+ *err = result;
+ }
+ return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+ cl_int error;
+
+ object_ = ::clCreateKernel(program(), name, &error);
+ detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+ if (err != NULL) {
+ *err = error;
+ }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+ static volatile int default_initialized_;
+ static CommandQueue default_;
+ static volatile cl_int default_error_;
+public:
+ CommandQueue(
+ cl_command_queue_properties properties,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+
+ Context context = Context::getDefault(&error);
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+ if (error != CL_SUCCESS) {
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+ else {
+ Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+ object_ = ::clCreateCommandQueue(
+ context(), device(), properties, &error);
+
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+ }
+ /*!
+ * \brief Constructs a CommandQueue for an implementation defined device in the given context
+ */
+ explicit CommandQueue(
+ const Context& context,
+ cl_command_queue_properties properties = 0,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ VECTOR_CLASS<cl::Device> devices;
+ error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+ if (error != CL_SUCCESS)
+ {
+ if (err != NULL) {
+ *err = error;
+ }
+ return;
+ }
+
+ object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+ if (err != NULL) {
+ *err = error;
+ }
+
+ }
+
+ CommandQueue(
+ const Context& context,
+ const Device& device,
+ cl_command_queue_properties properties = 0,
+ cl_int* err = NULL)
+ {
+ cl_int error;
+ object_ = ::clCreateCommandQueue(
+ context(), device(), properties, &error);
+
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ static CommandQueue getDefault(cl_int * err = NULL)
+ {
+ int state = detail::compare_exchange(
+ &default_initialized_,
+ __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+
+ if (state & __DEFAULT_INITIALIZED) {
+ if (err != NULL) {
+ *err = default_error_;
+ }
+ return default_;
+ }
+
+ if (state & __DEFAULT_BEING_INITIALIZED) {
+ // Assume writes will propagate eventually...
+ while(default_initialized_ != __DEFAULT_INITIALIZED) {
+ detail::fence();
+ }
+
+ if (err != NULL) {
+ *err = default_error_;
+ }
+ return default_;
+ }
+
+ cl_int error;
+
+ Context context = Context::getDefault(&error);
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+ if (error != CL_SUCCESS) {
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+ else {
+ Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+ default_ = CommandQueue(context, device, 0, &error);
+
+ detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+
+ detail::fence();
+
+ default_error_ = error;
+ // Assume writes will propagate eventually...
+ default_initialized_ = __DEFAULT_INITIALIZED;
+
+ detail::fence();
+
+ if (err != NULL) {
+ *err = default_error_;
+ }
+ return default_;
+
+ }
+
+ CommandQueue() { }
+
+ CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+ CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+ CommandQueue& operator = (const CommandQueue& rhs)
+ {
+ if (this != &rhs) {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ }
+ return *this;
+ }
+
+ CommandQueue& operator = (const cl_command_queue& rhs)
+ {
+ detail::Wrapper<cl_type>::operator=(rhs);
+ return *this;
+ }
+
+ template <typename T>
+ cl_int getInfo(cl_command_queue_info name, T* param) const
+ {
+ return detail::errHandler(
+ detail::getInfo(
+ &::clGetCommandQueueInfo, object_, name, param),
+ __GET_COMMAND_QUEUE_INFO_ERR);
+ }
+
+ template <cl_int name> typename
+ detail::param_traits<detail::cl_command_queue_info, name>::param_type
+ getInfo(cl_int* err = NULL) const
+ {
+ typename detail::param_traits<
+ detail::cl_command_queue_info, name>::param_type param;
+ cl_int result = getInfo(name, ¶m);
+ if (err != NULL) {
+ *err = result;
+ }
+ return param;
+ }
+
+ cl_int enqueueReadBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ ::size_t offset,
+ ::size_t size,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueReadBuffer(
+ object_, buffer(), blocking, offset, size,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_READ_BUFFER_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueWriteBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ ::size_t offset,
+ ::size_t size,
+ const void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueWriteBuffer(
+ object_, buffer(), blocking, offset, size,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_WRITE_BUFFER_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueCopyBuffer(
+ const Buffer& src,
+ const Buffer& dst,
+ ::size_t src_offset,
+ ::size_t dst_offset,
+ ::size_t size,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueCopyBuffer(
+ object_, src(), dst(), src_offset, dst_offset, size,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQEUE_COPY_BUFFER_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueReadBufferRect(
+ const Buffer& buffer,
+ cl_bool blocking,
+ const size_t<3>& buffer_offset,
+ const size_t<3>& host_offset,
+ const size_t<3>& region,
+ ::size_t buffer_row_pitch,
+ ::size_t buffer_slice_pitch,
+ ::size_t host_row_pitch,
+ ::size_t host_slice_pitch,
+ void *ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueReadBufferRect(
+ object_,
+ buffer(),
+ blocking,
+ (const ::size_t *)buffer_offset,
+ (const ::size_t *)host_offset,
+ (const ::size_t *)region,
+ buffer_row_pitch,
+ buffer_slice_pitch,
+ host_row_pitch,
+ host_slice_pitch,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueWriteBufferRect(
+ const Buffer& buffer,
+ cl_bool blocking,
+ const size_t<3>& buffer_offset,
+ const size_t<3>& host_offset,
+ const size_t<3>& region,
+ ::size_t buffer_row_pitch,
+ ::size_t buffer_slice_pitch,
+ ::size_t host_row_pitch,
+ ::size_t host_slice_pitch,
+ void *ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueWriteBufferRect(
+ object_,
+ buffer(),
+ blocking,
+ (const ::size_t *)buffer_offset,
+ (const ::size_t *)host_offset,
+ (const ::size_t *)region,
+ buffer_row_pitch,
+ buffer_slice_pitch,
+ host_row_pitch,
+ host_slice_pitch,
+ ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueCopyBufferRect(
+ const Buffer& src,
+ const Buffer& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ ::size_t src_row_pitch,
+ ::size_t src_slice_pitch,
+ ::size_t dst_row_pitch,
+ ::size_t dst_slice_pitch,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueCopyBufferRect(
+ object_,
+ src(),
+ dst(),
+ (const ::size_t *)src_origin,
+ (const ::size_t *)dst_origin,
+ (const ::size_t *)region,
+ src_row_pitch,
+ src_slice_pitch,
+ dst_row_pitch,
+ dst_slice_pitch,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+#if defined(CL_VERSION_1_2)
+ /**
+ * Enqueue a command to fill a buffer object with a pattern
+ * of a given size. The pattern is specified a as vector.
+ * \tparam PatternType The datatype of the pattern field.
+ * The pattern type must be an accepted OpenCL data type.
+ */
+ template<typename PatternType>
+ cl_int enqueueFillBuffer(
+ const Buffer& buffer,
+ PatternType pattern,
+ ::size_t offset,
+ ::size_t size,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueFillBuffer(
+ object_,
+ buffer(),
+ static_cast<void*>(&pattern),
+ sizeof(PatternType),
+ offset,
+ size,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_FILL_BUFFER_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+#endif // #if defined(CL_VERSION_1_2)
+
+ cl_int enqueueReadImage(
+ const Image& image,
+ cl_bool blocking,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t row_pitch,
+ ::size_t slice_pitch,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueReadImage(
+ object_, image(), blocking, (const ::size_t *) origin,
+ (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_READ_IMAGE_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueWriteImage(
+ const Image& image,
+ cl_bool blocking,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t row_pitch,
+ ::size_t slice_pitch,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueWriteImage(
+ object_, image(), blocking, (const ::size_t *) origin,
+ (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_WRITE_IMAGE_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueCopyImage(
+ const Image& src,
+ const Image& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueCopyImage(
+ object_, src(), dst(), (const ::size_t *) src_origin,
+ (const ::size_t *)dst_origin, (const ::size_t *) region,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_COPY_IMAGE_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+#if defined(CL_VERSION_1_2)
+ /**
+ * Enqueue a command to fill an image object with a specified color.
+ * \param fillColor is the color to use to fill the image.
+ * This is a four component RGBA floating-point color value if
+ * the image channel data type is not an unnormalized signed or
+ * unsigned data type.
+ */
+ cl_int enqueueFillImage(
+ const Image& image,
+ cl_float4 fillColor,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueFillImage(
+ object_,
+ image(),
+ static_cast<void*>(&fillColor),
+ (const ::size_t *) origin,
+ (const ::size_t *) region,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_FILL_IMAGE_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ /**
+ * Enqueue a command to fill an image object with a specified color.
+ * \param fillColor is the color to use to fill the image.
+ * This is a four component RGBA signed integer color value if
+ * the image channel data type is an unnormalized signed integer
+ * type.
+ */
+ cl_int enqueueFillImage(
+ const Image& image,
+ cl_int4 fillColor,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueFillImage(
+ object_,
+ image(),
+ static_cast<void*>(&fillColor),
+ (const ::size_t *) origin,
+ (const ::size_t *) region,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_FILL_IMAGE_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ /**
+ * Enqueue a command to fill an image object with a specified color.
+ * \param fillColor is the color to use to fill the image.
+ * This is a four component RGBA unsigned integer color value if
+ * the image channel data type is an unnormalized unsigned integer
+ * type.
+ */
+ cl_int enqueueFillImage(
+ const Image& image,
+ cl_uint4 fillColor,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueFillImage(
+ object_,
+ image(),
+ static_cast<void*>(&fillColor),
+ (const ::size_t *) origin,
+ (const ::size_t *) region,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_FILL_IMAGE_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+#endif // #if defined(CL_VERSION_1_2)
+
+ cl_int enqueueCopyImageToBuffer(
+ const Image& src,
+ const Buffer& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& region,
+ ::size_t dst_offset,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueCopyImageToBuffer(
+ object_, src(), dst(), (const ::size_t *) src_origin,
+ (const ::size_t *) region, dst_offset,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueCopyBufferToImage(
+ const Buffer& src,
+ const Image& dst,
+ ::size_t src_offset,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueCopyBufferToImage(
+ object_, src(), dst(), src_offset,
+ (const ::size_t *) dst_origin, (const ::size_t *) region,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ void* enqueueMapBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ cl_map_flags flags,
+ ::size_t offset,
+ ::size_t size,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL,
+ cl_int* err = NULL) const
+ {
+ cl_int error;
+ void * result = ::clEnqueueMapBuffer(
+ object_, buffer(), blocking, flags, offset, size,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event,
+ &error);
+
+ detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ return result;
+ }
+
+ void* enqueueMapImage(
+ const Image& buffer,
+ cl_bool blocking,
+ cl_map_flags flags,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t * row_pitch,
+ ::size_t * slice_pitch,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL,
+ cl_int* err = NULL) const
+ {
+ cl_int error;
+ void * result = ::clEnqueueMapImage(
+ object_, buffer(), blocking, flags,
+ (const ::size_t *) origin, (const ::size_t *) region,
+ row_pitch, slice_pitch,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event,
+ &error);
+
+ detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ return result;
+ }
+
+ cl_int enqueueUnmapMemObject(
+ const Memory& memory,
+ void* mapped_ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueUnmapMemObject(
+ object_, memory(), mapped_ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+#if defined(CL_VERSION_1_2)
+ /**
+ * Enqueues a marker command which waits for either a list of events to complete,
+ * or all previously enqueued commands to complete.
+ *
+ * Enqueues a marker command which waits for either a list of events to complete,
+ * or if the list is empty it waits for all commands previously enqueued in command_queue
+ * to complete before it completes. This command returns an event which can be waited on,
+ * i.e. this event can be waited on to insure that all events either in the event_wait_list
+ * or all previously enqueued commands, queued before this command to command_queue,
+ * have completed.
+ */
+ cl_int enqueueMarkerWithWaitList(
+ const VECTOR_CLASS<Event> *events = 0,
+ Event *event = 0)
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueMarkerWithWaitList(
+ object_,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ /**
+ * A synchronization point that enqueues a barrier operation.
+ *
+ * Enqueues a barrier command which waits for either a list of events to complete,
+ * or if the list is empty it waits for all commands previously enqueued in command_queue
+ * to complete before it completes. This command blocks command execution, that is, any
+ * following commands enqueued after it do not execute until it completes. This command
+ * returns an event which can be waited on, i.e. this event can be waited on to insure that
+ * all events either in the event_wait_list or all previously enqueued commands, queued
+ * before this command to command_queue, have completed.
+ */
+ cl_int enqueueBarrierWithWaitList(
+ const VECTOR_CLASS<Event> *events = 0,
+ Event *event = 0)
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueBarrierWithWaitList(
+ object_,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ /**
+ * Enqueues a command to indicate with which device a set of memory objects
+ * should be associated.
+ */
+ cl_int enqueueMigrateMemObjects(
+ const VECTOR_CLASS<Memory> &memObjects,
+ cl_mem_migration_flags flags,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL
+ )
+ {
+ cl_event tmp;
+
+ cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+ for( int i = 0; i < (int)memObjects.size(); ++i ) {
+ localMemObjects[i] = memObjects[i]();
+ }
+
+
+ cl_int err = detail::errHandler(
+ ::clEnqueueMigrateMemObjects(
+ object_,
+ (cl_uint)memObjects.size(),
+ static_cast<const cl_mem*>(localMemObjects),
+ flags,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+#endif // #if defined(CL_VERSION_1_2)
+
+ cl_int enqueueNDRangeKernel(
+ const Kernel& kernel,
+ const NDRange& offset,
+ const NDRange& global,
+ const NDRange& local = NullRange,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueNDRangeKernel(
+ object_, kernel(), (cl_uint) global.dimensions(),
+ offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+ (const ::size_t*) global,
+ local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueTask(
+ const Kernel& kernel,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueTask(
+ object_, kernel(),
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_TASK_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueNativeKernel(
+ void (CL_CALLBACK *userFptr)(void *),
+ std::pair<void*, ::size_t> args,
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<const void*>* mem_locs = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0)
+ ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+ : NULL;
+
+ if (mems != NULL) {
+ for (unsigned int i = 0; i < mem_objects->size(); i++) {
+ mems[i] = ((*mem_objects)[i])();
+ }
+ }
+
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueNativeKernel(
+ object_, userFptr, args.first, args.second,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ mems,
+ (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_NATIVE_KERNEL);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+ CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ {
+ return detail::errHandler(
+ ::clEnqueueMarker(object_, (cl_event*) event),
+ __ENQUEUE_MARKER_ERR);
+ }
+
+ CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ {
+ return detail::errHandler(
+ ::clEnqueueWaitForEvents(
+ object_,
+ (cl_uint) events.size(),
+ (const cl_event*) &events.front()),
+ __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+ }
+#endif // #if defined(CL_VERSION_1_1)
+
+ cl_int enqueueAcquireGLObjects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueAcquireGLObjects(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_ACQUIRE_GL_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueReleaseGLObjects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueReleaseGLObjects(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_RELEASE_GL_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+ cl_command_queue command_queue, cl_uint num_objects,
+ const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+ cl_command_queue command_queue, cl_uint num_objects,
+ const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+ const cl_event* event_wait_list, cl_event* event);
+
+ cl_int enqueueAcquireD3D10Objects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+ cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+ cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+ cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+ __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+ __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ pfn_clEnqueueAcquireD3D10ObjectsKHR(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_ACQUIRE_GL_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+
+ cl_int enqueueReleaseD3D10Objects(
+ const VECTOR_CLASS<Memory>* mem_objects = NULL,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL) const
+ {
+ static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+ cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+ cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+ cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+ __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+ __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ pfn_clEnqueueReleaseD3D10ObjectsKHR(
+ object_,
+ (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+ (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_RELEASE_GL_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+ }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+ CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ {
+ return detail::errHandler(
+ ::clEnqueueBarrier(object_),
+ __ENQUEUE_BARRIER_ERR);
+ }
+#endif // #if defined(CL_VERSION_1_1)
+
+ cl_int flush() const
+ {
+ return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+ }
+
+ cl_int finish() const
+ {
+ return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+ }
+};
+
+#ifdef _WIN32
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif
+
+template< typename IteratorType >
+Buffer::Buffer(
+ const Context &context,
+ IteratorType startIterator,
+ IteratorType endIterator,
+ bool readOnly,
+ bool useHostPtr,
+ cl_int* err)
+{
+ typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+ cl_int error;
+
+ cl_mem_flags flags = 0;
+ if( readOnly ) {
+ flags |= CL_MEM_READ_ONLY;
+ }
+ else {
+ flags |= CL_MEM_READ_WRITE;
+ }
+ if( useHostPtr ) {
+ flags |= CL_MEM_USE_HOST_PTR;
+ }
+
+ ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+ if( useHostPtr ) {
+ object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+ } else {
+ object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+ }
+
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+
+ if( !useHostPtr ) {
+ CommandQueue queue(context, 0, &error);
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+
+ error = cl::copy(queue, startIterator, endIterator, *this);
+ detail::errHandler(error, __CREATE_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ }
+}
+
+inline cl_int enqueueReadBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ ::size_t offset,
+ ::size_t size,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ ::size_t offset,
+ ::size_t size,
+ const void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+ const Buffer& buffer,
+ cl_bool blocking,
+ cl_map_flags flags,
+ ::size_t offset,
+ ::size_t size,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL,
+ cl_int* err = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+ detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+
+ void * result = ::clEnqueueMapBuffer(
+ queue(), buffer(), blocking, flags, offset, size,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (cl_event*) event,
+ &error);
+
+ detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+ if (err != NULL) {
+ *err = error;
+ }
+ return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+ const Memory& memory,
+ void* mapped_ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+ detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ cl_event tmp;
+ cl_int err = detail::errHandler(
+ ::clEnqueueUnmapMemObject(
+ queue(), memory(), mapped_ptr,
+ (events != NULL) ? (cl_uint) events->size() : 0,
+ (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+ (event != NULL) ? &tmp : NULL),
+ __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+ if (event != NULL && err == CL_SUCCESS)
+ *event = tmp;
+
+ return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+ const Buffer& src,
+ const Buffer& dst,
+ ::size_t src_offset,
+ ::size_t dst_offset,
+ ::size_t size,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+ if (error != CL_SUCCESS)
+ return error;
+
+ return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+ if (error != CL_SUCCESS)
+ return error;
+
+ return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+ typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+ cl_int error;
+
+ ::size_t length = endIterator-startIterator;
+ ::size_t byteLength = length*sizeof(DataType);
+
+ DataType *pointer =
+ static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+ // if exceptions enabled, enqueueMapBuffer will throw
+ if( error != CL_SUCCESS ) {
+ return error;
+ }
+#if defined(_MSC_VER)
+ std::copy(
+ startIterator,
+ endIterator,
+ stdext::checked_array_iterator<DataType*>(
+ pointer, length));
+#else
+ std::copy(startIterator, endIterator, pointer);
+#endif
+ Event endEvent;
+ error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+ // if exceptions enabled, enqueueUnmapMemObject will throw
+ if( error != CL_SUCCESS ) {
+ return error;
+ }
+ endEvent.wait();
+ return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+ typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+ cl_int error;
+
+ ::size_t length = endIterator-startIterator;
+ ::size_t byteLength = length*sizeof(DataType);
+
+ DataType *pointer =
+ static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+ // if exceptions enabled, enqueueMapBuffer will throw
+ if( error != CL_SUCCESS ) {
+ return error;
+ }
+ std::copy(pointer, pointer + length, startIterator);
+ Event endEvent;
+ error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+ // if exceptions enabled, enqueueUnmapMemObject will throw
+ if( error != CL_SUCCESS ) {
+ return error;
+ }
+ endEvent.wait();
+ return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+ const Buffer& buffer,
+ cl_bool blocking,
+ const size_t<3>& buffer_offset,
+ const size_t<3>& host_offset,
+ const size_t<3>& region,
+ ::size_t buffer_row_pitch,
+ ::size_t buffer_slice_pitch,
+ ::size_t host_row_pitch,
+ ::size_t host_slice_pitch,
+ void *ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueReadBufferRect(
+ buffer,
+ blocking,
+ buffer_offset,
+ host_offset,
+ region,
+ buffer_row_pitch,
+ buffer_slice_pitch,
+ host_row_pitch,
+ host_slice_pitch,
+ ptr,
+ events,
+ event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+ const Buffer& buffer,
+ cl_bool blocking,
+ const size_t<3>& buffer_offset,
+ const size_t<3>& host_offset,
+ const size_t<3>& region,
+ ::size_t buffer_row_pitch,
+ ::size_t buffer_slice_pitch,
+ ::size_t host_row_pitch,
+ ::size_t host_slice_pitch,
+ void *ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueWriteBufferRect(
+ buffer,
+ blocking,
+ buffer_offset,
+ host_offset,
+ region,
+ buffer_row_pitch,
+ buffer_slice_pitch,
+ host_row_pitch,
+ host_slice_pitch,
+ ptr,
+ events,
+ event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+ const Buffer& src,
+ const Buffer& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ ::size_t src_row_pitch,
+ ::size_t src_slice_pitch,
+ ::size_t dst_row_pitch,
+ ::size_t dst_slice_pitch,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueCopyBufferRect(
+ src,
+ dst,
+ src_origin,
+ dst_origin,
+ region,
+ src_row_pitch,
+ src_slice_pitch,
+ dst_row_pitch,
+ dst_slice_pitch,
+ events,
+ event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+ const Image& image,
+ cl_bool blocking,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t row_pitch,
+ ::size_t slice_pitch,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueReadImage(
+ image,
+ blocking,
+ origin,
+ region,
+ row_pitch,
+ slice_pitch,
+ ptr,
+ events,
+ event);
+}
+
+inline cl_int enqueueWriteImage(
+ const Image& image,
+ cl_bool blocking,
+ const size_t<3>& origin,
+ const size_t<3>& region,
+ ::size_t row_pitch,
+ ::size_t slice_pitch,
+ void* ptr,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueWriteImage(
+ image,
+ blocking,
+ origin,
+ region,
+ row_pitch,
+ slice_pitch,
+ ptr,
+ events,
+ event);
+}
+
+inline cl_int enqueueCopyImage(
+ const Image& src,
+ const Image& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueCopyImage(
+ src,
+ dst,
+ src_origin,
+ dst_origin,
+ region,
+ events,
+ event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+ const Image& src,
+ const Buffer& dst,
+ const size_t<3>& src_origin,
+ const size_t<3>& region,
+ ::size_t dst_offset,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueCopyImageToBuffer(
+ src,
+ dst,
+ src_origin,
+ region,
+ dst_offset,
+ events,
+ event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+ const Buffer& src,
+ const Image& dst,
+ ::size_t src_offset,
+ const size_t<3>& dst_origin,
+ const size_t<3>& region,
+ const VECTOR_CLASS<Event>* events = NULL,
+ Event* event = NULL)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.enqueueCopyBufferToImage(
+ src,
+ dst,
+ src_offset,
+ dst_origin,
+ region,
+ events,
+ event);
+}
+
+
+inline cl_int flush(void)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+ return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+ cl_int error;
+ CommandQueue queue = CommandQueue::getDefault(&error);
+
+ if (error != CL_SUCCESS) {
+ return error;
+ }
+
+
+ return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+ CommandQueue queue_;
+ const NDRange offset_;
+ const NDRange global_;
+ const NDRange local_;
+ VECTOR_CLASS<Event> events_;
+
+ EnqueueArgs(NDRange global) :
+ queue_(CommandQueue::getDefault()),
+ offset_(NullRange),
+ global_(global),
+ local_(NullRange)
+ {
+
+ }
+
+ EnqueueArgs(NDRange global, NDRange local) :
+ queue_(CommandQueue::getDefault()),
+ offset_(NullRange),
+ global_(global),
+ local_(local)
+ {
+
+ }
+
+ EnqueueArgs(NDRange offset, NDRange global, NDRange local) :
+ queue_(CommandQueue::getDefault()),
+ offset_(offset),
+ global_(global),
+ local_(local)
+ {
+
+ }
+
+ EnqueueArgs(Event e, NDRange global) :
+ queue_(CommandQueue::getDefault()),
+ offset_(NullRange),
+ global_(global),
+ local_(NullRange)
+ {
+ events_.push_back(e);
+ }
+
+ EnqueueArgs(Event e, NDRange global, NDRange local) :
+ queue_(CommandQueue::getDefault()),
+ offset_(NullRange),
+ global_(global),
+ local_(local)
+ {
+ events_.push_back(e);
+ }
+
+ EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) :
+ queue_(CommandQueue::getDefault()),
+ offset_(offset),
+ global_(global),
+ local_(local)
+ {
+ events_.push_back(e);
+ }
+
+ EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) :
+ queue_(CommandQueue::getDefault()),
+ offset_(NullRange),
+ global_(global),
+ local_(NullRange),
+ events_(events)
+ {
+
+ }
+
+ EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
+ queue_(CommandQueue::getDefault()),
+ offset_(NullRange),
+ global_(global),
+ local_(local),
+ events_(events)
+ {
+
+ }
+
+ EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
+ queue_(CommandQueue::getDefault()),
+ offset_(offset),
+ global_(global),
+ local_(local),
+ events_(events)
+ {
+
+ }
+
+ EnqueueArgs(CommandQueue &queue, NDRange global) :
+ queue_(queue),
+ offset_(NullRange),
+ global_(global),
+ local_(NullRange)
+ {
+
+ }
+
+ EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) :
+ queue_(queue),
+ offset_(NullRange),
+ global_(global),
+ local_(local)
+ {
+
+ }
+
+ EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) :
+ queue_(queue),
+ offset_(offset),
+ global_(global),
+ local_(local)
+ {
+
+ }
+
+ EnqueueArgs(CommandQueue &queue, Event e, NDRange global) :
+ queue_(queue),
+ offset_(NullRange),
+ global_(global),
+ local_(NullRange)
+ {
+ events_.push_back(e);
+ }
+
+ EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) :
+ queue_(queue),
+ offset_(NullRange),
+ global_(global),
+ local_(local)
+ {
+ events_.push_back(e);
+ }
+
+ EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) :
+ queue_(queue),
+ offset_(offset),
+ global_(global),
+ local_(local)
+ {
+ events_.push_back(e);
+ }
+
+ EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) :
+ queue_(queue),
+ offset_(NullRange),
+ global_(global),
+ local_(NullRange),
+ events_(events)
+ {
+
+ }
+
+ EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) :
+ queue_(queue),
+ offset_(NullRange),
+ global_(global),
+ local_(local),
+ events_(events)
+ {
+
+ }
+
+ EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) :
+ queue_(queue),
+ offset_(offset),
+ global_(global),
+ local_(local),
+ events_(events)
+ {
+
+ }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+ static void set (Kernel kernel, T0 arg)
+ {
+ kernel.setArg(index, arg);
+ }
+};
+
+template<int index>
+struct SetArg<index, NullType>
+{
+ static void set (Kernel, NullType)
+ {
+ }
+};
+
+template <
+ typename T0, typename T1, typename T2, typename T3,
+ typename T4, typename T5, typename T6, typename T7,
+ typename T8, typename T9, typename T10, typename T11,
+ typename T12, typename T13, typename T14, typename T15,
+ typename T16, typename T17, typename T18, typename T19,
+ typename T20, typename T21, typename T22, typename T23,
+ typename T24, typename T25, typename T26, typename T27,
+ typename T28, typename T29, typename T30, typename T31
+>
+class KernelFunctorGlobal
+{
+private:
+ Kernel kernel_;
+
+public:
+ KernelFunctorGlobal(
+ Kernel kernel) :
+ kernel_(kernel)
+ {}
+
+ KernelFunctorGlobal(
+ const Program& program,
+ const STRING_CLASS name,
+ cl_int * err = NULL) :
+ kernel_(program, name.c_str(), err)
+ {}
+
+ Event operator() (
+ const EnqueueArgs& args,
+ T0 t0,
+ T1 t1 = NullType(),
+ T2 t2 = NullType(),
+ T3 t3 = NullType(),
+ T4 t4 = NullType(),
+ T5 t5 = NullType(),
+ T6 t6 = NullType(),
+ T7 t7 = NullType(),
+ T8 t8 = NullType(),
+ T9 t9 = NullType(),
+ T10 t10 = NullType(),
+ T11 t11 = NullType(),
+ T12 t12 = NullType(),
+ T13 t13 = NullType(),
+ T14 t14 = NullType(),
+ T15 t15 = NullType(),
+ T16 t16 = NullType(),
+ T17 t17 = NullType(),
+ T18 t18 = NullType(),
+ T19 t19 = NullType(),
+ T20 t20 = NullType(),
+ T21 t21 = NullType(),
+ T22 t22 = NullType(),
+ T23 t23 = NullType(),
+ T24 t24 = NullType(),
+ T25 t25 = NullType(),
+ T26 t26 = NullType(),
+ T27 t27 = NullType(),
+ T28 t28 = NullType(),
+ T29 t29 = NullType(),
+ T30 t30 = NullType(),
+ T31 t31 = NullType()
+ )
+ {
+ Event event;
+ SetArg<0, T0>::set(kernel_, t0);
+ SetArg<1, T1>::set(kernel_, t1);
+ SetArg<2, T2>::set(kernel_, t2);
+ SetArg<3, T3>::set(kernel_, t3);
+ SetArg<4, T4>::set(kernel_, t4);
+ SetArg<5, T5>::set(kernel_, t5);
+ SetArg<6, T6>::set(kernel_, t6);
+ SetArg<7, T7>::set(kernel_, t7);
+ SetArg<8, T8>::set(kernel_, t8);
+ SetArg<9, T9>::set(kernel_, t9);
+ SetArg<10, T10>::set(kernel_, t10);
+ SetArg<11, T11>::set(kernel_, t11);
+ SetArg<12, T12>::set(kernel_, t12);
+ SetArg<13, T13>::set(kernel_, t13);
+ SetArg<14, T14>::set(kernel_, t14);
+ SetArg<15, T15>::set(kernel_, t15);
+ SetArg<16, T16>::set(kernel_, t16);
+ SetArg<17, T17>::set(kernel_, t17);
+ SetArg<18, T18>::set(kernel_, t18);
+ SetArg<19, T19>::set(kernel_, t19);
+ SetArg<20, T20>::set(kernel_, t20);
+ SetArg<21, T21>::set(kernel_, t21);
+ SetArg<22, T22>::set(kernel_, t22);
+ SetArg<23, T23>::set(kernel_, t23);
+ SetArg<24, T24>::set(kernel_, t24);
+ SetArg<25, T25>::set(kernel_, t25);
+ SetArg<26, T26>::set(kernel_, t26);
+ SetArg<27, T27>::set(kernel_, t27);
+ SetArg<28, T28>::set(kernel_, t28);
+ SetArg<29, T29>::set(kernel_, t29);
+ SetArg<30, T30>::set(kernel_, t30);
+ SetArg<31, T31>::set(kernel_, t31);
+
+ args.queue_.enqueueNDRangeKernel(
+ kernel_,
+ args.offset_,
+ args.global_,
+ args.local_,
+ &args.events_,
+ &event);
+
+ return event;
+ }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24,
+ typename T25,
+ typename T26,
+ typename T27,
+ typename T28,
+ typename T29,
+ typename T30,
+ typename T31>
+struct functionImplementation_
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29,
+ T30,
+ T31> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29,
+ T30,
+ T31);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24,
+ T25 arg25,
+ T26 arg26,
+ T27 arg27,
+ T28 arg28,
+ T29 arg29,
+ T30 arg30,
+ T31 arg31)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24,
+ arg25,
+ arg26,
+ arg27,
+ arg28,
+ arg29,
+ arg30,
+ arg31);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24,
+ typename T25,
+ typename T26,
+ typename T27,
+ typename T28,
+ typename T29,
+ typename T30>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29,
+ T30,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29,
+ T30,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29,
+ T30);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24,
+ T25 arg25,
+ T26 arg26,
+ T27 arg27,
+ T28 arg28,
+ T29 arg29,
+ T30 arg30)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24,
+ arg25,
+ arg26,
+ arg27,
+ arg28,
+ arg29,
+ arg30);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24,
+ typename T25,
+ typename T26,
+ typename T27,
+ typename T28,
+ typename T29>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ T29);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24,
+ T25 arg25,
+ T26 arg26,
+ T27 arg27,
+ T28 arg28,
+ T29 arg29)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24,
+ arg25,
+ arg26,
+ arg27,
+ arg28,
+ arg29);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24,
+ typename T25,
+ typename T26,
+ typename T27,
+ typename T28>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ T28);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24,
+ T25 arg25,
+ T26 arg26,
+ T27 arg27,
+ T28 arg28)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24,
+ arg25,
+ arg26,
+ arg27,
+ arg28);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24,
+ typename T25,
+ typename T26,
+ typename T27>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ T27);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24,
+ T25 arg25,
+ T26 arg26,
+ T27 arg27)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24,
+ arg25,
+ arg26,
+ arg27);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24,
+ typename T25,
+ typename T26>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ T26);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24,
+ T25 arg25,
+ T26 arg26)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24,
+ arg25,
+ arg26);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24,
+ typename T25>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ T25);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24,
+ T25 arg25)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24,
+ arg25);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23,
+ typename T24>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ T24);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23,
+ T24 arg24)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23,
+ arg24);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22,
+ typename T23>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ T23);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22,
+ T23 arg23)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22,
+ arg23);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21,
+ typename T22>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ T22);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21,
+ T22 arg22)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21,
+ arg22);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20,
+ typename T21>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ T21);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20,
+ T21 arg21)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20,
+ arg21);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19,
+ typename T20>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ T20);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19,
+ T20 arg20)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19,
+ arg20);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18,
+ typename T19>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ T19);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18,
+ T19 arg19)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18,
+ arg19);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17,
+ typename T18>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ T18);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17,
+ T18 arg18)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17,
+ arg18);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16,
+ typename T17>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ T17);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16,
+ T17 arg17)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16,
+ arg17);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15,
+ typename T16>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ T16);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15,
+ T16 arg16)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15,
+ arg16);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14,
+ typename T15>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ T15);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14,
+ T15 arg15)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14,
+ arg15);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13,
+ typename T14>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ T14);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13,
+ T14 arg14)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13,
+ arg14);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12,
+ typename T13>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ T13);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12,
+ T13 arg13)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12,
+ arg13);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11,
+ typename T12>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ T12);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11,
+ T12 arg12)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11,
+ arg12);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10,
+ typename T11>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ T11);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10,
+ T11 arg11)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10,
+ arg11);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9,
+ typename T10>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ T10);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9,
+ T10 arg10)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9,
+ arg10);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8,
+ typename T9>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ T9);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8,
+ T9 arg9)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8,
+ arg9);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7,
+ typename T8>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ T8);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7,
+ T8 arg8)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7,
+ arg8);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6,
+ typename T7>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ T7);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6,
+ T7 arg7)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6,
+ arg7);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5,
+ typename T6>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ T6);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5,
+ T6 arg6)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5,
+ arg6);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4,
+ typename T5>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ T5);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4,
+ T5 arg5)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4,
+ arg5);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3,
+ typename T4>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ T4,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3,
+ T4);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3,
+ T4 arg4)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3,
+ arg4);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2,
+ typename T3>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ T3,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ T3,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2,
+ T3);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2,
+ T3 arg3)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2,
+ arg3);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1,
+ typename T2>
+struct functionImplementation_
+< T0,
+ T1,
+ T2,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ T2,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1,
+ T2);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1,
+ T2 arg2)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1,
+ arg2);
+ }
+
+
+};
+
+template<
+ typename T0,
+ typename T1>
+struct functionImplementation_
+< T0,
+ T1,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ T1,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0,
+ T1);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0,
+ T1 arg1)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0,
+ arg1);
+ }
+
+
+};
+
+template<
+ typename T0>
+struct functionImplementation_
+< T0,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType>
+{
+ typedef detail::KernelFunctorGlobal<
+ T0,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType,
+ NullType> FunctorType;
+
+ FunctorType functor_;
+
+ functionImplementation_(const FunctorType &functor) :
+ functor_(functor)
+ {
+
+ #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+ // Fail variadic expansion for dev11
+ static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+ #endif
+
+ }
+
+ //! \brief Return type of the functor
+ typedef Event result_type;
+
+ //! \brief Function signature of kernel functor with no event dependency.
+ typedef Event type_(
+ const EnqueueArgs&,
+ T0);
+
+ Event operator()(
+ const EnqueueArgs& enqueueArgs,
+ T0 arg0)
+ {
+ return functor_(
+ enqueueArgs,
+ arg0);
+ }
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+ typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType,
+ typename T3 = detail::NullType, typename T4 = detail::NullType,
+ typename T5 = detail::NullType, typename T6 = detail::NullType,
+ typename T7 = detail::NullType, typename T8 = detail::NullType,
+ typename T9 = detail::NullType, typename T10 = detail::NullType,
+ typename T11 = detail::NullType, typename T12 = detail::NullType,
+ typename T13 = detail::NullType, typename T14 = detail::NullType,
+ typename T15 = detail::NullType, typename T16 = detail::NullType,
+ typename T17 = detail::NullType, typename T18 = detail::NullType,
+ typename T19 = detail::NullType, typename T20 = detail::NullType,
+ typename T21 = detail::NullType, typename T22 = detail::NullType,
+ typename T23 = detail::NullType, typename T24 = detail::NullType,
+ typename T25 = detail::NullType, typename T26 = detail::NullType,
+ typename T27 = detail::NullType, typename T28 = detail::NullType,
+ typename T29 = detail::NullType, typename T30 = detail::NullType,
+ typename T31 = detail::NullType
+>
+struct make_kernel :
+ public detail::functionImplementation_<
+ T0, T1, T2, T3,
+ T4, T5, T6, T7,
+ T8, T9, T10, T11,
+ T12, T13, T14, T15,
+ T16, T17, T18, T19,
+ T20, T21, T22, T23,
+ T24, T25, T26, T27,
+ T28, T29, T30, T31
+ >
+{
+public:
+ typedef detail::KernelFunctorGlobal<
+ T0, T1, T2, T3,
+ T4, T5, T6, T7,
+ T8, T9, T10, T11,
+ T12, T13, T14, T15,
+ T16, T17, T18, T19,
+ T20, T21, T22, T23,
+ T24, T25, T26, T27,
+ T28, T29, T30, T31
+ > FunctorType;
+
+ make_kernel(
+ const Program& program,
+ const STRING_CLASS name,
+ cl_int * err = NULL) :
+ detail::functionImplementation_<
+ T0, T1, T2, T3,
+ T4, T5, T6, T7,
+ T8, T9, T10, T11,
+ T12, T13, T14, T15,
+ T16, T17, T18, T19,
+ T20, T21, T22, T23,
+ T24, T25, T26, T27,
+ T28, T29, T30, T31
+ >(
+ FunctorType(program, name, err))
+ {}
+
+ make_kernel(
+ const Kernel kernel) :
+ detail::functionImplementation_<
+ T0, T1, T2, T3,
+ T4, T5, T6, T7,
+ T8, T9, T10, T11,
+ T12, T13, T14, T15,
+ T16, T17, T18, T19,
+ T20, T21, T22, T23,
+ T24, T25, T26, T27,
+ T28, T29, T30, T31
+ >(
+ FunctorType(kernel))
+ {}
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED
+#undef __DEFAULT_BEING_INITIALIZED
+#undef __DEFAULT_INITIALIZED
+
+} // namespace cl
+
+#ifdef _WIN32
+#pragma pop_macro("max")
+#endif // _WIN32
+
+#endif // CL_HPP_
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
new file mode 100644
index 0000000..b6c90b3
--- /dev/null
+++ b/include/CL/cl_d3d10.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR 0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+ cl_platform_id platform,
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Buffer * resource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture2D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture3D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_D3D10_H */
+
diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h
new file mode 100644
index 0000000..2e0a63f
--- /dev/null
+++ b/include/CL/cl_d3d11.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR 0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR 0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+ cl_platform_id platform,
+ cl_d3d11_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d11_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Buffer * resource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Texture2D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Texture3D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_D3D11_H */
+
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000..23f1631
--- /dev/null
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,127 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+/* cl_khr_dx9_media_sharing */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_media_adapter_type_khr;
+typedef cl_uint cl_dx9_media_adapter_set_khr;
+
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+ IDirect3DSurface9 *resource;
+ HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR 0x2020
+#define CL_ADAPTER_D3D9EX_KHR 0x2021
+#define CL_ADAPTER_DXVA_KHR 0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+ cl_platform_id platform,
+ cl_uint num_media_adapters,
+ cl_dx9_media_adapter_type_khr * media_adapter_type,
+ void * media_adapters,
+ cl_dx9_media_adapter_set_khr media_adapter_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_dx9_media_adapter_type_khr adapter_type,
+ void * surface_info,
+ cl_uint plane,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
new file mode 100644
index 0000000..93e6c9c
--- /dev/null
+++ b/include/CL/cl_egl.h
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include <CL/cl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context /* context */,
+ CLeglDisplayKHR /* egldisplay */,
+ CLeglImageKHR /* eglimage */,
+ cl_mem_flags /* flags */,
+ const cl_egl_image_properties_khr * /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+ cl_context context,
+ CLeglDisplayKHR egldisplay,
+ CLeglImageKHR eglimage,
+ cl_mem_flags flags,
+ const cl_egl_image_properties_khr * properties,
+ cl_int * errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context /* context */,
+ CLeglSyncKHR /* sync */,
+ CLeglDisplayKHR /* display */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+ cl_context context,
+ CLeglSyncKHR sync,
+ CLeglDisplayKHR display,
+ cl_int * errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
new file mode 100644
index 0000000..710bea8
--- /dev/null
+++ b/include/CL/cl_ext.h
@@ -0,0 +1,316 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies. */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+ #include <OpenCL/cl.h>
+ #include <AvailabilityMacros.h>
+#else
+ #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions */
+#define CL_DEVICE_HALF_FP_CONFIG 0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */,
+ void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info */
+#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
+
+/* Additional Error Codes */
+#define CL_PLATFORM_NOT_FOUND_KHR -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+ cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x200E
+
+
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x200F
+#define CL_CONTEXT_TERMINATE_KHR 0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS 0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
+#define CL_DEVICE_WARP_SIZE_NV 0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM 0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1
+
+#ifdef CL_VERSION_1_1
+ /***********************************
+ * cl_ext_device_fission extension *
+ ***********************************/
+ #define cl_ext_device_fission 1
+
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef cl_ulong cl_device_partition_property_ext;
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clCreateSubDevicesEXT( cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ ( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ /* cl_device_partition_property_ext */
+ #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
+ #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
+ #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
+ #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
+
+ /* clDeviceGetInfo selectors */
+ #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
+ #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
+ #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
+ #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
+ #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
+
+ /* error codes */
+ #define CL_DEVICE_PARTITION_FAILED_EXT -1057
+ #define CL_INVALID_PARTITION_COUNT_EXT -1058
+ #define CL_INVALID_PARTITION_NAME_EXT -1059
+
+ /* CL_AFFINITY_DOMAINs */
+ #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
+ #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
+ #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
+ #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
+ #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
+ #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
+
+ /* cl_device_partition_property_ext list terminators */
+ #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
+ #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
+ #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7
+
+typedef cl_uint cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id device,
+ size_t image_width,
+ size_t image_height,
+ const cl_image_format *image_format,
+ cl_image_pitch_info_qcom param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+ /* Type of external memory allocation. */
+ /* Legal values will be defined in layered extensions. */
+ cl_uint allocation_type;
+
+ /* Host cache policy for this external memory allocation. */
+ cl_uint host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+ /* Type of external memory allocation. */
+ /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+ cl_mem_ext_host_ptr ext_host_ptr;
+
+ /* ION file descriptor */
+ int ion_filedesc;
+
+ /* Host pointer to the ION allocated memory */
+ void* ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
new file mode 100644
index 0000000..e52c1b6
--- /dev/null
+++ b/include/CL/cl_gl.h
@@ -0,0 +1,162 @@
+/**********************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint cl_gl_object_type;
+typedef cl_uint cl_gl_texture_info;
+typedef cl_uint cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
+#define CL_GL_OBJECT_BUFFER 0x2000
+#define CL_GL_OBJECT_TEXTURE2D 0x2001
+#define CL_GL_OBJECT_TEXTURE3D 0x2002
+#define CL_GL_OBJECT_RENDERBUFFER 0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
+#define CL_GL_OBJECT_TEXTURE1D 0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
+
+/* cl_gl_texture_info */
+#define CL_GL_TEXTURE_TARGET 0x2004
+#define CL_GL_MIPMAP_LEVEL 0x2005
+#define CL_GL_NUM_SAMPLES 0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLuint /* bufobj */,
+ int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLuint /* renderbuffer */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem /* memobj */,
+ cl_gl_object_type * /* gl_object_type */,
+ cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem /* memobj */,
+ cl_gl_texture_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* cl_khr_gl_sharing extension */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint cl_gl_context_info;
+
+/* Additional Error Codes */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
+
+/* cl_gl_context_info */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
+
+/* Additional cl_context_properties */
+#define CL_GL_CONTEXT_KHR 0x2008
+#define CL_EGL_DISPLAY_KHR 0x2009
+#define CL_GLX_DISPLAY_KHR 0x200A
+#define CL_WGL_HDC_KHR 0x200B
+#define CL_CGL_SHAREGROUP_KHR 0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+ cl_gl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+ const cl_context_properties * properties,
+ cl_gl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_GL_H */
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
new file mode 100644
index 0000000..77d5353
--- /dev/null
+++ b/include/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */
+/* OpenGL dependencies. */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+ #include <OpenCL/cl_gl.h>
+#else
+ #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ * cl_VEN_extname extension */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ * This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/*
+ * cl_khr_gl_event extension
+ * See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context /* context */,
+ cl_GLsync /* cl_GLsync */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_GL_EXT_H */
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
new file mode 100644
index 0000000..f2fe9d4
--- /dev/null
+++ b/include/CL/cl_intel.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __OPENCL_CL_INTEL_H
+#define __OPENCL_CL_INTEL_H
+
+#include "CL/cl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CL_MEM_PINNABLE (1 << 10)
+
+/* Track allocations and report current number of unfreed allocations */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReportUnfreedIntel(void);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReportUnfreedIntel_fn)(void);
+
+/* 1 to 1 mapping of drm_intel_bo_map */
+extern CL_API_ENTRY void* CL_API_CALL
+clMapBufferIntel(cl_mem, cl_int*);
+
+typedef CL_API_ENTRY void* (CL_API_CALL *clMapBufferIntel_fn)(cl_mem, cl_int*);
+
+/* 1 to 1 mapping of drm_intel_bo_unmap */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnmapBufferIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnmapBufferIntel_fn)(cl_mem);
+
+/* 1 to 1 mapping of drm_intel_gem_bo_map_gtt */
+extern CL_API_ENTRY void* CL_API_CALL
+clMapBufferGTTIntel(cl_mem, cl_int*);
+
+typedef CL_API_ENTRY void* (CL_API_CALL *clMapBufferGTTIntel_fn)(cl_mem, cl_int*);
+
+/* 1 to 1 mapping of drm_intel_gem_bo_unmap_gtt */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnmapBufferGTTIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnmapBufferGTTIntel_fn)(cl_mem);
+
+/* Pin /Unpin the buffer in GPU memory (must be root) */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clPinBufferIntel(cl_mem);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnpinBufferIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clPinBufferIntel_fn)(cl_mem);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnpinBufferIntel_fn)(cl_mem);
+
+/* Get the generation of the Gen device (used to load the proper binary) */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGenVersionIntel(cl_device_id device, cl_int *ver);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGenVersionIntel_fn)(
+ cl_device_id device,
+ cl_int *ver);
+
+/* Create a program from a LLVM source file */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithLLVMIntel(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* file */,
+ cl_int * /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithLLVMIntel_fn)(
+ cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* file */,
+ cl_int * /* errcode_ret */);
+
+/* Create buffer from libva's buffer object */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferFromLibvaIntel(cl_context /* context */,
+ unsigned int /* bo_name */,
+ cl_int * /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateBufferFromLibvaIntel_fn)(
+ cl_context /* context */,
+ unsigned int /* bo_name */,
+ cl_int * /* errcode_ret */);
+
+/* Create image from libva's buffer object */
+typedef struct _cl_libva_image {
+ unsigned int bo_name;
+ uint32_t offset;
+ uint32_t width;
+ uint32_t height;
+ cl_image_format fmt;
+ uint32_t row_pitch;
+ uint32_t reserved[8];
+} cl_libva_image;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImageFromLibvaIntel(cl_context /* context */,
+ const cl_libva_image * /* info */,
+ cl_int * /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImageFromLibvaIntel_fn)(
+ cl_context /* context */,
+ const cl_libva_image * /* info */,
+ cl_int * /* errcode_ret */);
+
+/* Create buffer from libva's buffer object */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectFdIntel(cl_context /* context */,
+ cl_mem /* Memory Obejct */,
+ int* /* returned fd */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectFdIntel_fn)(
+ cl_context /* context */,
+ cl_mem /* Memory Obejct */,
+ int* /* returned fd */);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_INTEL_H */
+
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
new file mode 100644
index 0000000..7f6f5e8
--- /dev/null
+++ b/include/CL/cl_platform.h
@@ -0,0 +1,1278 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+ /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+ #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+ #define CL_API_ENTRY
+ #define CL_API_CALL __stdcall
+ #define CL_CALLBACK __stdcall
+#else
+ #define CL_API_ENTRY
+ #define CL_API_CALL
+ #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+ #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import))
+ #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+
+ #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+ #else
+ #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
+ #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #endif
+#else
+ #define CL_EXTENSION_WEAK_LINK
+ #define CL_API_SUFFIX__VERSION_1_0
+ #define CL_EXT_SUFFIX__VERSION_1_0
+ #define CL_API_SUFFIX__VERSION_1_1
+ #define CL_EXT_SUFFIX__VERSION_1_1
+ #define CL_API_SUFFIX__VERSION_1_2
+ #define CL_EXT_SUFFIX__VERSION_1_2
+
+ #ifdef __GNUC__
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #endif
+ #elif _WIN32
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
+ #endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+ #endif
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types */
+typedef signed __int8 cl_char;
+typedef unsigned __int8 cl_uchar;
+typedef signed __int16 cl_short;
+typedef unsigned __int16 cl_ushort;
+typedef signed __int32 cl_int;
+typedef unsigned __int32 cl_uint;
+typedef signed __int64 cl_long;
+typedef unsigned __int64 cl_ulong;
+
+typedef unsigned __int16 cl_half;
+typedef float cl_float;
+typedef double cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN 1.175494350822287507969e-38f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN 2.225073858507201383090e-308
+#define CL_DBL_EPSILON 2.220446049250313080847e-16
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#define CL_NAN (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF ((cl_float) 1e50)
+#define CL_HUGE_VAL ((cl_double) 1e500)
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types */
+typedef int8_t cl_char;
+typedef uint8_t cl_uchar;
+typedef int16_t cl_short __attribute__((aligned(2)));
+typedef uint16_t cl_ushort __attribute__((aligned(2)));
+typedef int32_t cl_int __attribute__((aligned(4)));
+typedef uint32_t cl_uint __attribute__((aligned(4)));
+typedef int64_t cl_long __attribute__((aligned(8)));
+typedef uint64_t cl_ulong __attribute__((aligned(8)));
+
+typedef uint16_t cl_half __attribute__((aligned(2)));
+typedef float cl_float __attribute__((aligned(4)));
+typedef double cl_double __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 0x1.fffffep127f
+#define CL_FLT_MIN 0x1.0p-126f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 0x1.fffffffffffffp1023
+#define CL_DBL_MIN 0x1.0p-1022
+#define CL_DBL_EPSILON 0x1.0p-52
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#if defined( __GNUC__ )
+ #define CL_HUGE_VALF __builtin_huge_valf()
+ #define CL_HUGE_VAL __builtin_huge_val()
+ #define CL_NAN __builtin_nanf( "" )
+#else
+ #define CL_HUGE_VALF ((cl_float) 1e50)
+ #define CL_HUGE_VAL ((cl_double) 1e500)
+ float nanf( const char * );
+ #define CL_NAN nanf( "" )
+#endif
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ * Note: OpenCL requires that all types be naturally aligned.
+ * This means that vector types must be naturally aligned.
+ * For example, a vector of four floats must be aligned to
+ * a 16 byte boundary (calculated as 4 * the natural 4-byte
+ * alignment of the float). The alignment qualifiers here
+ * will only function properly if your compiler supports them
+ * and if you don't actively work to defeat them. For example,
+ * in order for a cl_float4 to be 16 byte aligned in a struct,
+ * the start of the struct must itself be 16-byte aligned.
+ *
+ * Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+ #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+ typedef vector unsigned char __cl_uchar16;
+ typedef vector signed char __cl_char16;
+ typedef vector unsigned short __cl_ushort8;
+ typedef vector signed short __cl_short8;
+ typedef vector unsigned int __cl_uint4;
+ typedef vector signed int __cl_int4;
+ typedef vector float __cl_float4;
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_INT4__ 1
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <xmmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef float __cl_float4 __attribute__((vector_size(16)));
+ #else
+ typedef __m128 __cl_float4;
+ #endif
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE2__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <emmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16)));
+ typedef cl_char __cl_char16 __attribute__((vector_size(16)));
+ typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16)));
+ typedef cl_short __cl_short8 __attribute__((vector_size(16)));
+ typedef cl_uint __cl_uint4 __attribute__((vector_size(16)));
+ typedef cl_int __cl_int4 __attribute__((vector_size(16)));
+ typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16)));
+ typedef cl_long __cl_long2 __attribute__((vector_size(16)));
+ typedef cl_double __cl_double2 __attribute__((vector_size(16)));
+ #else
+ typedef __m128i __cl_uchar16;
+ typedef __m128i __cl_char16;
+ typedef __m128i __cl_ushort8;
+ typedef __m128i __cl_short8;
+ typedef __m128i __cl_uint4;
+ typedef __m128i __cl_int4;
+ typedef __m128i __cl_ulong2;
+ typedef __m128i __cl_long2;
+ typedef __m128d __cl_double2;
+ #endif
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_INT4__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_ULONG2__ 1
+ #define __CL_LONG2__ 1
+ #define __CL_DOUBLE2__ 1
+#endif
+
+#if defined( __MMX__ )
+ #include <mmintrin.h>
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8)));
+ typedef cl_char __cl_char8 __attribute__((vector_size(8)));
+ typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8)));
+ typedef cl_short __cl_short4 __attribute__((vector_size(8)));
+ typedef cl_uint __cl_uint2 __attribute__((vector_size(8)));
+ typedef cl_int __cl_int2 __attribute__((vector_size(8)));
+ typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8)));
+ typedef cl_long __cl_long1 __attribute__((vector_size(8)));
+ typedef cl_float __cl_float2 __attribute__((vector_size(8)));
+ #else
+ typedef __m64 __cl_uchar8;
+ typedef __m64 __cl_char8;
+ typedef __m64 __cl_ushort4;
+ typedef __m64 __cl_short4;
+ typedef __m64 __cl_uint2;
+ typedef __m64 __cl_int2;
+ typedef __m64 __cl_ulong1;
+ typedef __m64 __cl_long1;
+ typedef __m64 __cl_float2;
+ #endif
+ #define __CL_UCHAR8__ 1
+ #define __CL_CHAR8__ 1
+ #define __CL_USHORT4__ 1
+ #define __CL_SHORT4__ 1
+ #define __CL_INT2__ 1
+ #define __CL_UINT2__ 1
+ #define __CL_ULONG1__ 1
+ #define __CL_LONG1__ 1
+ #define __CL_FLOAT2__ 1
+#endif
+
+#if defined( __AVX__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <immintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_float __cl_float8 __attribute__((vector_size(32)));
+ typedef cl_double __cl_double4 __attribute__((vector_size(32)));
+ #else
+ typedef __m256 __cl_float8;
+ typedef __m256d __cl_double4;
+ #endif
+ #define __CL_FLOAT8__ 1
+ #define __CL_DOUBLE4__ 1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define __CL_HAS_ANON_STRUCT__ 1
+#define __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && (_MSC_VER >= 1500)
+ /* Microsoft Developer Studio 2008 supports anonymous structs, but
+ * complains by default. */
+#define __CL_HAS_ANON_STRUCT__ 1
+#define __CL_ANON_STRUCT__
+ /* Disable warning C4201: nonstandard extension used : nameless
+ * struct/union */
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#else
+#define __CL_HAS_ANON_STRUCT__ 0
+#define __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+ #define CL_ALIGNED(_x) __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+ /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */
+ /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */
+ /* #include <crtdefs.h> */
+ /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */
+ #define CL_ALIGNED(_x)
+#else
+ #warning Need to implement some method to align data here
+ #define CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+ /* .xyzw and .s0123...{f|F} are supported */
+ #define CL_HAS_NAMED_VECTOR_FIELDS 1
+ /* .hi and .lo are supported */
+ #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+ cl_char CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_char lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+ cl_char CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef cl_char4 cl_char3;
+
+typedef union
+{
+ cl_char CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+ cl_char CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+ __cl_char16 v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+ cl_uchar CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+ __cl_uchar2 v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef cl_uchar4 cl_uchar3;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+ __cl_uchar16 v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+ cl_short CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_short lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+ cl_short CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef cl_short4 cl_short3;
+
+typedef union
+{
+ cl_short CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+ cl_short CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+ __cl_short16 v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+ cl_ushort CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef cl_ushort4 cl_ushort3;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+ __cl_ushort16 v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+ cl_int CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_int lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+ cl_int CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[2];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef cl_int4 cl_int3;
+
+typedef union
+{
+ cl_int CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[4];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[2];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+ cl_int CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[8];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[4];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8[2];
+#endif
+#if defined( __CL_INT16__ )
+ __cl_int16 v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+ cl_uint CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[2];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef cl_uint4 cl_uint3;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[4];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[8];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+ __cl_uint16 v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+ cl_long CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_long lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+ cl_long CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[2];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef cl_long4 cl_long3;
+
+typedef union
+{
+ cl_long CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[4];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+ cl_long CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[8];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+ __cl_long16 v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+ cl_ulong CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef cl_ulong4 cl_ulong3;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+ __cl_ulong16 v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+ cl_float CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_float lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+ cl_float CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef cl_float4 cl_float3;
+
+typedef union
+{
+ cl_float CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+ cl_float CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+ __cl_float16 v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+ cl_double CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+ cl_double CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef cl_double4 cl_double3;
+
+typedef union
+{
+ cl_double CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+ cl_double CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+ __cl_double16 v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \"
+ * Each line thereafter of OpenCL C source must end with: \n\
+ * The last line ends in ";
+ *
+ * Example:
+ *
+ * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ * kernel void foo( int a, float * b ) \n\
+ * { \n\
+ * // my comment \n\
+ * *b[ get_global_id(0)] = a; \n\
+ * } \n\
+ * ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define __CL_STRINGIFY( _x ) # _x
+#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x )
+#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && (_MSC_VER >= 1500)
+#pragma warning( pop )
+#endif
+
+#endif /* __CL_PLATFORM_H */
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
new file mode 100644
index 0000000..3f00524
--- /dev/null
+++ b/include/CL/opencl.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_H */
+
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
new file mode 100644
index 0000000..4d0bed7
--- /dev/null
+++ b/include/CMakeLists.txt
@@ -0,0 +1,5 @@
+FILE(GLOB HEADER_FILES "CL/*.h")
+FILE(GLOB HPP_FILES "CL/*.hpp")
+
+install (FILES ${HEADER_FILES} DESTINATION include/CL)
+install (FILES ${HPP_FILES} DESTINATION include/CL)
diff --git a/intel-beignet.icd.in b/intel-beignet.icd.in
new file mode 100644
index 0000000..9b2e349
--- /dev/null
+++ b/intel-beignet.icd.in
@@ -0,0 +1 @@
+ at LIB_INSTALL_DIR@/beignet/libcl.so
diff --git a/kernels/buildin_work_dim.cl b/kernels/buildin_work_dim.cl
new file mode 100644
index 0000000..27c0e18
--- /dev/null
+++ b/kernels/buildin_work_dim.cl
@@ -0,0 +1,3 @@
+kernel void buildin_work_dim( __global int *ret ) {
+ *ret = get_work_dim();
+}
diff --git a/kernels/builtin_acos_asin.cl b/kernels/builtin_acos_asin.cl
new file mode 100644
index 0000000..bba2d21
--- /dev/null
+++ b/kernels/builtin_acos_asin.cl
@@ -0,0 +1,10 @@
+__kernel void builtin_acos_asin(__global float *dst, __global float *src, __global int *max_func) {
+ int i = get_global_id(0);
+ float x = src[i];
+
+ dst[i * (*max_func) + 0] = acos(x);
+ dst[i * (*max_func) + 1] = acosh(x);
+ dst[i * (*max_func) + 2] = asin(x);
+ dst[i * (*max_func) + 3] = asinh(x);
+ dst[i * (*max_func) + 4] = x;
+};
diff --git a/kernels/builtin_atan2.cl b/kernels/builtin_atan2.cl
new file mode 100644
index 0000000..aba73be
--- /dev/null
+++ b/kernels/builtin_atan2.cl
@@ -0,0 +1,4 @@
+kernel void builtin_atan2(global float *y, global float *x, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = atan2(y[i], x[i]);
+};
diff --git a/kernels/builtin_bitselect.cl b/kernels/builtin_bitselect.cl
new file mode 100644
index 0000000..9b60cbe
--- /dev/null
+++ b/kernels/builtin_bitselect.cl
@@ -0,0 +1,4 @@
+kernel void builtin_bitselect(global float *src1, global float *src2, global float *src3, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = bitselect(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/builtin_convert_sat.cl b/kernels/builtin_convert_sat.cl
new file mode 100644
index 0000000..1485f1d
--- /dev/null
+++ b/kernels/builtin_convert_sat.cl
@@ -0,0 +1,48 @@
+#define DEF(DSTTYPE, SRCTYPE) \
+ kernel void builtin_convert_ ## SRCTYPE ## _to_ ## DSTTYPE ## _sat(global SRCTYPE *src, global DSTTYPE *dst) { \
+ int i = get_global_id(0); \
+ dst[i] = convert_ ## DSTTYPE ## _sat(src[i]); \
+}
+
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, long);
+DEF(char, ulong);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, long);
+DEF(uchar, ulong);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, long);
+DEF(short, ulong);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, long);
+DEF(ushort, ulong);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, long);
+DEF(int, ulong);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, long);
+DEF(uint, ulong);
+DEF(uint, float);
+DEF(long, ulong);
+DEF(long, float);
+DEF(ulong, long);
+DEF(ulong, float);
+#undef DEF
+
diff --git a/kernels/builtin_exp.cl b/kernels/builtin_exp.cl
new file mode 100644
index 0000000..ecc1a3e
--- /dev/null
+++ b/kernels/builtin_exp.cl
@@ -0,0 +1,10 @@
+__kernel void builtin_exp(__global float *dst, __global float *src, __global int *max_func) {
+ int i = get_global_id(0);
+ float x = src[i];
+
+ dst[i * (*max_func) + 0] = exp(x);
+ dst[i * (*max_func) + 1] = exp2(x);
+ dst[i * (*max_func) + 2] = exp10(x);
+ dst[i * (*max_func) + 3] = expm1(x);
+ dst[i * (*max_func) + 4] = x;
+};
diff --git a/kernels/builtin_frexp.cl b/kernels/builtin_frexp.cl
new file mode 100644
index 0000000..766695a
--- /dev/null
+++ b/kernels/builtin_frexp.cl
@@ -0,0 +1,4 @@
+kernel void builtin_frexp(global float *src, global float *dst, global int *e) {
+ int i = get_global_id(0);
+ dst[i] = frexp(src[i], &e[i]);
+}
diff --git a/kernels/builtin_global_id.cl b/kernels/builtin_global_id.cl
new file mode 100644
index 0000000..5b82f9f
--- /dev/null
+++ b/kernels/builtin_global_id.cl
@@ -0,0 +1,4 @@
+kernel void builtin_global_id( __global int *ret) {
+ int id = get_global_id(0) + get_global_id(1)*3 + get_global_id(2)*3*4;
+ ret[id] = id;
+}
diff --git a/kernels/builtin_global_size.cl b/kernels/builtin_global_size.cl
new file mode 100644
index 0000000..e6ddb2f
--- /dev/null
+++ b/kernels/builtin_global_size.cl
@@ -0,0 +1,3 @@
+kernel void builtin_global_size( __global int *ret, __global int *i_dim ) {
+ *ret = get_global_size( *i_dim);
+}
diff --git a/kernels/builtin_lgamma.cl b/kernels/builtin_lgamma.cl
new file mode 100644
index 0000000..85bf859
--- /dev/null
+++ b/kernels/builtin_lgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = lgamma(src[i]);
+};
diff --git a/kernels/builtin_lgamma_r.cl b/kernels/builtin_lgamma_r.cl
new file mode 100644
index 0000000..71fcc36
--- /dev/null
+++ b/kernels/builtin_lgamma_r.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma_r(global float *src, global float *dst, global int *signp) {
+ int i = get_global_id(0);
+ dst[i] = lgamma_r(src[i], signp+i);
+};
diff --git a/kernels/builtin_local_id.cl b/kernels/builtin_local_id.cl
new file mode 100644
index 0000000..489833d
--- /dev/null
+++ b/kernels/builtin_local_id.cl
@@ -0,0 +1,6 @@
+kernel void builtin_local_id( __global int *ret) {
+ int id = get_local_id(0) + get_group_id(0) * 2 + \
+ get_local_id(1) * 4 + get_group_id(1) * 12 +\
+ get_local_id(2) * 36 + get_group_id(2) * 144;
+ ret[id] = id;
+}
diff --git a/kernels/builtin_local_size.cl b/kernels/builtin_local_size.cl
new file mode 100644
index 0000000..979d907
--- /dev/null
+++ b/kernels/builtin_local_size.cl
@@ -0,0 +1,3 @@
+kernel void builtin_local_size( __global int *ret, __global int *i_dim ) {
+ *ret = get_local_size( *i_dim);
+}
diff --git a/kernels/builtin_mad_sat.cl b/kernels/builtin_mad_sat.cl
new file mode 100644
index 0000000..1739a4d
--- /dev/null
+++ b/kernels/builtin_mad_sat.cl
@@ -0,0 +1,4 @@
+kernel void builtin_mad_sat(global short *src1, global short *src2, global short *src3, global short *dst) {
+ short i = get_global_id(0);
+ dst[i] = mad_sat(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/builtin_modf.cl b/kernels/builtin_modf.cl
new file mode 100644
index 0000000..43630ed
--- /dev/null
+++ b/kernels/builtin_modf.cl
@@ -0,0 +1,6 @@
+kernel void builtin_modf(global float *src, global float *dst, global float *it) {
+ int i = get_global_id(0);
+ float x;
+ dst[i] = modf(src[i], &x);
+ it[i] = x;
+}
diff --git a/kernels/builtin_nextafter.cl b/kernels/builtin_nextafter.cl
new file mode 100644
index 0000000..3945e34
--- /dev/null
+++ b/kernels/builtin_nextafter.cl
@@ -0,0 +1,4 @@
+kernel void builtin_nextafter(global float *src1, global float *src2, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = nextafter(src1[i], src2[i]);
+}
diff --git a/kernels/builtin_num_groups.cl b/kernels/builtin_num_groups.cl
new file mode 100644
index 0000000..719d25d
--- /dev/null
+++ b/kernels/builtin_num_groups.cl
@@ -0,0 +1,3 @@
+kernel void builtin_num_groups( __global int *ret, __global int *i_dim ) {
+ *ret = get_num_groups( *i_dim);
+}
diff --git a/kernels/builtin_pow.cl b/kernels/builtin_pow.cl
new file mode 100644
index 0000000..17d753e
--- /dev/null
+++ b/kernels/builtin_pow.cl
@@ -0,0 +1,7 @@
+kernel void builtin_pow(global float *dst, global float *src1, global float *src2, global int *max_func) {
+
+ int i = get_global_id(0);
+ dst[i * (*max_func) + 0] = pow(src1[i], src2[i]);
+ dst[i * (*max_func) + 1] = src1[i];
+
+}
diff --git a/kernels/builtin_remquo.cl b/kernels/builtin_remquo.cl
new file mode 100644
index 0000000..d66c164
--- /dev/null
+++ b/kernels/builtin_remquo.cl
@@ -0,0 +1,6 @@
+kernel void builtin_remquo(global float *x, global float *y, global float *dst, global int *quo) {
+ int i = get_global_id(0);
+ int q;
+ dst[i] = remquo(x[i], y[i], & q);
+ quo[i] = q;
+}
diff --git a/kernels/builtin_shuffle.cl b/kernels/builtin_shuffle.cl
new file mode 100644
index 0000000..ad988b9
--- /dev/null
+++ b/kernels/builtin_shuffle.cl
@@ -0,0 +1,8 @@
+kernel void builtin_shuffle(global float *src1, global float *src2, global float *dst1, global float *dst2) {
+ int i = get_global_id(0);
+ float2 src = (float2)(src1[i], src2[i]);
+ uint2 mask = (uint2)(1, 0);
+ float2 dst = shuffle(src, mask);
+ dst1[i] = dst.s0;
+ dst2[i] = dst.s1;
+}
diff --git a/kernels/builtin_shuffle2.cl b/kernels/builtin_shuffle2.cl
new file mode 100644
index 0000000..1a122d4
--- /dev/null
+++ b/kernels/builtin_shuffle2.cl
@@ -0,0 +1,13 @@
+kernel void builtin_shuffle2(global float *src1, global float *src2, global float *dst1, global float *dst2) {
+ int i = get_global_id(0);
+ float2 x = (float2)(src1[i], src2[i]);
+ float2 y = (float2)(1234, 5678);
+ uint4 mask = (uint4)(1, 0, 0, 0);
+ float4 v1 = shuffle2(x, y, mask);
+ float16 x2 = 0;
+ float16 y2 = (float16)(src1[i], src2[i], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ uint16 mask2 = (uint16)(17, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ float16 v2 = shuffle2(x2, y2, mask2);
+ dst1[i] = v1.s0 + v2.s0;
+ dst2[i] = v1.s1 + v2.s1;
+}
diff --git a/kernels/builtin_sign.cl b/kernels/builtin_sign.cl
new file mode 100644
index 0000000..ff9a66b
--- /dev/null
+++ b/kernels/builtin_sign.cl
@@ -0,0 +1,4 @@
+kernel void builtin_sign(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = sign(src[i]);
+}
diff --git a/kernels/builtin_sinpi.cl b/kernels/builtin_sinpi.cl
new file mode 100644
index 0000000..134152d
--- /dev/null
+++ b/kernels/builtin_sinpi.cl
@@ -0,0 +1,4 @@
+kernel void builtin_sinpi(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = sinpi(src[i]);
+};
diff --git a/kernels/builtin_tgamma.cl b/kernels/builtin_tgamma.cl
new file mode 100644
index 0000000..1f7abc3
--- /dev/null
+++ b/kernels/builtin_tgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_tgamma(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = tgamma(src[i]);
+};
diff --git a/kernels/compare_image_2d_and_1d_array.cl b/kernels/compare_image_2d_and_1d_array.cl
new file mode 100644
index 0000000..6aabb43
--- /dev/null
+++ b/kernels/compare_image_2d_and_1d_array.cl
@@ -0,0 +1,13 @@
+__kernel void
+compare_image_2d_and_1d_array(image2d_t a1, image1d_array_t a2, sampler_t sampler)
+{
+ float2 coord;
+ int4 color1;
+ int4 color2;
+ coord.x = (float)get_global_id(0) + 0.3f;
+ coord.y = (float)get_global_id(1) + 0.3f;
+ color1 = read_imagei(a1, sampler, coord);
+ color2 = read_imagei(a2, sampler, coord);
+// printf("########## x y is (%f, %f), color1 is (%d %d %d %d), color2 is (%d %d %d %d)\n",
+// coord.x, coord.y, color1.x, color1.y, color1.z, color1.w, color2.x, color2.y, color2.z, color2.w);
+}
diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs.cl
new file mode 100644
index 0000000..549575c
--- /dev/null
+++ b/kernels/compiler_abs.cl
@@ -0,0 +1,28 @@
+#define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+ kernel void compiler_abs_##TYPE ( \
+ global TYPE* src, global UTYPE* dst) { \
+ int i = get_global_id(0); \
+ dst[i] = abs(src[i]); \
+ }
+
+#define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
+ kernel void compiler_abs_##TYPE##N ( \
+ global TYPE##N* src, global UTYPE##N* dst) { \
+ int i = get_global_id(0); \
+ dst[i] = abs(src[i]); \
+ }
+
+#define COMPILER_ABS(TYPE, UTYPE) \
+ COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
+
+COMPILER_ABS(int, uint)
+COMPILER_ABS(uint, uint)
+COMPILER_ABS(char, uchar)
+COMPILER_ABS(uchar, uchar)
+COMPILER_ABS(short, ushort)
+COMPILER_ABS(ushort, ushort)
diff --git a/kernels/compiler_abs_diff.cl b/kernels/compiler_abs_diff.cl
new file mode 100644
index 0000000..1f30df4
--- /dev/null
+++ b/kernels/compiler_abs_diff.cl
@@ -0,0 +1,30 @@
+#define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+ kernel void compiler_abs_diff_##TYPE ( \
+ global TYPE* x, global TYPE* y, global UTYPE* diff) { \
+ int i = get_global_id(0); \
+ diff[i] = abs_diff(x[i], y[i]); \
+ }
+
+#define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
+ kernel void compiler_abs_diff_##TYPE##N ( \
+ global TYPE##N* x, global TYPE##N* y, global UTYPE##N* diff) { \
+ int i = get_global_id(0); \
+ diff[i] = abs_diff(x[i], y[i]); \
+ }
+
+#define COMPILER_ABS(TYPE, UTYPE) \
+ COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
+ COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
+
+COMPILER_ABS(int, uint)
+COMPILER_ABS(uint, uint)
+COMPILER_ABS(char, uchar)
+COMPILER_ABS(uchar, uchar)
+COMPILER_ABS(short, ushort)
+COMPILER_ABS(ushort, ushort)
+COMPILER_ABS(long, ulong)
+COMPILER_ABS(ulong, ulong)
diff --git a/kernels/compiler_address_space.cl b/kernels/compiler_address_space.cl
new file mode 100644
index 0000000..68b7746
--- /dev/null
+++ b/kernels/compiler_address_space.cl
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Address Space Qualifiers (section 6.5) */
+__constant float cf1[] = {1, 2, 3};
+constant float cf2[] = {4, 5, 6};
+__kernel void compiler_address_space(__global float *gf1, global float *gf2) {
+ __local float lf1[4];
+ local float lf2[4];
+ __private float pf1[4];
+ private float pf2[4];
+}
diff --git a/kernels/compiler_argument_structure.cl b/kernels/compiler_argument_structure.cl
new file mode 100644
index 0000000..ab7896e
--- /dev/null
+++ b/kernels/compiler_argument_structure.cl
@@ -0,0 +1,9 @@
+struct hop { int x, y; };
+
+__kernel void
+compiler_argument_structure(__global int *dst, struct hop h)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = h.x + h.y;
+}
+
diff --git a/kernels/compiler_argument_structure_indirect.cl b/kernels/compiler_argument_structure_indirect.cl
new file mode 100644
index 0000000..c4b062f
--- /dev/null
+++ b/kernels/compiler_argument_structure_indirect.cl
@@ -0,0 +1,9 @@
+struct hop { int x[16]; };
+
+__kernel void
+compiler_argument_structure(__global int *dst, struct hop h)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = h.x[get_local_id(0)];
+}
+
diff --git a/kernels/compiler_arith_shift_right.cl b/kernels/compiler_arith_shift_right.cl
new file mode 100644
index 0000000..03a4d8d
--- /dev/null
+++ b/kernels/compiler_arith_shift_right.cl
@@ -0,0 +1,4 @@
+kernel void compiler_arith_shift_right(global int *src, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = src[i] >> 24;
+}
diff --git a/kernels/compiler_array.cl b/kernels/compiler_array.cl
new file mode 100644
index 0000000..5dce4d9
--- /dev/null
+++ b/kernels/compiler_array.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_array(__global int *src, __global int *dst)
+{
+ int array[16];
+ int i;
+ for (i = 0; i < 16; ++i) {
+ if (src[0] > 10)
+ array[i] = get_local_id(0);
+ else
+ array[15 - i] = 3 + get_local_id(1);
+ }
+ dst[get_global_id(0)] = array[get_local_id(0)];
+}
+
diff --git a/kernels/compiler_array0.cl b/kernels/compiler_array0.cl
new file mode 100644
index 0000000..3ab0fb8
--- /dev/null
+++ b/kernels/compiler_array0.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_array0(__global int *src, __global int *dst)
+{
+ int i;
+ int final[16];
+ for (i = 0; i < 16; ++i) {
+ int array[16], j;
+ for (j = 0; j < 16; ++j)
+ array[j] = get_global_id(0);
+ for (j = 0; j < src[0]; ++j)
+ array[j] = 1+src[j];
+ final[i] = array[i];
+ }
+ dst[get_global_id(0)] = final[get_global_id(0)];
+}
+
diff --git a/kernels/compiler_array1.cl b/kernels/compiler_array1.cl
new file mode 100644
index 0000000..ad567c2
--- /dev/null
+++ b/kernels/compiler_array1.cl
@@ -0,0 +1,15 @@
+__kernel void
+compiler_array1(__global int *src, __global int *dst)
+{
+ int final[16];
+ for (int i = 0; i < 16; ++i) {
+ int array[16];
+ for (int j = 0; j < src[0]; ++j)
+ array[j] = 1+src[0];
+ for (int j = src[0]; j < 16; ++j)
+ array[j] = get_global_id(0);
+ final[i] = array[i];
+ }
+ dst[get_global_id(0)] = final[get_global_id(0)];
+}
+
diff --git a/kernels/compiler_array2.cl b/kernels/compiler_array2.cl
new file mode 100644
index 0000000..ae73932
--- /dev/null
+++ b/kernels/compiler_array2.cl
@@ -0,0 +1,13 @@
+__kernel void
+compiler_array2(__global int *src, __global int *dst)
+{
+ int final[16];
+ int array[16];
+ for (int j = 0; j < 16; ++j) array[j] = j;
+ for (int j = 0; j < 16; ++j) final[j] = j+1;
+ if (get_global_id(0) == 15)
+ dst[get_global_id(0)] = final[get_global_id(0)];
+ else
+ dst[get_global_id(0)] = array[15 - get_global_id(0)];
+}
+
diff --git a/kernels/compiler_array3.cl b/kernels/compiler_array3.cl
new file mode 100644
index 0000000..152c22a
--- /dev/null
+++ b/kernels/compiler_array3.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_array3(__global int *src, __global int *dst)
+{
+ int tmp[32];
+ for (int i = 0; i < 16; ++i) {
+ for (int j = 0; j < 16; ++j)
+ tmp[j] = get_global_id(0);
+ for (int j = 0; j < src[0]; ++j)
+ tmp[j] = 1+src[j];
+ tmp[16+i] = tmp[i];
+ }
+ dst[get_global_id(0)] = tmp[16+get_global_id(0)];
+}
+
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
new file mode 100644
index 0000000..dddde44
--- /dev/null
+++ b/kernels/compiler_async_copy.cl
@@ -0,0 +1,24 @@
+#define DEF(TYPE) \
+kernel void \
+compiler_async_copy_##TYPE(__global TYPE *dst, __global TYPE *src, __local TYPE *localBuffer, int copiesPerWorkItem) \
+{ \
+ event_t event; \
+ int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); \
+ int i; \
+ event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 ); \
+ wait_group_events( 1, &event ); \
+\
+ event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 ); \
+ wait_group_events( 1, &event ); \
+}
+
+DEF(char2);
+DEF(uchar2);
+DEF(short2);
+DEF(ushort2);
+DEF(int2);
+DEF(uint2);
+DEF(long2);
+DEF(ulong2);
+DEF(float2);
+//DEF(double2);
diff --git a/kernels/compiler_async_copy_and_prefetch.cl b/kernels/compiler_async_copy_and_prefetch.cl
new file mode 100644
index 0000000..7489bb0
--- /dev/null
+++ b/kernels/compiler_async_copy_and_prefetch.cl
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Async Copies and Prefetch Functions (section 6.11.10) */
+kernel void compiler_async_copy_and_prefetch(__global float *p) {
+ prefetch(p, 10);
+ local float l[10];
+ event_t e[2];
+ async_work_group_copy(l, p, 10, 0);
+ async_work_group_copy(p, l, 10, 0);
+ wait_group_events(2, e);
+}
diff --git a/kernels/compiler_async_stride_copy.cl b/kernels/compiler_async_stride_copy.cl
new file mode 100644
index 0000000..a926588
--- /dev/null
+++ b/kernels/compiler_async_stride_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_stride_copy(__global char4 *dst, __global char4 *src, __local char4 *localBuffer, int copiesPerWorkItem, int stride)
+{
+ event_t event;
+ int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+ int i;
+ event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+ wait_group_events( 1, &event );
+
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (char4)(3);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+ wait_group_events( 1, &event );
+}
diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
new file mode 100644
index 0000000..fbc16fb
--- /dev/null
+++ b/kernels/compiler_atomic_functions.cl
@@ -0,0 +1,50 @@
+__kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
+ int lid = get_local_id(0);
+ int i = lid % 12;
+ if(lid == 0) {
+ for(int j=0; j<12; j=j+1) {
+ atomic_xchg(&tmp[j], 0);
+ }
+ atomic_xchg(&tmp[4], -1);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ switch(i) {
+ case 0: atomic_inc(&tmp[i]); break;
+ case 1: atomic_dec(&tmp[i]); break;
+ case 2: atomic_add(&tmp[i], src[lid]); break;
+ case 3: atomic_sub(&tmp[i], src[lid]); break;
+ case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 16))); break;
+ case 5: atomic_or (&tmp[i], src[lid]<<(lid / 16)); break;
+ case 6: atomic_xor(&tmp[i], src[lid]); break;
+ case 7: atomic_min(&tmp[i], -src[lid]); break;
+ case 8: atomic_max(&tmp[i], src[lid]); break;
+ case 9: atomic_min((__local unsigned int *)&tmp[i], -src[lid]); break;
+ case 10: atomic_max((__local unsigned int *)&tmp[i], src[lid]); break;
+ case 11: atomic_cmpxchg(&(tmp[i]), 0, src[10]); break;
+ default: break;
+ }
+
+ switch(i) {
+ case 0: atomic_inc(&dst[i]); break;
+ case 1: atomic_dec(&dst[i]); break;
+ case 2: atomic_add(&dst[i], src[lid]); break;
+ case 3: atomic_sub(&dst[i], src[lid]); break;
+ case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 16))); break;
+ case 5: atomic_or (&dst[i], src[lid]<<(lid / 16)); break;
+ case 6: atomic_xor(&dst[i], src[lid]); break;
+ case 7: atomic_min(&dst[i], -src[lid]); break;
+ case 8: atomic_max(&dst[i], src[lid]); break;
+ case 9: atomic_min((__global unsigned int *)&dst[i], -src[lid]); break;
+ case 10: atomic_max((__global unsigned int *)&dst[i], src[lid]); break;
+ case 11: atomic_cmpxchg(&dst[i], 0, src[10]); break;
+ default: break;
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ if(get_global_id(0) == 0) {
+ for(i=0; i<12; i=i+1)
+ atomic_xchg(&dst[i+12], tmp[i]);
+ }
+}
diff --git a/kernels/compiler_basic_arithmetic.cl b/kernels/compiler_basic_arithmetic.cl
new file mode 100644
index 0000000..3e145d8
--- /dev/null
+++ b/kernels/compiler_basic_arithmetic.cl
@@ -0,0 +1,53 @@
+#define DECL_KERNEL_SUB(type)\
+__kernel void \
+compiler_sub_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+ int id = (int)get_global_id(0); \
+ dst[id] = src0[id] - src1[id]; \
+}
+
+#define DECL_KERNEL_ADD(type)\
+__kernel void \
+compiler_add_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+ int id = (int)get_global_id(0); \
+ dst[id] = src0[id] + src1[id]; \
+}
+
+#define DECL_KERNEL_MUL(type)\
+__kernel void \
+compiler_mul_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+ int id = (int)get_global_id(0); \
+ dst[id] = src0[id] * src1[id]; \
+}
+
+#define DECL_KERNEL_DIV(type)\
+__kernel void \
+compiler_div_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+ int id = (int)get_global_id(0); \
+ dst[id] = src0[id] / src1[id]; \
+}
+
+#define DECL_KERNEL_REM(type)\
+__kernel void \
+compiler_rem_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+ int id = (int)get_global_id(0); \
+ dst[id] = src0[id] % src1[id]; \
+}
+
+#define DECL_KERNEL_FOR_ALL_TYPE(op) \
+DECL_KERNEL_##op(char) \
+DECL_KERNEL_##op(uchar) \
+DECL_KERNEL_##op(short) \
+DECL_KERNEL_##op(ushort) \
+DECL_KERNEL_##op(int) \
+DECL_KERNEL_##op(uint)
+
+DECL_KERNEL_FOR_ALL_TYPE(SUB)
+DECL_KERNEL_FOR_ALL_TYPE(ADD)
+DECL_KERNEL_FOR_ALL_TYPE(MUL)
+DECL_KERNEL_FOR_ALL_TYPE(DIV)
+DECL_KERNEL_FOR_ALL_TYPE(REM)
diff --git a/kernels/compiler_bool_cross_basic_block.cl b/kernels/compiler_bool_cross_basic_block.cl
new file mode 100644
index 0000000..9aeb16d
--- /dev/null
+++ b/kernels/compiler_bool_cross_basic_block.cl
@@ -0,0 +1,21 @@
+__kernel
+void compiler_bool_cross_basic_block(__global int *src,
+ __global int *dst,
+ int scale){
+ int id = (int)get_global_id(0);
+
+ bool isRedRow = false;
+ bool isRed;
+ int val = src[id];
+ for (unsigned int i=0; i<scale; i++, isRedRow = !isRedRow) {
+ if (isRedRow) {
+ isRed= false;
+ for (unsigned int j=0; j < scale; j++, isRed=!isRed) {
+ if (isRed) {
+ val++;
+ }
+ }
+ }
+ }
+ dst[id] = val;
+}
diff --git a/kernels/compiler_box_blur.cl b/kernels/compiler_box_blur.cl
new file mode 100644
index 0000000..26936e0
--- /dev/null
+++ b/kernels/compiler_box_blur.cl
@@ -0,0 +1,80 @@
+inline float3 unpack_fp3(uint u) {
+ float3 u3;
+ u3.x = (float) (u & 0xff); u >>= 8;
+ u3.y = (float) (u & 0xff); u >>= 8;
+ u3.z = (float) (u & 0xff);
+ return u3;
+}
+
+inline uint pack_fp3(float3 u3) {
+ uint u;
+ u = (((uint) u3.x)) | (((uint) u3.y) << 8) | (((uint) u3.z) << 16);
+ return u;
+}
+
+#define HFILTER3(C0, C1, C2, C3, CURR, LEFT, RIGHT)\
+ float3 C0, C1, C2, C3;\
+ do {\
+ const uint4 from = vload4(CURR, src);\
+ const float3 from0 = unpack_fp3(from.x);\
+ const float3 from1 = unpack_fp3(from.y);\
+ const float3 from2 = unpack_fp3(from.z);\
+ const float3 from3 = unpack_fp3(from.w);\
+ const float3 l = unpack_fp3(src[LEFT]);\
+ const float3 r = unpack_fp3(src[RIGHT]);\
+ C0 = (l+from0+from1);\
+ C1 = (from0+from1+from2);\
+ C2 = (from1+from2+from3);\
+ C3 = (from2+from3+r);\
+ } while(0)
+
+__kernel void compiler_box_blur(__global const uint *src,
+ __global uint *dst,
+ int w,
+ int h,
+ int chunk)
+{
+ const int x = get_global_id(0);
+ int y = get_global_id(1)*chunk;
+ const int yend = min(y + chunk, h); /* we process a tile in the image */
+
+ /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+ const int left = max(4*x-1, 0) + y*w;
+ const int right = min(4*x+4, w-1) + y*w;
+ int curr = x + y*(w>>2);
+ HFILTER3(curr0, curr1, curr2, curr3, curr, left, right);
+
+ /* Top line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+ const int ytop = max(y-1,0);
+ const int topLeft = max(4*x-1, 0) + ytop*w;
+ const int topRight = min(4*x+4, w-1) + ytop*w;
+ const int top = x + ytop*(w>>2);
+ HFILTER3(top0, top1, top2, top3, top, topLeft, topRight);
+
+ /* To guard bottom line */
+ const int maxBottom = x + (h-1)*(w>>2);
+ const int maxBottomLeft = max(4*x-1,0) + (h-1)*w;
+ const int maxBottomRight = min(4*x+4,w-1) + (h-1)*w;
+
+ /* We use a short 3 pixel sliding window */
+ const int ybottom = min(y+1,h-1);
+ int bottomLeft = max(4*x-1, 0) + ybottom*w;
+ int bottomRight = min(4*x+4, w-1) + ybottom*w;
+ int bottom = x + ybottom*(w>>2);
+
+ /* Top down sliding window */
+ for (; y < yend; ++y, curr += (w>>2), bottom += (w>>2), bottomLeft += w, bottomRight += w) {
+ const int center = min(bottom, maxBottom);
+ const int left = min(bottomLeft, maxBottomLeft);
+ const int right = min(bottomRight, maxBottomRight);
+ HFILTER3(bottom0, bottom1, bottom2, bottom3, center, left, right);
+ const float3 to0 = (top0+curr0+bottom0)*(1.f/9.f);
+ const float3 to1 = (top1+curr1+bottom1)*(1.f/9.f);
+ const float3 to2 = (top2+curr2+bottom2)*(1.f/9.f);
+ const float3 to3 = (top3+curr3+bottom3)*(1.f/9.f);
+ const uint4 to = (uint4)(pack_fp3(to0),pack_fp3(to1),pack_fp3(to2),pack_fp3(to3));
+ vstore4(to, curr, dst);
+ top0 = curr0; top1 = curr1; top2 = curr2; top3 = curr3;
+ curr0 = bottom0; curr1 = bottom1; curr2 = bottom2; curr3 = bottom3;
+ }
+}
diff --git a/kernels/compiler_box_blur_float.cl b/kernels/compiler_box_blur_float.cl
new file mode 100644
index 0000000..6f4e1b9
--- /dev/null
+++ b/kernels/compiler_box_blur_float.cl
@@ -0,0 +1,48 @@
+__kernel void compiler_box_blur_float(__global const float4 *src,
+ __global float4 *dst,
+ int w,
+ int h,
+ int chunk)
+{
+ const int x = get_global_id(0);
+ int y = get_global_id(1)*chunk;
+ const int yend = min(y+chunk, h); /* we process a tile in the image */
+
+ /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+ const int left = max(x-1,0) + y*w;
+ const int right = min(x+1,w-1) + y*w;
+ int curr = x + y*w;
+ float4 currPixel = src[left] + src[curr] + src[right];
+
+ /* Top line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+ const int ytop = max(y-1,0);
+ const int topLeft = max(x-1,0) + ytop*w;
+ const int topRight = min(x+1,w-1) + ytop*w;
+ const int top = x + ytop*w;
+ float4 topPixel = src[topLeft] + src[top] + src[topRight];
+
+ /* To guard bottom line */
+ const int maxBottom = x + (h-1)*w;
+ const int maxBottomLeft = max(x-1,0) + (h-1)*w;
+ const int maxBottomRight = min(x+1,w-1) + (h-1)*w;
+
+ /* We use a short 4 pixel sliding window */
+ const int ybottom = min(y+1,h-1);
+ int bottomLeft = max(x-1 + ybottom*w, ybottom*w);
+ int bottomRight = min(x+1 + ybottom*w, ybottom*w+w-1);
+ int bottom = x + ybottom*w;
+
+
+ /* Top down sliding window */
+ for (; y < yend; ++y, curr += w, bottom += w, bottomLeft += w, bottomRight += w) {
+ const int center = min(bottom, maxBottom);
+ const int left = min(bottomLeft, maxBottomLeft);
+ const int right = min(bottomRight, maxBottomRight);
+ const float4 bottomPixel = src[left] + src[center] + src[right];
+ const float4 to = (bottomPixel + currPixel + topPixel) * (1.f/9.f);
+ dst[curr] = to;
+ topPixel = currPixel;
+ currPixel = bottomPixel;
+ }
+}
+
diff --git a/kernels/compiler_box_blur_image.cl b/kernels/compiler_box_blur_image.cl
new file mode 100644
index 0000000..42f463b
--- /dev/null
+++ b/kernels/compiler_box_blur_image.cl
@@ -0,0 +1,18 @@
+__kernel void compiler_box_blur_image(__read_only image2d_t src,
+ __write_only image2d_t dst)
+{
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+ CLK_ADDRESS_CLAMP_TO_EDGE |
+ CLK_FILTER_NEAREST;
+ const int2 coord = (int2)(get_global_id(0), get_global_id(1));
+ int2 offset;
+ float4 sum = 0;
+
+ for (offset.y = -1; offset.y <= 1; offset.y++) {
+ for (offset.x = -1; offset.x <= 1; offset.x++) {
+ sum += read_imagef(src, sampler, coord + offset);
+ }
+ }
+
+ write_imagef(dst, coord, (1.0f/9.0f)*sum);
+}
diff --git a/kernels/compiler_byte_scatter.cl b/kernels/compiler_byte_scatter.cl
new file mode 100644
index 0000000..ab56ba8
--- /dev/null
+++ b/kernels/compiler_byte_scatter.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_byte_scatter(__global char *dst)
+{
+ int id = (int) get_global_id(0);
+ dst[id] = (char) id;
+}
+
diff --git a/kernels/compiler_ceil.cl b/kernels/compiler_ceil.cl
new file mode 100644
index 0000000..cf27483
--- /dev/null
+++ b/kernels/compiler_ceil.cl
@@ -0,0 +1,4 @@
+kernel void compiler_ceil(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = ceil(src[i]);
+}
diff --git a/kernels/compiler_clz_int.cl b/kernels/compiler_clz_int.cl
new file mode 100644
index 0000000..0f17f86
--- /dev/null
+++ b/kernels/compiler_clz_int.cl
@@ -0,0 +1,5 @@
+kernel void compiler_clz_int(global int *src, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = clz(src[i]);
+}
+
diff --git a/kernels/compiler_clz_short.cl b/kernels/compiler_clz_short.cl
new file mode 100644
index 0000000..1ecf7a9
--- /dev/null
+++ b/kernels/compiler_clz_short.cl
@@ -0,0 +1,5 @@
+kernel void compiler_clz_short(global short *src, global short *dst) {
+ int i = get_global_id(0);
+ dst[i] = clz(src[i]);
+}
+
diff --git a/kernels/compiler_constant_expr.cl b/kernels/compiler_constant_expr.cl
new file mode 100644
index 0000000..d40cead
--- /dev/null
+++ b/kernels/compiler_constant_expr.cl
@@ -0,0 +1,23 @@
+float3 foo_pow3(float3 src0, float3 src1)
+{
+ union {
+ float3 f3;
+ float farray[4];
+ } s0, s1, dst;
+ s0.f3 = src0;
+ s1.f3 = src1;
+ int i;
+ for(i = 0; i < 3; i++)
+ dst.farray[i] = pow(s0.farray[i], s1.farray[i]);
+ return dst.f3;
+}
+
+__kernel void
+compiler_constant_expr(__global float* src, __global float *dst)
+{
+ int gid = get_global_id(0);
+ float3 f3 = vload3(gid, src);
+ float3 cf3 = (float3)(1.f, 2.f, 3.f);
+ float3 result = foo_pow3(f3, cf3);
+ vstore3(result, gid, dst);
+}
diff --git a/kernels/compiler_convert_uchar_sat.cl b/kernels/compiler_convert_uchar_sat.cl
new file mode 100644
index 0000000..0c81ecc
--- /dev/null
+++ b/kernels/compiler_convert_uchar_sat.cl
@@ -0,0 +1,4 @@
+kernel void compiler_convert_uchar_sat(global float *src, global uint *dst) {
+ int i = get_global_id(0);
+ dst[i] = convert_uchar_sat(src[i]);
+}
diff --git a/kernels/compiler_data_types.cl b/kernels/compiler_data_types.cl
new file mode 100644
index 0000000..79b06f3
--- /dev/null
+++ b/kernels/compiler_data_types.cl
@@ -0,0 +1,80 @@
+/* OpenCL 1.1 Supported Data Types */
+__kernel void compiler_data_types()
+{
+ // built-in scalar data types (section 6.1.1)
+ bool b;
+ b = true;
+ b = false;
+ char c;
+ unsigned char uc;
+ uchar uc_2;
+ short s;
+ unsigned short us;
+ ushort us_2;
+ int i;
+ unsigned int ui;
+ uint ui_2;
+ long l;
+ unsigned long ul;
+ ulong ul_2;
+ float f;
+ half h;
+ size_t sz;
+ ptrdiff_t pt;
+ intptr_t it;
+ uintptr_t uit;
+
+ // built-in vector data types (section 6.1.2)
+ // supported values of $n$ are 2, 3, 4, 8, 16 for all vector data types
+#define VEC(sz) char##sz c##sz; \
+ uchar##sz uc##sz; \
+ short##sz s##sz; \
+ ushort##sz us##sz;\
+ int##sz i##sz; \
+ uint##sz ui##sz; \
+ long##sz l##sz; \
+ ulong##sz ul##sz; \
+ float##sz f##sz;
+#if 1
+ VEC(2);
+ VEC(3);
+ VEC(4);
+ VEC(8);
+ VEC(16);
+#endif
+ float16 f_16 = (float16)(1.0f);
+ f_16.s0 += 1;
+ f_16.s1 += 1;
+ f_16.s2 += 1;
+ f_16.s3 += 1;
+ f_16.s4 += 1;
+ f_16.s5 += 1;
+ f_16.s6 += 1;
+ f_16.s7 += 1;
+ f_16.s8 += 1;
+ f_16.s9 += 1;
+ f_16.sa += 1;
+ f_16.sb += 1;
+ f_16.sc += 1;
+ f_16.sd += 1;
+ f_16.se += 1;
+ f_16.sf += 1;
+ f_16.sA += 1;
+ f_16.sB += 1;
+ f_16.sC += 1;
+ f_16.sD += 1;
+ f_16.sE += 1;
+ f_16.sF += 1;
+ float8 f_8;
+ f_8 = f_16.lo;
+ f_8 = f_16.hi;
+ f_8 = f_16.odd;
+ f_8 = f_16.even;
+ uint4 u_4 = (uint4)(1);
+
+ // Other built-in data types (section 6.1.3)
+ image2d_t i2dt;
+ image3d_t i3dt;
+ sampler_t st;
+ event_t et;
+}
diff --git a/kernels/compiler_degrees.cl b/kernels/compiler_degrees.cl
new file mode 100644
index 0000000..5fad995
--- /dev/null
+++ b/kernels/compiler_degrees.cl
@@ -0,0 +1,4 @@
+kernel void compiler_degrees(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = degrees(src[i]);
+}
diff --git a/kernels/compiler_displacement_map_element.cl b/kernels/compiler_displacement_map_element.cl
new file mode 100644
index 0000000..ee40ad5
--- /dev/null
+++ b/kernels/compiler_displacement_map_element.cl
@@ -0,0 +1,11 @@
+kernel void compiler_displacement_map_element(const global uint *in, const global uint *offset, int w, int h, global uint *out) {
+ const int cx = get_global_id(0);
+ const int cy = get_global_id(1);
+ uint c = offset[cy * w + cx];
+ int x_pos = cx + c;
+ int y_pos = cy + c;
+ if(0 <= x_pos && x_pos < w && 0 <= y_pos && y_pos < h)
+ out[cy * w + cx] = in[y_pos * w + x_pos];
+ else
+ out[cy * w + cx] = 0;
+}
diff --git a/kernels/compiler_double.cl b/kernels/compiler_double.cl
new file mode 100644
index 0000000..a84f142
--- /dev/null
+++ b/kernels/compiler_double.cl
@@ -0,0 +1,9 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double(global double *src, global double *dst) {
+ int i = get_global_id(0);
+ double d = 1.234567890123456789;
+ if (i < 14)
+ dst[i] = d * (src[i] + d);
+ else
+ dst[i] = 14;
+}
diff --git a/kernels/compiler_double_2.cl b/kernels/compiler_double_2.cl
new file mode 100644
index 0000000..20ee614
--- /dev/null
+++ b/kernels/compiler_double_2.cl
@@ -0,0 +1,9 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_2(global float *src, global double *dst) {
+ int i = get_global_id(0);
+ float d = 1.234567890123456789f;
+ if (i < 14)
+ dst[i] = d * (d + src[i]);
+ else
+ dst[i] = 14;
+}
diff --git a/kernels/compiler_double_3.cl b/kernels/compiler_double_3.cl
new file mode 100644
index 0000000..8b32404
--- /dev/null
+++ b/kernels/compiler_double_3.cl
@@ -0,0 +1,6 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_3(global float *src, global double *dst) {
+ int i = get_global_id(0);
+ float d = 1.234567890123456789f;
+ dst[i] = i < 14 ? d : 14;
+}
diff --git a/kernels/compiler_double_4.cl b/kernels/compiler_double_4.cl
new file mode 100644
index 0000000..e5e46f9
--- /dev/null
+++ b/kernels/compiler_double_4.cl
@@ -0,0 +1,5 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_4(global double *src1, global double *src2, global double *dst) {
+ int i = get_global_id(0);
+ dst[i] = src1[i] + src2[i];
+}
diff --git a/kernels/compiler_event.cl b/kernels/compiler_event.cl
new file mode 100644
index 0000000..a901b05
--- /dev/null
+++ b/kernels/compiler_event.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_event(__global int *dst, int value)
+{
+ int id = (int)get_global_id(0);
+ dst[id] += value;
+}
diff --git a/kernels/compiler_fabs.cl b/kernels/compiler_fabs.cl
new file mode 100644
index 0000000..016deb8
--- /dev/null
+++ b/kernels/compiler_fabs.cl
@@ -0,0 +1,5 @@
+kernel void compiler_fabs(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = fabs(src[i]);
+}
+
diff --git a/kernels/compiler_function_argument.cl b/kernels/compiler_function_argument.cl
new file mode 100644
index 0000000..fe6de28
--- /dev/null
+++ b/kernels/compiler_function_argument.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_function_argument(__global int *dst, int value)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = value;
+}
+
diff --git a/kernels/compiler_function_argument0.cl b/kernels/compiler_function_argument0.cl
new file mode 100644
index 0000000..6bc2e92
--- /dev/null
+++ b/kernels/compiler_function_argument0.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_function_argument0(__global int *dst, short value)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = value;
+}
+
diff --git a/kernels/compiler_function_argument1.cl b/kernels/compiler_function_argument1.cl
new file mode 100644
index 0000000..8842b0b
--- /dev/null
+++ b/kernels/compiler_function_argument1.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_function_argument1(__global int *dst, char value, short value0, int value1)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = value + value0 + value1;
+}
+
diff --git a/kernels/compiler_function_argument2.cl b/kernels/compiler_function_argument2.cl
new file mode 100644
index 0000000..24e5795
--- /dev/null
+++ b/kernels/compiler_function_argument2.cl
@@ -0,0 +1,12 @@
+__kernel void compiler_function_argument2(
+char8 c, uchar8 uc, short8 s, ushort8 us, int8 i, uint8 ui, float8 f,
+__global float8 *result)
+{
+ result[0] = convert_float8(c);
+ result[1] = convert_float8(uc);
+ result[2] = convert_float8(s);
+ result[3] = convert_float8(us);
+ result[4] = convert_float8(i);
+ result[5] = convert_float8(ui);
+ result[6] = f;
+}
diff --git a/kernels/compiler_function_argument3.cl b/kernels/compiler_function_argument3.cl
new file mode 100644
index 0000000..9395cd7
--- /dev/null
+++ b/kernels/compiler_function_argument3.cl
@@ -0,0 +1,71 @@
+struct sfloat8 {
+ float a;
+ float b;
+ float c;
+ float d;
+ float e;
+ float f;
+ float g;
+ float h;
+};
+
+
+__kernel void compiler_function_argument3(
+struct sfloat8 f, __global struct sfloat8 *result)
+{
+ result[0].a = f.a;
+ result[0].b = 12.0f;
+ result[0].c = 12.0f;
+ result[0].d = 12.0f;
+ result[0].e = 12.0f;
+ result[0].f = 12.0f;
+ result[0].g = 12.0f;
+ result[0].h = f.a + f.h;
+
+ result[1].a = f.a;
+ result[1].b = 12.0f;
+ result[1].c = 12.0f;
+ result[1].d = 12.0f;
+ result[1].e = 12.0f;
+ result[1].f = 12.0f;
+ result[1].g = 12.0f;
+ result[1].h = f.a + f.h;
+
+ result[2].a = f.a;
+ result[2].b = 12.0f;
+ result[2].c = 12.0f;
+ result[2].d = 12.0f;
+ result[2].e = 12.0f;
+ result[2].f = 12.0f;
+ result[2].g = 12.0f;
+ result[2].h = f.a + f.h;
+
+ result[3].a = f.a;
+ result[3].b = 12.0f;
+ result[3].c = 12.0f;
+ result[3].d = 12.0f;
+ result[3].e = 12.0f;
+ result[3].f = 12.0f;
+ result[3].g = 12.0f;
+ result[3].h = f.a + f.h;
+
+ result[4].a = f.a;
+ result[4].b = 12.0f;
+ result[4].c = 12.0f;
+ result[4].d = 12.0f;
+ result[4].e = 12.0f;
+ result[4].f = 12.0f;
+ result[4].g = 12.0f;
+ result[4].h = f.a + f.h;
+
+ result[5].a = f.a;
+ result[5].b = 12.0f;
+ result[5].c = 12.0f;
+ result[5].d = 12.0f;
+ result[5].e = 12.0f;
+ result[5].f = 12.0f;
+ result[5].g = 12.0f;
+ result[5].h = f.a + f.h;
+
+ result[6] = result[0];
+}
diff --git a/kernels/compiler_function_constant.cl b/kernels/compiler_function_constant.cl
new file mode 100644
index 0000000..ca7e874
--- /dev/null
+++ b/kernels/compiler_function_constant.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_function_constant(__constant short *c, __global int *dst, int value)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = value + c[id%69];
+}
diff --git a/kernels/compiler_function_constant0.cl b/kernels/compiler_function_constant0.cl
new file mode 100644
index 0000000..5340352
--- /dev/null
+++ b/kernels/compiler_function_constant0.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_function_constant0(__constant int *c0, __constant char *c1, __global int *dst, int value)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = value + c0[id%69] + c1[0];
+}
diff --git a/kernels/compiler_function_qualifiers.cl b/kernels/compiler_function_qualifiers.cl
new file mode 100644
index 0000000..c904c84
--- /dev/null
+++ b/kernels/compiler_function_qualifiers.cl
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Function Qualifiers (section 6.7) */
+kernel void compiler_function_qualifiers()
+__attribute__((vec_type_hint(float)))
+__attribute__((work_group_size_hint(4,1,1)))
+__attribute__((reqd_work_group_size(4,1,1)));
+
+kernel void compiler_function_qualifiers()
+{
+}
diff --git a/kernels/compiler_gather_register_file.cl b/kernels/compiler_gather_register_file.cl
new file mode 100644
index 0000000..773797d
--- /dev/null
+++ b/kernels/compiler_gather_register_file.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_gather_register_file(__global uint *src, __global uint *dst)
+{
+ __gen_ocl_force_simd16();
+ int id = (int)get_global_id(0);
+ const int x0 = src[id];
+ const unsigned short index = get_global_id(0);
+ dst[id] = __gen_ocl_rgather(index, x0);
+}
+
diff --git a/kernels/compiler_gather_register_file0.cl b/kernels/compiler_gather_register_file0.cl
new file mode 100644
index 0000000..0e6d487
--- /dev/null
+++ b/kernels/compiler_gather_register_file0.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_gather_register_file0(__global uint *src, __global uint *dst)
+{
+ __gen_ocl_force_simd16();
+ int id = (int)get_global_id(0);
+ const int x0 = src[id];
+ const unsigned short index = 15 - get_global_id(0);
+ dst[id] = __gen_ocl_rgather(index, x0);
+}
+
diff --git a/kernels/compiler_gather_register_file1.cl b/kernels/compiler_gather_register_file1.cl
new file mode 100644
index 0000000..184202c
--- /dev/null
+++ b/kernels/compiler_gather_register_file1.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_gather_register_file1(__global uint *src, __global uint *dst)
+{
+ __gen_ocl_force_simd16();
+ int id = (int)get_global_id(0);
+ const int x0 = src[id];
+ const int x1 = src[id+16];
+ const unsigned short index = 2*get_global_id(0);
+ dst[id] = __gen_ocl_rgather(index, x0, x1);
+}
+
diff --git a/kernels/compiler_geometric_builtin.cl b/kernels/compiler_geometric_builtin.cl
new file mode 100644
index 0000000..34ff761
--- /dev/null
+++ b/kernels/compiler_geometric_builtin.cl
@@ -0,0 +1,11 @@
+kernel void compiler_geometric_builtin() {
+ float x = 1, y = 2, z = 3;
+ z = dot(x, y);
+ z = cross(x, y);
+ z = distance(x, y);
+ z = length(x);
+ z = normalize(x);
+ z = fast_distance(x, y);
+ z = fast_length(x, y);
+ z = fast_normalize(x);
+}
diff --git a/kernels/compiler_getelementptr_bitcast.cl b/kernels/compiler_getelementptr_bitcast.cl
new file mode 100644
index 0000000..0320abf
--- /dev/null
+++ b/kernels/compiler_getelementptr_bitcast.cl
@@ -0,0 +1,18 @@
+__kernel void compiler_getelementptr_bitcast(global float *src, global float *dst)
+{
+ int i = get_global_id(0);
+
+ __local float ldata[256];
+ ldata[get_local_id(0)] = src[i];
+
+ //if use get_local_id(0) to index ldata, the issue is not reproduced
+ //so, just set the work group as 1 in the application
+ __local uchar * pldata = (__local uchar *)&ldata[0];
+ uchar data;
+ for(int k = 0; k < 3; k++){
+ data = *pldata;
+ pldata++;
+ }
+
+ dst[i] = data;
+}
diff --git a/kernels/compiler_global_constant.cl b/kernels/compiler_global_constant.cl
new file mode 100644
index 0000000..c0e23d1
--- /dev/null
+++ b/kernels/compiler_global_constant.cl
@@ -0,0 +1,76 @@
+constant int m[3] = {71,72,73};
+const constant int n = 1;
+constant int o[3] = {3, 2, 1};
+
+constant int4 a= {1, 2, 3, 4};
+constant int4 b = {0, -1, -2, -3};
+
+struct Person {
+ char name[7];
+ int3 idNumber;
+};
+
+struct Test1 {
+ int a0;
+ char a1;
+};
+
+struct Test2 {
+ char a0;
+ int a1;
+};
+struct Test3 {
+ int a0;
+ int a1;
+};
+struct Test4 {
+ float a0;
+ float a1;
+};
+
+constant struct Person james= {{"james"}, (int3)(1, 2, 3)};
+constant struct Test1 t0 = {1, 2};
+constant struct Test2 t1 = {1, 2};
+
+constant int3 c[3] = {(int3)(0, 1, 2), (int3)(3, 4, 5), (int3)(6,7,8) };
+constant char4 d[3] = {(char4)(0, 1, 2, 3), (char4)(4, 5, 6, 7), (char4)(8, 9, 10, 11)};
+
+constant struct Person members[3] = {{{"abc"}, (int3)(1, 2, 3)}, { {"defg"}, (int3)(4,5,6)}, { {"hijk"}, (int3)(7,8,9)} };
+constant struct Test3 zero_struct = {0, 0};
+constant int3 zero_vec = {0,0,0};
+constant int zero_arr[3] = {0,0,0};
+constant float zero_flt[3] = {0.0f, 0.0f, 0.0f};
+
+__kernel void
+compiler_global_constant(__global int *dst, int e, int r)
+{
+ int id = (int)get_global_id(0);
+
+ int4 x = a + b;
+ dst[id] = m[id%3] * n * o[2] + e + r *x.y * a.x + zero_struct.a0 + zero_vec.x + zero_arr[1] + (int)zero_flt[2];
+}
+// array of vectors
+__kernel void
+compiler_global_constant1(__global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = c[id%3].y + d[id%3].w;
+}
+
+// structure
+__kernel void
+compiler_global_constant2(__global int *dst)
+{
+ int id = (int)get_global_id(0);
+
+ dst[id] = james.idNumber.y + t0.a1 + t1.a1;
+}
+
+//array of structure
+__kernel void
+compiler_global_constant3(__global int *dst)
+{
+ int id = (int)get_global_id(0);
+
+ dst[id] = members[id%3].idNumber.z + members[id%3].name[2];
+}
diff --git a/kernels/compiler_global_constant_2.cl b/kernels/compiler_global_constant_2.cl
new file mode 100644
index 0000000..04536c7
--- /dev/null
+++ b/kernels/compiler_global_constant_2.cl
@@ -0,0 +1,20 @@
+constant int m[3] = {0x15b,0x25b,0x35b};
+constant short t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+constant long n[3] = {0x15b,0x25b,0xFFFFFFFFF};
+constant long p[3] = {1,1,1};
+constant long s = 1;
+
+
+__kernel void
+compiler_global_constant_2(__global int *dst, int e, int r)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = m[id%3] + t[id%5] + e + r;
+}
+
+__kernel void
+compiler_global_constant_2_long(__global long *dst, int e, int r)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = n[id%3]*p[1] + e*s + r;
+}
diff --git a/kernels/compiler_global_memory_barrier.cl b/kernels/compiler_global_memory_barrier.cl
new file mode 100644
index 0000000..99bb940
--- /dev/null
+++ b/kernels/compiler_global_memory_barrier.cl
@@ -0,0 +1,7 @@
+__kernel void compiler_global_memory_barrier(__global int *dst, __global int *src) {
+ src[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = get_local_id(0);
+ src[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = get_local_id(0);
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ dst[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = src[get_local_size(0) * 2 * get_group_id(0) + get_local_size(0) - (get_local_id(0) + 1)];
+ dst[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = src[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_size(0) - (get_local_id(0) + 1)];
+}
diff --git a/kernels/compiler_group_size.cl b/kernels/compiler_group_size.cl
new file mode 100644
index 0000000..4e2c333
--- /dev/null
+++ b/kernels/compiler_group_size.cl
@@ -0,0 +1,29 @@
+__kernel void
+compiler_group_size(__global unsigned int *dst)
+{
+ uint idx = (uint)get_global_id(0);
+ uint idy = (uint)get_global_id(1);
+ uint idz = (uint)get_global_id(2);
+ uint size_x = (uint)get_global_size(0);
+ uint size_y = (uint)get_global_size(1);
+
+ dst[idz*size_x*size_y + idy*size_x + idx] = idz*size_x*size_y + idy*size_x +idx;
+}
+
+struct xyz{
+ unsigned short b;
+ unsigned short e;
+ unsigned int o;
+};
+
+__kernel void
+compiler_group_size4(__global struct xyz *src, __global unsigned int *dst, unsigned int num, unsigned int c)
+{
+ uint idx = (uint)get_global_id(0);
+ if(idx>=num)
+ return;
+ struct xyz td = src[idx];
+ for(unsigned x = td.b;x<=td.e;x++)
+ dst[td.o+x] = c;
+}
+
diff --git a/kernels/compiler_hadd.cl b/kernels/compiler_hadd.cl
new file mode 100644
index 0000000..fe50195
--- /dev/null
+++ b/kernels/compiler_hadd.cl
@@ -0,0 +1,4 @@
+kernel void compiler_hadd(global int *src1, global int *src2, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = hadd(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_if_else.cl b/kernels/compiler_if_else.cl
new file mode 100644
index 0000000..7ae8f99
--- /dev/null
+++ b/kernels/compiler_if_else.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_if_else(__global int *src, __global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+ if (dst[id] >= 0) {
+ dst[id] = src[id+1];
+ src[id] = 1;
+ } else {
+ dst[id]--;
+ src[id] = 2;
+ }
+}
+
diff --git a/kernels/compiler_insert_to_constant.cl b/kernels/compiler_insert_to_constant.cl
new file mode 100644
index 0000000..f94c5c3
--- /dev/null
+++ b/kernels/compiler_insert_to_constant.cl
@@ -0,0 +1,6 @@
+__kernel void compiler_insert_to_constant(__global int4 *dst) {
+ int4 value = (int4)(0,1,2,3);
+ value.z = get_global_id(0);
+ dst[get_global_id(0)] = value;
+}
+
diff --git a/kernels/compiler_insert_vector.cl b/kernels/compiler_insert_vector.cl
new file mode 100644
index 0000000..0f0e20f
--- /dev/null
+++ b/kernels/compiler_insert_vector.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_insert_vector(__global int4 *out )
+{
+ int tid = get_global_id(0);
+ int4 output = (int4)(0, 0, 0, 1); //black
+ if (tid > 16)
+ {
+ output = (int4)(tid, tid, 1, 1);
+ }
+ out[tid] = output;
+}
diff --git a/kernels/compiler_insn_selection_masked_min_max.cl b/kernels/compiler_insn_selection_masked_min_max.cl
new file mode 100644
index 0000000..5b4be57
--- /dev/null
+++ b/kernels/compiler_insn_selection_masked_min_max.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_insn_selection_masked_min_max(__global float* src, __global float* dst)
+{
+ int id = (int)get_global_id(0);
+ if (get_local_id(0) > 5)
+ dst[id] = max(src[id], src[7]);
+ else
+ dst[id] = min(src[id], src[10]);
+}
+
+
diff --git a/kernels/compiler_insn_selection_max.cl b/kernels/compiler_insn_selection_max.cl
new file mode 100644
index 0000000..762de2b
--- /dev/null
+++ b/kernels/compiler_insn_selection_max.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_insn_selection_max(__global float* src, __global float* dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = max(src[id], src[0]);
+}
+
diff --git a/kernels/compiler_insn_selection_min.cl b/kernels/compiler_insn_selection_min.cl
new file mode 100644
index 0000000..6800eaf
--- /dev/null
+++ b/kernels/compiler_insn_selection_min.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_insn_selection_min(__global float* src, __global float* dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = min(src[id], src[0]);
+}
+
diff --git a/kernels/compiler_integer_builtin.cl b/kernels/compiler_integer_builtin.cl
new file mode 100644
index 0000000..4faacd6
--- /dev/null
+++ b/kernels/compiler_integer_builtin.cl
@@ -0,0 +1,23 @@
+/* test OpenCL 1.1 Integet Built-in Functions (section 6.11.3) */
+__kernel void compiler_integer_builtin() {
+ int i = 0, i1 = -1, i2 = -2;
+ unsigned u = 1, u1 = 2, u2 = 3;
+ i = CHAR_MAX;
+ i = abs(u);
+ i = abs_diff(u1, u2);
+ i = add_sat(i1, i2);
+ i = hadd(i1, i2);
+ i = rhadd(i1, i2);
+ i = clz(i);
+ i = clamp(i, i1, i2);
+ i = mad_hi(i, i1, i2);
+ i = mad_sat(i, i1, i2);
+ i = max(i1, i2);
+ i = min(i1, i2);
+ i = mul_hi(i1, i2);
+ i = rotate(i1, i2);
+ i = sub_sat(i1, i2);
+ long l = upsample(i, u);
+ i = mad24(i, i1, i2);
+ i = mul24(i1, i2);
+}
diff --git a/kernels/compiler_integer_division.cl b/kernels/compiler_integer_division.cl
new file mode 100644
index 0000000..146daa0
--- /dev/null
+++ b/kernels/compiler_integer_division.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_integer_division(__global int *src, __global int *dst, int x)
+{
+ dst[get_global_id(0)] = src[get_global_id(0)] / x;
+}
+
diff --git a/kernels/compiler_integer_remainder.cl b/kernels/compiler_integer_remainder.cl
new file mode 100644
index 0000000..73558cb
--- /dev/null
+++ b/kernels/compiler_integer_remainder.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_integer_remainder(__global int *src, __global int *dst, int x)
+{
+ dst[get_global_id(0)] = src[get_global_id(0)] % x;
+}
+
diff --git a/kernels/compiler_load_bool_imm.cl b/kernels/compiler_load_bool_imm.cl
new file mode 100644
index 0000000..fda49b9
--- /dev/null
+++ b/kernels/compiler_load_bool_imm.cl
@@ -0,0 +1,12 @@
+__kernel void
+compiler_load_bool_imm(__global int *dst, __local int *localBuffer, int copiesPerWorkItem )
+{
+ int i;
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[get_local_id(0)*copiesPerWorkItem+i] = copiesPerWorkItem;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for(i=0; i<copiesPerWorkItem; i++)
+ dst[get_global_id(0)*copiesPerWorkItem + i] = localBuffer[get_local_id(0)*copiesPerWorkItem+i];
+ barrier(CLK_LOCAL_MEM_FENCE);
+}
diff --git a/kernels/compiler_local_memory_barrier.cl b/kernels/compiler_local_memory_barrier.cl
new file mode 100644
index 0000000..39a94b8
--- /dev/null
+++ b/kernels/compiler_local_memory_barrier.cl
@@ -0,0 +1,6 @@
+__kernel void compiler_local_memory_barrier(__global int *dst, __local int *src) {
+ src[get_local_id(0)] = get_local_id(0);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ dst[get_global_id(0)] = src[15 - get_local_id(0)];
+}
+
diff --git a/kernels/compiler_local_memory_barrier_2.cl b/kernels/compiler_local_memory_barrier_2.cl
new file mode 100644
index 0000000..dca4a9c
--- /dev/null
+++ b/kernels/compiler_local_memory_barrier_2.cl
@@ -0,0 +1,7 @@
+__kernel void compiler_local_memory_barrier_2(__global int *dst, __local int *src) {
+ src[get_local_id(0)] = get_local_id(0);
+ src[get_local_size(0) + get_local_id(0)] = get_local_id(0);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ dst[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = src[get_local_size(0) - (get_local_id(0) + 1)];
+ dst[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = src[get_local_size(0) + get_local_size(0) - (get_local_id(0) + 1)];
+}
diff --git a/kernels/compiler_local_memory_barrier_wg64.cl b/kernels/compiler_local_memory_barrier_wg64.cl
new file mode 100644
index 0000000..b2ea906
--- /dev/null
+++ b/kernels/compiler_local_memory_barrier_wg64.cl
@@ -0,0 +1,6 @@
+__kernel void compiler_local_memory_barrier_wg64(__global int *dst, __local int *src) {
+ src[get_local_id(0)] = get_local_id(0);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ dst[get_global_id(0)] = src[63 - get_local_id(0)];
+}
+
diff --git a/kernels/compiler_local_memory_two_ptr.cl b/kernels/compiler_local_memory_two_ptr.cl
new file mode 100644
index 0000000..46589ba
--- /dev/null
+++ b/kernels/compiler_local_memory_two_ptr.cl
@@ -0,0 +1,10 @@
+__kernel void compiler_local_memory_two_ptr(__global int *dst,
+ __local int *src0,
+ __local int *src1)
+{
+ src0[get_local_id(0)] = get_local_id(0);
+ src1[get_local_id(0)] = get_global_id(0);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ dst[get_global_id(0)] = src0[15 - get_local_id(0)] + src1[15 - get_local_id(0)];
+}
+
diff --git a/kernels/compiler_local_slm.cl b/kernels/compiler_local_slm.cl
new file mode 100644
index 0000000..52c078c
--- /dev/null
+++ b/kernels/compiler_local_slm.cl
@@ -0,0 +1,24 @@
+struct Test{
+ char t0;
+ int t1;
+};
+
+constant int two= 2;
+
+__kernel void compiler_local_slm(__global int *dst) {
+ __local int hop[16];
+ __local char a;
+ __local struct Test c;
+
+ c.t1 = get_group_id(0);
+ a = two;// seems clang currently has a bug if I write 'a=2;' so currently workaroud it.
+ hop[get_local_id(0)] = get_local_id(0);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ dst[get_global_id(0)] = hop[get_local_id(0)] + (int)a + hop[1] + c.t1;
+}
+
+__kernel void compiler_local_slm1(__global ulong *dst) {
+ __local int hop[16];
+ dst[1] = (ulong)&hop[1];
+ dst[0] = (ulong)&hop[0];
+}
diff --git a/kernels/compiler_long.cl b/kernels/compiler_long.cl
new file mode 100644
index 0000000..e69c5bf
--- /dev/null
+++ b/kernels/compiler_long.cl
@@ -0,0 +1,8 @@
+kernel void compiler_long(global long *src1, global long *src2, global long *dst, long zero) {
+ int i = get_global_id(0);
+
+ if(i < 5)
+ dst[i] = src1[i] + src2[i] + src2[i]*zero;
+ if(i > 5)
+ dst[i] = src1[i] - src2[i] - zero;
+}
diff --git a/kernels/compiler_long_2.cl b/kernels/compiler_long_2.cl
new file mode 100644
index 0000000..92be93a
--- /dev/null
+++ b/kernels/compiler_long_2.cl
@@ -0,0 +1,20 @@
+kernel void compiler_long_2(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ switch(i) {
+ case 0:
+ dst[i] = 0xFEDCBA9876543210UL;
+ break;
+ case 1:
+ dst[i] = src1[i] & src2[i];
+ break;
+ case 2:
+ dst[i] = src1[i] | src2[i];
+ break;
+ case 3:
+ dst[i] = src1[i] ^ src2[i];
+ break;
+ case 4:
+ dst[i] = src1[i] ? 0x1122334455667788L : 0x8877665544332211UL;
+ break;
+ }
+}
diff --git a/kernels/compiler_long_asr.cl b/kernels/compiler_long_asr.cl
new file mode 100644
index 0000000..901630b
--- /dev/null
+++ b/kernels/compiler_long_asr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_asr(global long *src, global long *dst) {
+ int i = get_global_id(0);
+ if(i > 7)
+ dst[i] = src[i] >> i;
+ else
+ dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_cmp.cl b/kernels/compiler_long_cmp.cl
new file mode 100644
index 0000000..90dfb60
--- /dev/null
+++ b/kernels/compiler_long_cmp.cl
@@ -0,0 +1,29 @@
+kernel void compiler_long_cmp_l(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] < src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_le(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] <= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_g(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] > src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_ge(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] >= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_eq(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] == src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_neq(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] != src2[i]) ? 3 : 4;
+}
diff --git a/kernels/compiler_long_convert.cl b/kernels/compiler_long_convert.cl
new file mode 100644
index 0000000..e5f7939
--- /dev/null
+++ b/kernels/compiler_long_convert.cl
@@ -0,0 +1,19 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_long_convert(global char *src1, global short *src2, global int *src3, global long *dst1, global long *dst2, global long *dst3) {
+ int i = get_global_id(0);
+ dst1[i] = src1[i];
+ dst2[i] = src2[i];
+ dst3[i] = src3[i];
+}
+
+kernel void compiler_long_convert_2(global char *dst1, global short *dst2, global int *dst3, global long *src) {
+ int i = get_global_id(0);
+ dst1[i] = src[i];
+ dst2[i] = src[i];
+ dst3[i] = src[i];
+}
+
+kernel void compiler_long_convert_to_float(global float *dst, global long *src) {
+ int i = get_global_id(0);
+ dst[i] = src[i];
+}
diff --git a/kernels/compiler_long_mult.cl b/kernels/compiler_long_mult.cl
new file mode 100644
index 0000000..5b96d74
--- /dev/null
+++ b/kernels/compiler_long_mult.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_mult(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ if(i < 3)
+ dst[i] = src1[i] + src2[i];
+ else
+ dst[i] = src1[i] * src2[i];
+}
diff --git a/kernels/compiler_long_shl.cl b/kernels/compiler_long_shl.cl
new file mode 100644
index 0000000..3786b77
--- /dev/null
+++ b/kernels/compiler_long_shl.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shl(global long *src, global long *dst) {
+ int i = get_global_id(0);
+ if(i > 7)
+ dst[i] = src[i] << i;
+ else
+ dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_shr.cl b/kernels/compiler_long_shr.cl
new file mode 100644
index 0000000..d4e859c
--- /dev/null
+++ b/kernels/compiler_long_shr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shr(global ulong *src, global ulong *dst) {
+ int i = get_global_id(0);
+ if(i > 7)
+ dst[i] = src[i] >> i;
+ else
+ dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_lower_return0.cl b/kernels/compiler_lower_return0.cl
new file mode 100644
index 0000000..fd9846e
--- /dev/null
+++ b/kernels/compiler_lower_return0.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_lower_return0(__global int *src, __global int *dst) {
+ const int id = get_global_id(0);
+ dst[id] = id;
+ if (src[id] > 0) return;
+ dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_lower_return1.cl b/kernels/compiler_lower_return1.cl
new file mode 100644
index 0000000..bcb6b7f
--- /dev/null
+++ b/kernels/compiler_lower_return1.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_lower_return1(__global int *src, __global int *dst) {
+ const int id = get_global_id(0);
+ dst[id] = id;
+ if (id < 11 && (src[id] > 0 || src[id+16] < 2)) return;
+ dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_lower_return2.cl b/kernels/compiler_lower_return2.cl
new file mode 100644
index 0000000..9fa8ad6
--- /dev/null
+++ b/kernels/compiler_lower_return2.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_lower_return2(__global int *src, __global int *dst) {
+ const int id = get_global_id(0);
+ dst[id] = id;
+ while (dst[id] > src[id]) {
+ if (dst[id] > 10) return;
+ dst[id]--;
+ }
+ dst[id] += 2;
+}
+
diff --git a/kernels/compiler_mad24.cl b/kernels/compiler_mad24.cl
new file mode 100644
index 0000000..04bb2c5
--- /dev/null
+++ b/kernels/compiler_mad24.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mad24(global int *src1, global int *src2, global int *src3, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = mad24(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_mad_hi.cl b/kernels/compiler_mad_hi.cl
new file mode 100644
index 0000000..82b09c7
--- /dev/null
+++ b/kernels/compiler_mad_hi.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mad_hi(global int *src1, global int *src2, global int *src3, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = mad_hi(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_mandelbrot.cl b/kernels/compiler_mandelbrot.cl
new file mode 100644
index 0000000..d15ccd0
--- /dev/null
+++ b/kernels/compiler_mandelbrot.cl
@@ -0,0 +1,47 @@
+// Used to ID into the 1D array, so that we can use
+// it effectively as a 2D array
+inline int ID(int x, int y, int width) { return 4*width*y + x*4; }
+inline float mapX(float x) { return x*3.25f - 2.f; }
+inline float mapY(float y) { return y*2.5f - 1.25f; }
+
+__kernel void compiler_mandelbrot(__global char *out) {
+ int x_dim = get_global_id(0);
+ int y_dim = get_global_id(1);
+ int width = get_global_size(0);
+ int height = get_global_size(1);
+ int idx = ID(x_dim, y_dim, width);
+
+ float x_origin = mapX((float) x_dim / (float) width);
+ float y_origin = mapY((float) y_dim / (float) height);
+
+ // The Escape time algorithm, it follows the pseduocode from Wikipedia
+ // _very_ closely
+ float x = 0.0f;
+ float y = 0.0f;
+
+ int iteration = 0;
+
+ // This can be changed, to be more or less precise
+ int max_iteration = 256;
+ while(x*x + y*y <= 4 && iteration < max_iteration) {
+ float xtemp = x*x - y*y + x_origin;
+ y = 2*x*y + y_origin;
+ x = xtemp;
+ iteration++;
+ }
+
+ if(iteration == max_iteration) {
+ // This coordinate did not escape, so it is in the Mandelbrot set
+ out[idx] = 0;
+ out[idx + 1] = 0;
+ out[idx + 2] = 0;
+ out[idx + 3] = 255;
+ } else {
+ // This coordinate did escape, so color based on quickly it escaped
+ out[idx] = iteration;
+ out[idx + 1] = iteration;
+ out[idx + 2] = iteration;
+ out[idx + 3] = 255;
+ }
+
+}
diff --git a/kernels/compiler_mandelbrot_alternate.cl b/kernels/compiler_mandelbrot_alternate.cl
new file mode 100644
index 0000000..ab6fb07
--- /dev/null
+++ b/kernels/compiler_mandelbrot_alternate.cl
@@ -0,0 +1,38 @@
+inline int offset(int x, int y, int width) { return width*y + x; }
+inline float mapX(float x) {return x*3.25f - 2.f;}
+inline float mapY(float y) {return y*2.5f - 1.25f;}
+
+__kernel void compiler_mandelbrot_alternate(__global uint *out,
+ float rcpWidth,
+ float rcpHeight,
+ float criterium)
+{
+ int xDim = get_global_id(0);
+ int yDim = get_global_id(1);
+ int width = get_global_size(0);
+ int height = get_global_size(1);
+ int idx = offset(xDim, yDim, width);
+
+ float xOrigin = mapX((float) xDim * rcpWidth);
+ float yOrigin = mapY((float) yDim * rcpHeight);
+ float x = 0.0f;
+ float y = 0.0f;
+
+ float iteration = 256.f;
+
+ bool breakCond = true;
+ while (breakCond) {
+ const float xtemp = mad(-y,y,mad(x,x,xOrigin));
+ y = mad(2.f*x, y, yOrigin);
+ x = xtemp;
+ iteration -= 1.f;
+ breakCond = -mad(y,y,mad(x,x, -criterium)) * iteration > 0.f;
+ }
+
+ const uint iIteration = 256 - (uint) iteration;
+ const uint isBlack = (iIteration == 256);
+ const uint black = 255 << 24;
+ const uint nonBlack = iIteration | (iIteration << 8) | (iIteration << 16) | (255 << 24);
+ out[idx] = select(nonBlack, black, isBlack);
+}
+
diff --git a/kernels/compiler_mandelbrot_alternate_ref.bmp b/kernels/compiler_mandelbrot_alternate_ref.bmp
new file mode 100644
index 0000000..011d583
Binary files /dev/null and b/kernels/compiler_mandelbrot_alternate_ref.bmp differ
diff --git a/kernels/compiler_mandelbrot_ref.bmp b/kernels/compiler_mandelbrot_ref.bmp
new file mode 100644
index 0000000..494bf8b
Binary files /dev/null and b/kernels/compiler_mandelbrot_ref.bmp differ
diff --git a/kernels/compiler_math.cl b/kernels/compiler_math.cl
new file mode 100644
index 0000000..695fc2c
--- /dev/null
+++ b/kernels/compiler_math.cl
@@ -0,0 +1,40 @@
+__kernel void compiler_math(__global float *dst, __global float *src) {
+ int i = get_global_id(0);
+ const float x = src[i];
+ switch (i) {
+ case 0: dst[i] = cos(x); break;
+ case 1: dst[i] = sin(x); break;
+ case 2: dst[i] = log2(x); break;
+ case 3: dst[i] = sqrt(x); break;
+ case 4: dst[i] = rsqrt(x); break;
+ case 5: dst[i] = native_recip(x); break;
+ case 6: dst[i] = tan(x); break;
+ case 7: dst[i] = cbrt(x); break;
+ case 8: dst[i] = ceil(x); break;
+ case 9: dst[i] = cospi(x); break;
+ case 10: dst[i] = exp2(x); break;
+ case 11: dst[i] = exp10(x); break;
+ case 12: dst[i] = expm1(x); break;
+ case 13: dst[i] = log1p(x); break;
+ case 14: dst[i] = logb(x); break;
+ case 15: dst[i] = sinpi(x); break;
+ case 16: dst[i] = tanpi(x); break;
+ case 17: dst[i] = rint(x); break;
+ case 18: dst[i] = sinh(x); break;
+ case 19: dst[i] = cosh(x); break;
+ case 20: dst[i] = tanh(x); break;
+ case 21: dst[i] = asinh(x); break;
+ case 22: dst[i] = acosh(x); break;
+ case 23: dst[i] = atanh(x); break;
+ case 24: dst[i] = asin(x); break;
+ case 25: dst[i] = acos(x); break;
+ case 26: dst[i] = atan(x); break;
+ case 27: dst[i] = asinpi(x); break;
+ case 28: dst[i] = acospi(x); break;
+ case 29: dst[i] = atanpi(x); break;
+ case 30: dst[i] = erf(x); break;
+ case 31: dst[i] = nan((uint)x); break;
+ default: dst[i] = 1.f; break;
+ };
+}
+
diff --git a/kernels/compiler_math_2op.cl b/kernels/compiler_math_2op.cl
new file mode 100644
index 0000000..6e970b8
--- /dev/null
+++ b/kernels/compiler_math_2op.cl
@@ -0,0 +1,19 @@
+kernel void compiler_math_2op(global float *dst, global float *src1, global float *src2) {
+ int i = get_global_id(0);
+ const float x = src1[i], y = src2[i];
+ float z;
+ switch (i) {
+ case 0: dst[i] = native_divide(x, y); break;
+ case 1: dst[i] = fdim(x, y); break;
+ case 2: dst[i] = fract(x, &z); break;
+ case 3: dst[i] = hypot(x, y); break;
+ case 4: dst[i] = ldexp(x, y); break;
+ case 5: dst[i] = pown(x, (int)y); break;
+ case 6: dst[i] = remainder(x, y); break;
+ case 7: dst[i] = rootn(x, (int)(y+1)); break;
+ case 8: dst[i] = copysign(x, y); break;
+ case 9: dst[i] = maxmag(x, y); break;
+ case 10: dst[i] = minmag(x, y); break;
+ default: dst[i] = 1.f; break;
+ };
+}
diff --git a/kernels/compiler_math_3op.cl b/kernels/compiler_math_3op.cl
new file mode 100644
index 0000000..95b0398
--- /dev/null
+++ b/kernels/compiler_math_3op.cl
@@ -0,0 +1,9 @@
+kernel void compiler_math_3op(global float *dst, global float *src1, global float *src2, global float *src3) {
+ int i = get_global_id(0);
+ const float x = src1[i], y = src2[i], z = src3[i];
+ switch (i) {
+ case 0: dst[i] = mad(x, y, z); break;
+ case 1: dst[i] = fma(x, y, z); break;
+ default: dst[i] = 1.f; break;
+ };
+}
diff --git a/kernels/compiler_math_builtin.cl b/kernels/compiler_math_builtin.cl
new file mode 100644
index 0000000..d5c8392
--- /dev/null
+++ b/kernels/compiler_math_builtin.cl
@@ -0,0 +1,82 @@
+/* OpenCL 1.1 Math Built-in Functions (section 6.11.2) */
+__kernel void
+compiler_array0(__global float *src, __global float *dst)
+{
+ int p = get_global_id(0);
+ dst[p] = acos(src[p]);
+ dst[p+1] = acosh(src[p]);
+ dst[p+2] = acospi(src[p]);
+ dst[p+3] = asin(src[p]);
+ dst[p+4] = asinh(src[p]);
+ dst[p+5] = asinpi(src[p]);
+ dst[p+6] = atan(src[p]);
+ dst[p+7] = atan2(src[p], src[p+1]);
+ dst[p+8] = atanh(src[p]);
+ dst[p+9] = atanpi(src[p]);
+ dst[p+10] = atan2pi(src[p], src[p+1]);
+ dst[p+11] = cbrt(src[p]);
+ dst[p+12] = ceil(src[p]);
+ dst[p+13] = copysign(src[p], src[p+1]);
+ dst[p+14] = cos(src[p]);
+ dst[p+15] = cosh(src[p]);
+ dst[p+16] = cospi(src[p]);
+ dst[p+17] = half_divide(src[p], src[p+1]);
+ dst[p+18] = native_divide(src[p], src[p+1]);
+ dst[p+19] = erfc(src[p]);
+ dst[p+20] = erf(src[p]);
+ dst[p+21] = exp(src[p]);
+ dst[p+22] = exp2(src[p]);
+ dst[p+23] = exp10(src[p]);
+ dst[p+24] = expm1(src[p]);
+ dst[p+25] = fabs(src[p]);
+ dst[p+26] = fdim(src[p], src[p+1]);
+ dst[p+27] = floor(src[p]);
+ dst[p+28] = fma(src[p], src[p+1], src[p+2]);
+ dst[p+29] = fmax(src[p], src[p+1]);
+ dst[p+30] = fmin(src[p]);
+ dst[p+31] = fmod(src[p], src[p+1]);
+ __local float iptr[4];
+ dst[p+32] = fract(src[p], iptr);
+ __private int exps[4];
+ dst[p+33] = frexp(src[p], exps);
+ dst[p+34] = hypot(src[p], src[p+1]);
+ dst[p+35] = (float)ilogb(src[p]);
+ dst[p+36] = ldexp(src[p], 10);
+ dst[p+37] = lgamma(src[p]);
+ __local int signp[4];
+ dst[p+38] = lgamma_r(src[p], signp);
+ dst[p+39] = log(src[p]);
+ dst[p+40] = log2(src[p]);
+ dst[p+41] = log10(src[p]);
+ dst[p+42] = log1p(src[p]);
+ dst[p+43] = logb(src[p]);
+ dst[p+44] = mad(src[p], src[p+1], src[p+2]);
+ dst[p+45] = maxmag(src[p], src[p+1]);
+ dst[p+46] = minmag(src[p], src[p+1]);
+ dst[p+47] = modf(src[p], iptr);
+ dst[p+48] = nan((ulong)src[p]);
+ dst[p+49] = nextafter(src[p], src[p+1]);
+ dst[p+50] = pow(src[p], src[p+1]);
+ dst[p+51] = pown(src[p], (int)src[p+1]);
+ dst[p+52] = powr(src[p], src[p+1]);
+ dst[p+53] = half_recip((half)src[p]) + native_recip(src[p]);
+ dst[p+54] = remainder(src[p], src[p+1]);
+ __private int quo[4];
+ dst[p+55] = remquo(src[p], quo);
+ dst[p+56] = rint(src[p]);
+ dst[p+57] = rootn(src[p], 10);
+ dst[p+58] = round(src[p]);
+ dst[p+59] = rsqrt(src[p]);
+ dst[p+60] = sin(src[p]);
+ __local float cosval;
+ dst[p+61] = sincos(src[p], &cosval);
+ dst[p+62] = sinh(src[p]);
+ dst[p+63] = sinpi(src[p]);
+ dst[p+64] = sqrt(src[p]);
+ dst[p+65] = tan(src[p]);
+ dst[p+66] = tanh(src[p]);
+ dst[p+67] = tanpi(src[p]);
+ dst[p+68] = tgamma(src[p]);
+ dst[p+69] = trunc(src[p]);
+}
+
diff --git a/kernels/compiler_math_constants.cl b/kernels/compiler_math_constants.cl
new file mode 100644
index 0000000..4979cf2
--- /dev/null
+++ b/kernels/compiler_math_constants.cl
@@ -0,0 +1,23 @@
+/* test case for OpenCL 1.1 Math Constants (section 6.11.2) */
+__kernel void compiler_math_constants()
+{
+ float f;
+ f = MAXFLOAT;
+ f = HUGE_VALF;
+ f = HUGE_VAL;
+ f = INFINITY;
+ f = NAN;
+ f = M_E_F;
+ f = M_LOG2E_F;
+ f = M_LOG10E_F;
+ f = M_LN2_F;
+ f = M_LN10_F;
+ f = M_PI_F;
+ f = M_PI_2_F;
+ f = M_PI_4_F;
+ f = M_1_PI_F;
+ f = M_2_PI_F;
+ f = M_2_SQRTPI_F;
+ f = M_SQRT2_F;
+ f = M_SQRT1_2_F;
+}
diff --git a/kernels/compiler_mem_fence.cl b/kernels/compiler_mem_fence.cl
new file mode 100644
index 0000000..c17985e
--- /dev/null
+++ b/kernels/compiler_mem_fence.cl
@@ -0,0 +1,10 @@
+kernel void compiler_mem_fence() {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ mem_fence(CLK_LOCAL_MEM_FENCE);
+ mem_fence(CLK_GLOBAL_MEM_FENCE);
+ read_mem_fence(CLK_LOCAL_MEM_FENCE);
+ read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+ write_mem_fence(CLK_LOCAL_MEM_FENCE);
+ write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/kernels/compiler_mixed_pointer.cl b/kernels/compiler_mixed_pointer.cl
new file mode 100644
index 0000000..78c5783
--- /dev/null
+++ b/kernels/compiler_mixed_pointer.cl
@@ -0,0 +1,23 @@
+
+kernel void compiler_mixed_pointer(__global uint* src1, __global uint *src2, __global uint *dst) {
+ int x = get_global_id(0);
+ global uint * tmp = NULL;
+
+ switch(x) {
+ case 0:
+ case 1:
+ case 4:
+ tmp = src1;
+ break;
+ default:
+ tmp = src2;
+ break;
+ }
+ dst[x] = tmp[x];
+}
+
+kernel void compiler_mixed_pointer1(__global uint* src, __global uint *dst1, __global uint *dst2) {
+ int x = get_global_id(0);
+ global uint * tmp = x < 5 ? dst1 : dst2;
+ tmp[x] = src[x];
+}
diff --git a/kernels/compiler_mul24.cl b/kernels/compiler_mul24.cl
new file mode 100644
index 0000000..b69dda0
--- /dev/null
+++ b/kernels/compiler_mul24.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mul24(global int *src1, global int *src2, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = mul24(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_mul_hi.cl b/kernels/compiler_mul_hi.cl
new file mode 100644
index 0000000..28ce0a5
--- /dev/null
+++ b/kernels/compiler_mul_hi.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mul_hi(global int *src1, global int *src2, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = mul_hi(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_multiple_kernels.cl b/kernels/compiler_multiple_kernels.cl
new file mode 100644
index 0000000..d5cea68
--- /dev/null
+++ b/kernels/compiler_multiple_kernels.cl
@@ -0,0 +1,7 @@
+__kernel void first_kernel(void)
+{
+}
+
+__kernel void second_kernel(void)
+{
+}
\ No newline at end of file
diff --git a/kernels/compiler_obread.cl b/kernels/compiler_obread.cl
new file mode 100644
index 0000000..14658d9
--- /dev/null
+++ b/kernels/compiler_obread.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_obread(__global uint *src, __global uint *dst)
+{
+ int id = (int)get_global_id(0);
+ const int to = __gen_ocl_obread(src+id);
+ dst[id] = to;
+}
+
diff --git a/kernels/compiler_obwrite.cl b/kernels/compiler_obwrite.cl
new file mode 100644
index 0000000..50e55a1
--- /dev/null
+++ b/kernels/compiler_obwrite.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_obwrite(__global uint *src, __global uint *dst)
+{
+ int id = (int)get_global_id(0);
+ const int to = src[id];
+ __gen_ocl_obwrite(dst+id,to);
+}
+
diff --git a/kernels/compiler_preprocessor_macros.cl b/kernels/compiler_preprocessor_macros.cl
new file mode 100644
index 0000000..0f23b3f
--- /dev/null
+++ b/kernels/compiler_preprocessor_macros.cl
@@ -0,0 +1,13 @@
+/* test case for OpenCL 1.1 Preprocessor Directives & Macros (section 6.9) */
+__kernel_exec(1, float4) void compiler_preprocessor_macros()
+{
+#pragma OPENCL FP_CONTRACT ON
+#pragma OPENCL FP_CONTRACT OFF
+#pragma OPENCL FP_CONTRACT DEFAULT
+ int i = __OPENCL_VERSION__;
+ i = __CL_VERSION_1_0__;
+ i = __CL_VERSION_1_1__;
+ i = __ENDIAN_LITTLE__;
+ i = __IMAGE_SUPPORT__;
+ i = __FAST_RELAXED_MATH__;
+}
diff --git a/kernels/compiler_private_data_overflow.cl b/kernels/compiler_private_data_overflow.cl
new file mode 100644
index 0000000..d0f557d
--- /dev/null
+++ b/kernels/compiler_private_data_overflow.cl
@@ -0,0 +1,10 @@
+kernel void compiler_private_data_overflow( __global int4 *output )
+{
+ int4 data[65];
+ for( int i=0; i<65; ++i )
+ {
+ data[i] = (int4)i;
+ }
+ if( get_global_id(0) == 1 )
+ *output = data[0];
+}
diff --git a/kernels/compiler_radians.cl b/kernels/compiler_radians.cl
new file mode 100644
index 0000000..1f79481
--- /dev/null
+++ b/kernels/compiler_radians.cl
@@ -0,0 +1,4 @@
+kernel void compiler_radians(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = radians(src[i]);
+}
diff --git a/kernels/compiler_region.cl b/kernels/compiler_region.cl
new file mode 100644
index 0000000..d74ac7d
--- /dev/null
+++ b/kernels/compiler_region.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_region(__global uint *src, __global uint *dst)
+{
+ __gen_ocl_force_simd16();
+ int id = (int)get_global_id(0);
+ const int x0 = src[id];
+ const int x1 = src[id+16];
+ dst[id] = __gen_ocl_region(0, 16, 8, 2, x0, x1);
+}
+
diff --git a/kernels/compiler_region0.cl b/kernels/compiler_region0.cl
new file mode 100644
index 0000000..5bd57c0
--- /dev/null
+++ b/kernels/compiler_region0.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_region0(__global uint *src, __global uint *dst)
+{
+ __gen_ocl_force_simd16();
+ int id = (int)get_global_id(0);
+ const int x0 = src[id];
+ const int x1 = src[id+16];
+ const int x2 = src[id+32];
+ dst[id] = __gen_ocl_region(1, 16, 8, 2, x0, x1, x2);
+}
+
diff --git a/kernels/compiler_region1.cl b/kernels/compiler_region1.cl
new file mode 100644
index 0000000..9deb63c
--- /dev/null
+++ b/kernels/compiler_region1.cl
@@ -0,0 +1,9 @@
+__kernel void
+compiler_region1(__global uint *src, __global uint *dst)
+{
+ __gen_ocl_force_simd16();
+ int id = (int)get_global_id(0);
+ const int x0 = src[id];
+ dst[id] = __gen_ocl_region(0, 16, 8, 2, x0);
+}
+
diff --git a/kernels/compiler_relational_builtin.cl b/kernels/compiler_relational_builtin.cl
new file mode 100644
index 0000000..8b195ca
--- /dev/null
+++ b/kernels/compiler_relational_builtin.cl
@@ -0,0 +1,24 @@
+/* test OpenCL 1.1 Relational Built-in Functions (section 6.11.6) */
+kernel void compiler_relational_builtin() {
+ float x = 1, y = 2, z = 3;
+ int i;
+ i = isequal(x, y);
+ i = isnotequal(x, y);
+ i = isgreater(x, y);
+ i = isgreaterequal(x, y);
+ i = isless(x, y);
+ i = islessequal(x, y);
+ i = islessgreater(x, y);
+ i = isfinite(x);
+ i = isinf(x);
+ i = isnan(x);
+ i = isnormal(x);
+ i = isordered(x, y);
+ i = isunordered(x, y);
+ i = signbit(x);
+ long l = 12;
+ i = any(l);
+ i = all(l);
+ bitselect(x, y, z);
+ select(x, y, z);
+}
diff --git a/kernels/compiler_rhadd.cl b/kernels/compiler_rhadd.cl
new file mode 100644
index 0000000..4024ace
--- /dev/null
+++ b/kernels/compiler_rhadd.cl
@@ -0,0 +1,4 @@
+kernel void compiler_rhadd(global int *src1, global int *src2, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = rhadd(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_rotate.cl b/kernels/compiler_rotate.cl
new file mode 100644
index 0000000..8d0dd0f
--- /dev/null
+++ b/kernels/compiler_rotate.cl
@@ -0,0 +1,5 @@
+kernel void compiler_rotate(global int *src, global int *dst, global int *y) {
+ int i = get_global_id(0);
+ dst[i] = rotate(src[i], y[i]);
+}
+
diff --git a/kernels/compiler_sampler.cl b/kernels/compiler_sampler.cl
new file mode 100644
index 0000000..149bbf1
--- /dev/null
+++ b/kernels/compiler_sampler.cl
@@ -0,0 +1,25 @@
+/* test OpenCL 1.1 sampler declaration */
+__kernel void compiler_sampler () {
+#define S(A,B,C) CLK_NORMALIZED_COORDS_##A | CLK_ADDRESS_##B | CLK_FILTER_##C
+ const sampler_t \
+ s0 = S(TRUE,REPEAT,NEAREST),
+ s1 = S(TRUE,REPEAT,LINEAR),
+ s2 = S(TRUE,CLAMP,NEAREST),
+ s3 = S(TRUE,CLAMP,LINEAR),
+ s4 = S(TRUE,NONE,NEAREST),
+ s5 = S(TRUE,NONE,LINEAR),
+ s6 = S(TRUE,CLAMP_TO_EDGE,NEAREST),
+ s7 = S(TRUE,CLAMP_TO_EDGE,LINEAR),
+ s8 = S(TRUE,MIRRORED_REPEAT,NEAREST),
+ s9 = S(TRUE,MIRRORED_REPEAT,LINEAR),
+ s10 = S(FALSE,REPEAT,NEAREST),
+ s11 = S(FALSE,REPEAT,LINEAR),
+ s12 = S(FALSE,CLAMP,NEAREST),
+ s13 = S(FALSE,CLAMP,LINEAR),
+ s14 = S(FALSE,NONE,NEAREST),
+ s15 = S(FALSE,NONE,LINEAR),
+ s16 = S(FALSE,CLAMP_TO_EDGE,NEAREST),
+ s17 = S(FALSE,CLAMP_TO_EDGE,LINEAR),
+ s18 = S(FALSE,MIRRORED_REPEAT,NEAREST),
+ s19 = S(FALSE,MIRRORED_REPEAT,LINEAR);
+}
diff --git a/kernels/compiler_saturate.cl b/kernels/compiler_saturate.cl
new file mode 100644
index 0000000..e9ffc4b
--- /dev/null
+++ b/kernels/compiler_saturate.cl
@@ -0,0 +1,16 @@
+#define TEST_TYPE(TYPE) \
+__kernel void test_##TYPE(__global TYPE *C, __global TYPE *A, __global TYPE *B) { \
+ int id = get_global_id(0); \
+ C[id] = add_sat(A[id], B[id]); \
+}
+
+TEST_TYPE(char)
+TEST_TYPE(uchar)
+TEST_TYPE(short)
+TEST_TYPE(ushort)
+TEST_TYPE(int)
+TEST_TYPE(uint)
+//TEST_TYPE(long)
+//TEST_TYPE(ulong)
+
+#undef TEST_TYPE
diff --git a/kernels/compiler_saturate_sub.cl b/kernels/compiler_saturate_sub.cl
new file mode 100644
index 0000000..e20a76f
--- /dev/null
+++ b/kernels/compiler_saturate_sub.cl
@@ -0,0 +1,16 @@
+#define TEST_TYPE(TYPE) \
+__kernel void test_##TYPE(__global TYPE *C, __global TYPE *A, __global TYPE *B) { \
+ int id = get_global_id(0); \
+ C[id] = sub_sat(A[id], B[id]); \
+}
+
+TEST_TYPE(char)
+TEST_TYPE(uchar)
+TEST_TYPE(short)
+TEST_TYPE(ushort)
+TEST_TYPE(int)
+TEST_TYPE(uint)
+//TEST_TYPE(long)
+//TEST_TYPE(ulong)
+
+#undef TEST_TYPE
diff --git a/kernels/compiler_shift_right.cl b/kernels/compiler_shift_right.cl
new file mode 100644
index 0000000..c109170
--- /dev/null
+++ b/kernels/compiler_shift_right.cl
@@ -0,0 +1,4 @@
+kernel void compiler_shift_right(global uint *src, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = src[i] >> 24;
+}
diff --git a/kernels/compiler_short_scatter.cl b/kernels/compiler_short_scatter.cl
new file mode 100644
index 0000000..7dad029
--- /dev/null
+++ b/kernels/compiler_short_scatter.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_short_scatter(__global short *dst)
+{
+ int id = (int) get_global_id(0);
+ dst[id] = (short) id;
+}
+
diff --git a/kernels/compiler_simd_all.cl b/kernels/compiler_simd_all.cl
new file mode 100644
index 0000000..504710b
--- /dev/null
+++ b/kernels/compiler_simd_all.cl
@@ -0,0 +1,12 @@
+__kernel void compiler_simd_all(global int *src, global int *dst)
+{
+ int i = get_global_id(0);
+ if (i % 2 == 1) {
+ if (__gen_ocl_simd_all((src[i] < 12) && (src[i] > 0)))
+ dst[i] = 1;
+ else
+ dst[i] = 2;
+ }
+ else
+ dst[i] = 3;
+}
diff --git a/kernels/compiler_simd_any.cl b/kernels/compiler_simd_any.cl
new file mode 100644
index 0000000..3b04f82
--- /dev/null
+++ b/kernels/compiler_simd_any.cl
@@ -0,0 +1,15 @@
+__kernel void compiler_simd_any(global int *src, global int *dst)
+{
+ int i = get_global_id(0);
+
+ if (i % 2 == 1) {
+ if (__gen_ocl_simd_any(src[i] == 5) || __gen_ocl_simd_any(src[i] == 9))
+ dst[i] = 1;
+ else if (__gen_ocl_simd_any(src[i] == 6))
+ dst[i] = 0;
+ else
+ dst[i] = 2;
+ }
+ else
+ dst[i] = 3;
+}
diff --git a/kernels/compiler_smoothstep.cl b/kernels/compiler_smoothstep.cl
new file mode 100644
index 0000000..d3b7da4
--- /dev/null
+++ b/kernels/compiler_smoothstep.cl
@@ -0,0 +1,4 @@
+kernel void compiler_smoothstep(global float *src1, global float *src2, global float *src3, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = smoothstep(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_step.cl b/kernels/compiler_step.cl
new file mode 100644
index 0000000..ef77f05
--- /dev/null
+++ b/kernels/compiler_step.cl
@@ -0,0 +1,38 @@
+#define COMPILER_STEP_FUNC_N(TYPE, N) \
+ kernel void compiler_step_##TYPE##N ( \
+ global TYPE##N* edge, global TYPE##N* x, global TYPE##N* dst) { \
+ int i = get_global_id(0); \
+ dst[i] = step(edge[i], x[i]); \
+ }
+
+kernel void compiler_step_float (global float* edge,
+ global float* x, global float* dst)
+{
+ int i = get_global_id(0);
+ dst[i] = step(edge[i], x[i]);
+}
+
+COMPILER_STEP_FUNC_N(float, 2)
+COMPILER_STEP_FUNC_N(float, 3)
+COMPILER_STEP_FUNC_N(float, 4)
+COMPILER_STEP_FUNC_N(float, 8)
+COMPILER_STEP_FUNC_N(float, 16)
+
+#define COMPILER_STEPF_FUNC_N(TYPE, N) \
+ kernel void compiler_stepf_##TYPE##N ( \
+ float edge, global TYPE##N* x, global TYPE##N* dst) { \
+ int i = get_global_id(0); \
+ dst[i] = step(edge, x[i]); \
+ }
+
+kernel void compiler_stepf_float (float edge, global float* x, global float* dst)
+{
+ int i = get_global_id(0);
+ dst[i] = step(edge, x[i]);
+}
+
+COMPILER_STEPF_FUNC_N(float, 2)
+COMPILER_STEPF_FUNC_N(float, 3)
+COMPILER_STEPF_FUNC_N(float, 4)
+COMPILER_STEPF_FUNC_N(float, 8)
+COMPILER_STEPF_FUNC_N(float, 16)
diff --git a/kernels/compiler_structure_attributes.cl b/kernels/compiler_structure_attributes.cl
new file mode 100644
index 0000000..a07dd88
--- /dev/null
+++ b/kernels/compiler_structure_attributes.cl
@@ -0,0 +1,17 @@
+#define X(x, y) x ## y
+#define NAME(x, y) X(x, y)
+#define S struct NAME(s, __LINE__) { \
+ char c; \
+ int i; \
+ float f; \
+}
+
+S __attribute__((aligned(16)));
+S __attribute__((aligned));
+S __attribute__((packed));
+S __attribute__((endian(host)));
+S __attribute__((endian(device)));
+S __attribute__((endian));
+
+__kernel void compiler_structure_attributes() {
+}
diff --git a/kernels/compiler_switch.cl b/kernels/compiler_switch.cl
new file mode 100644
index 0000000..c28b431
--- /dev/null
+++ b/kernels/compiler_switch.cl
@@ -0,0 +1,14 @@
+__kernel void compiler_switch(__global int *dst, __global int *src)
+{
+ switch (get_global_id(0)) {
+ case 0: dst[get_global_id(0)] = src[get_global_id(0) + 4]; break;
+ case 1: dst[get_global_id(0)] = src[get_global_id(0) + 14]; break;
+ case 2: dst[get_global_id(0)] = src[get_global_id(0) + 13]; break;
+ case 6: dst[get_global_id(0)] = src[get_global_id(0) + 11]; break;
+ case 7: dst[get_global_id(0)] = src[get_global_id(0) + 10]; break;
+ case 10: dst[get_global_id(0)] = src[get_global_id(0) + 9]; break;
+ case 12: dst[get_global_id(0)] = src[get_global_id(0) + 6]; break;
+ default: dst[get_global_id(0)] = src[get_global_id(0) + 8]; break;
+ }
+}
+
diff --git a/kernels/compiler_type_casting.cl b/kernels/compiler_type_casting.cl
new file mode 100644
index 0000000..3cdb925
--- /dev/null
+++ b/kernels/compiler_type_casting.cl
@@ -0,0 +1,19 @@
+/* test OpenCL 1.1 Conversions & Type Casting Examples (section 6.2) */
+__kernel void compiler_type_casting() {
+ float f = 1.23456789f;
+ float g;
+
+ g = (float)f;
+ g = convert_float(f);
+ g = as_float(f);
+
+ g = convert_float_rte(f);
+ g = convert_float_rtz(f);
+ g = convert_float_rtp(f);
+ g = convert_float_rtn(f);
+
+ g = convert_float_sat_rte(f);
+ g = convert_float_sat_rtz(f);
+ g = convert_float_sat_rtp(f);
+ g = convert_float_sat_rtn(f);
+}
diff --git a/kernels/compiler_uint16_copy.cl b/kernels/compiler_uint16_copy.cl
new file mode 100644
index 0000000..1072234
--- /dev/null
+++ b/kernels/compiler_uint16_copy.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_uint16_copy(__global uint16 *src, __global uint16 *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+}
+
+
diff --git a/kernels/compiler_uint2_copy.cl b/kernels/compiler_uint2_copy.cl
new file mode 100644
index 0000000..7c5c5e3
--- /dev/null
+++ b/kernels/compiler_uint2_copy.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_uint2_copy(__global uint2 *src, __global uint2 *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_uint3_copy.cl b/kernels/compiler_uint3_copy.cl
new file mode 100644
index 0000000..7dc71b2
--- /dev/null
+++ b/kernels/compiler_uint3_copy.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_uint3_copy(__global uint3 *src, __global uint3 *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_uint3_unaligned_copy.cl b/kernels/compiler_uint3_unaligned_copy.cl
new file mode 100644
index 0000000..a50f0ab
--- /dev/null
+++ b/kernels/compiler_uint3_unaligned_copy.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_uint3_unaligned_copy(__global uint *src, __global uint *dst)
+{
+ const int id = (int)get_global_id(0);
+ const uint3 from = vload3(id, src);
+ vstore3(from, id, dst);
+}
+
diff --git a/kernels/compiler_uint8_copy.cl b/kernels/compiler_uint8_copy.cl
new file mode 100644
index 0000000..9eee538
--- /dev/null
+++ b/kernels/compiler_uint8_copy.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_uint8_copy(__global uint8 *src, __global uint8 *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_unstructured_branch0.cl b/kernels/compiler_unstructured_branch0.cl
new file mode 100644
index 0000000..66da6e0
--- /dev/null
+++ b/kernels/compiler_unstructured_branch0.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_unstructured_branch0(__global int *src, __global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+ if (dst[id] >= 0) goto label;
+
+ do {
+ dst[id] = 1;
+ label:
+ id += get_local_size(0);
+ } while (id < 32);
+}
+
diff --git a/kernels/compiler_unstructured_branch1.cl b/kernels/compiler_unstructured_branch1.cl
new file mode 100644
index 0000000..fb937e0
--- /dev/null
+++ b/kernels/compiler_unstructured_branch1.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_unstructured_branch1(__global int *src, __global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+ if (dst[id] >= 0) goto label1;
+ dst[id] = 1;
+ if (src[id] <= 2) goto label2;
+ label1:
+ dst[id] -= 2;
+ label2:
+ dst[id] += 2;
+}
+
diff --git a/kernels/compiler_unstructured_branch2.cl b/kernels/compiler_unstructured_branch2.cl
new file mode 100644
index 0000000..546f253
--- /dev/null
+++ b/kernels/compiler_unstructured_branch2.cl
@@ -0,0 +1,18 @@
+__kernel void
+compiler_unstructured_branch2(__global int *src, __global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+ if (dst[id] < 0) goto label1;
+ dst[id] = 1;
+ if (dst[id] > src[id]) goto label3;
+ dst[id]++;
+ if (src[id] <= 2) goto label2;
+ label1:
+ dst[id] -= 2;
+ label2:
+ dst[id] += 2;
+ label3:
+ dst[id] *= 3;
+}
+
diff --git a/kernels/compiler_unstructured_branch3.cl b/kernels/compiler_unstructured_branch3.cl
new file mode 100644
index 0000000..67b4761
--- /dev/null
+++ b/kernels/compiler_unstructured_branch3.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_unstructured_branch3(__global int *src, __global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+ if (dst[id] >= 2) goto label1;
+ dst[id] = 1;
+ if (src[id] < 2) goto label2;
+ dst[id]--;
+ label1:
+ dst[id] -= 2;
+ label2:
+ dst[id] += 2;
+}
+
+
diff --git a/kernels/compiler_upsample_int.cl b/kernels/compiler_upsample_int.cl
new file mode 100644
index 0000000..d7945b5
--- /dev/null
+++ b/kernels/compiler_upsample_int.cl
@@ -0,0 +1,4 @@
+kernel void compiler_upsample_int(global short *src1, global ushort *src2, global int *dst) {
+ int i = get_global_id(0);
+ dst[i] = upsample(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_upsample_long.cl b/kernels/compiler_upsample_long.cl
new file mode 100644
index 0000000..8f914e4
--- /dev/null
+++ b/kernels/compiler_upsample_long.cl
@@ -0,0 +1,4 @@
+kernel void compiler_upsample_long(global int *src1, global uint *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = upsample(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_vect_compare.cl b/kernels/compiler_vect_compare.cl
new file mode 100644
index 0000000..ae43ec6
--- /dev/null
+++ b/kernels/compiler_vect_compare.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_vect_compare(__global int4 *src, __global int4 *dst)
+{
+ int4 test = (int4)(0,0,0,0);
+
+ dst[get_global_id(0)] = test < src[get_global_id(0)];
+}
diff --git a/kernels/compiler_vector_inc.cl b/kernels/compiler_vector_inc.cl
new file mode 100644
index 0000000..548dcb4
--- /dev/null
+++ b/kernels/compiler_vector_inc.cl
@@ -0,0 +1,13 @@
+kernel void compiler_vector_inc(global char *dst, global char *src) {
+ size_t i = get_global_id(0);
+ char2 dst2 = vload2(i, dst);
+ if (src[i] == 0)
+ dst2++;
+ else if(src[i] == 1)
+ ++dst2;
+ else if(src[i] == 2)
+ dst2--;
+ else
+ --dst2;
+ vstore2(dst2, i, dst);
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
new file mode 100644
index 0000000..aec38b1
--- /dev/null
+++ b/kernels/compiler_vector_load_store.cl
@@ -0,0 +1,40 @@
+/* test OpenCL 1.1 Vector Data Load/Store Functions (section 6.11.7) */
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#define OFFSET2(type) (type ##2) {(type)1, (type)2}
+#define OFFSET3(type) (type ##3) {(type)1, (type)2, (type)3}
+#define OFFSET4(type) (type ##4) {(type)1, (type)2, (type)3, (type)4}
+#define OFFSET8(type) (type ##8) {(type)1, (type)2, (type)3, (type)4, (type)5, (type)6, (type)7, (type)8}
+#define OFFSET16(type) (type ##16) {(type)1, (type)2, (type)3, (type)4, (type)5, (type)6, (type)7, (type)8, (type)9, (type)10, (type)11, (type)12, (type)13, (type)14, (type)15, (type)16}
+
+#define TEST_TYPE(type, n) \
+__kernel void test_##type ##n(__global type *pin, \
+ __global type *pout) \
+{\
+ int x = get_global_id(0); \
+ type ##n value; \
+ value = vload ##n(x, pin); \
+ value += OFFSET ##n(type); \
+ vstore ##n(value, x, pout); \
+}
+
+#define TEST_ALL_TYPE(n) \
+ TEST_TYPE(char,n) \
+ TEST_TYPE(uchar,n) \
+ TEST_TYPE(short,n) \
+ TEST_TYPE(ushort,n)\
+ TEST_TYPE(int,n) \
+ TEST_TYPE(uint,n) \
+ TEST_TYPE(float,n) \
+ TEST_TYPE(long,n) \
+ TEST_TYPE(ulong,n)
+// TEST_TYPE(double,n)
+
+#if 0
+ TEST_TYPE(half,n)
+#endif
+
+TEST_ALL_TYPE(2)
+TEST_ALL_TYPE(3)
+TEST_ALL_TYPE(4)
+TEST_ALL_TYPE(8)
+TEST_ALL_TYPE(16)
diff --git a/kernels/compiler_volatile.cl b/kernels/compiler_volatile.cl
new file mode 100644
index 0000000..84f7228
--- /dev/null
+++ b/kernels/compiler_volatile.cl
@@ -0,0 +1,4 @@
+__kernel void compiler_volatile(__global int *dst, __local volatile int *hop) {
+ hop[get_global_id(0)] = get_local_id(1);
+ dst[get_global_id(0)] = hop[get_local_id(0)];
+}
diff --git a/kernels/compiler_vote_all.cl b/kernels/compiler_vote_all.cl
new file mode 100644
index 0000000..1918c1c
--- /dev/null
+++ b/kernels/compiler_vote_all.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_vote_all(__global uint *src, __global uint *dst)
+{
+ int id = (int)get_global_id(0);
+ if (__gen_ocl_all(id > 8))
+ dst[id] = src[id];
+ else
+ dst[id] = 0;
+}
+
diff --git a/kernels/compiler_vote_any.cl b/kernels/compiler_vote_any.cl
new file mode 100644
index 0000000..0a81e89
--- /dev/null
+++ b/kernels/compiler_vote_any.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_vote_any(__global uint *src, __global uint *dst)
+{
+ int id = (int)get_global_id(0);
+ if (__gen_ocl_any(id > 6))
+ dst[id] = src[id];
+ else
+ dst[id] = 0;
+}
+
diff --git a/kernels/compiler_workitem_builtin.cl b/kernels/compiler_workitem_builtin.cl
new file mode 100644
index 0000000..b01dd7d
--- /dev/null
+++ b/kernels/compiler_workitem_builtin.cl
@@ -0,0 +1,12 @@
+/* test case for OpenCL 1.1 work-item built-in functions */
+__kernel void compiler_workitem_builtin()
+{
+ uint x = get_work_dim();
+ size_t y = get_global_size(0);
+ y = get_global_id(0);
+ y = get_local_size(0);
+ y = get_local_id(0);
+ y = get_num_groups(0);
+ y = get_group_id(0);
+ y = get_global_offset(0);
+}
diff --git a/kernels/compiler_write_only_bytes.cl b/kernels/compiler_write_only_bytes.cl
new file mode 100644
index 0000000..555a9dc
--- /dev/null
+++ b/kernels/compiler_write_only_bytes.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_write_only_bytes(__global char *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = 2;
+}
diff --git a/kernels/compiler_write_only_shorts.cl b/kernels/compiler_write_only_shorts.cl
new file mode 100644
index 0000000..205634d
--- /dev/null
+++ b/kernels/compiler_write_only_shorts.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_write_only_shorts(__global short *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = 2;
+}
diff --git a/kernels/double_precision_check.cl b/kernels/double_precision_check.cl
new file mode 100644
index 0000000..e55cafa
--- /dev/null
+++ b/kernels/double_precision_check.cl
@@ -0,0 +1,11 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+__kernel void
+double_precision_check(__global float* src, __global float* dst)
+{
+ int id = (int)get_global_id(0);
+ double d0 = 0.12345678912345678 + src[1];
+ double d1 = 0.12355678922345678 + src[0];
+ float rem = d1 - d0;
+ dst[id] = rem;
+}
diff --git a/kernels/empty.cl b/kernels/empty.cl
new file mode 100644
index 0000000..fd6f298
--- /dev/null
+++ b/kernels/empty.cl
@@ -0,0 +1 @@
+__kernel void empty() {}
diff --git a/kernels/image_1D_buffer.cl b/kernels/image_1D_buffer.cl
new file mode 100644
index 0000000..e8e0a86
--- /dev/null
+++ b/kernels/image_1D_buffer.cl
@@ -0,0 +1,13 @@
+__kernel void image_1D_buffer(image1d_buffer_t image1, image1d_t image2, sampler_t sampler, __global int *results)
+{
+ int x = get_global_id(0);
+ int offset = x;
+
+ int4 col = read_imagei(image1, x);
+ int4 test = (col != read_imagei(image2, sampler, x));
+
+ if (test.x || test.y || test.z || test.w)
+ results[offset] = 0;
+ else
+ results[offset] = 1;
+}
diff --git a/kernels/include/runtime_compile_link_inc.h b/kernels/include/runtime_compile_link_inc.h
new file mode 100644
index 0000000..9b66850
--- /dev/null
+++ b/kernels/include/runtime_compile_link_inc.h
@@ -0,0 +1,4 @@
+int greater(long x, long y)
+{
+ return x > y ;
+}
diff --git a/kernels/my_test.cl b/kernels/my_test.cl
new file mode 100644
index 0000000..91f1821
--- /dev/null
+++ b/kernels/my_test.cl
@@ -0,0 +1,26 @@
+__kernel void
+my_test(__global int2 *src, __global int *offsets, __global uint2 *dst, int w)
+{
+ int i, index, j;
+ uint2 out;
+ unsigned int a, b, c, d;
+ int2 rle;
+ int gid = get_global_id(0);
+ index = offsets[gid];
+ int i0 = 0;
+ rle = src[index];
+ for (i = 0; i < w; i++, i0 += 8) {
+ if (i0+0 >= rle.x) { index++; rle = src[index]; } a = rle.y;
+ if (i0+1 >= rle.x) { index++; rle = src[index]; } b = rle.y;
+ if (i0+2 >= rle.x) { index++; rle = src[index]; } c = rle.y;
+ if (i0+3 >= rle.x) { index++; rle = src[index]; } d = rle.y;
+ out.x = (d<<24)|(c<<16)|(b<<8)|(a);
+ if (i0+4 >= rle.x) { index++; rle = src[index]; } a = rle.y;
+ if (i0+5 >= rle.x) { index++; rle = src[index]; } b = rle.y;
+ if (i0+6 >= rle.x) { index++; rle = src[index]; } c = rle.y;
+ if (i0+7 >= rle.x) { index++; rle = src[index]; } d = rle.y;
+ out.y = (d<<24)|(c<<16)|(b<<8)|(a);
+
+ dst[gid*w + i] = out;
+ }
+}
diff --git a/kernels/null_kernel_arg.cl b/kernels/null_kernel_arg.cl
new file mode 100644
index 0000000..68a4280
--- /dev/null
+++ b/kernels/null_kernel_arg.cl
@@ -0,0 +1,9 @@
+__kernel void
+null_kernel_arg(__global unsigned int *dst, __global unsigned int * mask_global, __constant unsigned int* mask_const)
+{
+ if(dst && mask_global==0 && mask_const == NULL)
+ {
+ uint idx = (uint)get_global_id(0);
+ dst[idx] = idx;
+ }
+}
diff --git a/kernels/runtime_compile_link.h b/kernels/runtime_compile_link.h
new file mode 100644
index 0000000..ae2c56e
--- /dev/null
+++ b/kernels/runtime_compile_link.h
@@ -0,0 +1 @@
+int comp_long(long x, long y);
diff --git a/kernels/runtime_compile_link_a.cl b/kernels/runtime_compile_link_a.cl
new file mode 100644
index 0000000..b17861f
--- /dev/null
+++ b/kernels/runtime_compile_link_a.cl
@@ -0,0 +1,13 @@
+#include "runtime_compile_link.h"
+#include "include/runtime_compile_link_inc.h"
+
+int comp_long(long x, long y)
+{
+ return x < y ;
+}
+
+kernel void runtime_compile_link_a(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ int j = comp_long(src1[i], src2[i]);
+ dst[i] = j ? 3 : 4;
+}
diff --git a/kernels/runtime_compile_link_b.cl b/kernels/runtime_compile_link_b.cl
new file mode 100644
index 0000000..89b5a2d
--- /dev/null
+++ b/kernels/runtime_compile_link_b.cl
@@ -0,0 +1,9 @@
+#include "runtime_compile_link.h"
+#include "include/runtime_compile_link_inc.h"
+
+kernel void runtime_compile_link_b(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ int j = comp_long(src1[i], src2[i]);
+ dst[i] = j ? 3 : 4;
+ int k = greater(src1[i], src2[i]);
+}
diff --git a/kernels/test_cl_finish.cl b/kernels/test_cl_finish.cl
new file mode 100644
index 0000000..723949c
--- /dev/null
+++ b/kernels/test_cl_finish.cl
@@ -0,0 +1,12 @@
+
+
+__kernel void
+test_cl_finish(__global int *src, __global int *dst, int n, int num_threads)
+{
+ int tid, pos;
+
+ tid = get_global_id(0);
+ for (pos=tid; pos < n; pos+=num_threads) {
+ dst[pos] = src[pos];
+ }
+}
diff --git a/kernels/test_copy_buffer.cl b/kernels/test_copy_buffer.cl
new file mode 100644
index 0000000..6f2fd22
--- /dev/null
+++ b/kernels/test_copy_buffer.cl
@@ -0,0 +1,6 @@
+__kernel void
+test_copy_buffer(__global float* src, __global float* dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = src[id];
+}
diff --git a/kernels/test_copy_buffer_row.cl b/kernels/test_copy_buffer_row.cl
new file mode 100644
index 0000000..e33380f
--- /dev/null
+++ b/kernels/test_copy_buffer_row.cl
@@ -0,0 +1,8 @@
+__kernel void
+test_copy_buffer_row(__global int *src, __global int *dst, __global int *data)
+{
+ int row = data[0];
+ int size = data[1];
+ int id = (int) get_global_id(0);
+ for (; id < size; id += row) dst[id] = src[id];
+}
diff --git a/kernels/test_copy_image.cl b/kernels/test_copy_image.cl
new file mode 100644
index 0000000..a5ee5e8
--- /dev/null
+++ b/kernels/test_copy_image.cl
@@ -0,0 +1,10 @@
+__kernel void
+test_copy_image(__read_only image2d_t src, __write_only image2d_t dst, sampler_t sampler)
+{
+ int2 coord;
+ int4 color;
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ color = read_imagei(src, sampler, coord);
+ write_imagei(dst, coord, color);
+}
diff --git a/kernels/test_copy_image1.cl b/kernels/test_copy_image1.cl
new file mode 100644
index 0000000..28e7a7d
--- /dev/null
+++ b/kernels/test_copy_image1.cl
@@ -0,0 +1,33 @@
+#define S(A,B,C) CLK_NORMALIZED_COORDS_##A | CLK_ADDRESS_##B | CLK_FILTER_##C
+
+#define COPY_IMAGE(_dst, _sampler, scoord, dcoord) \
+ color = read_imagei(src, _sampler, scoord);\
+ write_imagei(_dst, dcoord, color)
+
+__kernel void
+test_copy_image1(__read_only image2d_t src,
+ __write_only image2d_t dst0,
+ sampler_t sampler0,
+ __write_only image2d_t dst1,
+ __write_only image2d_t dst2,
+ __write_only image2d_t dst3,
+ __write_only image2d_t dst4,
+ float w_inv, float h_inv)
+{
+ const sampler_t sampler1 = S(FALSE, REPEAT, NEAREST);
+ const sampler_t sampler2 = S(FALSE, CLAMP, NEAREST);
+ const sampler_t sampler3 = S(FALSE, MIRRORED_REPEAT, NEAREST);
+ const sampler_t sampler4 = S(TRUE, REPEAT, NEAREST);
+ int2 coord;
+ float2 fcoord;
+ int4 color;
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ fcoord.x = coord.x * w_inv;
+ fcoord.y = coord.y * h_inv;
+ COPY_IMAGE(dst0, sampler0, coord, coord);
+ COPY_IMAGE(dst1, sampler1, coord, coord);
+ COPY_IMAGE(dst2, sampler2, coord, coord);
+ COPY_IMAGE(dst3, sampler3, coord, coord);
+ COPY_IMAGE(dst4, sampler4, fcoord, coord);
+}
diff --git a/kernels/test_copy_image_1d.cl b/kernels/test_copy_image_1d.cl
new file mode 100644
index 0000000..88428bb
--- /dev/null
+++ b/kernels/test_copy_image_1d.cl
@@ -0,0 +1,9 @@
+__kernel void
+test_copy_image_1d(__read_only image1d_t src, __write_only image1d_t dst, sampler_t sampler)
+{
+ int coord;
+ int4 color;
+ coord = (int)get_global_id(0);
+ color = read_imagei(src, sampler, coord);
+ write_imagei(dst, coord, color);
+}
diff --git a/kernels/test_copy_image_3d.cl b/kernels/test_copy_image_3d.cl
new file mode 100644
index 0000000..103fb69
--- /dev/null
+++ b/kernels/test_copy_image_3d.cl
@@ -0,0 +1,28 @@
+__kernel void
+test_copy_image_3d(__read_only image3d_t src,
+ __write_only image3d_t dst,
+ sampler_t sampler,
+ __write_only image2d_t buf0,
+ __write_only image2d_t buf1,
+ __write_only image2d_t buf2,
+ __write_only image2d_t buf3)
+{
+ int4 coord;
+ int2 coord2;
+ float4 color;
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ coord.z = (int)get_global_id(2);
+ coord2.x = coord.x;
+ coord2.y = coord.y;
+ color = read_imagef(src, sampler, coord);
+ write_imagef(dst, coord, color);
+ if (coord.z == 0)
+ write_imagef(buf0, coord2, color);
+ else if (coord.z == 1)
+ write_imagef(buf1, coord2, color);
+ else if (coord.z == 2)
+ write_imagef(buf2, coord2, color);
+ else if (coord.z == 3)
+ write_imagef(buf3, coord2, color);
+}
diff --git a/kernels/test_fill_gl_image.cl b/kernels/test_fill_gl_image.cl
new file mode 100644
index 0000000..4250a57
--- /dev/null
+++ b/kernels/test_fill_gl_image.cl
@@ -0,0 +1,11 @@
+__kernel void
+test_fill_gl_image(image2d_t img, int color)
+{
+ int2 coord;
+ float4 color_v4;
+ coord.x = get_global_id(0);
+ coord.y = get_global_id(1);
+ color_v4 = (float4){((color >> 24) & 0xFF), (color >> 16) & 0xFF, (color >> 8) & 0xFF, color & 0xFF};
+ color_v4 = color_v4 / 255.0f;
+ write_imagef(img, coord, color_v4);
+}
diff --git a/kernels/test_fill_image.cl b/kernels/test_fill_image.cl
new file mode 100644
index 0000000..3760568
--- /dev/null
+++ b/kernels/test_fill_image.cl
@@ -0,0 +1,13 @@
+__kernel void
+test_fill_image(__write_only image2d_t dst, uint color)
+{
+ int2 coord;
+ int4 color4;
+ color4.s0 = (color >> 24) & 0xFF;
+ color4.s1 = (color >> 16) & 0xFF;
+ color4.s2 = (color >> 8) & 0xFF;
+ color4.s3 = color & 0xFF;
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image0.cl b/kernels/test_fill_image0.cl
new file mode 100644
index 0000000..9428092
--- /dev/null
+++ b/kernels/test_fill_image0.cl
@@ -0,0 +1,9 @@
+__kernel void
+test_fill_image0(__write_only image2d_t dst)
+{
+ int2 coord;
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ int4 color4 = {coord.y & 0xFF, (coord.y & 0xFF00) >> 8, coord.x & 0xFF, (coord.x & 0xFF00) >> 8};
+ write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image_1d.cl b/kernels/test_fill_image_1d.cl
new file mode 100644
index 0000000..db922af
--- /dev/null
+++ b/kernels/test_fill_image_1d.cl
@@ -0,0 +1,8 @@
+__kernel void
+test_fill_image_1d(__write_only image1d_t dst)
+{
+ int coord;
+ coord = (int)get_global_id(0);
+ uint4 color4 = {0, 1, 2 ,3};
+ write_imageui(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image_3d.cl b/kernels/test_fill_image_3d.cl
new file mode 100644
index 0000000..4988f69
--- /dev/null
+++ b/kernels/test_fill_image_3d.cl
@@ -0,0 +1,14 @@
+__kernel void
+test_fill_image_3d(__write_only image3d_t dst, uint color)
+{
+ int4 coord;
+ int4 color4;
+ color4.s0 = (color >> 24) & 0xFF;
+ color4.s1 = (color >> 16) & 0xFF;
+ color4.s2 = (color >> 8) & 0xFF;
+ color4.s3 = color & 0xFF;
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ coord.z = (int)get_global_id(2);
+ write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image_3d_2.cl b/kernels/test_fill_image_3d_2.cl
new file mode 100644
index 0000000..1f9eaa1
--- /dev/null
+++ b/kernels/test_fill_image_3d_2.cl
@@ -0,0 +1,10 @@
+__kernel void
+test_fill_image_3d_2(__write_only image3d_t dst)
+{
+ int4 coord;
+ int4 color4 = {0x12, 0x34, 0x56, 0x78};
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ coord.z = (int)get_global_id(2);
+ write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_get_arg_info.cl b/kernels/test_get_arg_info.cl
new file mode 100644
index 0000000..43a804b
--- /dev/null
+++ b/kernels/test_get_arg_info.cl
@@ -0,0 +1,8 @@
+typedef struct _test_arg_struct {
+ int a;
+ int b;
+}test_arg_struct;
+
+kernel void test_get_arg_info(read_only global float const volatile *src, read_write local int read_only *dst, test_arg_struct extra) {
+
+}
diff --git a/kernels/test_get_image_info.cl b/kernels/test_get_image_info.cl
new file mode 100644
index 0000000..8f69b75
--- /dev/null
+++ b/kernels/test_get_image_info.cl
@@ -0,0 +1,13 @@
+__kernel void
+test_get_image_info(__write_only image3d_t src, __global int *size, __global int *fmt)
+{
+ int id = (int)get_global_id(0);
+ int w, h, depth;
+ w = get_image_width(src);
+ h = get_image_height(src);
+ depth = get_image_depth(src);
+ int channel_data_type = get_image_channel_data_type(src);
+ int channel_order = get_image_channel_order(src);
+ size[id] = (w << 20 | h << 8 | depth);
+ fmt[id] = (channel_data_type << 16 | channel_order);
+}
diff --git a/kernels/test_get_image_info_array.cl b/kernels/test_get_image_info_array.cl
new file mode 100644
index 0000000..333da77
--- /dev/null
+++ b/kernels/test_get_image_info_array.cl
@@ -0,0 +1,25 @@
+__kernel void
+test_get_image_info_array(__write_only image1d_array_t a1, __write_only image2d_array_t a2, __global int *result)
+{
+ int w, h, array_sz;
+
+ w = get_image_width(a1);
+ array_sz = (int)get_image_array_size(a1);
+ int channel_data_type = get_image_channel_data_type(a1);
+ int channel_order = get_image_channel_order(a1);
+ result[0] = w;
+ result[1] = array_sz;
+ result[2] = channel_data_type;
+ result[3] = channel_order;
+
+ w = get_image_width(a2);
+ h = get_image_height(a2);
+ array_sz = (int)get_image_array_size(a2);
+ channel_data_type = get_image_channel_data_type(a2);
+ channel_order = get_image_channel_order(a2);
+ result[4] = w;
+ result[5] = h;
+ result[6] = array_sz;
+ result[7] = channel_data_type;
+ result[8] = channel_order;
+}
diff --git a/kernels/test_movforphi_undef.cl b/kernels/test_movforphi_undef.cl
new file mode 100644
index 0000000..035c02a
--- /dev/null
+++ b/kernels/test_movforphi_undef.cl
@@ -0,0 +1,18 @@
+__kernel void
+test_movforphi_undef(__read_only image2d_t src, __write_only image2d_t dst, sampler_t sampler)
+{
+ int2 coord, dstCoord;
+ int4 color;
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ dstCoord.x = x;
+ dstCoord.y = y;
+ coord.y = y;
+ for(int j = -8; j < 2; j++)
+ {
+ coord.x = j + x;
+ color = read_imagei(src, sampler, coord);
+ if (j == 1 + x)
+ write_imagei(dst, dstCoord, color);
+ }
+}
diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
new file mode 100644
index 0000000..84bb478
--- /dev/null
+++ b/kernels/test_printf.cl
@@ -0,0 +1,38 @@
+__kernel void
+test_printf(void)
+{
+ int x = (int)get_global_id(0);
+ int y = (int)get_global_id(1);
+ int z = (int)get_global_id(2);
+ uint a = 'x';
+ float f = 5.0f;
+ int3 vec;
+ vec.x = x;
+ vec.y = y;
+ vec.z = z;
+
+ if (x == 0 && y == 0 && z == 0) {
+ printf("--- Welcome to the printf test of %s ---\n", "Intel Beignet");
+
+ printf("### output a char is %c\n", a);
+ }
+
+ if (x % 15 == 0)
+ if (y % 3 == 0)
+ if (z % 7 == 0)
+ printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
+ vec, get_global_size(0), get_global_size(1), get_global_size(2));
+
+ if (x == 1)
+ if (y == 0) {
+ if (z % 2 == 0)
+ printf("#### output a float is %f\n", f);
+ else
+ printf("#### output a float to int is %d\n", f);
+ }
+
+ if (x == 0 && y == 0 && z == 0) {
+ printf("--- End to the printf test ---\n");
+ }
+
+}
diff --git a/kernels/test_write_only.cl b/kernels/test_write_only.cl
new file mode 100644
index 0000000..27c7acb
--- /dev/null
+++ b/kernels/test_write_only.cl
@@ -0,0 +1,6 @@
+__kernel void
+test_write_only(__global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = id;
+}
diff --git a/setup_fulsim_hsw.sh b/setup_fulsim_hsw.sh
new file mode 100644
index 0000000..140be66
--- /dev/null
+++ b/setup_fulsim_hsw.sh
@@ -0,0 +1,5 @@
+export INTEL_DEVID_OVERRIDE=0x0094
+export DEVICE=hsw_m0
+export OCL_FULSIM_RUN=1
+export OCL_FULSIM_DEBUG_MODE=$1
+
diff --git a/setup_fulsim_ivb.sh b/setup_fulsim_ivb.sh
new file mode 100644
index 0000000..9df9082
--- /dev/null
+++ b/setup_fulsim_ivb.sh
@@ -0,0 +1,5 @@
+export INTEL_DEVID_OVERRIDE=0x0166 # or, 0x0112
+export DEVICE=ivb_m_gt2 # snb_gt2 for SNB GT2 desktop
+export OCL_SIMULATOR=1 # 0 -> HW, 1 -> fulsim, 2 -> perfsim
+export OCL_FULSIM_DEBUG_MODE=$1
+
diff --git a/setup_perfsim_ivb.sh b/setup_perfsim_ivb.sh
new file mode 100644
index 0000000..4cfdd1a
--- /dev/null
+++ b/setup_perfsim_ivb.sh
@@ -0,0 +1,4 @@
+export INTEL_DEVID_OVERRIDE=0x0166 # or, 0x0112
+export DEVICE=ivb_m_gt2 # snb_gt2 for SNB GT2 desktop
+export OCL_SIMULATOR=2 # 0 -> HW, 1 -> fulsim, 2 -> perfsim
+
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..fc1479e
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,2 @@
+OCLConfig.h
+libcl.so
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..ce16a8c
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,126 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}
+ ${DRM_INCLUDE_DIRS}
+ ${DRM_INCLUDE_DIRS}/../
+ ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
+ ${CMAKE_CURRENT_SOURCE_DIR}/../include
+ ${MESA_SOURCE_INCLUDES})
+
+macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
+foreach (KF ${KERNEL_FILES})
+ set (input_file ${KERNEL_PATH}/${KF}.cl)
+ set (output_file ${KERNEL_PATH}/${KF}_str.c)
+ list (APPEND KERNEL_STR_FILES ${output_file})
+ if(GEN_PCI_ID)
+ add_custom_command(
+ OUTPUT ${output_file}
+ COMMAND rm -rf ${output_file}
+ COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} -t${GEN_PCI_ID}
+ DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+ else(GEN_PCI_ID)
+ add_custom_command(
+ OUTPUT ${output_file}
+ COMMAND rm -rf ${output_file}
+ COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
+ DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+ endif(GEN_PCI_ID)
+endforeach (KF)
+endmacro (MakeKernelBinStr)
+
+macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES)
+ set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl)
+ set (file_content)
+ file (REMOVE ${output_file})
+ foreach (KF ${KERNEL_NAMES})
+ set (input_file ${KERNEL_PATH}/${KF}.cl)
+ file(READ ${input_file} file_content )
+ STRING(REGEX REPLACE ";" "\\\\;" file_content "${file_content}")
+ file(APPEND ${output_file} ${file_content})
+ endforeach (KF)
+endmacro (MakeBuiltInKernelStr)
+
+set (KERNEL_STR_FILES)
+set (KERNEL_NAMES cl_internal_copy_buf_align4
+cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
+cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset
+cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
+cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d
+cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
+cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
+cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
+cl_internal_fill_buf_align128 cl_internal_fill_image_1d
+cl_internal_fill_image_1d_array cl_internal_fill_image_2d
+cl_internal_fill_image_2d_array cl_internal_fill_image_3d)
+set (BUILT_IN_NAME cl_internal_built_in_kernel)
+MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
+MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
+MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${BUILT_IN_NAME}")
+
+set(OPENCL_SRC
+ ${KERNEL_STR_FILES}
+ cl_api.c
+ cl_alloc.c
+ cl_kernel.c
+ cl_program.c
+ cl_gbe_loader.cpp
+ cl_sampler.c
+ cl_event.c
+ cl_enqueue.c
+ cl_image.c
+ cl_mem.c
+ cl_platform_id.c
+ cl_extensions.c
+ cl_device_id.c
+ cl_context.c
+ cl_command_queue.c
+ cl_command_queue.h
+ cl_command_queue_gen7.c
+ cl_thread.c
+ cl_driver.h
+ cl_driver.cpp
+ cl_driver_defs.c
+ intel/intel_gpgpu.c
+ intel/intel_batchbuffer.c
+ intel/intel_driver.c
+ performance.c)
+
+if (X11_FOUND)
+ set(CMAKE_CXX_FLAGS "-DHAS_X11 ${CMAKE_CXX_FLAGS}")
+ set(CMAKE_C_FLAGS "-DHAS_X11 ${CMAKE_C_FLAGS}")
+ set(OPENCL_SRC
+ ${OPENCL_SRC}
+ x11/dricommon.c
+ x11/va_dri2.c)
+endif (X11_FOUND)
+
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
+set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
+SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
+SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
+else(EGL_FOUND AND MESA_SOURCE_FOUND)
+SET(OPTIONAL_EGL_LIBRARY "")
+endif (EGL_FOUND AND MESA_SOURCE_FOUND)
+
+if (OCLIcd_FOUND)
+set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
+SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
+endif (OCLIcd_FOUND)
+
+SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+add_library(cl SHARED ${OPENCL_SRC})
+target_link_libraries(
+ cl
+ ${X11_LIBRARIES}
+ ${XEXT_LIBRARIES}
+ ${XFIXES_LIBRARIES}
+ ${DRM_INTEL_LIBRARIES}
+ ${DRM_LIBRARIES}
+ ${CMAKE_THREAD_LIBS_INIT}
+ ${CMAKE_DL_LIBS}
+ ${OPENGL_LIBRARIES}
+ ${OPTIONAL_EGL_LIBRARY})
+install (TARGETS cl LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
diff --git a/src/OCLConfig.h.in b/src/OCLConfig.h.in
new file mode 100644
index 0000000..71de4b3
--- /dev/null
+++ b/src/OCLConfig.h.in
@@ -0,0 +1,6 @@
+// the configured options and settings for LIBCL
+#define LIBCL_DRIVER_VERSION_MAJOR @LIBCL_DRIVER_VERSION_MAJOR@
+#define LIBCL_DRIVER_VERSION_MINOR @LIBCL_DRIVER_VERSION_MINOR@
+#define LIBCL_DRIVER_VERSION_PATCH @LIBCL_DRIVER_VERSION_PATCH@
+#define LIBCL_C_VERSION_MAJOR @LIBCL_C_VERSION_MAJOR@
+#define LIBCL_C_VERSION_MINOR @LIBCL_C_VERSION_MINOR@
diff --git a/src/cl_alloc.c b/src/cl_alloc.c
new file mode 100644
index 0000000..93d2e6a
--- /dev/null
+++ b/src/cl_alloc.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include <stdlib.h>
+#include <assert.h>
+#include <malloc.h>
+
+static volatile int32_t cl_alloc_n = 0;
+
+LOCAL void*
+cl_malloc(size_t sz)
+{
+ void * p = NULL;
+ atomic_inc(&cl_alloc_n);
+ p = malloc(sz);
+ assert(p);
+ return p;
+}
+
+LOCAL void*
+cl_aligned_malloc(size_t sz, size_t align)
+{
+ void * p = NULL;
+ atomic_inc(&cl_alloc_n);
+ p = memalign(align, sz);
+ assert(p);
+ return p;
+}
+
+LOCAL void*
+cl_calloc(size_t n, size_t elem_size)
+{
+ void *p = NULL;
+ atomic_inc(&cl_alloc_n);
+ p = calloc(n, elem_size);
+ assert(p);
+ return p;
+}
+
+LOCAL void*
+cl_realloc(void *ptr, size_t sz)
+{
+ if (ptr == NULL)
+ atomic_inc(&cl_alloc_n);
+ return realloc(ptr, sz);
+}
+
+LOCAL void
+cl_free(void *ptr)
+{
+ if (ptr == NULL)
+ return;
+ atomic_dec(&cl_alloc_n);
+ free(ptr);
+ ptr = NULL;
+}
+
+LOCAL size_t
+cl_report_unfreed(void)
+{
+ return cl_alloc_n;
+}
+
+LOCAL void
+cl_report_set_all_freed(void)
+{
+ cl_alloc_n = 0;
+}
+
diff --git a/src/cl_alloc.h b/src/cl_alloc.h
new file mode 100644
index 0000000..9b463ed
--- /dev/null
+++ b/src/cl_alloc.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_ALLOC_H__
+#define __CL_ALLOC_H__
+
+#include "cl_internals.h"
+#include <stdlib.h>
+
+/* Return a valid pointer for the requested memory block size */
+extern void *cl_malloc(size_t sz);
+
+/* Aligned malloc */
+extern void* cl_aligned_malloc(size_t sz, size_t align);
+
+/* malloc + memzero */
+extern void *cl_calloc(size_t n, size_t elem_size);
+
+/* Regular realloc */
+extern void *cl_realloc(void *ptr, size_t sz);
+
+/* Free a pointer allocated with cl_*alloc */
+extern void cl_free(void *ptr);
+
+/* We count the number of allocation. This function report the number of
+ * allocation still unfreed
+ */
+extern size_t cl_report_unfreed(void);
+
+#endif /* __CL_ALLOC_H__ */
+
diff --git a/src/cl_api.c b/src/cl_api.c
new file mode 100644
index 0000000..630511f
--- /dev/null
+++ b/src/cl_api.c
@@ -0,0 +1,3341 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "cl_enqueue.h"
+#include "cl_event.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_sampler.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "CL/cl_intel.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "performance.h"
+
+#ifndef CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
+#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
+#define CL_MEM_HOST_WRITE_ONLY (1 << 7)
+#define CL_MEM_HOST_READ_ONLY (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS (1 << 9)
+typedef intptr_t cl_device_partition_property;
+#endif
+
+#define FILL_GETINFO_RET(TYPE, ELT, VAL, RET) \
+ do { \
+ if (param_value && param_value_size < sizeof(TYPE)*ELT) \
+ return CL_INVALID_VALUE; \
+ if (param_value) { \
+ memcpy(param_value, (VAL), sizeof(TYPE)*ELT); \
+ } \
+ \
+ if (param_value_size_ret) \
+ *param_value_size_ret = sizeof(TYPE)*ELT; \
+ return RET; \
+ } while(0)
+
+inline cl_int
+handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
+ cl_event* event, enqueue_data* data, cl_command_type type)
+{
+ cl_int status = cl_event_wait_events(num, wait_list, queue);
+ cl_event e = NULL;
+ if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
+ e = cl_event_new(queue->ctx, queue, type, event!=NULL);
+
+ /* if need profiling, add the submit timestamp here. */
+ if (e->type != CL_COMMAND_USER &&
+ e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
+ }
+
+ if(event != NULL)
+ *event = e;
+ if(status == CL_ENQUEUE_EXECUTE_DEFER) {
+ cl_event_new_enqueue_callback(e, data, num, wait_list);
+ }
+ }
+ queue->current_event = e;
+ return status;
+}
+
+/* The following code checking overlap is from Appendix of openCL spec 1.1 */
+inline cl_bool check_copy_overlap(const size_t src_offset[3],
+ const size_t dst_offset[3],
+ const size_t region[3],
+ size_t row_pitch, size_t slice_pitch)
+{
+ const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+ const size_t src_max[] = {src_offset[0] + region[0],
+ src_offset[1] + region[1],
+ src_offset[2] + region[2]};
+ const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+ const size_t dst_max[] = {dst_offset[0] + region[0],
+ dst_offset[1] + region[1],
+ dst_offset[2] + region[2]};
+ // Check for overlap
+ cl_bool overlap = CL_TRUE;
+ unsigned i;
+ size_t dst_start = dst_offset[2] * slice_pitch +
+ dst_offset[1] * row_pitch + dst_offset[0];
+ size_t dst_end = dst_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+ size_t src_start = src_offset[2] * slice_pitch +
+ src_offset[1] * row_pitch + src_offset[0];
+ size_t src_end = src_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+
+ for (i=0; i != 3; ++i) {
+ overlap = overlap && (src_min[i] < dst_max[i])
+ && (src_max[i] > dst_min[i]);
+ }
+
+ if (!overlap) {
+ size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
+ src_offset[0] + region[0] - row_pitch : 0;
+ size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
+ dst_offset[0] + region[0] - row_pitch : 0;
+ if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+ (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
+ if ( (src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end) )
+ overlap = CL_TRUE;
+ }
+ if (region[2] > 1) {
+ size_t src_height = slice_pitch / row_pitch;
+ size_t dst_height = slice_pitch / row_pitch;
+ size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
+ src_offset[1] + region[1] - src_height : 0;
+ size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
+ dst_offset[1] + region[1] - dst_height : 0;
+ if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+ (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
+ if ( (src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end) )
+ overlap = CL_TRUE;
+ }
+ }
+ }
+ return overlap;
+}
+
+static cl_int
+cl_check_device_type(cl_device_type device_type)
+{
+ const cl_device_type valid = CL_DEVICE_TYPE_GPU
+ | CL_DEVICE_TYPE_CPU
+ | CL_DEVICE_TYPE_ACCELERATOR
+ | CL_DEVICE_TYPE_DEFAULT
+ | CL_DEVICE_TYPE_CUSTOM;
+
+ if( (device_type & valid) == 0) {
+ return CL_INVALID_DEVICE_TYPE;
+ }
+ if(UNLIKELY(!(device_type & CL_DEVICE_TYPE_DEFAULT) && !(device_type & CL_DEVICE_TYPE_GPU)))
+ return CL_DEVICE_NOT_FOUND;
+
+ return CL_SUCCESS;
+}
+
+static cl_int
+cl_device_id_is_ok(const cl_device_id device)
+{
+ if(UNLIKELY(device == NULL)) return CL_FALSE;
+ return device != cl_get_gt_device() ? CL_FALSE : CL_TRUE;
+}
+
+cl_int
+clGetPlatformIDs(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms)
+{
+ if(UNLIKELY(platforms == NULL && num_platforms == NULL))
+ return CL_INVALID_VALUE;
+ if(UNLIKELY(num_entries == 0 && platforms != NULL))
+ return CL_INVALID_VALUE;
+
+ return cl_get_platform_ids(num_entries, platforms, num_platforms);
+}
+
+cl_int
+clGetPlatformInfo(cl_platform_id platform,
+ cl_platform_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ /* Only one platform. This is easy */
+ if (UNLIKELY(platform != NULL && platform != intel_platform))
+ return CL_INVALID_PLATFORM;
+
+ return cl_get_platform_info(platform,
+ param_name,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clGetDeviceIDs(cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices)
+{
+ cl_int err = CL_SUCCESS;
+
+ /* Check parameter consistency */
+ if (UNLIKELY(devices == NULL && num_devices == NULL))
+ return CL_INVALID_VALUE;
+ if (UNLIKELY(platform && platform != intel_platform))
+ return CL_INVALID_PLATFORM;
+ if (UNLIKELY(devices && num_entries == 0))
+ return CL_INVALID_VALUE;
+
+ err = cl_check_device_type(device_type);
+ if(err != CL_SUCCESS)
+ return err;
+
+ return cl_get_device_ids(platform,
+ device_type,
+ num_entries,
+ devices,
+ num_devices);
+}
+
+cl_int
+clGetDeviceInfo(cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ return cl_get_device_info(device,
+ param_name,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clCreateSubDevices(cl_device_id in_device,
+ const cl_device_partition_property * properties,
+ cl_uint num_devices,
+ cl_device_id * out_devices,
+ cl_uint * num_devices_ret)
+{
+ /* Check parameter consistency */
+ if (UNLIKELY(out_devices == NULL && num_devices_ret == NULL))
+ return CL_INVALID_VALUE;
+ if (UNLIKELY(in_device == NULL && properties == NULL))
+ return CL_INVALID_VALUE;
+
+ *num_devices_ret = 0;
+ return CL_INVALID_DEVICE_PARTITION_COUNT;
+}
+
+cl_int
+clRetainDevice(cl_device_id device)
+{
+ // XXX stub for C++ Bindings
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseDevice(cl_device_id device)
+{
+ // XXX stub for C++ Bindings
+ return CL_SUCCESS;
+}
+
+cl_context
+clCreateContext(const cl_context_properties * properties,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ void (* pfn_notify) (const char*, const void*, size_t, void*),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_context context = NULL;
+
+ /* Assert parameters correctness */
+ INVALID_VALUE_IF (devices == NULL);
+ INVALID_VALUE_IF (num_devices == 0);
+ INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
+
+ /* Now check if the user is asking for the right device */
+ INVALID_DEVICE_IF (cl_device_id_is_ok(*devices) == CL_FALSE);
+
+ context = cl_create_context(properties,
+ num_devices,
+ devices,
+ pfn_notify,
+ user_data,
+ &err);
+ initialize_env_var();
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return context;
+}
+
+cl_context
+clCreateContextFromType(const cl_context_properties * properties,
+ cl_device_type device_type,
+ void (CL_CALLBACK *pfn_notify) (const char *, const void *, size_t, void *),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ cl_context context = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_device_id devices[1];
+ cl_uint num_devices = 1;
+
+ INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
+
+ err = cl_check_device_type(device_type);
+ if(err != CL_SUCCESS) {
+ goto error;
+ }
+
+ err = cl_get_device_ids(NULL,
+ device_type,
+ 1,
+ &devices[0],
+ &num_devices);
+ if (err != CL_SUCCESS) {
+ goto error;
+ }
+
+ context = cl_create_context(properties,
+ num_devices,
+ devices,
+ pfn_notify,
+ user_data,
+ &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return context;
+}
+
+cl_int
+clRetainContext(cl_context context)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ cl_context_add_ref(context);
+error:
+ return err;
+}
+
+cl_int
+clReleaseContext(cl_context context)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ cl_context_delete(context);
+error:
+ return err;
+}
+
+cl_int
+clGetContextInfo(cl_context context,
+ cl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+
+ if (param_name == CL_CONTEXT_DEVICES) {
+ FILL_GETINFO_RET (cl_device_id, 1, &context->device, CL_SUCCESS);
+ } else if (param_name == CL_CONTEXT_NUM_DEVICES) {
+ cl_uint n = 1;
+ FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
+ } else if (param_name == CL_CONTEXT_REFERENCE_COUNT) {
+ cl_uint ref = context->ref_n;
+ FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
+ } else if (param_name == CL_CONTEXT_PROPERTIES) {
+ if(context->prop_len > 0) {
+ FILL_GETINFO_RET (cl_context_properties, context->prop_len, context->prop_user, CL_SUCCESS);
+ } else {
+ cl_context_properties n = 0;
+ FILL_GETINFO_RET (cl_context_properties, 1, &n, CL_SUCCESS);
+ }
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+}
+
+cl_command_queue
+clCreateCommandQueue(cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int * errcode_ret)
+{
+ cl_command_queue queue = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+
+ INVALID_DEVICE_IF (device != context->device);
+ INVALID_VALUE_IF (properties & ~(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE));
+
+ if(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {/*not supported now.*/
+ err = CL_INVALID_QUEUE_PROPERTIES;
+ goto error;
+ }
+
+ queue = cl_context_create_queue(context, device, properties, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return queue;
+}
+
+cl_int
+clRetainCommandQueue(cl_command_queue command_queue)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE (command_queue);
+ cl_command_queue_add_ref(command_queue);
+error:
+ return err;
+}
+
+cl_int
+clReleaseCommandQueue(cl_command_queue command_queue)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE (command_queue);
+ cl_command_queue_delete(command_queue);
+error:
+ return err;
+}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE (command_queue);
+
+ if (param_name == CL_QUEUE_CONTEXT) {
+ FILL_GETINFO_RET (cl_context, 1, &command_queue->ctx, CL_SUCCESS);
+ } else if (param_name == CL_QUEUE_DEVICE) {
+ FILL_GETINFO_RET (cl_device_id, 1, &command_queue->ctx->device, CL_SUCCESS);
+ } else if (param_name == CL_QUEUE_REFERENCE_COUNT) {
+ cl_uint ref = command_queue->ref_n;
+ FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
+ } else if (param_name == CL_QUEUE_PROPERTIES) {
+ FILL_GETINFO_RET (cl_command_queue_properties, 1, &command_queue->props, CL_SUCCESS);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+}
+
+cl_mem
+clCreateBuffer(cl_context context,
+ cl_mem_flags flags,
+ size_t size,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+
+ mem = cl_mem_new_buffer(context, flags, size, host_ptr, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateSubBuffer(cl_mem buffer,
+ cl_mem_flags flags,
+ cl_buffer_create_type buffer_create_type,
+ const void * buffer_create_info,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_MEM(buffer);
+
+ mem = cl_mem_new_sub_buffer(buffer, flags, buffer_create_type,
+ buffer_create_info, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateImage(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format *image_format,
+ const cl_image_desc *image_desc,
+ void *host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ if (image_format == NULL) {
+ err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ goto error;
+ }
+ if (image_format->image_channel_order < CL_R ||
+ image_format->image_channel_order > CL_RGBx) {
+ err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ goto error;
+ }
+ if (image_format->image_channel_data_type < CL_SNORM_INT8 ||
+ image_format->image_channel_data_type > CL_FLOAT) {
+ err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ goto error;
+ }
+
+ if (image_desc == NULL) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+ if (image_desc->image_type <= CL_MEM_OBJECT_BUFFER ||
+ image_desc->image_type > CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+ /* buffer refers to a valid buffer memory object if image_type is
+ CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
+ if (image_desc->image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
+ image_desc->buffer) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+ if (image_desc->num_mip_levels || image_desc->num_samples) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+
+ /* Other details check for image_desc will leave to image create. */
+ mem = cl_mem_new_image(context,
+ flags,
+ image_format,
+ image_desc,
+ host_ptr,
+ &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateImage2D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_row_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ cl_image_desc image_desc;
+ memset(&image_desc, 0, sizeof(image_desc));
+
+ image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ image_desc.image_width = image_width;
+ image_desc.image_height = image_height;
+ image_desc.image_row_pitch = image_row_pitch;
+
+ mem = cl_mem_new_image(context,
+ flags,
+ image_format,
+ &image_desc,
+ host_ptr,
+ &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateImage3D(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_depth,
+ size_t image_row_pitch,
+ size_t image_slice_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ cl_image_desc image_desc;
+
+ image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+ image_desc.image_width = image_width;
+ image_desc.image_height = image_height;
+ image_desc.image_depth = image_depth;
+ image_desc.image_row_pitch = image_row_pitch;
+ image_desc.image_slice_pitch = image_slice_pitch;
+
+ mem = cl_mem_new_image(context,
+ flags,
+ image_format,
+ &image_desc,
+ host_ptr,
+ &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_int
+clRetainMemObject(cl_mem memobj)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (memobj);
+ cl_mem_add_ref(memobj);
+error:
+ return err;
+}
+
+cl_int
+clReleaseMemObject(cl_mem memobj)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (memobj);
+ cl_mem_delete(memobj);
+error:
+ return err;
+}
+
+cl_int
+clGetSupportedImageFormats(cl_context ctx,
+ cl_mem_flags flags,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format * image_formats,
+ cl_uint * num_image_formats)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (ctx);
+ if (UNLIKELY(num_entries == 0 && image_formats != NULL)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
+ image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+ image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
+ image_type != CL_MEM_OBJECT_IMAGE2D &&
+ image_type != CL_MEM_OBJECT_IMAGE3D)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ err = cl_image_get_supported_fmt(ctx,
+ image_type,
+ num_entries,
+ image_formats,
+ num_image_formats);
+
+error:
+ return err;
+}
+
+cl_int
+clGetMemObjectInfo(cl_mem memobj,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM(memobj);
+
+ err = cl_get_mem_object_info(memobj,
+ param_name,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
+error:
+ return err;
+}
+
+cl_int
+clGetImageInfo(cl_mem mem,
+ cl_image_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ return cl_get_image_info(mem,
+ param_name,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clSetMemObjectDestructorCallback(cl_mem memobj,
+ void (CL_CALLBACK *pfn_notify) (cl_mem, void*),
+ void * user_data)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM(memobj);
+ INVALID_VALUE_IF (pfn_notify == 0);
+
+ cl_mem_dstr_cb *cb = (cl_mem_dstr_cb*)malloc(sizeof(cl_mem_dstr_cb));
+ if (!cb) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+
+ memset(cb, 0, sizeof(cl_mem_dstr_cb));
+ cb->pfn_notify = pfn_notify;
+ cb->user_data = user_data;
+ cb->next = memobj->dstr_cb;
+ memobj->dstr_cb = cb;
+
+error:
+ return err;
+}
+
+cl_sampler
+clCreateSampler(cl_context context,
+ cl_bool normalized,
+ cl_addressing_mode addressing,
+ cl_filter_mode filter,
+ cl_int * errcode_ret)
+{
+ cl_sampler sampler = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ sampler = cl_sampler_new(context, normalized, addressing, filter, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return sampler;
+}
+
+cl_int
+clRetainSampler(cl_sampler sampler)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_SAMPLER (sampler);
+ cl_sampler_add_ref(sampler);
+error:
+ return err;
+}
+
+cl_int
+clReleaseSampler(cl_sampler sampler)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_SAMPLER (sampler);
+ cl_sampler_delete(sampler);
+error:
+ return err;
+}
+
+cl_int
+clGetSamplerInfo(cl_sampler sampler,
+ cl_sampler_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_SAMPLER (sampler);
+
+ if (param_name == CL_SAMPLER_REFERENCE_COUNT) {
+ FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&sampler->ref_n, CL_SUCCESS);
+ } else if (param_name == CL_SAMPLER_CONTEXT) {
+ FILL_GETINFO_RET (cl_context, 1, &sampler->ctx, CL_SUCCESS);
+ } else if (param_name == CL_SAMPLER_NORMALIZED_COORDS) {
+ FILL_GETINFO_RET (cl_bool, 1, &sampler->normalized_coords, CL_SUCCESS);
+ } else if (param_name == CL_SAMPLER_ADDRESSING_MODE) {
+ FILL_GETINFO_RET (cl_addressing_mode, 1, &sampler->address, CL_SUCCESS);
+ } else if (param_name == CL_SAMPLER_FILTER_MODE ) {
+ FILL_GETINFO_RET (cl_filter_mode, 1, &sampler->filter, CL_SUCCESS);
+ } else{
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+}
+
+cl_program
+clCreateProgramWithSource(cl_context context,
+ cl_uint count,
+ const char ** strings,
+ const size_t * lengths,
+ cl_int * errcode_ret)
+{
+ cl_program program = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_uint i;
+
+ CHECK_CONTEXT (context);
+ INVALID_VALUE_IF (count == 0);
+ INVALID_VALUE_IF (strings == NULL);
+ for(i = 0; i < count; i++) {
+ if(UNLIKELY(strings[i] == NULL)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ }
+ program = cl_program_create_from_source(context,
+ count,
+ strings,
+ lengths,
+ &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return program;
+}
+
+cl_program
+clCreateProgramWithBinary(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ const size_t * lengths,
+ const unsigned char ** binaries,
+ cl_int * binary_status,
+ cl_int * errcode_ret)
+{
+ cl_program program = NULL;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_CONTEXT (context);
+ program = cl_program_create_from_binary(context,
+ num_devices,
+ devices,
+ lengths,
+ binaries,
+ binary_status,
+ &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return program;
+}
+
+cl_program
+clCreateProgramWithBuiltInKernels(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * kernel_names,
+ cl_int * errcode_ret)
+{
+ cl_program program = NULL;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_CONTEXT (context);
+ INVALID_VALUE_IF (kernel_names == NULL);
+ program = cl_program_create_with_built_in_kernles(context,
+ num_devices,
+ device_list,
+ kernel_names,
+ &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return program;
+}
+
+cl_int
+clRetainProgram(cl_program program)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_PROGRAM (program);
+ cl_program_add_ref(program);
+error:
+ return err;
+}
+
+cl_int
+clReleaseProgram(cl_program program)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_PROGRAM (program);
+ cl_program_delete(program);
+error:
+ return err;
+}
+
+cl_int
+clBuildProgram(cl_program program,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * options,
+ void (CL_CALLBACK *pfn_notify) (cl_program, void*),
+ void * user_data)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_PROGRAM(program);
+ INVALID_VALUE_IF (num_devices > 1);
+ INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+ INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+ INVALID_VALUE_IF (pfn_notify == 0 && user_data != NULL);
+
+ /* Everything is easy. We only support one device anyway */
+ if (num_devices != 0) {
+ assert(program->ctx);
+ INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+ }
+
+ /* TODO support create program from binary */
+ assert(program->source_type == FROM_LLVM ||
+ program->source_type == FROM_SOURCE ||
+ program->source_type == FROM_BINARY);
+ if((err = cl_program_build(program, options)) != CL_SUCCESS) {
+ goto error;
+ }
+ program->is_built = CL_TRUE;
+
+ if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+ return err;
+}
+
+cl_int
+clCompileProgram(cl_program program ,
+ cl_uint num_devices ,
+ const cl_device_id * device_list ,
+ const char * options ,
+ cl_uint num_input_headers ,
+ const cl_program * input_headers ,
+ const char ** header_include_names ,
+ void (CL_CALLBACK * pfn_notify )(cl_program, void *),
+ void * user_data )
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_PROGRAM(program);
+ INVALID_VALUE_IF (num_devices > 1);
+ INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+ INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+ INVALID_VALUE_IF (pfn_notify == 0 && user_data != NULL);
+ INVALID_VALUE_IF (num_input_headers == 0 && input_headers != NULL);
+ INVALID_VALUE_IF (num_input_headers != 0 && input_headers == NULL);
+
+ /* Everything is easy. We only support one device anyway */
+ if (num_devices != 0) {
+ assert(program->ctx);
+ INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+ }
+
+ /* TODO support create program from binary */
+ assert(program->source_type == FROM_LLVM ||
+ program->source_type == FROM_SOURCE ||
+ program->source_type == FROM_BINARY);
+ if((err = cl_program_compile(program, num_input_headers, input_headers, header_include_names, options)) != CL_SUCCESS) {
+ goto error;
+ }
+ program->is_built = CL_TRUE;
+
+ if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+ return err;
+}
+
+cl_program
+clLinkProgram(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * options,
+ cl_uint num_input_programs,
+ const cl_program * input_programs,
+ void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_program program = NULL;
+ CHECK_CONTEXT (context);
+ INVALID_VALUE_IF (num_devices > 1);
+ INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+ INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+ INVALID_VALUE_IF (pfn_notify == 0 && user_data != NULL);
+ INVALID_VALUE_IF (num_input_programs == 0 && input_programs != NULL);
+ INVALID_VALUE_IF (num_input_programs != 0 && input_programs == NULL);
+
+ program = cl_program_link(context, num_input_programs, input_programs, options, &err);
+
+ program->is_built = CL_TRUE;
+
+ if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return program;
+}
+
+cl_int
+clUnloadCompiler(void)
+{
+ return CL_SUCCESS;
+}
+
+cl_int
+clUnloadPlatformCompiler(cl_platform_id platform)
+{
+ return CL_SUCCESS;
+}
+
+cl_int
+clGetProgramInfo(cl_program program,
+ cl_program_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ char * ret_str = "";
+
+ CHECK_PROGRAM (program);
+
+ if (param_name == CL_PROGRAM_REFERENCE_COUNT) {
+ cl_uint ref = program->ref_n;
+ FILL_GETINFO_RET (cl_uint, 1, (&ref), CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_CONTEXT) {
+ cl_context context = program->ctx;
+ FILL_GETINFO_RET (cl_context, 1, &context, CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
+ cl_uint num_dev = 1; // Just 1 dev now.
+ FILL_GETINFO_RET (cl_uint, 1, &num_dev, CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_DEVICES) {
+ cl_device_id dev_id = program->ctx->device;
+ FILL_GETINFO_RET (cl_device_id, 1, &dev_id, CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
+ cl_uint kernels_num = program->ker_n;
+ FILL_GETINFO_RET (cl_uint, 1, &kernels_num, CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_SOURCE) {
+
+ if (!program->source)
+ FILL_GETINFO_RET (char, 1, &ret_str, CL_SUCCESS);
+ FILL_GETINFO_RET (char, (strlen(program->source) + 1),
+ program->source, CL_SUCCESS);
+ } else if(param_name == CL_PROGRAM_KERNEL_NAMES) {
+ cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
+ } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
+ if (program->binary == NULL){
+ if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+ }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+ }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+ }else{
+ return CL_INVALID_BINARY;
+ }
+ }
+
+ if (program->binary == NULL || program->binary_sz == 0) {
+ return CL_OUT_OF_RESOURCES;
+ }
+ FILL_GETINFO_RET (size_t, 1, (&program->binary_sz), CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_BINARIES) {
+ if (param_value_size_ret)
+ *param_value_size_ret = sizeof(void*);
+ if (!param_value)
+ return CL_SUCCESS;
+
+ /* param_value points to an array of n
+ pointers allocated by the caller */
+ if (program->binary == NULL) {
+ if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+ }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+ }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+ program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+ }else{
+ return CL_INVALID_BINARY;
+ }
+ }
+
+ if (program->binary == NULL || program->binary_sz == 0) {
+ return CL_OUT_OF_RESOURCES;
+ }
+
+ memcpy(*((void **)param_value), program->binary, program->binary_sz);
+ return CL_SUCCESS;
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clGetProgramBuildInfo(cl_program program,
+ cl_device_id device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ char * ret_str = "";
+
+ CHECK_PROGRAM (program);
+ INVALID_DEVICE_IF (device != program->ctx->device);
+
+ if (param_name == CL_PROGRAM_BUILD_STATUS) {
+ FILL_GETINFO_RET (cl_build_status, 1, &program->build_status, CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
+ if (program->is_built && program->build_opts)
+ ret_str = program->build_opts;
+
+ FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
+ } else if (param_name == CL_PROGRAM_BUILD_LOG) {
+ FILL_GETINFO_RET (char, program->build_log_sz + 1, program->build_log, CL_SUCCESS);
+ if (param_value_size_ret)
+ *param_value_size_ret = program->build_log_sz + 1;
+ }else if (param_name == CL_PROGRAM_BINARY_TYPE){
+
+ FILL_GETINFO_RET (cl_uint, 1, &program->binary_type, CL_SUCCESS);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+}
+
+cl_kernel
+clCreateKernel(cl_program program,
+ const char * kernel_name,
+ cl_int * errcode_ret)
+{
+ cl_kernel kernel = NULL;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_PROGRAM (program);
+ if (program->ker_n <= 0) {
+ err = CL_INVALID_PROGRAM_EXECUTABLE;
+ goto error;
+ }
+ INVALID_VALUE_IF (kernel_name == NULL);
+ kernel = cl_program_create_kernel(program, kernel_name, &err);
+
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return kernel;
+}
+
+cl_int
+clCreateKernelsInProgram(cl_program program,
+ cl_uint num_kernels,
+ cl_kernel * kernels,
+ cl_uint * num_kernels_ret)
+{
+ cl_int err = CL_SUCCESS;
+
+ CHECK_PROGRAM (program);
+ if (program->ker_n <= 0) {
+ err = CL_INVALID_PROGRAM_EXECUTABLE;
+ goto error;
+ }
+ if (kernels && num_kernels < program->ker_n) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(num_kernels_ret)
+ *num_kernels_ret = program->ker_n;
+
+ if(kernels)
+ err = cl_program_create_kernels_in_program(program, kernels);
+
+error:
+ return err;
+}
+
+cl_int
+clRetainKernel(cl_kernel kernel)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
+ cl_kernel_add_ref(kernel);
+error:
+ return err;
+}
+
+cl_int
+clReleaseKernel(cl_kernel kernel)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
+ cl_kernel_delete(kernel);
+error:
+ return err;
+}
+
+cl_int
+clSetKernelArg(cl_kernel kernel,
+ cl_uint arg_index,
+ size_t arg_size,
+ const void * arg_value)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
+ err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
+error:
+ return err;
+}
+
+cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
+ size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
+
+ if (param_name != CL_KERNEL_ARG_ADDRESS_QUALIFIER
+ && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
+ && param_name != CL_KERNEL_ARG_TYPE_NAME
+ && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
+ && param_name != CL_KERNEL_ARG_NAME) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (arg_index >= kernel->arg_n) {
+ err = CL_INVALID_ARG_INDEX;
+ goto error;
+ }
+
+ err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
+ param_value, param_value_size_ret);
+
+error:
+ return err;
+}
+
+cl_int
+clGetKernelInfo(cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err;
+
+ CHECK_KERNEL(kernel);
+
+ if (param_name == CL_KERNEL_CONTEXT) {
+ FILL_GETINFO_RET (cl_context, 1, &kernel->program->ctx, CL_SUCCESS);
+ } else if (param_name == CL_KERNEL_PROGRAM) {
+ FILL_GETINFO_RET (cl_program, 1, &kernel->program, CL_SUCCESS);
+ } else if (param_name == CL_KERNEL_NUM_ARGS) {
+ cl_uint n = kernel->arg_n;
+ FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
+ } else if (param_name == CL_KERNEL_REFERENCE_COUNT) {
+ cl_int ref = kernel->ref_n;
+ FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
+ } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
+ const char * n = cl_kernel_get_name(kernel);
+ FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
+ } else if (param_name == CL_KERNEL_ATTRIBUTES) {
+ const char * n = cl_kernel_get_attributes(kernel);
+ FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ return cl_get_kernel_workgroup_info(kernel,
+ device,
+ param_name,
+ param_value_size,
+ param_value,
+ param_value_size_ret);
+}
+
+cl_int
+clWaitForEvents(cl_uint num_events,
+ const cl_event * event_list)
+{
+ cl_int err = CL_SUCCESS;
+ cl_context ctx = NULL;
+
+ if(num_events > 0 && event_list)
+ ctx = event_list[0]->ctx;
+
+ TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
+
+ while(cl_event_wait_events(num_events, event_list, NULL) == CL_ENQUEUE_EXECUTE_DEFER) {
+ usleep(8000); //sleep 8ms to wait other thread
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clGetEventInfo(cl_event event,
+ cl_event_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_EVENT(event);
+
+ if (param_name == CL_EVENT_COMMAND_QUEUE) {
+ FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_CONTEXT) {
+ FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_COMMAND_TYPE) {
+ FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
+ cl_event_update_status(event, 0);
+ FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
+ cl_uint ref = event->ref_n;
+ FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+
+}
+
+cl_event
+clCreateUserEvent(cl_context context,
+ cl_int * errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_event event = NULL;
+ CHECK_CONTEXT(context);
+
+ TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER, CL_TRUE));
+
+error:
+ if(errcode_ret)
+ *errcode_ret = err;
+ return event;
+}
+
+cl_int
+clRetainEvent(cl_event event)
+{
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ cl_event_add_ref(event);
+
+error:
+ return err;
+}
+
+cl_int
+clReleaseEvent(cl_event event)
+{
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ cl_event_delete(event);
+
+error:
+ return err;
+}
+
+cl_int
+clSetUserEventStatus(cl_event event,
+ cl_int execution_status)
+{
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ if(execution_status > CL_COMPLETE) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if(event->status != CL_SUBMITTED) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ cl_event_set_status(event, execution_status);
+error:
+ return err;
+}
+
+cl_int
+clSetEventCallback(cl_event event,
+ cl_int command_exec_callback_type,
+ void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *),
+ void * user_data)
+{
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ if((pfn_notify == NULL) ||
+ (command_exec_callback_type > CL_SUBMITTED) ||
+ (command_exec_callback_type < CL_COMPLETE)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
+
+error:
+ return err;
+
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event event,
+ cl_profiling_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_ulong ret_val;
+
+ CHECK_EVENT(event);
+
+ if (event->type == CL_COMMAND_USER ||
+ !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
+ event->status != CL_COMPLETE) {
+ err = CL_PROFILING_INFO_NOT_AVAILABLE;
+ goto error;
+ }
+
+ if (param_value && param_value_size < sizeof(cl_ulong)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (param_name == CL_PROFILING_COMMAND_QUEUED) {
+ ret_val = event->timestamp[0];
+ } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
+ ret_val = event->timestamp[1];
+ } else if (param_name == CL_PROFILING_COMMAND_START) {
+ err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
+ ret_val = event->timestamp[2];
+ } else if (param_name == CL_PROFILING_COMMAND_END) {
+ err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
+ ret_val = event->timestamp[3];
+ } else {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (err == CL_SUCCESS) {
+ if (param_value)
+ *(cl_ulong*)param_value = ret_val;
+ if (param_value_size_ret)
+ *param_value_size_ret = sizeof(cl_ulong);
+ }
+error:
+ return err;
+}
+
+cl_int
+clFlush(cl_command_queue command_queue)
+{
+ /* have nothing to do now, as currently
+ * clEnqueueNDRangeKernel will flush at
+ * the end of each calling. we may need
+ * to optimize it latter.*/
+ return 0;
+}
+
+cl_int
+clFinish(cl_command_queue command_queue)
+{
+ cl_int err = CL_SUCCESS;
+
+ CHECK_QUEUE (command_queue);
+ err = cl_command_queue_finish(command_queue);
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueReadBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t size,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, defer_enqueue_data = { 0 };
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!ptr || !size || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &defer_enqueue_data;
+ data->type = EnqueueReadBuffer;
+ data->mem_obj = buffer;
+ data->ptr = ptr;
+ data->offset = offset;
+ data->size = size;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueReadBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(buffer_row_pitch == 0)
+ buffer_row_pitch = region[0];
+ if(buffer_slice_pitch == 0)
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+ if(host_row_pitch == 0)
+ host_row_pitch = region[0];
+ if(host_slice_pitch == 0)
+ host_slice_pitch = region[1] * host_row_pitch;
+
+ if (buffer_row_pitch < region[0] ||
+ host_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+ (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
+ + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
+ + buffer_origin[0] + region[0] > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueReadBufferRect;
+ data->mem_obj = buffer;
+ data->ptr = ptr;
+ data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+ data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = buffer_row_pitch;
+ data->slice_pitch = buffer_slice_pitch;
+ data->host_row_pitch = host_row_pitch;
+ data->host_slice_pitch = host_slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+ error:
+ return err;
+}
+
+cl_int
+clEnqueueWriteBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t size,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!ptr || !size || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueWriteBuffer;
+ data->mem_obj = buffer;
+ data->const_ptr = ptr;
+ data->offset = offset;
+ data->size = size;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+ error:
+ return err;
+}
+
+cl_int
+clEnqueueWriteBufferRect(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(buffer_row_pitch == 0)
+ buffer_row_pitch = region[0];
+ if(buffer_slice_pitch == 0)
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+ if(host_row_pitch == 0)
+ host_row_pitch = region[0];
+ if(host_slice_pitch == 0)
+ host_slice_pitch = region[1] * host_row_pitch;
+
+ if (buffer_row_pitch < region[0] ||
+ host_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+ (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
+ + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
+ + buffer_origin[0] + region[0] > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueWriteBufferRect;
+ data->mem_obj = buffer;
+ data->const_ptr = ptr;
+ data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+ data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = buffer_row_pitch;
+ data->slice_pitch = buffer_slice_pitch;
+ data->host_row_pitch = host_row_pitch;
+ data->host_slice_pitch = host_slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueFillImage(cl_command_queue command_queue,
+ cl_mem image,
+ const void * fill_color,
+ const size_t * porigin,
+ const size_t * pregion,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(image, src_image);
+ FIXUP_IMAGE_REGION(src_image, pregion, region);
+ FIXUP_IMAGE_ORIGIN(src_image, porigin, origin);
+
+ if (command_queue->ctx != image->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (fill_color == NULL) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!origin || !region || origin[0] + region[0] > src_image->w || origin[1] + region[1] > src_image->h || origin[2] + region[2] > src_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)){
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 ||origin[1] != 0 || region[2] != 1 || region[1] != 1)){
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = cl_image_fill(command_queue, fill_color, src_image, origin, region);
+ if (err) {
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueFillImage;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_fill_image", "", command_queue);
+
+ return 0;
+
+ error:
+ return err;
+}
+
+cl_int
+clEnqueueFillBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ const void * pattern,
+ size_t pattern_size,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+ static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+ int i = 0;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (offset < 0 || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (pattern == NULL) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
+ if (valid_sz[i] == pattern_size)
+ break;
+ }
+ if (i == sizeof(valid_sz) / sizeof(size_t)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (offset % pattern_size || size % pattern_size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
+ if (err) {
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueFillBuffer;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue);
+
+ return 0;
+
+ error:
+ return err;
+}
+
+cl_int
+clEnqueueCopyBuffer(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(src_buffer);
+ CHECK_MEM(dst_buffer);
+
+ if (command_queue->ctx != src_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (command_queue->ctx != dst_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (src_offset < 0 || src_offset + cb > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if (dst_offset < 0 || dst_offset + cb > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ /* Check overlap */
+ if (src_buffer == dst_buffer
+ && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
+ && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+
+ /* Check sub overlap */
+ if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE ) {
+ struct _cl_mem_buffer* src_b = (struct _cl_mem_buffer*)src_buffer;
+ struct _cl_mem_buffer* dst_b = (struct _cl_mem_buffer*)dst_buffer;
+ size_t src_sub_offset = src_b->sub_offset;
+ size_t dst_sub_offset = dst_b->sub_offset;
+
+ if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset
+ && dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1)
+ && (dst_offset + dst_sub_offset <= src_offset + src_sub_offset
+ && src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+ }
+
+ err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyBuffer;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue);
+
+ return 0;
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueCopyBufferRect(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ const size_t * src_origin,
+ const size_t * dst_origin,
+ const size_t * region,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(src_buffer);
+ CHECK_MEM(dst_buffer);
+
+ if ((command_queue->ctx != src_buffer->ctx) ||
+ (command_queue->ctx != dst_buffer->ctx)) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(src_row_pitch == 0)
+ src_row_pitch = region[0];
+ if(src_slice_pitch == 0)
+ src_slice_pitch = region[1] * src_row_pitch;
+
+ if(dst_row_pitch == 0)
+ dst_row_pitch = region[0];
+ if(dst_slice_pitch == 0)
+ dst_slice_pitch = region[1] * dst_row_pitch;
+
+ if (src_row_pitch < region[0] ||
+ dst_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
+ (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_origin[2] + region[2] - 1) * src_slice_pitch
+ + (src_origin[1] + region[1] - 1) * src_row_pitch
+ + src_origin[0] + region[0] > src_buffer->size
+ ||(dst_origin[2] + region[2] - 1) * dst_slice_pitch
+ + (dst_origin[1] + region[1] - 1) * dst_row_pitch
+ + dst_origin[0] + region[0] > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_buffer == dst_buffer &&
+ check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+
+ cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+ src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyBufferRect;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue);
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueReadImage(cl_command_queue command_queue,
+ cl_mem mem,
+ cl_bool blocking_read,
+ const size_t * porigin,
+ const size_t * pregion,
+ size_t row_pitch,
+ size_t slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(mem, image);
+ FIXUP_IMAGE_REGION(image, pregion, region);
+ FIXUP_IMAGE_ORIGIN(image, porigin, origin);
+ if (command_queue->ctx != mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!row_pitch)
+ row_pitch = image->bpp*region[0];
+ else if (row_pitch < image->bpp*region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (image->slice_pitch) {
+ if (!slice_pitch)
+ slice_pitch = row_pitch*region[1];
+ else if (slice_pitch < row_pitch*region[1]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ }
+ else if (slice_pitch) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!ptr) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueReadImage;
+ data->mem_obj = mem;
+ data->ptr = ptr;
+ data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = row_pitch;
+ data->slice_pitch = slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueWriteImage(cl_command_queue command_queue,
+ cl_mem mem,
+ cl_bool blocking_write,
+ const size_t * porigin,
+ const size_t * pregion,
+ size_t row_pitch,
+ size_t slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(mem, image);
+ FIXUP_IMAGE_REGION(image, pregion, region);
+ FIXUP_IMAGE_ORIGIN(image, porigin, origin);
+ if (command_queue->ctx != mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!row_pitch)
+ row_pitch = image->bpp*region[0];
+ else if (row_pitch < image->bpp*region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (image->slice_pitch) {
+ if (!slice_pitch)
+ slice_pitch = row_pitch*region[1];
+ else if (slice_pitch < row_pitch*region[1]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ }
+ else if (slice_pitch) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!ptr) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueWriteImage;
+ data->mem_obj = mem;
+ data->const_ptr = ptr;
+ data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = row_pitch;
+ data->slice_pitch = slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueCopyImage(cl_command_queue command_queue,
+ cl_mem src_mem,
+ cl_mem dst_mem,
+ const size_t * psrc_origin,
+ const size_t * pdst_origin,
+ const size_t * pregion,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+ cl_bool overlap = CL_TRUE;
+ cl_int i = 0;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(src_mem, src_image);
+ CHECK_IMAGE(dst_mem, dst_image);
+ FIXUP_IMAGE_REGION(src_image, pregion, region);
+ FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
+ FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
+ if (command_queue->ctx != src_mem->ctx ||
+ command_queue->ctx != dst_mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
+ src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
+ err = CL_IMAGE_FORMAT_MISMATCH;
+ goto error;
+ }
+
+ if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+ src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+ dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
+ (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_image == dst_image) {
+ for(i = 0; i < 3; i++)
+ overlap = overlap && (src_origin[i] < dst_origin[i] + region[i])
+ && (dst_origin[i] < src_origin[i] + region[i]);
+ if(overlap == CL_TRUE) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+ }
+
+ cl_mem_kernel_copy_image(command_queue, src_image, dst_image, src_origin, dst_origin, region);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyImage;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue);
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+ cl_mem src_mem,
+ cl_mem dst_buffer,
+ const size_t * psrc_origin,
+ const size_t * pregion,
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(src_mem, src_image);
+ CHECK_MEM(dst_buffer);
+ FIXUP_IMAGE_REGION(src_image, pregion, region);
+ FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
+ if (command_queue->ctx != src_mem->ctx ||
+ command_queue->ctx != dst_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+ src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer, src_origin, dst_offset, region);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyImageToBuffer;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue);
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_mem,
+ size_t src_offset,
+ const size_t * pdst_origin,
+ const size_t * pregion,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(src_buffer);
+ CHECK_IMAGE(dst_mem, dst_image);
+ FIXUP_IMAGE_REGION(dst_image, pregion, region);
+ FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
+ if (command_queue->ctx != src_buffer->ctx ||
+ command_queue->ctx != dst_mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+ dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image, src_offset, dst_origin, region);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, dst_mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyBufferToImage;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue);
+
+error:
+ return err;
+}
+
+static cl_int _cl_map_mem(cl_mem mem, void *ptr, void **mem_ptr,
+ size_t offset, size_t size,
+ const size_t *origin, const size_t *region)
+{
+ cl_int slot = -1;
+ int err = CL_SUCCESS;
+ size_t sub_offset = 0;
+
+ if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ sub_offset = buffer->sub_offset;
+ }
+
+ ptr = (char*)ptr + offset + sub_offset;
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mem->host_ptr);
+ //only calc ptr here, will do memcpy in enqueue
+ *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
+ } else {
+ *mem_ptr = ptr;
+ }
+ /* Record the mapped address. */
+ if (!mem->mapped_ptr_sz) {
+ mem->mapped_ptr_sz = 16;
+ mem->mapped_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
+ if (!mem->mapped_ptr) {
+ cl_mem_unmap_auto(mem);
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = 0;
+ } else {
+ int i = 0;
+ for (; i < mem->mapped_ptr_sz; i++) {
+ if (mem->mapped_ptr[i].ptr == NULL) {
+ slot = i;
+ break;
+ }
+ }
+ if (i == mem->mapped_ptr_sz) {
+ cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
+ if (!new_ptr) {
+ cl_mem_unmap_auto(mem);
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ memcpy(new_ptr, mem->mapped_ptr,
+ mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = mem->mapped_ptr_sz;
+ mem->mapped_ptr_sz *= 2;
+ free(mem->mapped_ptr);
+ mem->mapped_ptr = new_ptr;
+ }
+ }
+ assert(slot != -1);
+ mem->mapped_ptr[slot].ptr = *mem_ptr;
+ mem->mapped_ptr[slot].v_ptr = ptr;
+ mem->mapped_ptr[slot].size = size;
+ if(origin) {
+ assert(region);
+ mem->mapped_ptr[slot].origin[0] = origin[0];
+ mem->mapped_ptr[slot].origin[1] = origin[1];
+ mem->mapped_ptr[slot].origin[2] = origin[2];
+ mem->mapped_ptr[slot].region[0] = region[0];
+ mem->mapped_ptr[slot].region[1] = region[1];
+ mem->mapped_ptr[slot].region[2] = region[2];
+ }
+ mem->map_ref++;
+error:
+ if (err != CL_SUCCESS)
+ *mem_ptr = NULL;
+ return err;
+}
+
+void *
+clEnqueueMapBuffer(cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t size,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ void *ptr = NULL;
+ void *mem_ptr = NULL;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!size || offset + size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((map_flags & CL_MAP_READ &&
+ buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+ (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+ buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+ {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueMapBuffer;
+ data->mem_obj = buffer;
+ data->offset = offset;
+ data->size = size;
+ data->ptr = ptr;
+ data->unsync_map = 1;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ data->unsync_map = 0;
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if (err != CL_SUCCESS)
+ goto error;
+ ptr = data->ptr;
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ } else {
+ if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+ }
+ err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
+ if (err != CL_SUCCESS)
+ goto error;
+
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem_ptr;
+}
+
+void *
+clEnqueueMapImage(cl_command_queue command_queue,
+ cl_mem mem,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ const size_t * porigin,
+ const size_t * pregion,
+ size_t * image_row_pitch,
+ size_t * image_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ void *ptr = NULL;
+ void *mem_ptr = NULL;
+ size_t offset = 0;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(mem, image);
+ FIXUP_IMAGE_REGION(image, pregion, region);
+ FIXUP_IMAGE_ORIGIN(image, porigin, origin);
+ if (command_queue->ctx != mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((map_flags & CL_MAP_READ &&
+ mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+ (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+ mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+ {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueMapImage;
+ data->mem_obj = mem;
+ data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->ptr = ptr;
+ data->unsync_map = 1;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ data->unsync_map = 0;
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if (err != CL_SUCCESS)
+ goto error;
+ ptr = data->ptr;
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ } else {
+ if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+ }
+
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ if (image_slice_pitch)
+ *image_slice_pitch = image->host_slice_pitch;
+ *image_row_pitch = image->host_row_pitch;
+
+ offset = image->bpp*origin[0] + image->host_row_pitch*origin[1] + image->host_slice_pitch*origin[2];
+ } else {
+ if (image_slice_pitch)
+ *image_slice_pitch = image->slice_pitch;
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ *image_row_pitch = image->slice_pitch;
+ else
+ *image_row_pitch = image->row_pitch;
+
+ offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ }
+ err = _cl_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
+
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem_ptr; //TODO: map and unmap first
+}
+
+cl_int
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+ cl_mem memobj,
+ void * mapped_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(memobj);
+ if (command_queue->ctx != memobj->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, memobj->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueUnmapMemObject;
+ data->mem_obj = memobj;
+ data->ptr = mapped_ptr;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueMigrateMemObjects(cl_command_queue command_queue,
+ cl_uint num_mem_objects,
+ const cl_mem * mem_objects,
+ cl_mem_migration_flags flags,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ /* So far, we just support 1 device and no subdevice. So all the command queues
+ belong to the small context. There is no need to migrate the mem objects by now. */
+ cl_int err = CL_SUCCESS;
+ cl_uint i = 0;
+ enqueue_data *data, defer_enqueue_data = { 0 };
+
+ if (!flags & CL_MIGRATE_MEM_OBJECT_HOST)
+ CHECK_QUEUE(command_queue);
+
+ if (num_mem_objects == 0 || mem_objects == NULL) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST |
+ CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ for (i = 0; i < num_mem_objects; i++) {
+ CHECK_MEM(mem_objects[i]);
+ if (mem_objects[i]->ctx != command_queue->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+ }
+
+ /* really nothing to do, fill the event. */
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+ data = &defer_enqueue_data;
+ data->type = EnqueueMigrateMemObj;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t * global_work_offset,
+ const size_t * global_work_size,
+ const size_t * local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ size_t fixed_global_off[] = {0,0,0};
+ size_t fixed_global_sz[] = {1,1,1};
+ size_t fixed_local_sz[] = {1,1,1};
+ cl_int err = CL_SUCCESS;
+ cl_uint i;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_KERNEL(kernel);
+
+ /* Check number of dimensions we have */
+ if (UNLIKELY(work_dim == 0 || work_dim > 3)) {
+ err = CL_INVALID_WORK_DIMENSION;
+ goto error;
+ }
+
+ /* We need a work size per dimension */
+ if (UNLIKELY(global_work_size == NULL)) {
+ err = CL_INVALID_GLOBAL_WORK_SIZE;
+ goto error;
+ }
+
+ if (global_work_offset != NULL)
+ for (i = 0; i < work_dim; ++i) {
+ if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
+ err = CL_INVALID_GLOBAL_OFFSET;
+ goto error;
+ }
+ }
+
+ /* Local sizes must be non-null and divide global sizes */
+ if (local_work_size != NULL)
+ for (i = 0; i < work_dim; ++i)
+ if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
+ err = CL_INVALID_WORK_GROUP_SIZE;
+ goto error;
+ }
+
+ /* Queue and kernel must share the same context */
+ assert(kernel->program);
+ if (command_queue->ctx != kernel->program->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+
+ /* XXX No event right now */
+ //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
+ //FATAL_IF(event_wait_list != NULL, "Events are not supported");
+ //FATAL_IF(event != NULL, "Events are not supported");
+
+ if (local_work_size != NULL) {
+ for (i = 0; i < work_dim; ++i)
+ fixed_local_sz[i] = local_work_size[i];
+ } else {
+ uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+ for (i = 0; i< work_dim; i++) {
+ for (j = maxDimSize; j > 1; j--) {
+ if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+ fixed_local_sz[i] = j;
+ maxGroupSize = maxGroupSize /j;
+ maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+ break; //choose next work_dim
+ }
+ }
+ }
+ }
+
+ if (global_work_size != NULL)
+ for (i = 0; i < work_dim; ++i)
+ fixed_global_sz[i] = global_work_size[i];
+ if (global_work_offset != NULL)
+ for (i = 0; i < work_dim; ++i)
+ fixed_global_off[i] = global_work_offset[i];
+
+ if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
+ if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
+ || fixed_local_sz[1] != kernel->compile_wg_sz[1]
+ || fixed_local_sz[2] != kernel->compile_wg_sz[2])
+ {
+ err = CL_INVALID_WORK_GROUP_SIZE;
+ goto error;
+ }
+ }
+
+ /* Do device specific checks are enqueue the kernel */
+ err = cl_command_queue_ND_range(command_queue,
+ kernel,
+ work_dim,
+ fixed_global_off,
+ fixed_global_sz,
+ fixed_local_sz);
+ if(err != CL_SUCCESS)
+ goto error;
+
+ data = &no_wait_data;
+ data->type = EnqueueNDRangeKernel;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ err = cl_command_queue_flush(command_queue);
+ }
+
+ if(b_output_kernel_perf)
+ {
+ if(kernel->program->build_opts != NULL)
+ time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
+ else
+ time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
+ }
+error:
+ return err;
+}
+
+cl_int
+clEnqueueTask(cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ const size_t global_size[3] = {1, 0, 0};
+ const size_t local_size[3] = {1, 0, 0};
+
+ return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size,
+ num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int
+clEnqueueNativeKernel(cl_command_queue command_queue,
+ void (*user_func)(void *),
+ void * args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const cl_mem * mem_list,
+ const void ** args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event)
+{
+ cl_int err = CL_SUCCESS;
+ void *new_args = NULL;
+ enqueue_data *data, no_wait_data = { 0 };
+ cl_int i;
+
+ if(user_func == NULL ||
+ (args == NULL && cb_args > 0) ||
+ (args == NULL && num_mem_objects ==0) ||
+ (args != NULL && cb_args == 0) ||
+ (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
+ (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ //Per spec, need copy args
+ if (cb_args)
+ {
+ new_args = malloc(cb_args);
+ if (!new_args)
+ {
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memcpy(new_args, args, cb_args);
+
+ for (i=0; i<num_mem_objects; ++i)
+ {
+ CHECK_MEM(mem_list[i]);
+ args_mem_loc[i] = new_args + (args_mem_loc[i] - args); //change to new args
+ }
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueNativeKernel;
+ data->mem_list = mem_list;
+ data->ptr = new_args;
+ data->size = cb_args;
+ data->offset = (size_t)num_mem_objects;
+ data->const_ptr = args_mem_loc;
+ data->user_func = user_func;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(event ? *event : NULL, data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueMarker(cl_command_queue command_queue,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+ if(event == NULL) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ cl_event_marker_with_wait_list(command_queue, 0, NULL, event);
+error:
+ return err;
+}
+
+cl_int
+clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+ cl_event_marker_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
+error:
+ return err;
+}
+
+cl_int
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+ cl_uint num_events,
+ const cl_event * event_list)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+ err = clWaitForEvents(num_events, event_list);
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueBarrier(cl_command_queue command_queue)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+
+ cl_event_barrier_with_wait_list(command_queue, 0, NULL, NULL);
+
+error:
+ return err;
+}
+
+cl_int
+clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+ cl_event_barrier_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
+error:
+ return err;
+}
+
+#define EXTFUNC(x) \
+ if (strcmp(#x, func_name) == 0) \
+ return (void *)x;
+
+static void*
+internal_clGetExtensionFunctionAddress(const char *func_name)
+{
+ if (func_name == NULL)
+ return NULL;
+#ifdef HAS_OCLIcd
+ /* cl_khr_icd */
+ EXTFUNC(clIcdGetPlatformIDsKHR)
+#endif
+ EXTFUNC(clCreateProgramWithLLVMIntel)
+ EXTFUNC(clGetGenVersionIntel)
+ EXTFUNC(clMapBufferIntel)
+ EXTFUNC(clUnmapBufferIntel)
+ EXTFUNC(clMapBufferGTTIntel)
+ EXTFUNC(clUnmapBufferGTTIntel)
+ EXTFUNC(clPinBufferIntel)
+ EXTFUNC(clUnpinBufferIntel)
+ EXTFUNC(clReportUnfreedIntel)
+ EXTFUNC(clCreateBufferFromLibvaIntel)
+ EXTFUNC(clCreateImageFromLibvaIntel)
+ EXTFUNC(clGetMemObjectFdIntel)
+ return NULL;
+}
+
+void*
+clGetExtensionFunctionAddress(const char *func_name)
+{
+ return internal_clGetExtensionFunctionAddress(func_name);
+}
+
+void*
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+ const char *func_name)
+{
+ if (UNLIKELY(platform != NULL && platform != intel_platform))
+ return NULL;
+ return internal_clGetExtensionFunctionAddress(func_name);
+}
+
+#undef EXTFUNC
+
+cl_int
+clReportUnfreedIntel(void)
+{
+ return cl_report_unfreed();
+}
+
+void*
+clMapBufferIntel(cl_mem mem, cl_int *errcode_ret)
+{
+ void *ptr = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (mem);
+ ptr = cl_mem_map(mem);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return ptr;
+}
+
+cl_int
+clUnmapBufferIntel(cl_mem mem)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (mem);
+ err = cl_mem_unmap(mem);
+error:
+ return err;
+}
+
+void*
+clMapBufferGTTIntel(cl_mem mem, cl_int *errcode_ret)
+{
+ void *ptr = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (mem);
+ ptr = cl_mem_map_gtt(mem);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return ptr;
+}
+
+cl_int
+clUnmapBufferGTTIntel(cl_mem mem)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (mem);
+ err = cl_mem_unmap_gtt(mem);
+error:
+ return err;
+}
+
+cl_int
+clPinBufferIntel(cl_mem mem)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (mem);
+ cl_mem_pin(mem);
+error:
+ return err;
+}
+
+cl_int
+clUnpinBufferIntel(cl_mem mem)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_MEM (mem);
+ cl_mem_unpin(mem);
+error:
+ return err;
+}
+
+cl_int
+clGetGenVersionIntel(cl_device_id device, cl_int *ver)
+{
+ return cl_device_get_version(device, ver);
+}
+
+cl_program
+clCreateProgramWithLLVMIntel(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ const char * filename,
+ cl_int * errcode_ret)
+{
+ return cl_program_create_from_llvm(context,
+ num_devices,
+ devices,
+ filename,
+ errcode_ret);
+}
+
+cl_mem
+clCreateBufferFromLibvaIntel(cl_context context,
+ unsigned int bo_name,
+ cl_int *errorcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+
+ mem = cl_mem_new_libva_buffer(context, bo_name, &err);
+
+error:
+ if (errorcode_ret)
+ *errorcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateImageFromLibvaIntel(cl_context context,
+ const cl_libva_image *info,
+ cl_int *errorcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+
+ if (!info) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ mem = cl_mem_new_libva_image(context,
+ info->bo_name, info->offset, info->width, info->height,
+ info->fmt, info->row_pitch,
+ &err);
+
+error:
+ if (errorcode_ret)
+ *errorcode_ret = err;
+ return mem;
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectFdIntel(cl_context context,
+ cl_mem memobj,
+ int* fd)
+{
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ CHECK_MEM (memobj);
+
+ err = cl_mem_get_fd(memobj, fd);
+
+error:
+ return err;
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
new file mode 100644
index 0000000..0be37a7
--- /dev/null
+++ b/src/cl_command_queue.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "program.h" // for BTI_MAX_IMAGE_NUM
+#include "cl_command_queue.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_device_id.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_thread.h"
+#include "cl_alloc.h"
+#include "cl_driver.h"
+#include "cl_khr_icd.h"
+#include "cl_event.h"
+#include "performance.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+LOCAL cl_command_queue
+cl_command_queue_new(cl_context ctx)
+{
+ cl_command_queue queue = NULL;
+
+ assert(ctx);
+ TRY_ALLOC_NO_ERR (queue, CALLOC(struct _cl_command_queue));
+ SET_ICD(queue->dispatch)
+ queue->magic = CL_MAGIC_QUEUE_HEADER;
+ queue->ref_n = 1;
+ queue->ctx = ctx;
+ if ((queue->thread_data = cl_thread_data_create()) == NULL) {
+ goto error;
+ }
+
+ /* Append the command queue in the list */
+ pthread_mutex_lock(&ctx->queue_lock);
+ queue->next = ctx->queues;
+ if (ctx->queues != NULL)
+ ctx->queues->prev = queue;
+ ctx->queues = queue;
+ pthread_mutex_unlock(&ctx->queue_lock);
+
+ /* The queue also belongs to its context */
+ cl_context_add_ref(ctx);
+
+exit:
+ return queue;
+error:
+ cl_command_queue_delete(queue);
+ queue = NULL;
+ goto exit;
+}
+
+LOCAL void
+cl_command_queue_delete(cl_command_queue queue)
+{
+ assert(queue);
+ if (atomic_dec(&queue->ref_n) != 1) return;
+
+ // If there is a valid last event, we need to give it a chance to
+ // call the call-back function.
+ if (queue->last_event && queue->last_event->user_cb)
+ cl_event_update_status(queue->last_event, 1);
+ /* Remove it from the list */
+ assert(queue->ctx);
+ pthread_mutex_lock(&queue->ctx->queue_lock);
+ if (queue->prev)
+ queue->prev->next = queue->next;
+ if (queue->next)
+ queue->next->prev = queue->prev;
+ if (queue->ctx->queues == queue)
+ queue->ctx->queues = queue->next;
+ pthread_mutex_unlock(&queue->ctx->queue_lock);
+ if (queue->fulsim_out != NULL) {
+ cl_mem_delete(queue->fulsim_out);
+ queue->fulsim_out = NULL;
+ }
+
+ cl_thread_data_destroy(queue);
+ queue->thread_data = NULL;
+ cl_mem_delete(queue->perf);
+ cl_context_delete(queue->ctx);
+ cl_free(queue->wait_events);
+ queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ cl_free(queue);
+}
+
+LOCAL void
+cl_command_queue_add_ref(cl_command_queue queue)
+{
+ atomic_inc(&queue->ref_n);
+}
+
+static void
+set_image_info(char *curbe,
+ struct ImageInfo * image_info,
+ struct _cl_mem_image *image)
+{
+ if (image_info->wSlot >= 0)
+ *(uint32_t*)(curbe + image_info->wSlot) = image->w;
+ if (image_info->hSlot >= 0)
+ *(uint32_t*)(curbe + image_info->hSlot) = image->h;
+ if (image_info->depthSlot >= 0)
+ *(uint32_t*)(curbe + image_info->depthSlot) = image->depth;
+ if (image_info->channelOrderSlot >= 0)
+ *(uint32_t*)(curbe + image_info->channelOrderSlot) = image->fmt.image_channel_order;
+ if (image_info->dataTypeSlot >= 0)
+ *(uint32_t*)(curbe + image_info->dataTypeSlot) = image->fmt.image_channel_data_type;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
+{
+ uint32_t i;
+ GET_QUEUE_THREAD_GPGPU(queue);
+
+ for (i = 0; i < k->image_sz; i++) {
+ int id = k->images[i].arg_idx;
+ struct _cl_mem_image *image;
+ assert(interp_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
+ image = cl_mem_image(k->args[id].mem);
+ set_image_info(k->curbe, &k->images[i], image);
+ cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+ image->intel_fmt, image->image_type,
+ image->w, image->h, image->depth,
+ image->row_pitch, (cl_gpgpu_tiling)image->tiling);
+ // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
+ // on demand.
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_MAX_IMAGE_NUM, image->base.bo, image->offset,
+ image->intel_fmt, image->image_type,
+ image->w, image->h, image->depth,
+ image->row_pitch, image->tiling);
+ }
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
+{
+ GET_QUEUE_THREAD_GPGPU(queue);
+
+ /* Bind all user buffers (given by clSetKernelArg) */
+ uint32_t i;
+ enum gbe_arg_type arg_type; /* kind of argument */
+ for (i = 0; i < k->arg_n; ++i) {
+ uint32_t offset; // location of the address in the curbe
+ arg_type = interp_kernel_get_arg_type(k->opaque, i);
+ if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
+ continue;
+ offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
+ if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+ } else {
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+
+#if USE_FULSIM
+extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
+extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr, FILE*);
+extern void aub_exec_dump_raw_file(cl_buffer, size_t offset, size_t sz);
+
+static void
+cl_run_fulsim(void)
+{
+ const char *run_it = getenv("OCL_SIMULATOR");
+ const char *debug_mode = getenv("OCL_FULSIM_DEBUG_MODE");
+ if (run_it == NULL || strcmp(run_it, "1")) return;
+
+#if EMULATE_GEN == 7 /* IVB */
+ if (debug_mode == NULL || strcmp(debug_mode, "1"))
+ system("wine AubLoad.exe dump.aub -device ivbB0");
+ else
+ system("wine AubLoad.exe dump.aub -device ivbB0 -debug");
+#elif EMULATE_GEN == 75 /* HSW */
+ if (debug_mode == NULL || strcmp(debug_mode, "1"))
+ system("wine AubLoad.exe dump.aub -device hsw.h.a0");
+ else
+ system("wine AubLoad.exe dump.aub -device hsw.h.a0 -debug");
+#else
+#error "Unknown device"
+#endif
+}
+
+/* Each buffer is dump using several chunks of this size */
+static const size_t chunk_sz = 8192u;
+
+static cl_int
+cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ int i;
+ size_t j;
+
+ /* Bind user defined surface */
+ for (i = 0; i < k->arg_n; ++i) {
+ size_t chunk_n, chunk_remainder;
+ if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
+ continue;
+ mem = (cl_mem) k->args[i].mem;
+ CHECK_MEM(mem);
+ chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
+ chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
+ for (j = 0; j < chunk_n; ++j)
+ aub_exec_dump_raw_file(mem->bo, j * chunk_sz, chunk_sz);
+ if (chunk_remainder)
+ aub_exec_dump_raw_file(mem->bo, chunk_n * chunk_sz, chunk_remainder);
+ }
+error:
+ return err;
+}
+
+struct bmphdr {
+ /* 2 bytes of magic here, "BM", total header size is 54 bytes! */
+ int filesize; /* 4 total file size incl header */
+ short as0, as1; /* 8 app specific */
+ int bmpoffset; /* 12 ofset of bmp data */
+ int headerbytes; /* 16 bytes in header from this point (40 actually) */
+ int width; /* 20 */
+ int height; /* 24 */
+ short nplanes; /* 26 no of color planes */
+ short bpp; /* 28 bits/pixel */
+ int compression; /* 32 BI_RGB = 0 = no compression */
+ int sizeraw; /* 36 size of raw bmp file, excluding header, incl padding */
+ int hres; /* 40 horz resolutions pixels/meter */
+ int vres; /* 44 */
+ int npalcolors; /* 48 No of colors in palette */
+ int nimportant; /* 52 No of important colors */
+ /* raw b, g, r data here, dword aligned per scan line */
+};
+
+static int*
+cl_read_bmp(const char *filename, int *width, int *height)
+{
+ int n;
+ struct bmphdr hdr;
+
+ FILE *fp = fopen(filename, "rb");
+ assert(fp);
+
+ char magic[2];
+ n = fread(&magic[0], 1, 2, fp);
+ assert(n == 2 && magic[0] == 'B' && magic[1] == 'M');
+
+ n = fread(&hdr, 1, sizeof(hdr), fp);
+ assert(n == sizeof(hdr));
+
+ assert(hdr.width > 0 &&
+ hdr.height > 0 &&
+ hdr.nplanes == 1
+ && hdr.compression == 0);
+
+ int *rgb32 = (int *) cl_malloc(hdr.width * hdr.height * sizeof(int));
+ assert(rgb32);
+ int x, y;
+
+ int *dst = rgb32;
+ for (y = 0; y < hdr.height; y++) {
+ for (x = 0; x < hdr.width; x++) {
+ assert(!feof(fp));
+ int b = (getc(fp) & 0x0ff);
+ int g = (getc(fp) & 0x0ff);
+ int r = (getc(fp) & 0x0ff);
+ *dst++ = (r | (g << 8) | (b << 16) | 0xff000000); /* abgr */
+ }
+ while (x & 3) {
+ getc(fp);
+ x++;
+ }
+ }
+ fclose(fp);
+ *width = hdr.width;
+ *height = hdr.height;
+ return rgb32;
+}
+
+static char*
+cl_read_dump(const char *name, size_t *size)
+{
+ char *raw = NULL, *dump = NULL;
+ size_t i, sz;
+ int w, h;
+ if ((raw = (char*) cl_read_bmp(name, &w, &h)) == NULL)
+ return NULL;
+ sz = w * h;
+ dump = (char*) cl_malloc(sz);
+ assert(dump);
+ for (i = 0; i < sz; ++i)
+ dump[i] = raw[4*i];
+ cl_free(raw);
+ if (size)
+ *size = sz;
+ return dump;
+}
+
+static cl_int
+cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ char *from = NULL, *to = NULL;
+ size_t size, j, chunk_n, chunk_remainder;
+ int i, curr = 0;
+ /* Bind user defined surface */
+ for (i = 0; i < k->arg_n; ++i) {
+ if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
+ continue;
+ mem = (cl_mem) k->args[i].mem;
+ CHECK_MEM(mem);
+ assert(mem->bo);
+ chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
+ chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
+ to = cl_mem_map(mem);
+ for (j = 0; j < chunk_n; ++j) {
+ char name[256];
+ sprintf(name, "dump%03i.bmp", curr);
+#ifdef NDEBUG
+ from = cl_read_dump(name, NULL);
+#else
+ from = cl_read_dump(name, &size);
+ assert(size == chunk_sz);
+#endif /* NDEBUG */
+ memcpy(to + j*chunk_sz, from, chunk_sz);
+ cl_free(from);
+ curr++;
+ }
+ if (chunk_remainder) {
+ char name[256];
+ sprintf(name, "dump%03i.bmp", curr);
+#ifdef NDEBUG
+ from = cl_read_dump(name, NULL);
+#else
+ from = cl_read_dump(name, &size);
+ assert(size == chunk_remainder);
+#endif /* NDEBUG */
+ memcpy(to + chunk_n*chunk_sz, from, chunk_remainder);
+ cl_free(from);
+ curr++;
+ }
+ cl_mem_unmap(mem);
+ }
+error:
+ return err;
+}
+#endif
+
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
+
+static cl_int
+cl_kernel_check_args(cl_kernel k)
+{
+ uint32_t i;
+ for (i = 0; i < k->arg_n; ++i)
+ if (k->args[i].is_set == CL_FALSE)
+ return CL_INVALID_KERNEL_ARGS;
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_ND_range(cl_command_queue queue,
+ cl_kernel k,
+ const uint32_t work_dim,
+ const size_t *global_wk_off,
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz)
+{
+ if(b_output_kernel_perf)
+ time_start(queue->ctx, cl_kernel_get_name(k), queue);
+ const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
+ cl_int err = CL_SUCCESS;
+
+ /* Check that the user did not forget any argument */
+ TRY (cl_kernel_check_args, k);
+
+#if USE_FULSIM
+ cl_buffer_mgr bufmgr = NULL;
+ FILE *file = NULL;
+ const char *run_it = getenv("OCL_SIMULATOR");
+ if (run_it != NULL && strcmp(run_it, "1") == 0) {
+ file = fopen("dump.aub", "wb");
+ FATAL_IF (file == NULL, "Unable to open file dump.aub");
+ bufmgr = cl_context_get_bufmgr(queue->ctx);
+ drm_intel_bufmgr_gem_set_aubfile(bufmgr, file);
+ }
+#endif /* USE_FULSIM */
+
+ if (ver == 7 || ver == 75)
+ TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+ else
+ FATAL ("Unknown Gen Device");
+
+#if USE_FULSIM
+ if (run_it != NULL && strcmp(run_it, "1") == 0) {
+ TRY (cl_fulsim_dump_all_surfaces, queue, k);
+ drm_intel_bufmgr_gem_stop_aubfile(bufmgr);
+ fclose(file);
+ cl_run_fulsim();
+ TRY (cl_fulsim_read_all_surfaces, queue, k);
+ }
+#endif /* USE_FULSIM */
+
+error:
+ return err;
+}
+
+LOCAL void
+cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
+{
+ size_t global_wk_sz[3];
+ void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
+
+ cl_gpgpu_flush(gpgpu);
+
+ if (printf_info && interp_get_printf_num(printf_info)) {
+ void *index_addr = cl_gpgpu_map_printf_buffer(gpgpu, 0);
+ void *buf_addr = NULL;
+ if (interp_get_printf_sizeof_size(printf_info))
+ buf_addr = cl_gpgpu_map_printf_buffer(gpgpu, 1);
+
+ interp_output_printf(printf_info, index_addr, buf_addr, global_wk_sz[0],
+ global_wk_sz[1], global_wk_sz[2]);
+
+ cl_gpgpu_unmap_printf_buffer(gpgpu, 0);
+ if (interp_get_printf_sizeof_size(printf_info))
+ cl_gpgpu_unmap_printf_buffer(gpgpu, 1);
+ }
+
+ if (printf_info) {
+ interp_release_printf_info(printf_info);
+ global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
+ cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
+ }
+}
+
+LOCAL cl_int
+cl_command_queue_flush(cl_command_queue queue)
+{
+ GET_QUEUE_THREAD_GPGPU(queue);
+ cl_command_queue_flush_gpgpu(queue, gpgpu);
+ // As we don't have a deadicate timer thread to take care the possible
+ // event which has a call back function registerred and the event will
+ // be released at the call back function, no other function will access
+ // the event any more. If we don't do this here, we will leak that event
+ // and all the corresponding buffers which is really bad.
+ if (queue->last_event && queue->last_event->user_cb)
+ cl_event_update_status(queue->last_event, 1);
+ if (queue->current_event)
+ cl_event_flush(queue->current_event);
+ cl_invalid_thread_gpgpu(queue);
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_finish(cl_command_queue queue)
+{
+ cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
+ return CL_SUCCESS;
+}
+
+#define DEFAULT_WAIT_EVENTS_SIZE 16
+LOCAL void
+cl_command_queue_insert_event(cl_command_queue queue, cl_event event)
+{
+ cl_int i=0;
+ cl_event *new_list;
+
+ assert(queue != NULL);
+ if(queue->wait_events == NULL) {
+ queue->wait_events_size = DEFAULT_WAIT_EVENTS_SIZE;
+ TRY_ALLOC_NO_ERR (queue->wait_events, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+ }
+
+ for(i=0; i<queue->wait_events_num; i++) {
+ if(queue->wait_events[i] == event)
+ return; //is in the wait_events, need to insert
+ }
+
+ if(queue->wait_events_num < queue->wait_events_size) {
+ queue->wait_events[queue->wait_events_num++] = event;
+ return;
+ }
+
+ //wait_events_num == wait_events_size, array is full
+ queue->wait_events_size *= 2;
+ TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+ memcpy(new_list, queue->wait_events, sizeof(cl_event)*queue->wait_events_num);
+ cl_free(queue->wait_events);
+ queue->wait_events = new_list;
+ queue->wait_events[queue->wait_events_num++] = event;
+ return;
+
+exit:
+ return;
+error:
+ if(queue->wait_events)
+ cl_free(queue->wait_events);
+ queue->wait_events = NULL;
+ queue->wait_events_size = 0;
+ queue->wait_events_num = 0;
+ goto exit;
+
+}
+
+LOCAL void
+cl_command_queue_remove_event(cl_command_queue queue, cl_event event)
+{
+ cl_int i=0;
+
+ assert(queue->wait_events);
+ for(i=0; i<queue->wait_events_num; i++) {
+ if(queue->wait_events[i] == event)
+ break;
+ }
+
+ if(i == queue->wait_events_num)
+ return;
+
+ if(i == queue->wait_events_num - 1) {
+ queue->wait_events[i] = NULL;
+ } else {
+ for(; i<queue->wait_events_num-1; i++) {
+ queue->wait_events[i] = queue->wait_events[i+1];
+ }
+ }
+ queue->wait_events_num -= 1;
+}
+
+#define DEFAULT_WAIT_EVENTS_SIZE 16
+LOCAL void
+cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event)
+{
+ cl_int i=0;
+ cl_event *new_list;
+
+ assert(queue != NULL);
+ if(queue->barrier_events == NULL) {
+ queue->barrier_events_size = DEFAULT_WAIT_EVENTS_SIZE;
+ TRY_ALLOC_NO_ERR (queue->barrier_events, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+ }
+
+ for(i=0; i<queue->barrier_events_num; i++) {
+ if(queue->barrier_events[i] == event)
+ return; //is in the barrier_events, need to insert
+ }
+
+ if(queue->barrier_events_num < queue->barrier_events_size) {
+ queue->barrier_events[queue->barrier_events_num++] = event;
+ return;
+ }
+
+ //barrier_events_num == barrier_events_size, array is full
+ queue->barrier_events_size *= 2;
+ TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+ memcpy(new_list, queue->barrier_events, sizeof(cl_event)*queue->barrier_events_num);
+ cl_free(queue->barrier_events);
+ queue->barrier_events = new_list;
+ queue->barrier_events[queue->barrier_events_num++] = event;
+ return;
+
+exit:
+ return;
+error:
+ if(queue->barrier_events)
+ cl_free(queue->barrier_events);
+ queue->barrier_events = NULL;
+ queue->barrier_events_size = 0;
+ queue->barrier_events_num = 0;
+ goto exit;
+
+}
+
+LOCAL void
+cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event)
+{
+ cl_int i=0;
+
+ if(queue->barrier_events_num == 0)
+ return;
+
+ for(i=0; i<queue->barrier_events_num; i++) {
+ if(queue->barrier_events[i] == event)
+ break;
+ }
+
+ if(i == queue->barrier_events_num)
+ return;
+
+ if(i == queue->barrier_events_num - 1) {
+ queue->barrier_events[i] = NULL;
+ } else {
+ for(; i<queue->barrier_events_num-1; i++) {
+ queue->barrier_events[i] = queue->barrier_events[i+1];
+ }
+ }
+ queue->barrier_events_num -= 1;
+}
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
new file mode 100644
index 0000000..bd70f25
--- /dev/null
+++ b/src/cl_command_queue.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_COMMAND_QUEUE_H__
+#define __CL_COMMAND_QUEUE_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_thread.h"
+#include "CL/cl.h"
+#include <stdint.h>
+
+struct intel_gpgpu;
+
+/* Basically, this is a (kind-of) batch buffer */
+struct _cl_command_queue {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a command queue */
+ volatile int ref_n; /* We reference count this object */
+ cl_context ctx; /* Its parent context */
+ cl_event* barrier_events; /* Point to array of non-complete user events that block this command queue */
+ cl_int barrier_events_num; /* Number of Non-complete user events */
+ cl_int barrier_events_size; /* The size of array that wait_events point to */
+ cl_event* wait_events; /* Point to array of non-complete user events that block this command queue */
+ cl_int wait_events_num; /* Number of Non-complete user events */
+ cl_int wait_events_size; /* The size of array that wait_events point to */
+ cl_event last_event; /* The last event in the queue, for enqueue mark used */
+ cl_event current_event; /* Current event. */
+ cl_command_queue_properties props; /* Queue properties */
+ cl_command_queue prev, next; /* We chain the command queues together */
+ void *thread_data; /* Used to store thread context data */
+ cl_mem perf; /* Where to put the perf counters */
+ cl_mem fulsim_out; /* Fulsim will output this buffer */
+};
+
+/* The macro to get the thread specified gpgpu struct. */
+#define GET_QUEUE_THREAD_GPGPU(queue) \
+ cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL; \
+ if (queue) \
+ assert(gpgpu);
+
+/* Allocate and initialize a new command queue. Also insert it in the list of
+ * command queue in the associated context
+ */
+extern cl_command_queue cl_command_queue_new(cl_context);
+
+/* Destroy and deallocate the command queue */
+extern void cl_command_queue_delete(cl_command_queue);
+
+/* Keep one more reference on the queue */
+extern void cl_command_queue_add_ref(cl_command_queue);
+
+/* Map ND range kernel from OCL API */
+extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
+ cl_kernel ker,
+ const uint32_t work_dim,
+ const size_t *global_work_offset,
+ const size_t *global_work_size,
+ const size_t *local_work_size);
+
+/* The memory object where to report the performance */
+extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
+
+/* Fulsim will dump this buffer (mostly to check its consistency */
+cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
+
+/* Flush for the command queue */
+extern cl_int cl_command_queue_flush(cl_command_queue);
+
+/* Flush for the specified gpgpu */
+extern void cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
+
+/* Wait for the completion of the command queue */
+extern cl_int cl_command_queue_finish(cl_command_queue);
+
+/* Bind all the surfaces in the GPGPU state */
+extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
+
+/* Bind all the image surfaces in the GPGPU state */
+extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
+
+/* Insert a user event to command's wait_events */
+extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
+
+/* Remove a user event from command's wait_events */
+extern void cl_command_queue_remove_event(cl_command_queue, cl_event);
+
+extern void cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event);
+
+extern void cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event);
+
+#endif /* __CL_COMMAND_QUEUE_H__ */
+
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
new file mode 100644
index 0000000..330f0f9
--- /dev/null
+++ b/src/cl_command_queue_gen7.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_command_queue.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_device_id.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_GROUP_SIZE_IN_HALFSLICE 512
+static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; }
+
+/* "Varing" payload is the part of the curbe that changes accross threads in the
+ * same work group. Right now, it consists in local IDs and block IPs
+ */
+static cl_int
+cl_set_varying_payload(const cl_kernel ker,
+ char *data,
+ const size_t *local_wk_sz,
+ size_t simd_sz,
+ size_t cst_sz,
+ size_t thread_n)
+{
+ uint32_t *ids[3] = {NULL,NULL,NULL};
+ uint16_t *block_ips = NULL;
+ size_t i, j, k, curr = 0;
+ int32_t id_offset[3], ip_offset;
+ cl_int err = CL_SUCCESS;
+
+ id_offset[0] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
+ id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
+ id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
+ ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
+ assert(id_offset[0] >= 0 &&
+ id_offset[1] >= 0 &&
+ id_offset[2] >= 0 &&
+ ip_offset >= 0);
+
+ TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+ TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+ TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+ TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
+
+ /* 0xffff means that the lane is inactivated */
+ memset(block_ips, 0xff, sizeof(uint16_t)*thread_n*simd_sz);
+
+ /* Compute the IDs and the block IPs */
+ for (k = 0; k < local_wk_sz[2]; ++k)
+ for (j = 0; j < local_wk_sz[1]; ++j)
+ for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
+ ids[0][curr] = i;
+ ids[1][curr] = j;
+ ids[2][curr] = k;
+ block_ips[curr] = 0;
+ }
+
+ /* Copy them to the curbe buffer */
+ curr = 0;
+ for (i = 0; i < thread_n; ++i, data += cst_sz) {
+ uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
+ uint32_t *ids1 = (uint32_t *) (data + id_offset[1]);
+ uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
+ uint16_t *ips = (uint16_t *) (data + ip_offset);
+ for (j = 0; j < simd_sz; ++j, ++curr) {
+ ids0[j] = ids[0][curr];
+ ids1[j] = ids[1][curr];
+ ids2[j] = ids[2][curr];
+ ips[j] = block_ips[curr];
+ }
+ }
+
+error:
+ return err;
+}
+
+static int
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+{
+ /* calculate constant buffer size
+ * we need raw_size & aligned_size
+ */
+ GET_QUEUE_THREAD_GPGPU(queue);
+ int32_t arg;
+ size_t offset = 0;
+ uint32_t raw_size = 0, aligned_size =0;
+ gbe_program prog = ker->program->opaque;
+ const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
+ size_t global_const_size = interp_program_get_global_constant_size(prog);
+ aligned_size = raw_size = global_const_size;
+ /* Reserve 8 bytes to get rid of 0 address */
+ if(global_const_size == 0) aligned_size = 8;
+
+ for (arg = 0; arg < arg_n; ++arg) {
+ const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
+ if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+ uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
+ assert(alignment != 0);
+ cl_mem mem = ker->args[arg].mem;
+ raw_size += mem->size;
+ aligned_size = ALIGN(aligned_size, alignment);
+ aligned_size += mem->size;
+ }
+ }
+ if(raw_size == 0)
+ return 0;
+
+ cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size, BTI_CONSTANT);
+ if (bo == NULL)
+ return -1;
+ cl_buffer_map(bo, 1);
+ char * cst_addr = cl_buffer_get_virtual(bo);
+ if (cst_addr == NULL)
+ return -1;
+
+ /* upload the global constant data */
+ if (global_const_size > 0) {
+ interp_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+ offset += global_const_size;
+ }
+
+ /* reserve 8 bytes to get rid of 0 address */
+ if(global_const_size == 0) {
+ offset = 8;
+ }
+
+ /* upload constant buffer argument */
+ int32_t curbe_offset = 0;
+ for (arg = 0; arg < arg_n; ++arg) {
+ const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
+ if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+ cl_mem mem = ker->args[arg].mem;
+ uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
+ offset = ALIGN(offset, alignment);
+ curbe_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+ assert(curbe_offset >= 0);
+ *(uint32_t *) (ker->curbe + curbe_offset) = offset;
+
+ cl_buffer_map(mem->bo, 1);
+ void * addr = cl_buffer_get_virtual(mem->bo);
+ memcpy(cst_addr + offset, addr, mem->size);
+ cl_buffer_unmap(mem->bo);
+ offset += mem->size;
+ }
+ }
+ cl_buffer_unmap(bo);
+ return 0;
+}
+
+/* Will return the total amount of slm used */
+static int32_t
+cl_curbe_fill(cl_kernel ker,
+ const uint32_t work_dim,
+ const size_t *global_wk_off,
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz,
+ size_t thread_n)
+{
+ int32_t offset;
+#define UPLOAD(ENUM, VALUE) \
+ if ((offset = interp_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \
+ *((uint32_t *) (ker->curbe + offset)) = VALUE;
+ UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
+ UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
+ UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
+ UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
+ UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
+ UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
+ UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
+ UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
+ UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
+ UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]);
+ UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
+ UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
+ UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
+ UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
+#undef UPLOAD
+
+ /* Write identity for the stack pointer. This is required by the stack pointer
+ * computation in the kernel
+ */
+ if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
+ const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
+ uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
+ int32_t i;
+ for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
+ }
+ /* Handle the various offsets to SLM */
+ const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
+ int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
+ ker->local_mem_sz = 0;
+ for (arg = 0; arg < arg_n; ++arg) {
+ const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
+ if (type != GBE_ARG_LOCAL_PTR)
+ continue;
+ uint32_t align = interp_kernel_get_arg_align(ker->opaque, arg);
+ assert(align != 0);
+ slm_offset = ALIGN(slm_offset, align);
+ offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+ assert(offset >= 0);
+ uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
+ *slmptr = slm_offset;
+ slm_offset += ker->args[arg].local_sz;
+ ker->local_mem_sz += ker->args[arg].local_sz;
+ }
+ return slm_offset;
+}
+
+static void
+cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
+{
+ cl_context ctx = ker->program->ctx;
+ cl_device_id device = ctx->device;
+ const int32_t per_lane_stack_sz = ker->stack_size;
+ const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
+ const int32_t sub_value = GBE_STACK_BUFFER;
+ const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+ int32_t stack_sz = per_lane_stack_sz;
+
+ /* No stack required for this kernel */
+ if (per_lane_stack_sz == 0)
+ return;
+
+ /* The stack size is given for *each* SIMD lane. So, we accordingly compute
+ * the size we need for the complete machine
+ */
+ assert(offset >= 0);
+ stack_sz *= interp_kernel_get_simd_width(ker->opaque);
+ stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
+ /* Because HSW calc stack offset per thread is relative with half slice, when
+ thread schedule in half slice is not balance, would out of bound. Because
+ the max half slice is 4 in GT4, multiply stack size with 4 for safe.
+ */
+ if(cl_driver_get_ver(ctx->drv) == 75)
+ stack_sz *= 4;
+ cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
+}
+
+static int
+cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
+ int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
+ int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+ size_t buf_size = global_sz * sizeof(int) * printf_num;
+ if (offset > 0) {
+ if (cl_gpgpu_set_printf_buffer(gpgpu, 0, buf_size, offset, interp_get_printf_indexbuf_bti(printf_info)) != 0)
+ return -1;
+ }
+
+ value = GBE_CURBE_PRINTF_BUF_POINTER;
+ offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+ buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;
+ if (offset > 0) {
+ if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
+ return -1;
+ }
+ return 0;
+}
+
+LOCAL cl_int
+cl_command_queue_ND_range_gen7(cl_command_queue queue,
+ cl_kernel ker,
+ const uint32_t work_dim,
+ const size_t *global_wk_off,
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz)
+{
+ GET_QUEUE_THREAD_GPGPU(queue);
+ cl_context ctx = queue->ctx;
+ char *final_curbe = NULL; /* Includes them and one sub-buffer per group */
+ cl_gpgpu_kernel kernel;
+ const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
+ size_t i, batch_sz = 0u, local_sz = 0u;
+ size_t cst_sz = ker->curbe_sz= interp_kernel_get_curbe_size(ker->opaque);
+ int32_t scratch_sz = interp_kernel_get_scratch_size(ker->opaque);
+ size_t thread_n = 0u;
+ int printf_num = 0;
+ cl_int err = CL_SUCCESS;
+ size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
+ void* printf_info = NULL;
+
+ /* Setup kernel */
+ kernel.name = "KERNEL";
+ kernel.grf_blocks = 128;
+ kernel.bo = ker->bo;
+ kernel.barrierID = 0;
+ kernel.slm_sz = 0;
+ kernel.use_slm = interp_kernel_use_slm(ker->opaque);
+
+ /* Compute the number of HW threads we need */
+ TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
+ kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
+ kernel.curbe_sz = cst_sz;
+
+ if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
+ fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
+ return CL_OUT_OF_RESOURCES;
+ }
+ /* Curbe step 1: fill the constant urb buffer data shared by all threads */
+ if (ker->curbe) {
+ kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+ if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
+ fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz);
+ return CL_OUT_OF_RESOURCES;
+ }
+ }
+
+ printf_info = interp_dup_printfset(ker->opaque);
+ cl_gpgpu_set_printf_info(gpgpu, printf_info, (size_t *)global_wk_sz);
+
+ /* Setup the kernel */
+ if (queue->props & CL_QUEUE_PROFILING_ENABLE)
+ err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
+ else
+ err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 0);
+ if (err != 0)
+ goto error;
+ printf_num = interp_get_printf_num(printf_info);
+ if (printf_num) {
+ if (cl_bind_printf(gpgpu, ker, printf_info, printf_num, global_size) != 0)
+ goto error;
+ }
+
+ /* Bind user buffers */
+ cl_command_queue_bind_surface(queue, ker);
+ /* Bind user images */
+ cl_command_queue_bind_image(queue, ker);
+ /* Bind all samplers */
+ cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
+
+ if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
+ goto error;
+
+ /* Bind a stack if needed */
+ cl_bind_stack(gpgpu, ker);
+
+ if (cl_upload_constant_buffer(queue, ker) != 0)
+ goto error;
+
+ cl_gpgpu_states_setup(gpgpu, &kernel);
+
+ /* Curbe step 2. Give the localID and upload it to video memory */
+ if (ker->curbe) {
+ assert(cst_sz > 0);
+ TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
+ for (i = 0; i < thread_n; ++i) {
+ memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
+ }
+ TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
+ if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
+ goto error;
+ }
+
+ /* Start a new batch buffer */
+ batch_sz = cl_kernel_compute_batch_sz(ker);
+ if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0)
+ goto error;
+ cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
+ cl_gpgpu_batch_start(gpgpu);
+
+ /* Issue the GPGPU_WALKER command */
+ cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+
+ /* Close the batch buffer and submit it */
+ cl_gpgpu_batch_end(gpgpu, 0);
+ return CL_SUCCESS;
+
+error:
+ fprintf(stderr, "error occured. \n");
+ exit(-1);
+ return CL_OUT_OF_RESOURCES;
+}
+
diff --git a/src/cl_context.c b/src/cl_context.c
new file mode 100644
index 0000000..152faf3
--- /dev/null
+++ b/src/cl_context.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "cl_mem.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_driver.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_program.h"
+
+#include "CL/cl.h"
+#include "CL/cl_gl.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+#define CHECK(var) \
+ if (var) \
+ return CL_INVALID_PROPERTY; \
+ else \
+ var = 1;
+
+static cl_int
+cl_context_properties_process(const cl_context_properties *prop,
+ struct _cl_context_prop *cl_props, cl_uint * prop_len)
+{
+ int set_cl_context_platform = 0,
+ set_cl_gl_context_khr = 0,
+ set_cl_egl_display_khr = 0,
+ set_cl_glx_display_khr = 0,
+ set_cl_wgl_hdc_khr = 0,
+ set_cl_cgl_sharegroup_khr = 0;
+ cl_int err = CL_SUCCESS;
+
+ cl_props->gl_type = CL_GL_NOSHARE;
+ cl_props->platform_id = 0;
+
+ if (prop == NULL)
+ goto exit;
+
+
+ while(*prop) {
+ switch (*prop) {
+ case CL_CONTEXT_PLATFORM:
+ CHECK (set_cl_context_platform);
+ cl_props->platform_id = *(prop + 1);
+ if (UNLIKELY((cl_platform_id) cl_props->platform_id != intel_platform)) {
+ err = CL_INVALID_PLATFORM;
+ goto error;
+ }
+ break;
+ case CL_GL_CONTEXT_KHR:
+ CHECK (set_cl_gl_context_khr);
+ cl_props->gl_context = *(prop + 1);
+ break;
+ case CL_EGL_DISPLAY_KHR:
+ CHECK (set_cl_egl_display_khr);
+ cl_props->gl_type = CL_GL_EGL_DISPLAY;
+ cl_props->egl_display = *(prop + 1);
+ break;
+ case CL_GLX_DISPLAY_KHR:
+ CHECK (set_cl_glx_display_khr);
+ cl_props->gl_type = CL_GL_GLX_DISPLAY;
+ cl_props->glx_display = *(prop + 1);
+ break;
+ case CL_WGL_HDC_KHR:
+ CHECK (set_cl_wgl_hdc_khr);
+ cl_props->gl_type = CL_GL_WGL_HDC;
+ cl_props->wgl_hdc = *(prop + 1);
+ break;
+ case CL_CGL_SHAREGROUP_KHR:
+ CHECK (set_cl_cgl_sharegroup_khr);
+ cl_props->gl_type = CL_GL_CGL_SHAREGROUP;
+ cl_props->cgl_sharegroup = *(prop + 1);
+ break;
+ default:
+ err = CL_INVALID_PROPERTY;
+ goto error;
+ }
+ prop += 2;
+ *prop_len += 2;
+ }
+ (*prop_len)++;
+exit:
+error:
+ return err;
+}
+
+
+
+LOCAL cl_context
+cl_create_context(const cl_context_properties * properties,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ void (CL_CALLBACK * pfn_notify) (const char*, const void*, size_t, void*),
+ void * user_data,
+ cl_int * errcode_ret)
+{
+ /* cl_platform_id platform = NULL; */
+ struct _cl_context_prop props;
+ cl_context ctx = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_uint prop_len = 0;
+ /* XXX */
+ FATAL_IF (num_devices != 1, "Only one device is supported");
+
+ /* Check that we are getting the right platform */
+ if (UNLIKELY(((err = cl_context_properties_process(properties, &props, &prop_len)) != CL_SUCCESS)))
+ goto error;
+
+ /* We are good */
+ if (UNLIKELY((ctx = cl_context_new(&props)) == NULL)) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+
+ if(properties != NULL && prop_len > 0) {
+ TRY_ALLOC (ctx->prop_user, CALLOC_ARRAY(cl_context_properties, prop_len));
+ memcpy(ctx->prop_user, properties, sizeof(cl_context_properties)*prop_len);
+ }
+ ctx->prop_len = prop_len;
+ /* Attach the device to the context */
+ ctx->device = *devices;
+
+ /* Save the user callback and user data*/
+ ctx->pfn_notify = pfn_notify;
+ ctx->user_data = user_data;
+
+exit:
+ if (errcode_ret != NULL)
+ *errcode_ret = err;
+ return ctx;
+error:
+ cl_context_delete(ctx);
+ ctx = NULL;
+ goto exit;
+}
+
+LOCAL cl_context
+cl_context_new(struct _cl_context_prop *props)
+{
+ cl_context ctx = NULL;
+
+ TRY_ALLOC_NO_ERR (ctx, CALLOC(struct _cl_context));
+ TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
+ SET_ICD(ctx->dispatch)
+ ctx->props = *props;
+ ctx->magic = CL_MAGIC_CONTEXT_HEADER;
+ ctx->ref_n = 1;
+ ctx->ver = cl_driver_get_ver(ctx->drv);
+ pthread_mutex_init(&ctx->program_lock, NULL);
+ pthread_mutex_init(&ctx->queue_lock, NULL);
+ pthread_mutex_init(&ctx->buffer_lock, NULL);
+ pthread_mutex_init(&ctx->sampler_lock, NULL);
+
+exit:
+ return ctx;
+error:
+ cl_context_delete(ctx);
+ ctx = NULL;
+ goto exit;
+}
+
+LOCAL void
+cl_context_delete(cl_context ctx)
+{
+ int i = 0;
+ if (UNLIKELY(ctx == NULL))
+ return;
+
+ /* We are not done yet */
+ if (atomic_dec(&ctx->ref_n) > 1)
+ return;
+
+ /* delete the internal programs. */
+ for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
+ if (ctx->internel_kernels[i]) {
+ cl_kernel_delete(ctx->internel_kernels[i]);
+ ctx->internel_kernels[i] = NULL;
+
+ assert(ctx->internal_prgs[i]);
+ cl_program_delete(ctx->internal_prgs[i]);
+ ctx->internal_prgs[i] = NULL;
+ }
+
+ if (ctx->internel_kernels[i]) {
+ cl_kernel_delete(ctx->built_in_kernels[i]);
+ ctx->built_in_kernels[i] = NULL;
+ }
+ }
+
+ cl_program_delete(ctx->built_in_prgs);
+ ctx->built_in_prgs = NULL;
+
+ /* All object lists should have been freed. Otherwise, the reference counter
+ * of the context cannot be 0
+ */
+ assert(ctx->queues == NULL);
+ assert(ctx->programs == NULL);
+ assert(ctx->buffers == NULL);
+ assert(ctx->drv);
+ cl_free(ctx->prop_user);
+ cl_driver_delete(ctx->drv);
+ ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ cl_free(ctx);
+}
+
+LOCAL void
+cl_context_add_ref(cl_context ctx)
+{
+ assert(ctx);
+ atomic_inc(&ctx->ref_n);
+}
+
+LOCAL cl_command_queue
+cl_context_create_queue(cl_context ctx,
+ cl_device_id device,
+ cl_command_queue_properties properties, /* XXX */
+ cl_int *errcode_ret)
+{
+ cl_command_queue queue = NULL;
+ cl_int err = CL_SUCCESS;
+
+
+
+ /* We create the command queue and store it in the context list of queues */
+ TRY_ALLOC (queue, cl_command_queue_new(ctx));
+ queue->props = properties;
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return queue;
+error:
+ cl_command_queue_delete(queue);
+ queue = NULL;
+ goto exit;
+}
+
+cl_buffer_mgr
+cl_context_get_bufmgr(cl_context ctx)
+{
+ return cl_driver_get_bufmgr(ctx->drv);
+}
+
+cl_kernel
+cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
+{
+ cl_int ret;
+ if (!ctx->internal_prgs[index]) {
+ size_t length = strlen(str_kernel) + 1;
+ ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
+
+ if (!ctx->internal_prgs[index])
+ return NULL;
+
+ ret = cl_program_build(ctx->internal_prgs[index], str_option);
+ if (ret != CL_SUCCESS)
+ return NULL;
+
+ ctx->internal_prgs[index]->is_built = 1;
+
+ /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+ if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+ for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+ if (index != i) {
+ assert(ctx->internal_prgs[i] == NULL);
+ assert(ctx->internel_kernels[i] == NULL);
+ cl_program_add_ref(ctx->internal_prgs[index]);
+ ctx->internal_prgs[i] = ctx->internal_prgs[index];
+ }
+
+ if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_2", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_4", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_8", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_16", NULL);
+ } else
+ assert(0);
+ }
+ } else {
+ ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ }
+ }
+
+ return ctx->internel_kernels[index];
+}
+
+cl_kernel
+cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
+ const char * str_kernel, size_t size, const char * str_option)
+{
+ cl_int ret;
+ cl_int binary_status = CL_SUCCESS;
+ if (!ctx->internal_prgs[index]) {
+ ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+ &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
+
+ if (!ctx->internal_prgs[index])
+ return NULL;
+
+ ret = cl_program_build(ctx->internal_prgs[index], str_option);
+ if (ret != CL_SUCCESS)
+ return NULL;
+
+ ctx->internal_prgs[index]->is_built = 1;
+
+ /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+ if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+ for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+ if (index != i) {
+ assert(ctx->internal_prgs[i] == NULL);
+ assert(ctx->internel_kernels[i] == NULL);
+ cl_program_add_ref(ctx->internal_prgs[index]);
+ ctx->internal_prgs[i] = ctx->internal_prgs[index];
+ }
+
+ if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_2", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_4", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_8", NULL);
+ } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+ ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+ "__cl_fill_region_align8_16", NULL);
+ } else
+ assert(0);
+ }
+ } else {
+ ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ }
+ }
+
+ return ctx->internel_kernels[index];
+}
diff --git a/src/cl_context.h b/src/cl_context.h
new file mode 100644
index 0000000..75afbf6
--- /dev/null
+++ b/src/cl_context.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_CONTEXT_H__
+#define __CL_CONTEXT_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+#include "cl_khr_icd.h"
+
+#include <stdint.h>
+#include <pthread.h>
+
+/* DRI device created at create context */
+struct intel_driver;
+
+enum _cl_gl_context_type {
+ CL_GL_NOSHARE,
+ CL_GL_EGL_DISPLAY,
+ CL_GL_GLX_DISPLAY,
+ CL_GL_WGL_HDC,
+ CL_GL_CGL_SHAREGROUP
+};
+
+enum _cl_internal_ker_type {
+ CL_INTERNAL_KERNEL_MIN = 0,
+ CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
+ CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+ CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
+ CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
+ CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
+ CL_ENQUEUE_COPY_BUFFER_RECT,
+ CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, //copy image 1d to image 1d
+ CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, //copy image 2d to image 3d
+ CL_ENQUEUE_COPY_IMAGE_3D_TO_3D, //copy image 3d to image 3d
+ CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, //copy image 2d to buffer
+ CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d
+ CL_ENQUEUE_FILL_BUFFER_UNALIGN, //fill buffer with 1 aligne pattern, pattern size=1
+ CL_ENQUEUE_FILL_BUFFER_ALIGN2, //fill buffer with 2 aligne pattern, pattern size=2
+ CL_ENQUEUE_FILL_BUFFER_ALIGN4, //fill buffer with 4 aligne pattern, pattern size=4
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_8, //fill buffer with 8 aligne pattern, pattern size=8
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_16, //fill buffer with 16 aligne pattern, pattern size=16
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_32, //fill buffer with 16 aligne pattern, pattern size=32
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_64, //fill buffer with 16 aligne pattern, pattern size=64
+ CL_ENQUEUE_FILL_BUFFER_ALIGN128, //fill buffer with 128 aligne pattern, pattern size=128
+ CL_ENQUEUE_FILL_IMAGE_1D, //fill image 1d
+ CL_ENQUEUE_FILL_IMAGE_1D_ARRAY, //fill image 1d array
+ CL_ENQUEUE_FILL_IMAGE_2D, //fill image 2d
+ CL_ENQUEUE_FILL_IMAGE_2D_ARRAY, //fill image 2d array
+ CL_ENQUEUE_FILL_IMAGE_3D, //fill image 3d
+ CL_INTERNAL_KERNEL_MAX
+};
+
+struct _cl_context_prop {
+ cl_context_properties platform_id;
+ enum _cl_gl_context_type gl_type;
+ cl_context_properties gl_context;
+ union {
+ cl_context_properties egl_display;
+ cl_context_properties glx_display;
+ cl_context_properties wgl_hdc;
+ cl_context_properties cgl_sharegroup;
+ };
+};
+
+#define IS_EGL_CONTEXT(ctx) (ctx->props.gl_type == CL_GL_EGL_DISPLAY)
+#define EGL_DISP(ctx) (EGLDisplay)(ctx->props.egl_display)
+#define EGL_CTX(ctx) (EGLContext)(ctx->props.gl_context)
+/* Encapsulate the whole device */
+struct _cl_context {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a context */
+ volatile int ref_n; /* We reference count this object */
+ cl_driver drv; /* Handles HW or simulator */
+ cl_device_id device; /* All information about the GPU device */
+ cl_command_queue queues; /* All command queues currently allocated */
+ cl_program programs; /* All programs currently allocated */
+ cl_mem buffers; /* All memory object currently allocated */
+ cl_sampler samplers; /* All sampler object currently allocated */
+ cl_event events; /* All event object currently allocated */
+ pthread_mutex_t queue_lock; /* To allocate and deallocate queues */
+ pthread_mutex_t program_lock; /* To allocate and deallocate programs */
+ pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
+ pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
+ pthread_mutex_t event_lock; /* To allocate and deallocate events */
+ cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
+ /* All programs internal used, for example clEnqueuexxx api use */
+ cl_kernel internel_kernels[CL_INTERNAL_KERNEL_MAX];
+ /* All kernels for clenqueuexxx api, for example clEnqueuexxx api use */
+ cl_program built_in_prgs; /*all built-in kernels belongs to this program only*/
+ cl_kernel built_in_kernels[CL_INTERNAL_KERNEL_MAX];
+ uint32_t ver; /* Gen version */
+ struct _cl_context_prop props;
+ cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
+ cl_uint prop_len; /* count of the properties */
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
+ /* User's callback when error occur in context */
+ void *user_data; /* A pointer to user supplied data */
+
+};
+
+/* Implement OpenCL function */
+extern cl_context cl_create_context(const cl_context_properties*,
+ cl_uint,
+ const cl_device_id*,
+ void (CL_CALLBACK * pfn_notify) (const char*, const void*, size_t, void*),
+ void *,
+ cl_int*);
+
+/* Allocate and initialize a context */
+extern cl_context cl_context_new(struct _cl_context_prop *);
+
+/* Destroy and deallocate a context */
+extern void cl_context_delete(cl_context);
+
+/* Increment the context reference counter */
+extern void cl_context_add_ref(cl_context);
+
+/* Create the command queue from the given context and device */
+extern cl_command_queue cl_context_create_queue(cl_context,
+ cl_device_id,
+ cl_command_queue_properties,
+ cl_int*);
+
+/* Enqueue a ND Range kernel */
+extern cl_int cl_context_ND_kernel(cl_context,
+ cl_command_queue,
+ cl_kernel,
+ cl_uint,
+ const size_t*,
+ const size_t*,
+ const size_t*);
+
+/* Used for allocation */
+extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
+
+/* Get the internal used kernel */
+extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
+
+/* Get the internal used kernel from binary*/
+extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
+ const char * str_kernel, size_t size, const char * str_option);
+
+#endif /* __CL_CONTEXT_H__ */
+
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
new file mode 100644
index 0000000..28bd5f0
--- /dev/null
+++ b/src/cl_device_data.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_DEVICE_DATA_H__
+#define __CL_DEVICE_DATA_H__
+
+#define INVALID_CHIP_ID -1 //returned by intel_get_device_id if no device found
+
+#define PCI_CHIP_GM45_GM 0x2A42
+#define PCI_CHIP_IGD_E_G 0x2E02
+#define PCI_CHIP_Q45_G 0x2E12
+#define PCI_CHIP_G45_G 0x2E22
+#define PCI_CHIP_G41_G 0x2E32
+
+#define PCI_CHIP_IGDNG_D_G 0x0042
+#define PCI_CHIP_IGDNG_M_G 0x0046
+
+#define IS_G45(devid) (devid == PCI_CHIP_IGD_E_G || \
+ devid == PCI_CHIP_Q45_G || \
+ devid == PCI_CHIP_G45_G || \
+ devid == PCI_CHIP_G41_G)
+#define IS_GM45(devid) (devid == PCI_CHIP_GM45_GM)
+#define IS_G4X(devid) (IS_G45(devid) || IS_GM45(devid))
+
+#define IS_IGDNG_D(devid) (devid == PCI_CHIP_IGDNG_D_G)
+#define IS_IGDNG_M(devid) (devid == PCI_CHIP_IGDNG_M_G)
+#define IS_IGDNG(devid) (IS_IGDNG_D(devid) || IS_IGDNG_M(devid))
+
+#ifndef PCI_CHIP_SANDYBRIDGE_BRIDGE
+#define PCI_CHIP_SANDYBRIDGE_BRIDGE 0x0100 /* Desktop */
+#define PCI_CHIP_SANDYBRIDGE_GT1 0x0102
+#define PCI_CHIP_SANDYBRIDGE_GT2 0x0112
+#define PCI_CHIP_SANDYBRIDGE_GT2_PLUS 0x0122
+#define PCI_CHIP_SANDYBRIDGE_BRIDGE_M 0x0104 /* Mobile */
+#define PCI_CHIP_SANDYBRIDGE_M_GT1 0x0106
+#define PCI_CHIP_SANDYBRIDGE_M_GT2 0x0116
+#define PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS 0x0126
+#define PCI_CHIP_SANDYBRIDGE_BRIDGE_S 0x0108 /* Server */
+#define PCI_CHIP_SANDYBRIDGE_S_GT 0x010A
+#endif
+
+#define IS_GEN6(devid) \
+ (devid == PCI_CHIP_SANDYBRIDGE_GT1 || \
+ devid == PCI_CHIP_SANDYBRIDGE_GT2 || \
+ devid == PCI_CHIP_SANDYBRIDGE_GT2_PLUS || \
+ devid == PCI_CHIP_SANDYBRIDGE_M_GT1 || \
+ devid == PCI_CHIP_SANDYBRIDGE_M_GT2 || \
+ devid == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS || \
+ devid == PCI_CHIP_SANDYBRIDGE_S_GT)
+
+#define PCI_CHIP_IVYBRIDGE_GT1 0x0152 /* Desktop */
+#define PCI_CHIP_IVYBRIDGE_GT2 0x0162
+#define PCI_CHIP_IVYBRIDGE_M_GT1 0x0156 /* Mobile */
+#define PCI_CHIP_IVYBRIDGE_M_GT2 0x0166
+#define PCI_CHIP_IVYBRIDGE_S_GT1 0x015a /* Server */
+#define PCI_CHIP_IVYBRIDGE_S_GT2 0x016a
+
+#define PCI_CHIP_BAYTRAIL_T 0x0F31
+
+#define IS_IVB_GT1(devid) \
+ (devid == PCI_CHIP_IVYBRIDGE_GT1 || \
+ devid == PCI_CHIP_IVYBRIDGE_M_GT1 || \
+ devid == PCI_CHIP_IVYBRIDGE_S_GT1)
+
+#define IS_IVB_GT2(devid) \
+ (devid == PCI_CHIP_IVYBRIDGE_GT2 || \
+ devid == PCI_CHIP_IVYBRIDGE_M_GT2 || \
+ devid == PCI_CHIP_IVYBRIDGE_S_GT2)
+
+#define IS_BAYTRAIL_T(devid) \
+ (devid == PCI_CHIP_BAYTRAIL_T)
+
+#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid) || IS_BAYTRAIL_T(devid))
+#define IS_GEN7(devid) IS_IVYBRIDGE(devid)
+
+
+#define PCI_CHIP_HASWELL_D1 0x0402 /* GT1 desktop */
+#define PCI_CHIP_HASWELL_D2 0x0412 /* GT2 desktop */
+#define PCI_CHIP_HASWELL_D3 0x0422 /* GT3 desktop */
+#define PCI_CHIP_HASWELL_S1 0x040a /* GT1 server */
+#define PCI_CHIP_HASWELL_S2 0x041a /* GT2 server */
+#define PCI_CHIP_HASWELL_S3 0x042a /* GT3 server */
+#define PCI_CHIP_HASWELL_M1 0x0406 /* GT1 mobile */
+#define PCI_CHIP_HASWELL_M2 0x0416 /* GT2 mobile */
+#define PCI_CHIP_HASWELL_M3 0x0426 /* GT3 mobile */
+#define PCI_CHIP_HASWELL_B1 0x040B /* Haswell GT1 */
+#define PCI_CHIP_HASWELL_B2 0x041B /* Haswell GT2 */
+#define PCI_CHIP_HASWELL_B3 0x042B /* Haswell GT3 */
+#define PCI_CHIP_HASWELL_E1 0x040E /* Haswell GT1 */
+#define PCI_CHIP_HASWELL_E2 0x041E /* Haswell GT2 */
+#define PCI_CHIP_HASWELL_E3 0x042E /* Haswell GT3 */
+
+/* Software Development Vehicle devices. */
+#define PCI_CHIP_HASWELL_SDV_D1 0x0C02 /* SDV GT1 desktop */
+#define PCI_CHIP_HASWELL_SDV_D2 0x0C12 /* SDV GT2 desktop */
+#define PCI_CHIP_HASWELL_SDV_D3 0x0C22 /* SDV GT3 desktop */
+#define PCI_CHIP_HASWELL_SDV_S1 0x0C0A /* SDV GT1 server */
+#define PCI_CHIP_HASWELL_SDV_S2 0x0C1A /* SDV GT2 server */
+#define PCI_CHIP_HASWELL_SDV_S3 0x0C2A /* SDV GT3 server */
+#define PCI_CHIP_HASWELL_SDV_M1 0x0C06 /* SDV GT1 mobile */
+#define PCI_CHIP_HASWELL_SDV_M2 0x0C16 /* SDV GT2 mobile */
+#define PCI_CHIP_HASWELL_SDV_M3 0x0C26 /* SDV GT3 mobile */
+#define PCI_CHIP_HASWELL_SDV_B1 0x0C0B /* SDV GT1 */
+#define PCI_CHIP_HASWELL_SDV_B2 0x0C1B /* SDV GT2 */
+#define PCI_CHIP_HASWELL_SDV_B3 0x0C2B /* SDV GT3 */
+#define PCI_CHIP_HASWELL_SDV_E1 0x0C0E /* SDV GT1 */
+#define PCI_CHIP_HASWELL_SDV_E2 0x0C1E /* SDV GT2 */
+#define PCI_CHIP_HASWELL_SDV_E3 0x0C2E /* SDV GT3 */
+/* Ultrabooks */
+#define PCI_CHIP_HASWELL_ULT_D1 0x0A02 /* ULT GT1 desktop */
+#define PCI_CHIP_HASWELL_ULT_D2 0x0A12 /* ULT GT2 desktop */
+#define PCI_CHIP_HASWELL_ULT_D3 0x0A22 /* ULT GT3 desktop */
+#define PCI_CHIP_HASWELL_ULT_S1 0x0A0A /* ULT GT1 server */
+#define PCI_CHIP_HASWELL_ULT_S2 0x0A1A /* ULT GT2 server */
+#define PCI_CHIP_HASWELL_ULT_S3 0x0A2A /* ULT GT3 server */
+#define PCI_CHIP_HASWELL_ULT_M1 0x0A06 /* ULT GT1 mobile */
+#define PCI_CHIP_HASWELL_ULT_M2 0x0A16 /* ULT GT2 mobile */
+#define PCI_CHIP_HASWELL_ULT_M3 0x0A26 /* ULT GT3 mobile */
+#define PCI_CHIP_HASWELL_ULT_B1 0x0A0B /* ULT GT1 */
+#define PCI_CHIP_HASWELL_ULT_B2 0x0A1B /* ULT GT2 */
+#define PCI_CHIP_HASWELL_ULT_B3 0x0A2B /* ULT GT3 */
+#define PCI_CHIP_HASWELL_ULT_E1 0x0A0E /* ULT GT1 */
+#define PCI_CHIP_HASWELL_ULT_E2 0x0A1E /* ULT GT2 */
+#define PCI_CHIP_HASWELL_ULT_E3 0x0A2E /* ULT GT3 */
+/* CRW */
+#define PCI_CHIP_HASWELL_CRW_D1 0x0D02 /* CRW GT1 desktop */
+#define PCI_CHIP_HASWELL_CRW_D2 0x0D12 /* CRW GT2 desktop */
+#define PCI_CHIP_HASWELL_CRW_D3 0x0D22 /* CRW GT3 desktop */
+#define PCI_CHIP_HASWELL_CRW_S1 0x0D0A /* CRW GT1 server */
+#define PCI_CHIP_HASWELL_CRW_S2 0x0D1A /* CRW GT2 server */
+#define PCI_CHIP_HASWELL_CRW_S3 0x0D2A /* CRW GT3 server */
+#define PCI_CHIP_HASWELL_CRW_M1 0x0D06 /* CRW GT1 mobile */
+#define PCI_CHIP_HASWELL_CRW_M2 0x0D16 /* CRW GT2 mobile */
+#define PCI_CHIP_HASWELL_CRW_M3 0x0D26 /* CRW GT3 mobile */
+#define PCI_CHIP_HASWELL_CRW_B1 0x0D0B /* CRW GT1 */
+#define PCI_CHIP_HASWELL_CRW_B2 0x0D1B /* CRW GT2 */
+#define PCI_CHIP_HASWELL_CRW_B3 0x0D2B /* CRW GT3 */
+#define PCI_CHIP_HASWELL_CRW_E1 0x0D0E /* CRW GT1 */
+#define PCI_CHIP_HASWELL_CRW_E2 0x0D1E /* CRW GT2 */
+#define PCI_CHIP_HASWELL_CRW_E3 0x0D2E /* CRW GT3 */
+
+
+#define IS_HASWELL(devid) ( \
+ (devid) == PCI_CHIP_HASWELL_D1 || (devid) == PCI_CHIP_HASWELL_D2 || \
+ (devid) == PCI_CHIP_HASWELL_D3 || (devid) == PCI_CHIP_HASWELL_S1 || \
+ (devid) == PCI_CHIP_HASWELL_S2 || (devid) == PCI_CHIP_HASWELL_S3 || \
+ (devid) == PCI_CHIP_HASWELL_M1 || (devid) == PCI_CHIP_HASWELL_M2 || \
+ (devid) == PCI_CHIP_HASWELL_M3 || (devid) == PCI_CHIP_HASWELL_B1 || \
+ (devid) == PCI_CHIP_HASWELL_B2 || (devid) == PCI_CHIP_HASWELL_B3 || \
+ (devid) == PCI_CHIP_HASWELL_E1 || (devid) == PCI_CHIP_HASWELL_E2 || \
+ (devid) == PCI_CHIP_HASWELL_E3 || (devid) == PCI_CHIP_HASWELL_SDV_D1 || \
+ (devid) == PCI_CHIP_HASWELL_SDV_D2 || (devid) == PCI_CHIP_HASWELL_SDV_D3 || \
+ (devid) == PCI_CHIP_HASWELL_SDV_S1 || (devid) == PCI_CHIP_HASWELL_SDV_S2 || \
+ (devid) == PCI_CHIP_HASWELL_SDV_S3 || (devid) == PCI_CHIP_HASWELL_SDV_M1 || \
+ (devid) == PCI_CHIP_HASWELL_SDV_M2 || (devid) == PCI_CHIP_HASWELL_SDV_M3 || \
+ (devid) == PCI_CHIP_HASWELL_SDV_B1 || (devid) == PCI_CHIP_HASWELL_SDV_B2 || \
+ (devid) == PCI_CHIP_HASWELL_SDV_B3 || (devid) == PCI_CHIP_HASWELL_SDV_E1 || \
+ (devid) == PCI_CHIP_HASWELL_SDV_E2 || (devid) == PCI_CHIP_HASWELL_SDV_E3 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_D1 || (devid) == PCI_CHIP_HASWELL_ULT_D2 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_D3 || (devid) == PCI_CHIP_HASWELL_ULT_S1 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_S2 || (devid) == PCI_CHIP_HASWELL_ULT_S3 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_M1 || (devid) == PCI_CHIP_HASWELL_ULT_M2 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_M3 || (devid) == PCI_CHIP_HASWELL_ULT_B1 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_B2 || (devid) == PCI_CHIP_HASWELL_ULT_B3 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_E1 || (devid) == PCI_CHIP_HASWELL_ULT_E2 || \
+ (devid) == PCI_CHIP_HASWELL_ULT_E3 || (devid) == PCI_CHIP_HASWELL_CRW_D1 || \
+ (devid) == PCI_CHIP_HASWELL_CRW_D2 || (devid) == PCI_CHIP_HASWELL_CRW_D3 || \
+ (devid) == PCI_CHIP_HASWELL_CRW_S1 || (devid) == PCI_CHIP_HASWELL_CRW_S2 || \
+ (devid) == PCI_CHIP_HASWELL_CRW_S3 || (devid) == PCI_CHIP_HASWELL_CRW_M1 || \
+ (devid) == PCI_CHIP_HASWELL_CRW_M2 || (devid) == PCI_CHIP_HASWELL_CRW_M3 || \
+ (devid) == PCI_CHIP_HASWELL_CRW_B1 || (devid) == PCI_CHIP_HASWELL_CRW_B2 || \
+ (devid) == PCI_CHIP_HASWELL_CRW_B3 || (devid) == PCI_CHIP_HASWELL_CRW_E1 || \
+ (devid) == PCI_CHIP_HASWELL_CRW_E2 || (devid) == PCI_CHIP_HASWELL_CRW_E3)
+
+#define IS_GEN75(devid) IS_HASWELL(devid)
+
+#endif /* __CL_DEVICE_DATA_H__ */
+
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
new file mode 100644
index 0000000..ee3f2b7
--- /dev/null
+++ b/src/cl_device_id.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h"
+#include "cl_internals.h"
+#include "cl_utils.h"
+#include "cl_driver.h"
+#include "cl_device_data.h"
+#include "cl_khr_icd.h"
+#include "cl_thread.h"
+#include "CL/cl.h"
+#include "cl_gbe_loader.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifndef CL_VERSION_1_2
+#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
+#endif
+
+static struct _cl_device_id intel_ivb_gt2_device = {
+ INIT_ICD(dispatch)
+ .max_compute_unit = 16,
+ .max_thread_per_unit = 8,
+ .max_work_item_sizes = {1024, 1024, 1024},
+ .max_work_group_size = 1024,
+ .max_clock_frequency = 1000,
+#include "cl_gen7_device.h"
+};
+
+static struct _cl_device_id intel_ivb_gt1_device = {
+ INIT_ICD(dispatch)
+ .max_compute_unit = 6,
+ .max_thread_per_unit = 6,
+ .max_work_item_sizes = {512, 512, 512},
+ .max_work_group_size = 512,
+ .max_clock_frequency = 1000,
+#include "cl_gen7_device.h"
+};
+
+static struct _cl_device_id intel_baytrail_t_device = {
+ INIT_ICD(dispatch)
+ .max_compute_unit = 4,
+ .max_thread_per_unit = 8,
+ .max_work_item_sizes = {512, 512, 512},
+ .max_work_group_size = 512,
+ .max_clock_frequency = 1000,
+#include "cl_gen7_device.h"
+};
+
+/* XXX we clone IVB for HSW now */
+static struct _cl_device_id intel_hsw_gt1_device = {
+ INIT_ICD(dispatch)
+ .max_compute_unit = 10,
+ .max_thread_per_unit = 7,
+ .max_work_item_sizes = {1024, 1024, 1024},
+ .max_work_group_size = 1024,
+ .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_hsw_gt2_device = {
+ INIT_ICD(dispatch)
+ .max_compute_unit = 20,
+ .max_thread_per_unit = 7,
+ .max_work_item_sizes = {1024, 1024, 1024},
+ .max_work_group_size = 1024,
+ .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_hsw_gt3_device = {
+ INIT_ICD(dispatch)
+ .max_compute_unit = 40,
+ .max_thread_per_unit = 7,
+ .max_work_item_sizes = {1024, 1024, 1024},
+ .max_work_group_size = 1024,
+ .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+LOCAL cl_device_id
+cl_get_gt_device(void)
+{
+ cl_device_id ret = NULL;
+ const int device_id = cl_driver_get_device_id();
+ cl_device_id device = NULL;
+
+#define DECL_INFO_STRING(BREAK, STRUCT, FIELD, STRING) \
+ STRUCT.FIELD = STRING; \
+ STRUCT.JOIN(FIELD,_sz) = sizeof(STRING); \
+ device = &STRUCT; \
+ goto BREAK;
+
+ switch (device_id) {
+ case PCI_CHIP_HASWELL_D1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Desktop");
+ case PCI_CHIP_HASWELL_D2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Desktop");
+ case PCI_CHIP_HASWELL_D3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Desktop");
+ case PCI_CHIP_HASWELL_S1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Server");
+ case PCI_CHIP_HASWELL_S2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Server");
+ case PCI_CHIP_HASWELL_S3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Server");
+ case PCI_CHIP_HASWELL_M1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Mobile");
+ case PCI_CHIP_HASWELL_M2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Mobile");
+ case PCI_CHIP_HASWELL_M3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Mobile");
+ case PCI_CHIP_HASWELL_B1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
+ case PCI_CHIP_HASWELL_B2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
+ case PCI_CHIP_HASWELL_B3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
+ case PCI_CHIP_HASWELL_E1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
+ case PCI_CHIP_HASWELL_E2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
+ case PCI_CHIP_HASWELL_E3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
+ case PCI_CHIP_HASWELL_SDV_D1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT1 Desktop");
+ case PCI_CHIP_HASWELL_SDV_D2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT2 Desktop");
+ case PCI_CHIP_HASWELL_SDV_D3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT3 Desktop");
+ case PCI_CHIP_HASWELL_SDV_S1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT1 Server");
+ case PCI_CHIP_HASWELL_SDV_S2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT2 Server");
+ case PCI_CHIP_HASWELL_SDV_S3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT3 Server");
+ case PCI_CHIP_HASWELL_SDV_M1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT1 Mobile");
+ case PCI_CHIP_HASWELL_SDV_M2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT2 Mobile");
+ case PCI_CHIP_HASWELL_SDV_M3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT3 Mobile");
+ case PCI_CHIP_HASWELL_SDV_B1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT1 reserved");
+ case PCI_CHIP_HASWELL_SDV_B2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT2 reserved");
+ case PCI_CHIP_HASWELL_SDV_B3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT3 reserved");
+ case PCI_CHIP_HASWELL_SDV_E1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT1 reserved");
+ case PCI_CHIP_HASWELL_SDV_E2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT2 reserved");
+ case PCI_CHIP_HASWELL_SDV_E3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+ " Software Development Vehicle device GT3 reserved");
+ case PCI_CHIP_HASWELL_ULT_D1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Desktop");
+ case PCI_CHIP_HASWELL_ULT_D2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Desktop");
+ case PCI_CHIP_HASWELL_ULT_D3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Desktop");
+ case PCI_CHIP_HASWELL_ULT_S1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Server");
+ case PCI_CHIP_HASWELL_ULT_S2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Server");
+ case PCI_CHIP_HASWELL_ULT_S3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Server");
+ case PCI_CHIP_HASWELL_ULT_M1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Mobile");
+ case PCI_CHIP_HASWELL_ULT_M2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile");
+ case PCI_CHIP_HASWELL_ULT_M3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Mobile");
+ case PCI_CHIP_HASWELL_ULT_B1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
+ case PCI_CHIP_HASWELL_ULT_B2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
+ case PCI_CHIP_HASWELL_ULT_B3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
+ case PCI_CHIP_HASWELL_ULT_E1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
+ case PCI_CHIP_HASWELL_ULT_E2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
+ case PCI_CHIP_HASWELL_ULT_E3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
+
+ /* CRW */
+ case PCI_CHIP_HASWELL_CRW_D1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Desktop");
+ case PCI_CHIP_HASWELL_CRW_D2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Desktop");
+ case PCI_CHIP_HASWELL_CRW_D3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Desktop");
+ case PCI_CHIP_HASWELL_CRW_S1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Server");
+ case PCI_CHIP_HASWELL_CRW_S2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Server");
+ case PCI_CHIP_HASWELL_CRW_S3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Server");
+ case PCI_CHIP_HASWELL_CRW_M1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Mobile");
+ case PCI_CHIP_HASWELL_CRW_M2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Mobile");
+ case PCI_CHIP_HASWELL_CRW_M3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Mobile");
+ case PCI_CHIP_HASWELL_CRW_B1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
+ case PCI_CHIP_HASWELL_CRW_B2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
+ case PCI_CHIP_HASWELL_CRW_B3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
+ case PCI_CHIP_HASWELL_CRW_E1:
+ DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
+ case PCI_CHIP_HASWELL_CRW_E2:
+ DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
+ case PCI_CHIP_HASWELL_CRW_E3:
+ DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
+has_break:
+ device->vendor_id = device_id;
+ device->platform = intel_platform;
+ ret = device;
+ break;
+
+ case PCI_CHIP_IVYBRIDGE_GT1:
+ DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge GT1");
+ case PCI_CHIP_IVYBRIDGE_M_GT1:
+ DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge M GT1");
+ case PCI_CHIP_IVYBRIDGE_S_GT1:
+ DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge S GT1");
+ivb_gt1_break:
+ intel_ivb_gt1_device.vendor_id = device_id;
+ intel_ivb_gt1_device.platform = intel_platform;
+ ret = &intel_ivb_gt1_device;
+ break;
+
+ case PCI_CHIP_IVYBRIDGE_GT2:
+ DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge GT2");
+ case PCI_CHIP_IVYBRIDGE_M_GT2:
+ DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge M GT2");
+ case PCI_CHIP_IVYBRIDGE_S_GT2:
+ DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge S GT2");
+ivb_gt2_break:
+ intel_ivb_gt2_device.vendor_id = device_id;
+ intel_ivb_gt2_device.platform = intel_platform;
+ ret = &intel_ivb_gt2_device;
+ break;
+
+ case PCI_CHIP_BAYTRAIL_T:
+ DECL_INFO_STRING(baytrail_t_device_break, intel_baytrail_t_device, name, "Intel(R) HD Graphics Bay Trail-T");
+baytrail_t_device_break:
+ intel_baytrail_t_device.vendor_id = device_id;
+ intel_baytrail_t_device.platform = intel_platform;
+ ret = &intel_baytrail_t_device;
+ break;
+
+ case PCI_CHIP_SANDYBRIDGE_BRIDGE:
+ case PCI_CHIP_SANDYBRIDGE_GT1:
+ case PCI_CHIP_SANDYBRIDGE_GT2:
+ case PCI_CHIP_SANDYBRIDGE_GT2_PLUS:
+ case PCI_CHIP_SANDYBRIDGE_BRIDGE_M:
+ case PCI_CHIP_SANDYBRIDGE_M_GT1:
+ case PCI_CHIP_SANDYBRIDGE_M_GT2:
+ case PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS:
+ case PCI_CHIP_SANDYBRIDGE_BRIDGE_S:
+ case PCI_CHIP_SANDYBRIDGE_S_GT:
+ // Intel(R) HD Graphics SandyBridge not supported yet
+ ret = NULL;
+ break;
+ default:
+ printf("cl_get_gt_device(): error, unknown device: %x\n", device_id);
+ }
+
+ if (!CompilerSupported()) {
+ if (ret != NULL) {
+ ret->compiler_available = CL_FALSE;
+ //ret->linker_available = CL_FALSE;
+ ret->profile = "EMBEDDED_PROFILE";
+ ret->profile_sz = strlen(ret->profile) + 1;
+ }
+ }
+
+ return ret;
+}
+
+LOCAL cl_int
+cl_get_device_ids(cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices)
+{
+ cl_device_id device;
+
+ /* Do we have a usable device? */
+ device = cl_get_gt_device();
+ if (!device) {
+ if (num_devices)
+ *num_devices = 0;
+ if (devices)
+ *devices = 0;
+ return CL_DEVICE_NOT_FOUND;
+ } else {
+ if (num_devices)
+ *num_devices = 1;
+ if (devices) {
+ *devices = device;
+ (*devices)->extensions = intel_platform->extensions;
+ (*devices)->extensions_sz = intel_platform->extensions_sz;
+ }
+ return CL_SUCCESS;
+ }
+}
+
+#define DECL_FIELD(CASE,FIELD) \
+ case JOIN(CL_DEVICE_,CASE): \
+ if (param_value_size_ret) { \
+ *param_value_size_ret = sizeof device->FIELD; \
+ if (!param_value) \
+ return CL_SUCCESS; \
+ } \
+ if (param_value_size < sizeof device->FIELD) \
+ return CL_INVALID_VALUE; \
+ memcpy(param_value, &device->FIELD, sizeof device->FIELD); \
+ return CL_SUCCESS;
+
+#define DECL_STRING_FIELD(CASE,FIELD) \
+ case JOIN(CL_DEVICE_,CASE): \
+ if (param_value_size_ret) { \
+ *param_value_size_ret = device->JOIN(FIELD,_sz); \
+ if (!param_value) \
+ return CL_SUCCESS; \
+ } \
+ if (param_value_size < device->JOIN(FIELD,_sz)) \
+ return CL_INVALID_VALUE; \
+ memcpy(param_value, device->FIELD, device->JOIN(FIELD,_sz)); \
+ return CL_SUCCESS;
+
+LOCAL cl_int
+cl_get_device_info(cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (UNLIKELY(device != &intel_ivb_gt1_device &&
+ device != &intel_ivb_gt2_device &&
+ device != &intel_baytrail_t_device &&
+ device != &intel_hsw_gt1_device &&
+ device != &intel_hsw_gt2_device &&
+ device != &intel_hsw_gt3_device
+ ))
+ return CL_INVALID_DEVICE;
+
+ /* Find the correct parameter */
+ switch (param_name) {
+ DECL_FIELD(TYPE, device_type)
+ DECL_FIELD(VENDOR_ID, vendor_id)
+ DECL_FIELD(MAX_COMPUTE_UNITS, max_compute_unit)
+ DECL_FIELD(MAX_WORK_ITEM_DIMENSIONS, max_work_item_dimensions)
+ DECL_FIELD(MAX_WORK_ITEM_SIZES, max_work_item_sizes)
+ DECL_FIELD(MAX_WORK_GROUP_SIZE, max_work_group_size)
+ DECL_FIELD(PREFERRED_VECTOR_WIDTH_CHAR, preferred_vector_width_char)
+ DECL_FIELD(PREFERRED_VECTOR_WIDTH_SHORT, preferred_vector_width_short)
+ DECL_FIELD(PREFERRED_VECTOR_WIDTH_INT, preferred_vector_width_int)
+ DECL_FIELD(PREFERRED_VECTOR_WIDTH_LONG, preferred_vector_width_long)
+ DECL_FIELD(PREFERRED_VECTOR_WIDTH_FLOAT, preferred_vector_width_float)
+ DECL_FIELD(PREFERRED_VECTOR_WIDTH_DOUBLE, preferred_vector_width_double)
+ DECL_FIELD(PREFERRED_VECTOR_WIDTH_HALF, preferred_vector_width_half)
+ DECL_FIELD(NATIVE_VECTOR_WIDTH_CHAR, native_vector_width_char)
+ DECL_FIELD(NATIVE_VECTOR_WIDTH_SHORT, native_vector_width_short)
+ DECL_FIELD(NATIVE_VECTOR_WIDTH_INT, native_vector_width_int)
+ DECL_FIELD(NATIVE_VECTOR_WIDTH_LONG, native_vector_width_long)
+ DECL_FIELD(NATIVE_VECTOR_WIDTH_FLOAT, native_vector_width_float)
+ DECL_FIELD(NATIVE_VECTOR_WIDTH_DOUBLE, native_vector_width_double)
+ DECL_FIELD(NATIVE_VECTOR_WIDTH_HALF, native_vector_width_half)
+ DECL_FIELD(MAX_CLOCK_FREQUENCY, max_clock_frequency)
+ DECL_FIELD(ADDRESS_BITS, address_bits)
+ DECL_FIELD(MAX_MEM_ALLOC_SIZE, max_mem_alloc_size)
+ DECL_FIELD(IMAGE_SUPPORT, image_support)
+ DECL_FIELD(MAX_READ_IMAGE_ARGS, max_read_image_args)
+ DECL_FIELD(MAX_WRITE_IMAGE_ARGS, max_write_image_args)
+ DECL_FIELD(IMAGE_MAX_ARRAY_SIZE, image_max_array_size)
+ DECL_FIELD(IMAGE2D_MAX_WIDTH, image2d_max_width)
+ DECL_FIELD(IMAGE2D_MAX_HEIGHT, image2d_max_height)
+ DECL_FIELD(IMAGE3D_MAX_WIDTH, image3d_max_width)
+ DECL_FIELD(IMAGE3D_MAX_HEIGHT, image3d_max_height)
+ DECL_FIELD(IMAGE3D_MAX_DEPTH, image3d_max_depth)
+ DECL_FIELD(MAX_SAMPLERS, max_samplers)
+ DECL_FIELD(MAX_PARAMETER_SIZE, max_parameter_size)
+ DECL_FIELD(MEM_BASE_ADDR_ALIGN, mem_base_addr_align)
+ DECL_FIELD(MIN_DATA_TYPE_ALIGN_SIZE, min_data_type_align_size)
+ DECL_FIELD(SINGLE_FP_CONFIG, single_fp_config)
+ DECL_FIELD(DOUBLE_FP_CONFIG, double_fp_config)
+ DECL_FIELD(GLOBAL_MEM_CACHE_TYPE, global_mem_cache_type)
+ DECL_FIELD(GLOBAL_MEM_CACHELINE_SIZE, global_mem_cache_line_size)
+ DECL_FIELD(GLOBAL_MEM_CACHE_SIZE, global_mem_cache_size)
+ DECL_FIELD(GLOBAL_MEM_SIZE, global_mem_size)
+ DECL_FIELD(MAX_CONSTANT_BUFFER_SIZE, max_constant_buffer_size)
+ DECL_FIELD(IMAGE_MAX_BUFFER_SIZE, image_mem_size)
+ DECL_FIELD(MAX_CONSTANT_ARGS, max_constant_args)
+ DECL_FIELD(LOCAL_MEM_TYPE, local_mem_type)
+ DECL_FIELD(LOCAL_MEM_SIZE, local_mem_size)
+ DECL_FIELD(ERROR_CORRECTION_SUPPORT, error_correction_support)
+ DECL_FIELD(HOST_UNIFIED_MEMORY, host_unified_memory)
+ DECL_FIELD(PROFILING_TIMER_RESOLUTION, profiling_timer_resolution)
+ DECL_FIELD(ENDIAN_LITTLE, endian_little)
+ DECL_FIELD(AVAILABLE, available)
+ DECL_FIELD(COMPILER_AVAILABLE, compiler_available)
+ DECL_FIELD(LINKER_AVAILABLE, linker_available)
+ DECL_FIELD(EXECUTION_CAPABILITIES, execution_capabilities)
+ DECL_FIELD(QUEUE_PROPERTIES, queue_properties)
+ DECL_FIELD(PLATFORM, platform)
+ DECL_FIELD(PRINTF_BUFFER_SIZE, printf_buffer_size)
+ DECL_FIELD(PREFERRED_INTEROP_USER_SYNC, interop_user_sync)
+ DECL_STRING_FIELD(NAME, name)
+ DECL_STRING_FIELD(VENDOR, vendor)
+ DECL_STRING_FIELD(VERSION, version)
+ DECL_STRING_FIELD(PROFILE, profile)
+ DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
+ DECL_STRING_FIELD(EXTENSIONS, extensions);
+ DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
+ DECL_FIELD(PARENT_DEVICE, parent_device)
+ DECL_FIELD(PARTITION_MAX_SUB_DEVICES, partition_max_sub_device)
+ DECL_FIELD(PARTITION_PROPERTIES, partition_property)
+ DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
+ DECL_FIELD(PARTITION_TYPE, partition_type)
+ DECL_FIELD(REFERENCE_COUNT, device_reference_count)
+
+ case CL_DRIVER_VERSION:
+ if (param_value_size_ret) {
+ *param_value_size_ret = device->driver_version_sz;
+ if (!param_value)
+ return CL_SUCCESS;
+ }
+ if (param_value_size < device->driver_version_sz)
+ return CL_INVALID_VALUE;
+ memcpy(param_value, device->driver_version, device->driver_version_sz);
+ return CL_SUCCESS;
+
+ default: return CL_INVALID_VALUE;
+ };
+}
+
+LOCAL cl_int
+cl_device_get_version(cl_device_id device, cl_int *ver)
+{
+ if (UNLIKELY(device != &intel_ivb_gt1_device &&
+ device != &intel_ivb_gt2_device &&
+ device != &intel_baytrail_t_device &&
+ device != &intel_hsw_gt1_device &&
+ device != &intel_hsw_gt2_device &&
+ device != &intel_hsw_gt3_device))
+ return CL_INVALID_DEVICE;
+ if (ver == NULL)
+ return CL_SUCCESS;
+ if (device == &intel_ivb_gt1_device ||
+ device == &intel_ivb_gt2_device ||
+ device == &intel_baytrail_t_device) {
+ *ver = 7;
+ } else if (device == &intel_hsw_gt1_device || device == &intel_hsw_gt2_device
+ || device == &intel_hsw_gt3_device) {
+ *ver = 75;
+ } else
+ return CL_INVALID_VALUE;
+
+ return CL_SUCCESS;
+}
+#undef DECL_FIELD
+
+#define _DECL_FIELD(FIELD) \
+ if (param_value && param_value_size < sizeof(FIELD)) \
+ return CL_INVALID_VALUE; \
+ if (param_value_size_ret != NULL) \
+ *param_value_size_ret = sizeof(FIELD); \
+ if (param_value) \
+ memcpy(param_value, &FIELD, sizeof(FIELD)); \
+ return CL_SUCCESS;
+
+#define DECL_FIELD(CASE,FIELD) \
+ case JOIN(CL_KERNEL_,CASE): \
+ _DECL_FIELD(FIELD)
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+static int
+cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
+{
+ const char * n = cl_kernel_get_name(kernel);
+ const char * builtin_kernels_2d = "__cl_copy_image_2d_to_2d;__cl_copy_image_2d_to_buffer;__cl_copy_buffer_to_image_2d;__cl_fill_image_2d;__cl_fill_image_2d_array;";
+ const char * builtin_kernels_3d = "__cl_copy_image_3d_to_2d;__cl_copy_image_2d_to_3d;__cl_copy_image_3d_to_3d;__cl_copy_image_3d_to_buffer;__cl_copy_buffer_to_image_3d;__cl_fill_image_3d";
+ if (!strstr(device->built_in_kernels, n)){
+ return 0;
+ }else if(strstr(builtin_kernels_2d, n)){
+ return 2;
+ }else if(strstr(builtin_kernels_3d, n)){
+ return 3;
+ }else
+ return 1;
+
+}
+
+LOCAL size_t
+cl_get_kernel_max_wg_sz(cl_kernel kernel)
+{
+ size_t work_group_size;
+ int simd_width = interp_kernel_get_simd_width(kernel->opaque);
+ int vendor_id = kernel->program->ctx->device->vendor_id;
+ if (!interp_kernel_use_slm(kernel->opaque)) {
+ if (!IS_BAYTRAIL_T(vendor_id) || simd_width == 16)
+ work_group_size = simd_width * 64;
+ else
+ work_group_size = kernel->program->ctx->device->max_compute_unit *
+ kernel->program->ctx->device->max_thread_per_unit * simd_width;
+ } else
+ work_group_size = kernel->program->ctx->device->max_work_group_size /
+ (16 / simd_width);
+ return work_group_size;
+}
+
+LOCAL cl_int
+cl_get_kernel_workgroup_info(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void* param_value,
+ size_t* param_value_size_ret)
+{
+ int err = CL_SUCCESS;
+ int dimension = 0;
+ if (UNLIKELY(device != &intel_ivb_gt1_device &&
+ device != &intel_ivb_gt2_device &&
+ device != &intel_baytrail_t_device &&
+ device != &intel_hsw_gt1_device &&
+ device != &intel_hsw_gt2_device &&
+ device != &intel_hsw_gt3_device))
+ return CL_INVALID_DEVICE;
+
+ CHECK_KERNEL(kernel);
+ switch (param_name) {
+ case CL_KERNEL_WORK_GROUP_SIZE:
+ {
+ if (param_value && param_value_size < sizeof(size_t))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret != NULL)
+ *param_value_size_ret = sizeof(size_t);
+ if (param_value) {
+ size_t work_group_size = cl_get_kernel_max_wg_sz(kernel);
+ *(size_t*)param_value = work_group_size;
+ return CL_SUCCESS;
+ }
+ }
+ DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul)
+ case CL_KERNEL_LOCAL_MEM_SIZE:
+ {
+ size_t local_mem_sz = interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
+ _DECL_FIELD(local_mem_sz)
+ }
+ DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
+ DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
+ case CL_KERNEL_GLOBAL_WORK_SIZE:
+ dimension = cl_check_builtin_kernel_dimension(kernel, device);
+ if ( !dimension ) return CL_INVALID_VALUE;
+ if (param_value_size_ret != NULL)
+ *param_value_size_ret = sizeof(device->max_1d_global_work_sizes);
+ if (param_value) {
+ if (dimension == 1) {
+ memcpy(param_value, device->max_1d_global_work_sizes, sizeof(device->max_1d_global_work_sizes));
+ }else if(dimension == 2){
+ memcpy(param_value, device->max_2d_global_work_sizes, sizeof(device->max_2d_global_work_sizes));
+ }else if(dimension == 3){
+ memcpy(param_value, device->max_3d_global_work_sizes, sizeof(device->max_3d_global_work_sizes));
+ }else
+ return CL_INVALID_VALUE;
+
+ return CL_SUCCESS;
+ }
+ default:
+ return CL_INVALID_VALUE;
+ };
+
+error:
+ return err;
+}
+
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
new file mode 100644
index 0000000..31bce47
--- /dev/null
+++ b/src/cl_device_id.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_DEVICE_ID_H__
+#define __CL_DEVICE_ID_H__
+
+/* Store complete information about the device */
+struct _cl_device_id {
+ DEFINE_ICD(dispatch)
+ cl_device_type device_type;
+ cl_uint vendor_id;
+ cl_uint max_compute_unit; // maximum EU number
+ cl_uint max_thread_per_unit; // maximum EU threads per EU.
+ cl_uint max_work_item_dimensions; // should be 3.
+ size_t max_work_item_sizes[3]; // equal to maximum work group size.
+ size_t max_work_group_size; // maximum work group size under simd16 mode.
+ size_t max_1d_global_work_sizes[3]; // maximum 1d global work size for builtin kernels.
+ size_t max_2d_global_work_sizes[3]; // maximum 2d global work size for builtin kernels.
+ size_t max_3d_global_work_sizes[3]; // maximum 3d global work size for builtin kernels.
+ cl_uint preferred_vector_width_char;
+ cl_uint preferred_vector_width_short;
+ cl_uint preferred_vector_width_int;
+ cl_uint preferred_vector_width_long;
+ cl_uint preferred_vector_width_float;
+ cl_uint preferred_vector_width_double;
+ cl_uint preferred_vector_width_half;
+ cl_uint native_vector_width_char;
+ cl_uint native_vector_width_short;
+ cl_uint native_vector_width_int;
+ cl_uint native_vector_width_long;
+ cl_uint native_vector_width_float;
+ cl_uint native_vector_width_double;
+ cl_uint native_vector_width_half;
+ cl_uint max_clock_frequency;
+ cl_uint address_bits;
+ cl_ulong max_mem_alloc_size;
+ cl_bool image_support;
+ cl_uint max_read_image_args;
+ cl_uint max_write_image_args;
+ size_t image2d_max_width;
+ size_t image_max_array_size;
+ size_t image2d_max_height;
+ size_t image3d_max_width;
+ size_t image3d_max_height;
+ size_t image3d_max_depth;
+ cl_ulong image_mem_size;
+ cl_uint max_samplers;
+ size_t max_parameter_size;
+ cl_uint mem_base_addr_align;
+ cl_uint min_data_type_align_size;
+ cl_device_fp_config single_fp_config;
+ cl_device_fp_config double_fp_config;
+ cl_device_mem_cache_type global_mem_cache_type;
+ cl_uint global_mem_cache_line_size;
+ cl_ulong global_mem_cache_size;
+ cl_ulong global_mem_size;
+ cl_ulong max_constant_buffer_size;
+ cl_uint max_constant_args;
+ cl_device_local_mem_type local_mem_type;
+ cl_ulong local_mem_size;
+ cl_ulong scratch_mem_size;
+ cl_bool error_correction_support;
+ cl_bool host_unified_memory;
+ size_t profiling_timer_resolution;
+ cl_bool endian_little;
+ cl_bool available;
+ cl_bool compiler_available;
+ cl_bool linker_available;
+ cl_device_exec_capabilities execution_capabilities;
+ cl_command_queue_properties queue_properties;
+ cl_platform_id platform;
+ size_t printf_buffer_size;
+ cl_bool interop_user_sync;
+ const char *name;
+ const char *vendor;
+ const char *version;
+ const char *profile;
+ const char *opencl_c_version;
+ const char *extensions;
+ const char *driver_version;
+ const char *built_in_kernels;
+ size_t name_sz;
+ size_t vendor_sz;
+ size_t version_sz;
+ size_t profile_sz;
+ size_t opencl_c_version_sz;
+ size_t extensions_sz;
+ size_t driver_version_sz;
+ size_t built_in_kernels_sz;
+ /* Kernel specific info that we're assigning statically */
+ size_t preferred_wg_sz_mul;
+ /* SubDevice specific info */
+ cl_device_id parent_device;
+ cl_uint partition_max_sub_device;
+ cl_device_partition_property partition_property[3];
+ cl_device_affinity_domain affinity_domain;
+ cl_device_partition_property partition_type[3];
+ cl_uint device_reference_count;
+};
+
+/* Get a device from the given platform */
+extern cl_int cl_get_device_ids(cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices);
+
+/* Get the intel GPU device we currently have in this machine (if any) */
+extern cl_device_id cl_get_gt_device(void);
+
+/* Provide info about the device */
+extern cl_int cl_get_device_info(cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret);
+
+extern cl_int cl_get_kernel_workgroup_info(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret);
+/* Returns the Gen device ID */
+extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
+extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
+
+#endif /* __CL_DEVICE_ID_H__ */
+
diff --git a/src/cl_driver.cpp b/src/cl_driver.cpp
new file mode 100644
index 0000000..19ac4ae
--- /dev/null
+++ b/src/cl_driver.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+extern "C" {
+#include "intel/intel_driver.h"
+#include "cl_utils.h"
+#include <stdlib.h>
+#include <string.h>
+}
+
+namespace
+{
+ /*! Just use c++ pre-main to initialize the call-backs */
+ struct OCLDriverCallBackInitializer
+ {
+ OCLDriverCallBackInitializer(void) {
+ intel_setup_callbacks();
+ }
+ };
+
+ /*! Set the call backs at pre-main time */
+ static OCLDriverCallBackInitializer cbInitializer;
+} /* namespace */
+
diff --git a/src/cl_driver.h b/src/cl_driver.h
new file mode 100644
index 0000000..9cdba98
--- /dev/null
+++ b/src/cl_driver.h
@@ -0,0 +1,383 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_DRIVER_H__
+#define __CL_DRIVER_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cl_driver_type.h"
+/* Various limitations we should remove actually */
+#define GEN_MAX_SURFACES 256
+#define GEN_MAX_SAMPLERS 16
+
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
+ * will allow us to make the use of a software performance simulator easier and
+ * to minimize the code specific for the HW and for the simulator
+ **************************************************************************/
+/* Create a new driver */
+typedef cl_driver (cl_driver_new_cb)(cl_context_prop);
+extern cl_driver_new_cb *cl_driver_new;
+
+/* Delete the driver */
+typedef void (cl_driver_delete_cb)(cl_driver);
+extern cl_driver_delete_cb *cl_driver_delete;
+
+/* Get the buffer manager from the driver */
+typedef cl_buffer_mgr (cl_driver_get_bufmgr_cb)(cl_driver);
+extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr;
+
+/* Get the Gen version from the driver */
+typedef uint32_t (cl_driver_get_ver_cb)(cl_driver);
+extern cl_driver_get_ver_cb *cl_driver_get_ver;
+
+/**************************************************************************
+ * GPGPU command streamer
+ **************************************************************************/
+/* Describe texture tiling */
+typedef enum cl_gpgpu_tiling {
+ GPGPU_NO_TILE = 0,
+ GPGPU_TILE_X = 1,
+ GPGPU_TILE_Y = 2,
+} cl_gpgpu_tiling;
+
+/* Cache control options for gen7 */
+typedef enum cl_cache_control {
+ cc_gtt = 0x0,
+ cc_l3 = 0x1,
+ cc_llc = 0x2,
+ cc_llc_l3 = 0x3
+} cl_cache_control;
+
+/* L3 Cache control options for gen75 */
+typedef enum cl_l3_cache_control {
+ l3cc_uc = 0x0,
+ l3cc_ec = 0x1
+} cl_l3_cache_control;
+
+/* LLCCC Cache control options for gen75 */
+typedef enum cl_llccc_cache_control {
+ llccc_pte = 0x0<<1,
+ llccc_uc = 0x1<<1,
+ llccc_ec = 0x2<<1,
+ llccc_ucllc = 0x3<<1
+} cl_llccc_cache_control;
+
+typedef enum gpu_command_status {
+ command_queued = 3,
+ command_submitted = 2,
+ command_running = 1,
+ command_complete = 0
+} gpu_command_status;
+
+/* Use this structure to bind kernels in the gpgpu state */
+typedef struct cl_gpgpu_kernel {
+ const char *name; /* kernel name and bo name */
+ uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
+ uint32_t curbe_sz; /* total size of all curbes */
+ cl_buffer bo; /* kernel code in the proper addr space */
+ int32_t barrierID; /* barrierID for _this_ kernel */
+ uint32_t use_slm:1; /* For gen7 (automatic barrier management) */
+ uint32_t thread_n:15; /* For gen7 (automatic barrier management) */
+ uint32_t slm_sz; /* For gen7 (automatic SLM allocation) */
+} cl_gpgpu_kernel;
+
+/* Create a new gpgpu state */
+typedef cl_gpgpu (cl_gpgpu_new_cb)(cl_driver);
+extern cl_gpgpu_new_cb *cl_gpgpu_new;
+
+/* Delete the gpgpu state */
+typedef void (cl_gpgpu_delete_cb)(cl_gpgpu);
+extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
+
+/* Synchonize GPU with CPU */
+typedef void (cl_gpgpu_sync_cb)(void*);
+extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
+
+/* Bind a regular unformatted buffer */
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t size, uint8_t bti);
+extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
+
+/* bind samplers defined in both kernel and kernel args. */
+typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
+extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
+
+/* get the default cache control value. */
+typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
+extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
+/* Set a 2d texture */
+typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
+ uint32_t id,
+ cl_buffer obj_bo,
+ uint32_t obj_bo_offset,
+ uint32_t format,
+ uint32_t type,
+ int32_t w,
+ int32_t h,
+ int32_t depth,
+ int pitch,
+ cl_gpgpu_tiling tiling);
+
+extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
+
+/* Setup a stack */
+typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
+extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
+
+/* Setup scratch */
+typedef int (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
+extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
+
+/* Configure internal state */
+typedef int (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
+extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
+
+/* Set the buffer object where to report performance counters */
+typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
+extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
+
+/* Fills current curbe buffer with data */
+typedef int (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
+
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size, uint8_t bti);
+extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
+
+/* Setup all indirect states */
+typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
+extern cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup;
+
+/* Upload the constant samplers as specified inside the OCL kernel */
+typedef void (cl_gpgpu_upload_samplers_cb)(cl_gpgpu *state, const void *data, uint32_t n);
+extern cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers;
+
+/* Set a sampler */
+typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu, uint32_t index, uint32_t non_normalized);
+extern cl_gpgpu_set_sampler_cb *cl_gpgpu_set_sampler;
+
+/* Allocate the batch buffer and return the BO used for the batch buffer */
+typedef int (cl_gpgpu_batch_reset_cb)(cl_gpgpu, size_t sz);
+extern cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset;
+
+/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
+typedef void (cl_gpgpu_batch_start_cb)(cl_gpgpu);
+extern cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start;
+
+/* atomic end with possibly inserted flush */
+typedef void (cl_gpgpu_batch_end_cb)(cl_gpgpu, int32_t flush_mode);
+extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end;
+
+/* Flush the command buffer */
+typedef void (cl_gpgpu_flush_cb)(cl_gpgpu);
+extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
+
+/* new a event for a batch buffer */
+typedef cl_gpgpu_event (cl_gpgpu_event_new_cb)(cl_gpgpu);
+extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new;
+
+/* update the batch buffer of this event */
+typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int);
+extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status;
+
+/* flush the batch buffer of this event */
+typedef void (cl_gpgpu_event_flush_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush;
+
+/* cancel exec batch buffer of this event */
+typedef void (cl_gpgpu_event_cancel_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_cancel_cb *cl_gpgpu_event_cancel;
+
+/* delete a gpgpu event */
+typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
+
+/* Get a event time stamp */
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, cl_gpgpu_event, int, uint64_t*);
+extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
+
+/* Get current GPU time stamp */
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
+
+/* Get current batch buffer handle */
+typedef void* (cl_gpgpu_ref_batch_buf_cb)(cl_gpgpu);
+extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
+
+/* Get release batch buffer handle */
+typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
+extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
+
+/* Set the printf buffer */
+typedef int (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint32_t, uint8_t);
+extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
+
+/* get the printf buffer offset in the apeture*/
+typedef unsigned long (cl_gpgpu_reloc_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t);
+extern cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer;
+
+/* map the printf buffer */
+typedef void* (cl_gpgpu_map_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer;
+
+/* unmap the printf buffer */
+typedef void (cl_gpgpu_unmap_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer;
+
+/* release the printf buffer */
+typedef unsigned long (cl_gpgpu_release_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer;
+
+/* Set the last printfset pointer */
+typedef int (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *, size_t*);
+extern cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info;
+
+/* Get the last printfset pointer */
+typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*);
+extern cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info;
+
+/* Will spawn all threads */
+typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
+ uint32_t simd_sz,
+ uint32_t thread_n,
+ const size_t global_wk_off[3],
+ const size_t global_wk_sz[3],
+ const size_t local_wk_sz[3]);
+extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
+
+/**************************************************************************
+ * Buffer
+ **************************************************************************/
+/* Allocate a buffer */
+typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, size_t, size_t);
+extern cl_buffer_alloc_cb *cl_buffer_alloc;
+
+/* Set a buffer's tiling mode */
+typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
+extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
+
+#include "cl_context.h"
+#include "cl_mem.h"
+typedef struct _cl_context *cl_context;
+
+typedef cl_buffer (cl_buffer_alloc_from_texture_cb)(cl_context, unsigned int, int, unsigned int,
+ struct _cl_mem_image *gl_image);
+extern cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture;
+
+typedef void (cl_buffer_release_from_texture_cb)(cl_context, unsigned int, int, unsigned int);
+extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
+
+typedef cl_buffer (cl_buffer_get_buffer_from_libva_cb)(cl_context ctx, unsigned int bo_name, size_t *sz);
+extern cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva;
+
+typedef cl_buffer (cl_buffer_get_image_from_libva_cb)(cl_context ctx, unsigned int bo_name, struct _cl_mem_image *image, unsigned int offset);
+extern cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva;
+
+/* Unref a buffer and destroy it if no more ref */
+typedef int (cl_buffer_unreference_cb)(cl_buffer);
+extern cl_buffer_unreference_cb *cl_buffer_unreference;
+
+/* Add one more ref on a buffer */
+typedef void (cl_buffer_reference_cb)(cl_buffer);
+extern cl_buffer_reference_cb *cl_buffer_reference;
+
+/* Map a buffer */
+typedef int (cl_buffer_map_cb)(cl_buffer, uint32_t write_enable);
+extern cl_buffer_map_cb *cl_buffer_map;
+
+/* Unmap a buffer */
+typedef int (cl_buffer_unmap_cb)(cl_buffer);
+extern cl_buffer_unmap_cb *cl_buffer_unmap;
+
+/* Map a buffer in the GTT domain */
+typedef int (cl_buffer_map_gtt_cb)(cl_buffer);
+extern cl_buffer_map_gtt_cb *cl_buffer_map_gtt;
+
+/* Map a buffer in the GTT domain, non waiting the GPU read or write*/
+typedef int (cl_buffer_map_gtt_unsync_cb)(cl_buffer);
+extern cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync;
+
+/* Unmap a buffer in the GTT domain */
+typedef int (cl_buffer_unmap_gtt_cb)(cl_buffer);
+extern cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt;
+
+/* Get the virtual address (when mapped) */
+typedef void* (cl_buffer_get_virtual_cb)(cl_buffer);
+extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual;
+
+/* Get the size of the buffer */
+typedef size_t (cl_buffer_get_size_cb)(cl_buffer);
+extern cl_buffer_get_size_cb *cl_buffer_get_size;
+
+/* Pin a buffer */
+typedef int (cl_buffer_pin_cb)(cl_buffer, uint32_t alignment);
+extern cl_buffer_pin_cb *cl_buffer_pin;
+
+/* Unpin a buffer */
+typedef int (cl_buffer_unpin_cb)(cl_buffer);
+extern cl_buffer_unpin_cb *cl_buffer_unpin;
+
+/* Fill data in the buffer */
+typedef int (cl_buffer_subdata_cb)(cl_buffer, unsigned long, unsigned long, const void*);
+extern cl_buffer_subdata_cb *cl_buffer_subdata;
+
+/* Wait for all pending rendering for this buffer to complete */
+typedef int (cl_buffer_wait_rendering_cb) (cl_buffer);
+extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
+
+typedef int (cl_buffer_get_fd_cb)(cl_buffer, int *fd);
+extern cl_buffer_get_fd_cb *cl_buffer_get_fd;
+
+/* Get the device id */
+typedef int (cl_driver_get_device_id_cb)(void);
+extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
+
+/**************************************************************************
+ * cl_khr_gl_sharing.
+ **************************************************************************/
+typedef int (cl_gl_acquire_texture_cb)(void *driver, void *ctx, int target,
+ int level, int texture, void*user_data);
+extern cl_gl_acquire_texture_cb *cl_gl_acquire_texture;
+
+typedef int (cl_gl_release_texture_cb)(void *driver, void *ctx, int target,
+ int level, int texture);
+extern cl_gl_release_texture_cb *cl_gl_release_texture;
+
+typedef int (cl_gl_acquire_buffer_object_cb)(void *driver, void *ctx,
+ int bufobj, void* user_data);
+extern cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object;
+
+typedef int (cl_gl_release_buffer_object_cb)(void *driver, void *ctx, int bufobj);
+extern cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object;
+
+typedef int (cl_gl_acquire_render_buffer_cb)(void *driver, void *ctx,
+ int rb, void* user_data);
+extern cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer;
+
+typedef int (cl_gl_release_render_buffer_cb)(void *driver, void *ctx, int rb);
+extern cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer;
+
+#ifndef DEFAULT_DRIVER_DIR
+/* this is normally defined in Mesa/configs/default with DRI_DRIVER_SEARCH_PATH */
+#define DEFAULT_DRIVER_DIR "/usr/local/lib/dri"
+#endif
+
+#endif /* __CL_DRIVER_H__ */
+
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
new file mode 100644
index 0000000..72f25d9
--- /dev/null
+++ b/src/cl_driver_defs.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "cl_driver.h"
+#include "cl_utils.h"
+#include <stdlib.h>
+
+/* Driver */
+LOCAL cl_driver_new_cb *cl_driver_new = NULL;
+LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
+LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
+LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
+
+/* Buffer */
+LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
+LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
+LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
+LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
+LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
+LOCAL cl_buffer_map_cb *cl_buffer_map = NULL;
+LOCAL cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
+LOCAL cl_buffer_map_gtt_cb *cl_buffer_map_gtt = NULL;
+LOCAL cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync = NULL;
+LOCAL cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt = NULL;
+LOCAL cl_buffer_get_virtual_cb *cl_buffer_get_virtual = NULL;
+LOCAL cl_buffer_get_size_cb *cl_buffer_get_size = NULL;
+LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
+LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
+LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
+LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
+LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
+LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
+LOCAL cl_buffer_get_fd_cb *cl_buffer_get_fd = NULL;
+
+/* cl_khr_gl_sharing */
+LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
+LOCAL cl_gl_release_texture_cb *cl_gl_release_texture = NULL;
+LOCAL cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object = NULL;
+LOCAL cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object = NULL;
+LOCAL cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer = NULL;
+LOCAL cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer = NULL;
+/* GPGPU */
+LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
+LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
+LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL;
+LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
+LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
+LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
+LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
+LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
+LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
+LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
+LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
+LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
+LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
+LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
+LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
+LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
+LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
+LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
+LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
+LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
+LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
+LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
+LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
+LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
+LOCAL cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer = NULL;
+LOCAL cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer = NULL;
+LOCAL cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer = NULL;
+LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
+LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
+LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
+LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
+
diff --git a/src/cl_driver_type.h b/src/cl_driver_type.h
new file mode 100644
index 0000000..891a33c
--- /dev/null
+++ b/src/cl_driver_type.h
@@ -0,0 +1,24 @@
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
+ * will allow us to make the use of a software performance simulator easier and
+ * to minimize the code specific for the HW and for the simulator
+ **************************************************************************/
+
+/* Encapsulates command buffer / data buffer / kernels */
+typedef struct _cl_buffer *cl_buffer;
+
+/* Encapsulates buffer manager */
+typedef struct _cl_buffer_mgr *cl_buffer_mgr;
+
+/* Encapsulates the driver backend functionalities */
+typedef struct _cl_driver *cl_driver;
+
+/* Encapsulates the gpgpu stream of commands */
+typedef struct _cl_gpgpu *cl_gpgpu;
+
+/* Encapsulates the event of a command stream */
+typedef struct _cl_gpgpu_event *cl_gpgpu_event;
+
+typedef struct _cl_context_prop *cl_context_prop;
+typedef struct _cl_sampler *cl_sampler;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
new file mode 100644
index 0000000..af118ad
--- /dev/null
+++ b/src/cl_enqueue.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "cl_enqueue.h"
+#include "cl_image.h"
+#include "cl_driver.h"
+#include "cl_event.h"
+#include "cl_command_queue.h"
+#include "cl_utils.h"
+
+
+cl_int cl_enqueue_read_buffer(enqueue_data* data)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = data->mem_obj;
+ assert(mem->type == CL_MEM_BUFFER_TYPE ||
+ mem->type == CL_MEM_SUBBUFFER_TYPE);
+ void* src_ptr;
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+ if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
+
+ err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
+{
+ cl_int err = CL_SUCCESS;
+ void* src_ptr;
+ void* dst_ptr;
+
+ const size_t* origin = data->origin;
+ const size_t* host_origin = data->host_origin;
+ const size_t* region = data->region;
+
+ cl_mem mem = data->mem_obj;
+ assert(mem->type == CL_MEM_BUFFER_TYPE ||
+ mem->type == CL_MEM_SUBBUFFER_TYPE);
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+ if (!(src_ptr = cl_mem_map_auto(mem))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+ src_ptr = (char*)src_ptr + offset + buffer->sub_offset;
+
+ offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+ dst_ptr = (char *)data->ptr + offset;
+
+ if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+ (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
+ {
+ memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = src_ptr;
+ char* dst = dst_ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, region[0]);
+ src += data->row_pitch;
+ dst += data->host_row_pitch;
+ }
+ src_ptr = (char*)src_ptr + data->slice_pitch;
+ dst_ptr = (char*)dst_ptr + data->host_slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(mem);
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_write_buffer(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = data->mem_obj;
+ assert(mem->type == CL_MEM_BUFFER_TYPE ||
+ mem->type == CL_MEM_SUBBUFFER_TYPE);
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ void* dst_ptr;
+
+ if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
+
+ err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ void* src_ptr;
+ void* dst_ptr;
+
+ const size_t* origin = data->origin;
+ const size_t* host_origin = data->host_origin;
+ const size_t* region = data->region;
+
+ cl_mem mem = data->mem_obj;
+ assert(mem->type == CL_MEM_BUFFER_TYPE ||
+ mem->type == CL_MEM_SUBBUFFER_TYPE);
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+ if (!(dst_ptr = cl_mem_map_auto(mem))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+ dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset;
+
+ offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+ src_ptr = (char*)data->const_ptr + offset;
+
+ if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+ (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
+ {
+ memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = src_ptr;
+ char* dst = dst_ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, region[0]);
+ src += data->host_row_pitch;
+ dst += data->row_pitch;
+ }
+ src_ptr = (char*)src_ptr + data->host_slice_pitch;
+ dst_ptr = (char*)dst_ptr + data->slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(mem);
+
+error:
+ return err;
+}
+
+
+cl_int cl_enqueue_read_image(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ void* src_ptr;
+
+ cl_mem mem = data->mem_obj;
+ CHECK_IMAGE(mem, image);
+ const size_t* origin = data->origin;
+ const size_t* region = data->region;
+
+ if (!(src_ptr = cl_mem_map_auto(mem))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ src_ptr = (char*)src_ptr + offset;
+
+ if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+ {
+ memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = src_ptr;
+ char* dst = data->ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, image->bpp*region[0]);
+ src += image->row_pitch;
+ dst += data->row_pitch;
+ }
+ src_ptr = (char*)src_ptr + image->slice_pitch;
+ data->ptr = (char*)data->ptr + data->slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(mem);
+
+error:
+ return err;
+
+}
+
+cl_int cl_enqueue_write_image(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ void* dst_ptr;
+
+ cl_mem mem = data->mem_obj;
+ CHECK_IMAGE(mem, image);
+
+ if (!(dst_ptr = cl_mem_map_auto(mem))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+ //dst need to add offset
+ cl_mem_copy_image_region(data->origin, data->region, dst_ptr,
+ image->row_pitch, image->slice_pitch,
+ data->const_ptr, data->row_pitch,
+ data->slice_pitch, image, CL_TRUE, CL_FALSE);
+ err = cl_mem_unmap_auto(mem);
+
+error:
+ return err;
+
+}
+
+cl_int cl_enqueue_map_buffer(enqueue_data *data)
+{
+ void *ptr = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = data->mem_obj;
+ assert(mem->type == CL_MEM_BUFFER_TYPE ||
+ mem->type == CL_MEM_SUBBUFFER_TYPE);
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+ if(data->unsync_map == 1)
+ //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+ ptr = cl_mem_map_gtt(mem);
+ else
+ ptr = cl_mem_map_auto(mem);
+
+ if (ptr == NULL) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+ data->ptr = ptr;
+
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mem->host_ptr);
+ ptr = (char*)ptr + data->offset + buffer->sub_offset;
+ memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
+ }
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_map_image(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = data->mem_obj;
+ void *ptr = NULL;
+ size_t row_pitch = 0;
+ CHECK_IMAGE(mem, image);
+
+ if(data->unsync_map == 1)
+ //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+ ptr = cl_mem_map_gtt(mem);
+ else
+ ptr = cl_mem_map_auto(mem);
+
+ if (ptr == NULL) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+ data->ptr = ptr;
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ row_pitch = image->slice_pitch;
+ else
+ row_pitch = image->row_pitch;
+
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mem->host_ptr);
+ //src and dst need add offset in function cl_mem_copy_image_region
+ cl_mem_copy_image_region(data->origin, data->region,
+ mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+ data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
+ }
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ int i, j;
+ size_t mapped_size = 0;
+ size_t origin[3], region[3];
+ void * v_ptr = NULL;
+ void * mapped_ptr = data->ptr;
+ cl_mem memobj = data->mem_obj;
+ size_t row_pitch = 0;
+
+ assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+ INVALID_VALUE_IF(!mapped_ptr);
+ for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+ if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+ memobj->mapped_ptr[i].ptr = NULL;
+ mapped_size = memobj->mapped_ptr[i].size;
+ v_ptr = memobj->mapped_ptr[i].v_ptr;
+ for(j=0; j<3; j++) {
+ region[j] = memobj->mapped_ptr[i].region[j];
+ origin[j] = memobj->mapped_ptr[i].origin[j];
+ memobj->mapped_ptr[i].region[j] = 0;
+ memobj->mapped_ptr[i].origin[j] = 0;
+ }
+ memobj->mapped_ptr[i].size = 0;
+ memobj->mapped_ptr[i].v_ptr = NULL;
+ memobj->map_ref--;
+ break;
+ }
+ }
+ /* can not find a mapped address? */
+ INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
+
+ if (memobj->flags & CL_MEM_USE_HOST_PTR) {
+ if(memobj->type == CL_MEM_BUFFER_TYPE ||
+ memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+ assert(mapped_ptr >= memobj->host_ptr &&
+ mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+ /* Sync the data. */
+ memcpy(v_ptr, mapped_ptr, mapped_size);
+ } else {
+ CHECK_IMAGE(memobj, image);
+
+ if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ row_pitch = image->slice_pitch;
+ else
+ row_pitch = image->row_pitch;
+ //v_ptr have added offset, host_ptr have not added offset.
+ cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
+ memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+ image, CL_FALSE, CL_TRUE);
+ }
+ } else {
+ assert(v_ptr == mapped_ptr);
+ }
+
+ cl_mem_unmap_auto(memobj);
+
+ /* shrink the mapped slot. */
+ if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
+ int j = 0;
+ cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
+ if (!new_ptr) {
+ /* Just do nothing. */
+ goto error;
+ }
+ memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
+
+ for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+ if (memobj->mapped_ptr[i].ptr) {
+ new_ptr[j] = memobj->mapped_ptr[i];
+ j++;
+ assert(j < memobj->mapped_ptr_sz/2);
+ }
+ }
+ memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
+ free(memobj->mapped_ptr);
+ memobj->mapped_ptr = new_ptr;
+ }
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_native_kernel(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ cl_uint num_mem_objects = (cl_uint)data->offset;
+ const cl_mem *mem_list = data->mem_list;
+ const void **args_mem_loc = (const void **)data->const_ptr;
+ cl_uint i;
+
+ for (i=0; i<num_mem_objects; ++i)
+ {
+ const cl_mem buffer = mem_list[i];
+ CHECK_MEM(buffer);
+
+ *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer);
+ }
+ data->user_func(data->ptr);
+
+ for (i=0; i<num_mem_objects; ++i)
+ {
+ cl_mem_unmap_auto(mem_list[i]);
+ }
+
+ free(data->ptr);
+error:
+ return err;
+}
+
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
+{
+ /* if need profiling, add the submit timestamp here. */
+ if (event && event->type != CL_COMMAND_USER
+ && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
+ switch(data->type) {
+ case EnqueueReadBuffer:
+ return cl_enqueue_read_buffer(data);
+ case EnqueueReadBufferRect:
+ return cl_enqueue_read_buffer_rect(data);
+ case EnqueueWriteBuffer:
+ return cl_enqueue_write_buffer(data);
+ case EnqueueWriteBufferRect:
+ return cl_enqueue_write_buffer_rect(data);
+ case EnqueueReadImage:
+ return cl_enqueue_read_image(data);
+ case EnqueueWriteImage:
+ return cl_enqueue_write_image(data);
+ case EnqueueMapBuffer:
+ return cl_enqueue_map_buffer(data);
+ case EnqueueMapImage:
+ return cl_enqueue_map_image(data);
+ case EnqueueUnmapMemObject:
+ return cl_enqueue_unmap_mem_object(data);
+ case EnqueueCopyBufferRect:
+ case EnqueueCopyBuffer:
+ case EnqueueCopyImage:
+ case EnqueueCopyBufferToImage:
+ case EnqueueCopyImageToBuffer:
+ case EnqueueNDRangeKernel:
+ case EnqueueFillBuffer:
+ case EnqueueFillImage:
+ cl_event_flush(event);
+ return CL_SUCCESS;
+ case EnqueueNativeKernel:
+ return cl_enqueue_native_kernel(data);
+ case EnqueueMigrateMemObj:
+ default:
+ return CL_SUCCESS;
+ }
+}
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
new file mode 100644
index 0000000..a9b3601
--- /dev/null
+++ b/src/cl_enqueue.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+#ifndef __CL_ENQUEUE_H__
+#define __CL_ENQUEUE_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+
+typedef enum {
+ EnqueueReadBuffer = 0,
+ EnqueueReadBufferRect,
+ EnqueueWriteBuffer,
+ EnqueueWriteBufferRect,
+ EnqueueCopyBuffer,
+ EnqueueCopyBufferRect,
+ EnqueueReadImage,
+ EnqueueWriteImage,
+ EnqueueCopyImage,
+ EnqueueCopyImageToBuffer,
+ EnqueueCopyBufferToImage,
+ EnqueueMapBuffer,
+ EnqueueMapImage,
+ EnqueueUnmapMemObject,
+ EnqueueNDRangeKernel,
+ EnqueueNativeKernel,
+ EnqueueMarker,
+ EnqueueBarrier,
+ EnqueueFillBuffer,
+ EnqueueFillImage,
+ EnqueueMigrateMemObj,
+ EnqueueInvalid
+} enqueue_type;
+
+typedef struct _enqueue_data {
+ enqueue_type type; /* Command type */
+ cl_mem mem_obj; /* Enqueue's cl_mem */
+ cl_command_queue queue; /* Command queue */
+ size_t offset; /* Mem object's offset */
+ size_t size; /* Size */
+ size_t origin[3]; /* Origin */
+ size_t host_origin[3]; /* Origin */
+ size_t region[3]; /* Region */
+ size_t row_pitch; /* Row pitch */
+ size_t slice_pitch; /* Slice pitch */
+ size_t host_row_pitch; /* Host row pitch, used in read/write buffer rect */
+ size_t host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */
+ const void * const_ptr; /* Const ptr for memory read */
+ void * ptr; /* Ptr for write and return value */
+ const cl_mem* mem_list; /* mem_list of clEnqueueNativeKernel */
+ uint8_t unsync_map; /* Indicate the clEnqueueMapBuffer/Image is unsync map */
+ void (*user_func)(void *); /* pointer to a host-callable user function */
+} enqueue_data;
+
+/* Do real enqueue commands */
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
+#endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
new file mode 100644
index 0000000..99e60eb
--- /dev/null
+++ b/src/cl_event.c
@@ -0,0 +1,650 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+inline cl_bool
+cl_event_is_gpu_command_type(cl_command_type type)
+{
+ switch(type) {
+ case CL_COMMAND_COPY_BUFFER:
+ case CL_COMMAND_FILL_BUFFER:
+ case CL_COMMAND_COPY_IMAGE:
+ case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
+ case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+ case CL_COMMAND_COPY_BUFFER_RECT:
+ case CL_COMMAND_TASK:
+ case CL_COMMAND_NDRANGE_KERNEL:
+ return CL_TRUE;
+ default:
+ return CL_FALSE;
+ }
+}
+
+void cl_event_flush(cl_event event)
+{
+ assert(event->gpgpu_event != NULL);
+ if (event->gpgpu) {
+ cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
+ cl_gpgpu_delete(event->gpgpu);
+ event->gpgpu = NULL;
+ }
+ cl_gpgpu_event_flush(event->gpgpu_event);
+ event->queue->last_event = event;
+}
+
+cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
+{
+ cl_event event = NULL;
+ GET_QUEUE_THREAD_GPGPU(queue);
+
+ /* Allocate and inialize the structure itself */
+ TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
+ SET_ICD(event->dispatch)
+ event->magic = CL_MAGIC_EVENT_HEADER;
+ event->ref_n = 1;
+
+ /* Append the event in the context event list */
+ pthread_mutex_lock(&ctx->event_lock);
+ event->next = ctx->events;
+ if (ctx->events != NULL)
+ ctx->events->prev = event;
+ ctx->events = event;
+ pthread_mutex_unlock(&ctx->event_lock);
+ event->ctx = ctx;
+ cl_context_add_ref(ctx);
+
+ /* Initialize all members and create GPGPU event object */
+ event->queue = queue;
+ event->type = type;
+ event->gpgpu_event = NULL;
+ if(type == CL_COMMAND_USER) {
+ event->status = CL_SUBMITTED;
+ }
+ else {
+ event->status = CL_QUEUED;
+ if(cl_event_is_gpu_command_type(event->type))
+ event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
+ }
+ cl_event_add_ref(event); //dec when complete
+ event->user_cb = NULL;
+ event->enqueue_cb = NULL;
+ event->waits_head = NULL;
+ event->emplict = emplict;
+
+exit:
+ return event;
+error:
+ cl_event_delete(event);
+ event = NULL;
+ goto exit;
+}
+
+void cl_event_delete(cl_event event)
+{
+ if (UNLIKELY(event == NULL))
+ return;
+
+ cl_event_update_status(event, 0);
+
+ if (atomic_dec(&event->ref_n) > 1)
+ return;
+
+ if(event->queue && event->queue->last_event == event)
+ event->queue->last_event = NULL;
+
+ /* Call all user's callback if haven't execute */
+ user_callback *cb = event->user_cb;
+ while(event->user_cb) {
+ cb = event->user_cb;
+ if(cb->executed == CL_FALSE) {
+ cb->executed = CL_TRUE;
+ cb->pfn_notify(event, event->status, cb->user_data);
+ }
+ event->user_cb = cb->next;
+ cl_free(cb);
+ }
+
+ /* delete gpgpu event object */
+ if(event->gpgpu_event)
+ cl_gpgpu_event_delete(event->gpgpu_event);
+
+ /* Remove it from the list */
+ assert(event->ctx);
+ pthread_mutex_lock(&event->ctx->event_lock);
+
+ if (event->prev)
+ event->prev->next = event->next;
+ if (event->next)
+ event->next->prev = event->prev;
+ /* if this is the head, update head pointer ctx->events */
+ if (event->ctx->events == event)
+ event->ctx->events = event->next;
+
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ cl_context_delete(event->ctx);
+
+ if (event->gpgpu) {
+ fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n");
+ cl_gpgpu_delete(event->gpgpu);
+ event->gpgpu = NULL;
+ }
+ cl_free(event);
+}
+
+void cl_event_add_ref(cl_event event)
+{
+ assert(event);
+ atomic_inc(&event->ref_n);
+}
+
+cl_int cl_event_set_callback(cl_event event ,
+ cl_int command_exec_callback_type,
+ EVENT_NOTIFY pfn_notify,
+ void* user_data)
+{
+ assert(event);
+ assert(pfn_notify);
+
+ cl_int err = CL_SUCCESS;
+ user_callback *cb;
+ TRY_ALLOC(cb, CALLOC(user_callback));
+
+ cb->pfn_notify = pfn_notify;
+ cb->user_data = user_data;
+ cb->status = command_exec_callback_type;
+ cb->executed = CL_FALSE;
+
+ cb->next = event->user_cb;
+ event->user_cb = cb;
+
+exit:
+ return err;
+error:
+ err = CL_OUT_OF_HOST_MEMORY;
+ cl_free(cb);
+ goto exit;
+};
+
+cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event,cl_context ctx)
+{
+ cl_int err = CL_SUCCESS;
+ cl_int i;
+ /* check the event_wait_list and num_events_in_wait_list */
+ if((event_wait_list == NULL) &&
+ (num_events_in_wait_list > 0))
+ goto error;
+
+ if ((event_wait_list != NULL) &&
+ (num_events_in_wait_list == 0)){
+ goto error;
+ }
+
+ /* check the event and context */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ CHECK_EVENT(event_wait_list[i]);
+ if(event_wait_list[i]->status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ goto exit;
+ }
+ if(event && *event == event_wait_list[i])
+ goto error;
+ if(event_wait_list[i]->ctx != ctx)
+ goto error;
+ }
+
+exit:
+ return err;
+error:
+ err = CL_INVALID_EVENT_WAIT_LIST; //reset error
+ goto exit;
+}
+
+cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+ cl_command_queue queue)
+{
+ cl_int i;
+
+ /* Check whether wait user events */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ if(event_wait_list[i]->status <= CL_COMPLETE)
+ continue;
+
+ /* Need wait on user event, return and do enqueue defer */
+ if((event_wait_list[i]->type == CL_COMMAND_USER) ||
+ (event_wait_list[i]->enqueue_cb &&
+ (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
+ return CL_ENQUEUE_EXECUTE_DEFER;
+ }
+ }
+
+ if(queue && queue->barrier_events_num )
+ return CL_ENQUEUE_EXECUTE_DEFER;
+
+ /* Non user events or all user event finished, wait all enqueue events finish */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ if(event_wait_list[i]->status <= CL_COMPLETE)
+ continue;
+
+ //enqueue callback haven't finish, in another thread, wait
+ if(event_wait_list[i]->enqueue_cb != NULL)
+ return CL_ENQUEUE_EXECUTE_DEFER;
+ if(event_wait_list[i]->gpgpu_event)
+ cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
+ cl_event_set_status(event_wait_list[i], CL_COMPLETE); //Execute user's callback
+ }
+ return CL_ENQUEUE_EXECUTE_IMM;
+}
+
+void cl_event_new_enqueue_callback(cl_event event,
+ enqueue_data *data,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list)
+{
+ enqueue_callback *cb, *node;
+ user_event *user_events, *u_ev;
+ cl_command_queue queue = event->queue;
+ cl_int i;
+ cl_int err = CL_SUCCESS;
+
+ /* Allocate and initialize the structure itself */
+ TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
+ cb->num_events = 0;
+ TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list));
+ for(i=0; i<num_events_in_wait_list; i++) {
+ //user event will insert to cb->wait_user_events, need not in wait list, avoid ref twice
+ if(event_wait_list[i]->type != CL_COMMAND_USER) {
+ cb->wait_list[cb->num_events++] = event_wait_list[i];
+ cl_event_add_ref(event_wait_list[i]); //add defer enqueue's wait event reference
+ }
+ }
+ cb->event = event;
+ cb->next = NULL;
+ cb->wait_user_events = NULL;
+
+ if(queue && queue->barrier_events_num > 0) {
+ for(i=0; i<queue->barrier_events_num; i++) {
+ /* Insert the enqueue_callback to user event list */
+ node = queue->wait_events[i]->waits_head;
+ if(node == NULL)
+ queue->wait_events[i]->waits_head = cb;
+ else{
+ while((node != cb) && node->next)
+ node = node->next;
+ if(node == cb) //wait on dup user event
+ continue;
+ node->next = cb;
+ }
+
+ /* Insert the user event to enqueue_callback's wait_user_events */
+ TRY(cl_event_insert_user_event, &cb->wait_user_events, queue->wait_events[i]);
+ cl_event_add_ref(queue->wait_events[i]);
+ }
+ }
+
+ /* Find out all user events that in event_wait_list wait */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ if(event_wait_list[i]->status <= CL_COMPLETE)
+ continue;
+
+ if(event_wait_list[i]->type == CL_COMMAND_USER) {
+ /* Insert the enqueue_callback to user event list */
+ node = event_wait_list[i]->waits_head;
+ if(node == NULL)
+ event_wait_list[i]->waits_head = cb;
+ else {
+ while((node != cb) && node->next)
+ node = node->next;
+ if(node == cb) //wait on dup user event
+ continue;
+ node->next = cb;
+ }
+ /* Insert the user event to enqueue_callback's wait_user_events */
+ TRY(cl_event_insert_user_event, &cb->wait_user_events, event_wait_list[i]);
+ cl_event_add_ref(event_wait_list[i]);
+ cl_command_queue_insert_event(event->queue, event_wait_list[i]);
+ if(data->type == EnqueueBarrier){
+ cl_command_queue_insert_barrier_event(event->queue, event_wait_list[i]);
+ }
+ } else if(event_wait_list[i]->enqueue_cb != NULL) {
+ user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
+ while(user_events != NULL) {
+ /* Insert the enqueue_callback to user event's waits_tail */
+ node = user_events->event->waits_head;
+ if(node == NULL)
+ event_wait_list[i]->waits_head = cb;
+ else{
+ while((node != cb) && node->next)
+ node = node->next;
+ if(node == cb) { //wait on dup user event
+ user_events = user_events->next;
+ continue;
+ }
+ node->next = cb;
+ }
+
+ /* Insert the user event to enqueue_callback's wait_user_events */
+ TRY(cl_event_insert_user_event, &cb->wait_user_events, user_events->event);
+ cl_event_add_ref(user_events->event);
+ cl_command_queue_insert_event(event->queue, user_events->event);
+ if(data->type == EnqueueBarrier){
+ cl_command_queue_insert_barrier_event(event->queue, user_events->event);
+ }
+ user_events = user_events->next;
+ }
+ }
+ }
+ if(data->queue != NULL && event->gpgpu_event != NULL) {
+ event->gpgpu = cl_thread_gpgpu_take(event->queue);
+ data->ptr = (void *)event->gpgpu_event;
+ }
+ cb->data = *data;
+ event->enqueue_cb = cb;
+
+exit:
+ return;
+error:
+ if(cb) {
+ while(cb->wait_user_events) {
+ u_ev = cb->wait_user_events;
+ cb->wait_user_events = cb->wait_user_events->next;
+ cl_event_delete(u_ev->event);
+ cl_free(u_ev);
+ }
+ for(i=0; i<cb->num_events; i++) {
+ if(cb->wait_list[i]) {
+ cl_event_delete(cb->wait_list[i]);
+ }
+ }
+ cl_free(cb);
+ }
+ goto exit;
+}
+
+void cl_event_set_status(cl_event event, cl_int status)
+{
+ user_callback *user_cb;
+ cl_int ret, i;
+ cl_event evt;
+
+ pthread_mutex_lock(&event->ctx->event_lock);
+ if(status >= event->status) {
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ return;
+ }
+ if(event->status <= CL_COMPLETE) {
+ event->status = status; //have done enqueue before or doing in another thread
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ return;
+ }
+
+ if(status <= CL_COMPLETE) {
+ if(event->enqueue_cb) {
+ if(status == CL_COMPLETE) {
+ cl_enqueue_handle(event, &event->enqueue_cb->data);
+ if(event->gpgpu_event)
+ cl_gpgpu_event_update_status(event->gpgpu_event, 1); //now set complet, need refine
+ } else {
+ if(event->gpgpu_event) {
+ // Error then cancel the enqueued event.
+ cl_gpgpu_delete(event->gpgpu);
+ event->gpgpu = NULL;
+ }
+ }
+
+ event->status = status; //Change the event status after enqueue and befor unlock
+
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ for(i=0; i<event->enqueue_cb->num_events; i++)
+ cl_event_delete(event->enqueue_cb->wait_list[i]);
+ pthread_mutex_lock(&event->ctx->event_lock);
+
+ if(event->enqueue_cb->wait_list)
+ cl_free(event->enqueue_cb->wait_list);
+ cl_free(event->enqueue_cb);
+ event->enqueue_cb = NULL;
+ }
+ }
+ if(event->status >= status) //maybe changed in other threads
+ event->status = status;
+ pthread_mutex_unlock(&event->ctx->event_lock);
+
+ if(event->status <= CL_COMPLETE)
+ cl_event_delete(event);
+
+ /* Call user callback */
+ user_cb = event->user_cb;
+ while(user_cb) {
+ if(user_cb->status >= status) {
+ user_cb->executed = CL_TRUE;
+ user_cb->pfn_notify(event, event->status, user_cb->user_data);
+ }
+ user_cb = user_cb->next;
+ }
+
+ if(event->type != CL_COMMAND_USER)
+ return;
+
+ /* Check all defer enqueue */
+ enqueue_callback *cb, *enqueue_cb = event->waits_head;
+ while(enqueue_cb) {
+ /* Remove this user event in enqueue_cb, update the header if needed. */
+ cl_event_remove_user_event(&enqueue_cb->wait_user_events, event);
+ cl_event_delete(event);
+
+ /* Still wait on other user events */
+ if(enqueue_cb->wait_user_events != NULL) {
+ enqueue_cb = enqueue_cb->next;
+ continue;
+ }
+
+ //remove user event frome enqueue_cb's ctx
+ cl_command_queue_remove_event(enqueue_cb->event->queue, event);
+ cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event);
+
+ /* All user events complete, now wait enqueue events */
+ ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
+ enqueue_cb->event->queue);
+ ret = ret;
+ assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
+
+ cb = enqueue_cb;
+ enqueue_cb = enqueue_cb->next;
+
+ /* Call the pending operation */
+ evt = cb->event;
+ /* TODO: if this event wait on several events, one event's
+ status is error, the others is complete, what's the status
+ of this event? Can't find the description in OpenCL spec.
+ Simply update to latest finish wait event.*/
+ cl_event_set_status(cb->event, status);
+ if(evt->emplict == CL_FALSE) {
+ cl_event_delete(evt);
+ }
+ }
+ event->waits_head = NULL;
+}
+
+void cl_event_update_status(cl_event event, int wait)
+{
+ if(event->status <= CL_COMPLETE)
+ return;
+ if((event->gpgpu_event) &&
+ (cl_gpgpu_event_update_status(event->gpgpu_event, wait) == command_complete))
+ cl_event_set_status(event, CL_COMPLETE);
+}
+
+cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event* event)
+{
+ enqueue_data data = { 0 };
+ cl_event e;
+
+ e = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
+ if(e == NULL)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ if(event != NULL ){
+ *event = e;
+ }
+
+//enqueues a marker command which waits for either a list of events to complete, or if the list is
+//empty it waits for all commands previously enqueued in command_queue to complete before it completes.
+ if(num_events_in_wait_list > 0){
+ if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
+ data.type = EnqueueMarker;
+ cl_event_new_enqueue_callback(*event, &data, num_events_in_wait_list, event_wait_list);
+ return CL_SUCCESS;
+ }
+ } else if(queue->wait_events_num > 0) {
+ data.type = EnqueueMarker;
+ cl_event_new_enqueue_callback(*event, &data, queue->wait_events_num, queue->wait_events);
+ return CL_SUCCESS;
+ }
+
+ if(queue->last_event && queue->last_event->gpgpu_event) {
+ cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
+ }
+
+ cl_event_set_status(e, CL_COMPLETE);
+ return CL_SUCCESS;
+}
+
+cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event* event)
+{
+ enqueue_data data = { 0 };
+ cl_event e;
+
+ e = cl_event_new(queue->ctx, queue, CL_COMMAND_BARRIER, CL_TRUE);
+ if(e == NULL)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ if(event != NULL ){
+ *event = e;
+ }
+//enqueues a barrier command which waits for either a list of events to complete, or if the list is
+//empty it waits for all commands previously enqueued in command_queue to complete before it completes.
+ if(num_events_in_wait_list > 0){
+ if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
+ data.type = EnqueueBarrier;
+ cl_event_new_enqueue_callback(e, &data, num_events_in_wait_list, event_wait_list);
+ return CL_SUCCESS;
+ }
+ } else if(queue->wait_events_num > 0) {
+ data.type = EnqueueBarrier;
+ cl_event_new_enqueue_callback(e, &data, queue->wait_events_num, queue->wait_events);
+ return CL_SUCCESS;
+ }
+
+ if(queue->last_event && queue->last_event->gpgpu_event) {
+ cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
+ }
+
+ cl_event_set_status(e, CL_COMPLETE);
+ return CL_SUCCESS;
+}
+
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
+{
+ cl_ulong ret_val = 0;
+ GET_QUEUE_THREAD_GPGPU(event->queue);
+
+ if (!event->gpgpu_event) {
+ cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+ return CL_SUCCESS;
+ }
+
+ if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
+ param_name == CL_PROFILING_COMMAND_QUEUED) {
+ cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+ return CL_SUCCESS;
+ } else if(param_name == CL_PROFILING_COMMAND_START) {
+ cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 0, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+ return CL_SUCCESS;
+ } else if (param_name == CL_PROFILING_COMMAND_END) {
+ cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 1, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+ return CL_SUCCESS;
+ }
+ return CL_INVALID_VALUE;
+}
+
+cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event)
+{
+ user_event * u_iter = *p_u_ev;
+ user_event * u_ev;
+
+ while(u_iter)
+ {
+ if(u_iter->event == event)
+ return CL_SUCCESS;
+ u_iter = u_iter->next;
+ }
+
+ TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+ u_ev->event = event;
+ u_ev->next = *p_u_ev;
+ *p_u_ev = u_ev;
+
+
+ return CL_SUCCESS;
+error:
+ return CL_FALSE;
+}
+
+cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event)
+{
+ user_event * u_iter = *p_u_ev;
+ user_event * u_prev = *p_u_ev;
+
+ while(u_iter){
+ if(u_iter->event == event ){
+ if(u_iter == *p_u_ev){
+ *p_u_ev = u_iter->next;
+ }else{
+ u_prev->next = u_iter->next;
+ }
+ cl_free(u_iter);
+ break;
+ }
+ u_prev = u_iter;
+ u_iter = u_iter->next;
+ }
+
+ return CL_SUCCESS;
+}
diff --git a/src/cl_event.h b/src/cl_event.h
new file mode 100644
index 0000000..cfe5ddd
--- /dev/null
+++ b/src/cl_event.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_EVENT_H__
+#define __CL_EVENT_H__
+
+#include <semaphore.h>
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_enqueue.h"
+#include "CL/cl.h"
+
+#define CL_ENQUEUE_EXECUTE_IMM 0
+#define CL_ENQUEUE_EXECUTE_DEFER 1
+
+typedef struct _user_event {
+ cl_event event; /* The user event */
+ struct _user_event* next; /* Next user event in list */
+} user_event;
+
+typedef struct _enqueue_callback {
+ cl_event event; /* The event relative this enqueue callback */
+ enqueue_data data; /* Hold all enqueue callback's infomation */
+ cl_uint num_events; /* num events in wait list */
+ cl_event* wait_list; /* All event wait list this callback wait on */
+ user_event* wait_user_events; /* The head of user event list the callback wait on */
+ struct _enqueue_callback* next; /* The next enqueue callback in wait list */
+} enqueue_callback;
+
+typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int event_command_exec_status, void *user_data);
+
+typedef struct _user_callback {
+ cl_int status; /* The execution status */
+ cl_bool executed; /* Indicat the callback function been called or not */
+ EVENT_NOTIFY pfn_notify; /* Callback function */
+ void* user_data; /* Callback user data */
+ struct _user_callback* next; /* Next event callback in list */
+} user_callback;
+
+struct _cl_event {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a sampler object */
+ volatile int ref_n; /* We reference count this object */
+ cl_context ctx; /* The context associated with event */
+ cl_event prev, next; /* We chain the memory buffers together */
+ cl_command_queue queue; /* The command queue associated with event */
+ cl_command_type type; /* The command type associated with event */
+ cl_int status; /* The execution status */
+ cl_gpgpu gpgpu; /* Current gpgpu, owned by this structure. */
+ cl_gpgpu_event gpgpu_event; /* The event object communicate with hardware */
+ user_callback* user_cb; /* The event callback functions */
+ enqueue_callback* enqueue_cb; /* This event's enqueue */
+ enqueue_callback* waits_head; /* The head of enqueues list wait on this event */
+ cl_bool emplict; /* Identify this event whether created by api emplict*/
+ cl_ulong timestamp[4];/* The time stamps for profiling. */
+};
+
+/* Create a new event object */
+cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
+/* Unref the object and delete it if no more reference on it */
+void cl_event_delete(cl_event);
+/* Add one more reference to this object */
+void cl_event_add_ref(cl_event);
+/* Rigister a user callback function for specific commond execution status */
+cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
+/* Check events wait list for enqueue commonds */
+cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
+/* Wait the all events in wait list complete */
+cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue);
+/* New a enqueue suspend task */
+void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
+/* Set the event status and call all callbacks */
+void cl_event_set_status(cl_event, cl_int);
+/* Check and update event status */
+void cl_event_update_status(cl_event, cl_int);
+/* Create the marker event */
+cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event *, cl_event*);
+/* Create the barrier event */
+cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *, cl_event*);
+/* Do the event profiling */
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
+/* insert the user event */
+cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
+/* remove the user event */
+cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event);
+/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */
+void cl_event_flush(cl_event event);
+#endif /* __CL_EVENT_H__ */
+
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
new file mode 100644
index 0000000..d07a525
--- /dev/null
+++ b/src/cl_extensions.c
@@ -0,0 +1,107 @@
+#ifdef HAS_EGL
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+#endif
+
+#include "cl_platform_id.h"
+#include "cl_internals.h"
+#include "CL/cl.h"
+#include "cl_utils.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+static struct cl_extensions intel_extensions =
+{
+ {
+#define DECL_EXT(name) \
+ {(struct cl_extension_base){.ext_id = cl_##name##_ext_id, .ext_name = "cl_" #name, .ext_enabled = 0}},
+ DECL_ALL_EXTENSIONS
+ },
+#undef DECL_EXT
+ {""}
+};
+
+void check_basic_extension(cl_extensions_t *extensions)
+{
+ int id;
+ for(id = BASE_EXT_START_ID; id <= BASE_EXT_END_ID; id++)
+ if (id != EXT_ID(khr_fp64))
+ extensions->extensions[id].base.ext_enabled = 1;
+}
+
+void check_opt1_extension(cl_extensions_t *extensions)
+{
+ int id;
+ for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++)
+ if (id == EXT_ID(khr_icd))
+ extensions->extensions[id].base.ext_enabled = 1;
+}
+
+void
+check_gl_extension(cl_extensions_t *extensions) {
+#if defined(HAS_EGL)
+ int id;
+ /* For now, we only support cl_khr_gl_sharing. */
+ for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
+ if (id == EXT_ID(khr_gl_sharing))
+ extensions->extensions[id].base.ext_enabled = 1;
+#endif
+}
+
+void
+check_intel_extension(cl_extensions_t *extensions)
+{
+ /* Should put those map/unmap extensions here. */
+}
+
+void
+process_extension_str(cl_extensions_t *extensions)
+{
+ int str_max = sizeof(extensions->ext_str);
+ int str_offset = 0;
+ int id;
+
+ extensions->ext_str[str_max] = '\0';
+
+ for(id = 0; id < cl_khr_extension_id_max; id++)
+ {
+ if (extensions->extensions[id].base.ext_enabled) {
+ int copy_len;
+ char *ext_name = extensions->extensions[id].base.ext_name;
+ if (str_offset + 1 >= str_max)
+ return;
+
+ if (str_offset != 0)
+ extensions->ext_str[str_offset - 1] = ' ';
+ copy_len = (strlen(ext_name) + 1 + str_offset) < str_max
+ ? (strlen(ext_name) + 1) : (str_max - str_offset - 1);
+ strncpy(&extensions->ext_str[str_offset],
+ extensions->extensions[id].base.ext_name, copy_len);
+ str_offset += copy_len;
+ }
+ }
+}
+
+LOCAL void
+cl_intel_platform_extension_init(cl_platform_id intel_platform)
+{
+ static int initialized = 0;
+
+ if (initialized) {
+ intel_platform->internal_extensions = &intel_extensions;
+ intel_platform->extensions = intel_extensions.ext_str;
+ return;
+ }
+ check_basic_extension(&intel_extensions);
+ check_opt1_extension(&intel_extensions);
+ check_gl_extension(&intel_extensions);
+ check_intel_extension(&intel_extensions);
+ process_extension_str(&intel_extensions);
+
+ intel_platform->internal_extensions = &intel_extensions;
+ intel_platform->extensions = intel_extensions.ext_str;
+
+ initialized = 1;
+ return;
+}
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
new file mode 100644
index 0000000..52ee0a4
--- /dev/null
+++ b/src/cl_extensions.h
@@ -0,0 +1,99 @@
+/* The following approved Khronos extension
+ * names must be returned by all device that
+ * support OpenCL C 1.2. */
+#define DECL_BASE_EXTENSIONS \
+ DECL_EXT(khr_global_int32_base_atomics) \
+ DECL_EXT(khr_global_int32_extended_atomics) \
+ DECL_EXT(khr_local_int32_base_atomics) \
+ DECL_EXT(khr_local_int32_extended_atomics) \
+ DECL_EXT(khr_byte_addressable_store) \
+ DECL_EXT(khr_fp64)
+
+/* The OPT1 extensions are those optional extensions
+ * which don't have external dependecies*/
+#define DECL_OPT1_EXTENSIONS \
+ DECL_EXT(khr_int64_base_atomics)\
+ DECL_EXT(khr_int64_extended_atomics)\
+ DECL_EXT(khr_3d_image_writes)\
+ DECL_EXT(khr_fp16)\
+ DECL_EXT(khr_image2d_from_buffer)\
+ DECL_EXT(khr_initialize_memory)\
+ DECL_EXT(khr_context_abort)\
+ DECL_EXT(khr_depth_images)\
+ DECL_EXT(khr_spir) \
+ DECL_EXT(khr_icd)
+
+#define DECL_GL_EXTENSIONS \
+ DECL_EXT(khr_gl_sharing)\
+ DECL_EXT(khr_gl_event)\
+ DECL_EXT(khr_gl_depth_images)\
+ DECL_EXT(khr_gl_msaa_sharing)
+
+#define DECL_D3D_EXTENSIONS \
+ DECL_EXT(khr_d3d10_sharing)\
+ DECL_EXT(khr_dx9_media_sharing)\
+ DECL_EXT(khr_d3d11_sharing)\
+
+#define DECL_ALL_EXTENSIONS \
+ DECL_BASE_EXTENSIONS \
+ DECL_OPT1_EXTENSIONS \
+ DECL_GL_EXTENSIONS \
+ DECL_D3D_EXTENSIONS
+
+#define EXT_ID(name) cl_ ## name ## _ext_id
+#define EXT_STRUCT_NAME(name) cl_ ## name ## ext
+/*Declare enum ids */
+typedef enum {
+#define DECL_EXT(name) EXT_ID(name),
+DECL_ALL_EXTENSIONS
+#undef DECL_EXT
+cl_khr_extension_id_max
+}cl_extension_enum;
+
+#define BASE_EXT_START_ID EXT_ID(khr_global_int32_base_atomics)
+#define BASE_EXT_END_ID EXT_ID(khr_fp64)
+#define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
+#define OPT1_EXT_END_ID EXT_ID(khr_icd)
+#define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
+#define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
+
+#define IS_BASE_EXTENSION(id) (id >= BASE_EXT_START_ID && id <= BASE_EXT_END_ID)
+#define IS_OPT1_EXTENSION(id) (id >= OPT1_EXT_START_ID && id <= OPT1_EXT_END_ID)
+#define IS_GL_EXTENSION(id) (id >= GL_EXT_START_ID && id <= GL_EXT_END_ID)
+
+struct cl_extension_base {
+ cl_extension_enum ext_id;
+ int ext_enabled;
+ char *ext_name;
+};
+
+/* Declare each extension structure. */
+#define DECL_EXT(name) \
+struct EXT_STRUCT_NAME(name) { \
+ struct cl_extension_base base;\
+};
+
+DECL_BASE_EXTENSIONS
+DECL_OPT1_EXTENSIONS
+DECL_D3D_EXTENSIONS
+DECL_GL_EXTENSIONS
+#undef DECL_EXT
+
+/* Union all extensions together. */
+typedef union {
+ struct cl_extension_base base;
+ #define DECL_EXT(name) struct EXT_STRUCT_NAME(name) EXT_STRUCT_NAME(name);
+ DECL_ALL_EXTENSIONS
+ #undef DECL_EXT
+} extension_union;
+
+typedef struct cl_extensions {
+ extension_union extensions[cl_khr_extension_id_max];
+ char ext_str[256];
+} cl_extensions_t;
+
+struct _cl_platform_id;
+typedef struct _cl_platform_id * cl_platform_id;
+
+extern void
+cl_intel_platform_extension_init(cl_platform_id intel_platform);
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
new file mode 100644
index 0000000..7da0475
--- /dev/null
+++ b/src/cl_gbe_loader.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <iostream>
+#include <dlfcn.h>
+#include <string.h>
+#include <stdio.h>
+#include "cl_gbe_loader.h"
+#include "backend/src/GBEConfig.h"
+
+//function pointer from libgbe.so
+gbe_program_new_from_source_cb *compiler_program_new_from_source = NULL;
+gbe_program_compile_from_source_cb *compiler_program_compile_from_source = NULL;
+gbe_program_new_gen_program_cb *compiler_program_new_gen_program = NULL;
+gbe_program_link_program_cb *compiler_program_link_program = NULL;
+gbe_program_build_from_llvm_cb *compiler_program_build_from_llvm = NULL;
+gbe_program_new_from_llvm_binary_cb *compiler_program_new_from_llvm_binary = NULL;
+gbe_program_serialize_to_binary_cb *compiler_program_serialize_to_binary = NULL;
+gbe_program_new_from_llvm_cb *compiler_program_new_from_llvm = NULL;
+gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL;
+
+//function pointer from libgbeinterp.so
+gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL;
+gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL;
+gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL;
+gbe_program_delete_cb *interp_program_delete = NULL;
+gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
+gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
+gbe_program_get_kernel_cb *interp_program_get_kernel = NULL;
+gbe_kernel_get_name_cb *interp_kernel_get_name = NULL;
+gbe_kernel_get_attributes_cb *interp_kernel_get_attributes = NULL;
+gbe_kernel_get_code_cb *interp_kernel_get_code = NULL;
+gbe_kernel_get_code_size_cb *interp_kernel_get_code_size = NULL;
+gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num = NULL;
+gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size = NULL;
+gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti = NULL;
+gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type = NULL;
+gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align = NULL;
+gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width = NULL;
+gbe_kernel_get_curbe_offset_cb *interp_kernel_get_curbe_offset = NULL;
+gbe_kernel_get_curbe_size_cb *interp_kernel_get_curbe_size = NULL;
+gbe_kernel_get_stack_size_cb *interp_kernel_get_stack_size = NULL;
+gbe_kernel_get_scratch_size_cb *interp_kernel_get_scratch_size = NULL;
+gbe_kernel_get_required_work_group_size_cb *interp_kernel_get_required_work_group_size = NULL;
+gbe_kernel_use_slm_cb *interp_kernel_use_slm = NULL;
+gbe_kernel_get_slm_size_cb *interp_kernel_get_slm_size = NULL;
+gbe_kernel_get_sampler_size_cb *interp_kernel_get_sampler_size = NULL;
+gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data = NULL;
+gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size = NULL;
+gbe_kernel_get_image_size_cb *interp_kernel_get_image_size = NULL;
+gbe_kernel_get_image_data_cb *interp_kernel_get_image_data = NULL;
+gbe_get_printf_num_cb* interp_get_printf_num = NULL;
+gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti = NULL;
+gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti = NULL;
+gbe_dup_printfset_cb* interp_dup_printfset = NULL;
+gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size = NULL;
+gbe_release_printf_info_cb* interp_release_printf_info = NULL;
+gbe_output_printf_cb* interp_output_printf = NULL;
+gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info = NULL;
+
+struct GbeLoaderInitializer
+{
+ GbeLoaderInitializer()
+ {
+ LoadCompiler();
+
+ const char* path;
+ if (!LoadInterp(path))
+ std::cerr << "unable to load " << path << " which is part of the driver, please check!" << std::endl;
+ }
+
+ bool LoadInterp(const char*& path)
+ {
+ const char* interpPath = getenv("OCL_INTERP_PATH");
+ if (interpPath == NULL)
+ interpPath = INTERP_OBJECT_DIR;
+
+ path = interpPath;
+
+ dlhInterp = dlopen(interpPath, RTLD_LAZY | RTLD_LOCAL);
+ if (dlhInterp == NULL) {
+ return false;
+ }
+
+ interp_program_new_from_binary = *(gbe_program_new_from_binary_cb**)dlsym(dlhInterp, "gbe_program_new_from_binary");
+ if (interp_program_new_from_binary == NULL)
+ return false;
+
+ interp_program_get_global_constant_size = *(gbe_program_get_global_constant_size_cb**)dlsym(dlhInterp, "gbe_program_get_global_constant_size");
+ if (interp_program_get_global_constant_size == NULL)
+ return false;
+
+ interp_program_get_global_constant_data = *(gbe_program_get_global_constant_data_cb**)dlsym(dlhInterp, "gbe_program_get_global_constant_data");
+ if (interp_program_get_global_constant_data == NULL)
+ return false;
+
+ interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete");
+ if (interp_program_delete == NULL)
+ return false;
+
+ interp_program_get_kernel_num = *(gbe_program_get_kernel_num_cb**)dlsym(dlhInterp, "gbe_program_get_kernel_num");
+ if (interp_program_get_kernel_num == NULL)
+ return false;
+
+ interp_program_get_kernel_by_name = *(gbe_program_get_kernel_by_name_cb**)dlsym(dlhInterp, "gbe_program_get_kernel_by_name");
+ if (interp_program_get_kernel_by_name == NULL)
+ return false;
+
+ interp_program_get_kernel = *(gbe_program_get_kernel_cb**)dlsym(dlhInterp, "gbe_program_get_kernel");
+ if (interp_program_get_kernel == NULL)
+ return false;
+
+ interp_kernel_get_name = *(gbe_kernel_get_name_cb**)dlsym(dlhInterp, "gbe_kernel_get_name");
+ if (interp_kernel_get_name == NULL)
+ return false;
+
+ interp_kernel_get_attributes = *(gbe_kernel_get_attributes_cb**)dlsym(dlhInterp, "gbe_kernel_get_attributes");
+ if (interp_kernel_get_attributes == NULL)
+ return false;
+
+ interp_kernel_get_code = *(gbe_kernel_get_code_cb**)dlsym(dlhInterp, "gbe_kernel_get_code");
+ if (interp_kernel_get_code == NULL)
+ return false;
+
+ interp_kernel_get_code_size = *(gbe_kernel_get_code_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_code_size");
+ if (interp_kernel_get_code_size == NULL)
+ return false;
+
+ interp_kernel_get_arg_num = *(gbe_kernel_get_arg_num_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_num");
+ if (interp_kernel_get_arg_num == NULL)
+ return false;
+
+ interp_kernel_get_arg_size = *(gbe_kernel_get_arg_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_size");
+ if (interp_kernel_get_arg_size == NULL)
+ return false;
+
+ interp_kernel_get_arg_bti = *(gbe_kernel_get_arg_bti_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_bti");
+ if (interp_kernel_get_arg_bti == NULL)
+ return false;
+
+ interp_kernel_get_arg_type = *(gbe_kernel_get_arg_type_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_type");
+ if (interp_kernel_get_arg_type == NULL)
+ return false;
+
+ interp_kernel_get_arg_align = *(gbe_kernel_get_arg_align_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_align");
+ if (interp_kernel_get_arg_align == NULL)
+ return false;
+
+ interp_kernel_get_simd_width = *(gbe_kernel_get_simd_width_cb**)dlsym(dlhInterp, "gbe_kernel_get_simd_width");
+ if (interp_kernel_get_simd_width == NULL)
+ return false;
+
+ interp_kernel_get_curbe_offset = *(gbe_kernel_get_curbe_offset_cb**)dlsym(dlhInterp, "gbe_kernel_get_curbe_offset");
+ if (interp_kernel_get_curbe_offset == NULL)
+ return false;
+
+ interp_kernel_get_curbe_size = *(gbe_kernel_get_curbe_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_curbe_size");
+ if (interp_kernel_get_curbe_size == NULL)
+ return false;
+
+ interp_kernel_get_stack_size = *(gbe_kernel_get_stack_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_stack_size");
+ if (interp_kernel_get_stack_size == NULL)
+ return false;
+
+ interp_kernel_get_scratch_size = *(gbe_kernel_get_scratch_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_scratch_size");
+ if (interp_kernel_get_scratch_size == NULL)
+ return false;
+
+ interp_kernel_get_required_work_group_size = *(gbe_kernel_get_required_work_group_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_required_work_group_size");
+ if (interp_kernel_get_required_work_group_size == NULL)
+ return false;
+
+ interp_kernel_use_slm = *(gbe_kernel_use_slm_cb**)dlsym(dlhInterp, "gbe_kernel_use_slm");
+ if (interp_kernel_use_slm == NULL)
+ return false;
+
+ interp_kernel_get_slm_size = *(gbe_kernel_get_slm_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_slm_size");
+ if (interp_kernel_get_slm_size == NULL)
+ return false;
+
+ interp_kernel_get_sampler_size = *(gbe_kernel_get_sampler_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_sampler_size");
+ if (interp_kernel_get_sampler_size == NULL)
+ return false;
+
+ interp_kernel_get_sampler_data = *(gbe_kernel_get_sampler_data_cb**)dlsym(dlhInterp, "gbe_kernel_get_sampler_data");
+ if (interp_kernel_get_sampler_data == NULL)
+ return false;
+
+ interp_kernel_get_compile_wg_size = *(gbe_kernel_get_compile_wg_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_compile_wg_size");
+ if (interp_kernel_get_compile_wg_size == NULL)
+ return false;
+
+ interp_kernel_get_image_size = *(gbe_kernel_get_image_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_image_size");
+ if (interp_kernel_get_image_size == NULL)
+ return false;
+
+ interp_kernel_get_image_data = *(gbe_kernel_get_image_data_cb**)dlsym(dlhInterp, "gbe_kernel_get_image_data");
+ if (interp_kernel_get_image_data == NULL)
+ return false;
+
+ interp_get_printf_num = *(gbe_get_printf_num_cb**)dlsym(dlhInterp, "gbe_get_printf_num");
+ if (interp_get_printf_num == NULL)
+ return false;
+
+ interp_get_printf_buf_bti = *(gbe_get_printf_buf_bti_cb**)dlsym(dlhInterp, "gbe_get_printf_buf_bti");
+ if (interp_get_printf_buf_bti == NULL)
+ return false;
+
+ interp_get_printf_indexbuf_bti = *(gbe_get_printf_indexbuf_bti_cb**)dlsym(dlhInterp, "gbe_get_printf_indexbuf_bti");
+ if (interp_get_printf_indexbuf_bti == NULL)
+ return false;
+
+ interp_dup_printfset = *(gbe_dup_printfset_cb**)dlsym(dlhInterp, "gbe_dup_printfset");
+ if (interp_dup_printfset == NULL)
+ return false;
+
+ interp_get_printf_sizeof_size = *(gbe_get_printf_sizeof_size_cb**)dlsym(dlhInterp, "gbe_get_printf_sizeof_size");
+ if (interp_get_printf_sizeof_size == NULL)
+ return false;
+
+ interp_release_printf_info = *(gbe_release_printf_info_cb**)dlsym(dlhInterp, "gbe_release_printf_info");
+ if (interp_release_printf_info == NULL)
+ return false;
+
+ interp_output_printf = *(gbe_output_printf_cb**)dlsym(dlhInterp, "gbe_output_printf");
+ if (interp_output_printf == NULL)
+ return false;
+
+ interp_kernel_get_arg_info = *(gbe_kernel_get_arg_info_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_info");
+ if (interp_kernel_get_arg_info == NULL)
+ return false;
+
+ return true;
+ }
+
+ void LoadCompiler()
+ {
+ compilerLoaded = false;
+
+ const char* nonCompiler = getenv("OCL_NON_COMPILER");
+ if (nonCompiler != NULL) {
+ if (strcmp(nonCompiler, "1") == 0)
+ return;
+ }
+
+ const char* gbePath = getenv("OCL_GBE_PATH");
+ if (gbePath == NULL)
+ gbePath = GBE_OBJECT_DIR;
+
+ dlhCompiler = dlopen(gbePath, RTLD_LAZY | RTLD_LOCAL);
+ if (dlhCompiler != NULL) {
+ compiler_program_new_from_source = *(gbe_program_new_from_source_cb **)dlsym(dlhCompiler, "gbe_program_new_from_source");
+ if (compiler_program_new_from_source == NULL)
+ return;
+
+ compiler_program_compile_from_source = *(gbe_program_compile_from_source_cb **)dlsym(dlhCompiler, "gbe_program_compile_from_source");
+ if (compiler_program_compile_from_source == NULL)
+ return;
+
+ compiler_program_new_gen_program = *(gbe_program_new_gen_program_cb **)dlsym(dlhCompiler, "gbe_program_new_gen_program");
+ if (compiler_program_new_gen_program == NULL)
+ return;
+
+ compiler_program_link_program = *(gbe_program_link_program_cb **)dlsym(dlhCompiler, "gbe_program_link_program");
+ if (compiler_program_link_program == NULL)
+ return;
+
+ compiler_program_build_from_llvm = *(gbe_program_build_from_llvm_cb **)dlsym(dlhCompiler, "gbe_program_build_from_llvm");
+ if (compiler_program_build_from_llvm == NULL)
+ return;
+
+ compiler_program_new_from_llvm_binary = *(gbe_program_new_from_llvm_binary_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm_binary");
+ if (compiler_program_new_from_llvm_binary == NULL)
+ return;
+
+ compiler_program_serialize_to_binary = *(gbe_program_serialize_to_binary_cb **)dlsym(dlhCompiler, "gbe_program_serialize_to_binary");
+ if (compiler_program_serialize_to_binary == NULL)
+ return;
+
+ compiler_program_new_from_llvm = *(gbe_program_new_from_llvm_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm");
+ if (compiler_program_new_from_llvm == NULL)
+ return;
+
+ compiler_program_clean_llvm_resource = *(gbe_program_clean_llvm_resource_cb **)dlsym(dlhCompiler, "gbe_program_clean_llvm_resource");
+ if (compiler_program_clean_llvm_resource == NULL)
+ return;
+
+ compilerLoaded = true;
+ }
+ }
+
+ ~GbeLoaderInitializer()
+ {
+ if (dlhCompiler != NULL)
+ dlclose(dlhCompiler);
+
+ if (dlhInterp != NULL)
+ dlclose(dlhInterp);
+ }
+
+ bool compilerLoaded;
+ void *dlhCompiler;
+ void *dlhInterp;
+};
+
+static struct GbeLoaderInitializer gbeLoader;
+
+int CompilerSupported()
+{
+ if (gbeLoader.compilerLoaded)
+ return 1;
+ else
+ return 0;
+}
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
new file mode 100644
index 0000000..da9d034
--- /dev/null
+++ b/src/cl_gbe_loader.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_GBE_LOADER_H__
+#define __CL_GBE_LOADER_H__
+
+#include "program.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern gbe_program_new_from_source_cb *compiler_program_new_from_source;
+extern gbe_program_compile_from_source_cb *compiler_program_compile_from_source;
+extern gbe_program_new_gen_program_cb *compiler_program_new_gen_program;
+extern gbe_program_link_program_cb *compiler_program_link_program;
+extern gbe_program_build_from_llvm_cb *compiler_program_build_from_llvm;
+extern gbe_program_new_from_llvm_binary_cb *compiler_program_new_from_llvm_binary;
+extern gbe_program_serialize_to_binary_cb *compiler_program_serialize_to_binary;
+extern gbe_program_new_from_llvm_cb *compiler_program_new_from_llvm;
+extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource;
+
+extern gbe_program_new_from_binary_cb *interp_program_new_from_binary;
+extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size;
+extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data;
+extern gbe_program_delete_cb *interp_program_delete;
+extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num;
+extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name;
+extern gbe_program_get_kernel_cb *interp_program_get_kernel;
+extern gbe_kernel_get_name_cb *interp_kernel_get_name;
+extern gbe_kernel_get_attributes_cb *interp_kernel_get_attributes;
+extern gbe_kernel_get_code_cb *interp_kernel_get_code;
+extern gbe_kernel_get_code_size_cb *interp_kernel_get_code_size;
+extern gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num;
+extern gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size;
+extern gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti;
+extern gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type;
+extern gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align;
+extern gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width;
+extern gbe_kernel_get_curbe_offset_cb *interp_kernel_get_curbe_offset;
+extern gbe_kernel_get_curbe_size_cb *interp_kernel_get_curbe_size;
+extern gbe_kernel_get_stack_size_cb *interp_kernel_get_stack_size;
+extern gbe_kernel_get_scratch_size_cb *interp_kernel_get_scratch_size;
+extern gbe_kernel_get_required_work_group_size_cb *interp_kernel_get_required_work_group_size;
+extern gbe_kernel_use_slm_cb *interp_kernel_use_slm;
+extern gbe_kernel_get_slm_size_cb *interp_kernel_get_slm_size;
+extern gbe_kernel_get_sampler_size_cb *interp_kernel_get_sampler_size;
+extern gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data;
+extern gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size;
+extern gbe_kernel_get_image_size_cb *interp_kernel_get_image_size;
+extern gbe_kernel_get_image_data_cb *interp_kernel_get_image_data;
+extern gbe_get_printf_num_cb* interp_get_printf_num;
+extern gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti;
+extern gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti;
+extern gbe_dup_printfset_cb* interp_dup_printfset;
+extern gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size;
+extern gbe_release_printf_info_cb* interp_release_printf_info;
+extern gbe_output_printf_cb* interp_output_printf;
+extern gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info;
+
+int CompilerSupported();
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_GBE_LOADER_H__ */
diff --git a/src/cl_gen75_device.h b/src/cl_gen75_device.h
new file mode 100644
index 0000000..682ee06
--- /dev/null
+++ b/src/cl_gen75_device.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Common fields for both SNB devices (either GT1 or GT2)
+ */
+.max_parameter_size = 1024,
+.global_mem_cache_line_size = 128, /* XXX */
+.global_mem_cache_size = 8 << 10, /* XXX */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 64 << 10,
+.scratch_mem_size = 2 << 20,
+
+#include "cl_gt_device.h"
+
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
new file mode 100644
index 0000000..69cc0b9
--- /dev/null
+++ b/src/cl_gen7_device.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Common fields for both IVB devices (either GT1 or GT2) */
+.max_parameter_size = 1024,
+.global_mem_cache_line_size = 128, /* XXX */
+.global_mem_cache_size = 8 << 10, /* XXX */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 64 << 10,
+.scratch_mem_size = 12 << 10,
+
+#include "cl_gt_device.h"
+
diff --git a/src/cl_gl_api.c b/src/cl_gl_api.c
new file mode 100644
index 0000000..04dde5a
--- /dev/null
+++ b/src/cl_gl_api.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Zhigang Gong <zhigang.gong at intel.com>
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#ifdef HAS_EGL
+#include <GL/gl.h>
+#endif
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_sampler.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include "CL/cl.h"
+#include "CL/cl_gl.h"
+#include "CL/cl_intel.h"
+#include "cl_mem_gl.h"
+
+#define CHECK_GL_CONTEXT(CTX) \
+do { \
+ if (UNLIKELY(CTX->props.gl_type == CL_GL_NOSHARE)) { \
+ err = CL_INVALID_CONTEXT; \
+ goto error; \
+ } \
+} while (0)
+
+cl_mem
+clCreateFromGLBuffer(cl_context context,
+ cl_mem_flags flags,
+ GLuint bufobj,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ CHECK_GL_CONTEXT (context);
+
+ mem = cl_mem_new_gl_buffer(context, flags, bufobj, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateFromGLTexture2D(cl_context context,
+ cl_mem_flags flags,
+ GLenum texture_target,
+ GLint miplevel,
+ GLuint texture,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ CHECK_GL_CONTEXT (context);
+
+ mem = cl_mem_new_gl_texture(context, flags, texture_target, miplevel, texture, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateFromGLTexture3D(cl_context context,
+ cl_mem_flags flags,
+ GLenum texture_target,
+ GLint miplevel,
+ GLuint texture,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ CHECK_GL_CONTEXT (context);
+
+ mem = cl_mem_new_gl_texture(context, flags, texture_target, miplevel, texture, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+}
+
+cl_mem
+clCreateFromGLTexture(cl_context context,
+ cl_mem_flags flags,
+ cl_GLenum target,
+ cl_GLint miplevel,
+ cl_GLuint texture,
+ cl_int * errcode_ret)
+{
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ CHECK_CONTEXT (context);
+ CHECK_GL_CONTEXT (context);
+
+ mem = cl_mem_new_gl_texture(context, flags, target, miplevel, texture, &err);
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+
+}
+
+/* XXX NULL function currently. */
+cl_int clEnqueueAcquireGLObjects (cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem *mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ return err;
+}
+
+/* XXX NULL function currently. */
+cl_int clEnqueueReleaseGLObjects (cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem *mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event)
+{
+ cl_int err = CL_SUCCESS;
+ return err;
+}
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
new file mode 100644
index 0000000..e2fcee3
--- /dev/null
+++ b/src/cl_gt_device.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Common fields for both all GT devices (IVB / SNB) */
+.device_type = CL_DEVICE_TYPE_GPU,
+.vendor_id = 0, /* == device_id (set when requested) */
+.max_work_item_dimensions = 3,
+.max_1d_global_work_sizes = {1024 * 1024 * 256, 1, 1},
+.max_2d_global_work_sizes = {8192, 8192, 1},
+.max_3d_global_work_sizes = {8192, 8192, 2048},
+.preferred_vector_width_char = 8,
+.preferred_vector_width_short = 8,
+.preferred_vector_width_int = 4,
+.preferred_vector_width_long = 2,
+.preferred_vector_width_float = 4,
+.preferred_vector_width_double = 0,
+.preferred_vector_width_half = 0,
+.native_vector_width_char = 8,
+.native_vector_width_short = 8,
+.native_vector_width_int = 4,
+.native_vector_width_long = 2,
+.native_vector_width_float = 4,
+.native_vector_width_double = 2,
+.native_vector_width_half = 8,
+.preferred_wg_sz_mul = 16,
+.address_bits = 32,
+.max_mem_alloc_size = 256 * 1024 * 1024,
+.image_support = CL_TRUE,
+.max_read_image_args = 128,
+.max_write_image_args = 8,
+.image_max_array_size = 2048,
+.image2d_max_width = 8192,
+.image2d_max_height = 8192,
+.image3d_max_width = 8192,
+.image3d_max_height = 8192,
+.image3d_max_depth = 2048,
+.image_mem_size = 8192,
+.max_samplers = 16,
+.mem_base_addr_align = sizeof(cl_long) * 16 * 8,
+.min_data_type_align_size = sizeof(cl_long) * 16,
+.single_fp_config = 0, /* XXX */
+.double_fp_config = 0,
+.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.global_mem_size = 1024 * 1024 * 1024,
+.max_constant_buffer_size = 512 << 10,
+.max_constant_args = 8,
+.error_correction_support = CL_FALSE,
+.host_unified_memory = CL_FALSE,
+.profiling_timer_resolution = 80, /* ns */
+.endian_little = CL_TRUE,
+.available = CL_TRUE,
+.compiler_available = CL_TRUE,
+.linker_available = CL_TRUE,
+.execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
+.queue_properties = CL_QUEUE_PROFILING_ENABLE,
+.platform = NULL, /* == intel_platform (set when requested) */
+/* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
+.single_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
+.printf_buffer_size = 1 * 1024 * 1024,
+.interop_user_sync = CL_TRUE,
+
+#define DECL_INFO_STRING(FIELD, STRING) \
+ .FIELD = STRING, \
+ .JOIN(FIELD,_sz) = sizeof(STRING),
+DECL_INFO_STRING(name, "Intel HD Graphics Family")
+DECL_INFO_STRING(vendor, "Intel")
+DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
+DECL_INFO_STRING(profile, "FULL_PROFILE")
+DECL_INFO_STRING(opencl_c_version, LIBCL_C_VERSION_STRING)
+DECL_INFO_STRING(extensions, "")
+DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
+ "__cl_copy_region_align16;"
+ "__cl_cpy_region_unalign_same_offset;"
+ "__cl_copy_region_unalign_dst_offset;"
+ "__cl_copy_region_unalign_src_offset;"
+ "__cl_copy_buffer_rect;"
+ "__cl_copy_image_1d_to_1d;"
+ "__cl_copy_image_2d_to_2d;"
+ "__cl_copy_image_3d_to_2d;"
+ "__cl_copy_image_2d_to_3d;"
+ "__cl_copy_image_3d_to_3d;"
+ "__cl_copy_image_2d_to_buffer;"
+ "__cl_copy_image_3d_to_buffer;"
+ "__cl_copy_buffer_to_image_2d;"
+ "__cl_copy_buffer_to_image_3d;"
+ "__cl_fill_region_unalign;"
+ "__cl_fill_region_align2;"
+ "__cl_fill_region_align4;"
+ "__cl_fill_region_align8_2;"
+ "__cl_fill_region_align8_4;"
+ "__cl_fill_region_align8_8;"
+ "__cl_fill_region_align8_16;"
+ "__cl_fill_region_align128;"
+ "__cl_fill_image_1d;"
+ "__cl_fill_image_1d_array;"
+ "__cl_fill_image_2d;"
+ "__cl_fill_image_2d_array;"
+ "__cl_fill_image_3d;")
+
+DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
+#undef DECL_INFO_STRING
+.parent_device = NULL,
+.partition_max_sub_device = 1,
+.partition_property = {0},
+.affinity_domain = 0,
+.partition_type = {0},
+.device_reference_count = 1,
+
diff --git a/src/cl_image.c b/src/cl_image.c
new file mode 100644
index 0000000..ced9789
--- /dev/null
+++ b/src/cl_image.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_image.h"
+#include "cl_utils.h"
+#include "intel/intel_defines.h"
+
+#include <assert.h>
+
+LOCAL cl_int
+cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp)
+{
+ assert(bpp);
+
+ if(fmt == NULL)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+
+ const uint32_t type = fmt->image_channel_data_type;
+ const uint32_t order = fmt->image_channel_order;
+ switch (type) {
+#define DECL_BPP(DATA_TYPE, VALUE) case DATA_TYPE: *bpp = VALUE;
+ DECL_BPP(CL_SNORM_INT8, 1); break;
+ DECL_BPP(CL_SNORM_INT16, 2); break;
+ DECL_BPP(CL_UNORM_INT8, 1); break;
+ DECL_BPP(CL_UNORM_INT16, 2); break;
+ DECL_BPP(CL_UNORM_SHORT_565, 2);
+ if (order != CL_RGBx && order != CL_RGB)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ break;
+ DECL_BPP(CL_UNORM_SHORT_555, 2);
+ if (order != CL_RGBx && order != CL_RGB)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ break;
+ DECL_BPP(CL_UNORM_INT_101010, 4);
+ if (order != CL_RGBx && order != CL_RGB)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ break;
+ DECL_BPP(CL_SIGNED_INT8, 1); break;
+ DECL_BPP(CL_SIGNED_INT16, 2); break;
+ DECL_BPP(CL_SIGNED_INT32, 4); break;
+ DECL_BPP(CL_UNSIGNED_INT8, 1); break;
+ DECL_BPP(CL_UNSIGNED_INT16, 2); break;
+ DECL_BPP(CL_UNSIGNED_INT32, 4); break;
+ DECL_BPP(CL_HALF_FLOAT, 2); break;
+ DECL_BPP(CL_FLOAT, 4); break;
+#undef DECL_BPP
+ default: return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ };
+
+ switch (order) {
+ case CL_Rx: break;
+ case CL_R: break;
+ case CL_A: break;
+ case CL_RA: *bpp *= 2; break;
+ case CL_RG: *bpp *= 2; break;
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ if (type != CL_UNORM_INT8 && type != CL_UNORM_INT16 &&
+ type != CL_SNORM_INT8 && type != CL_SNORM_INT16 &&
+ type != CL_HALF_FLOAT && type != CL_FLOAT)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ break;
+ case CL_RGB:
+ case CL_RGBx:
+ if (type != CL_UNORM_SHORT_555 &&
+ type != CL_UNORM_SHORT_565 &&
+ type != CL_UNORM_INT_101010)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ break;
+ case CL_RGBA: *bpp *= 4; break;
+ case CL_ARGB:
+ case CL_BGRA:
+ if (type != CL_UNORM_INT8 && type != CL_SIGNED_INT8 &&
+ type != CL_SNORM_INT8 && type != CL_UNSIGNED_INT8)
+ return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ *bpp *= 4;
+ break;
+ default: return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ };
+
+ return CL_SUCCESS;
+}
+
+LOCAL uint32_t
+cl_image_get_intel_format(const cl_image_format *fmt)
+{
+ const uint32_t type = fmt->image_channel_data_type;
+ const uint32_t order = fmt->image_channel_order;
+ switch (order) {
+ case CL_R:
+#if 0
+ case CL_Rx:
+ case CL_A:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ if ((order == CL_INTENSITY || order == CL_LUMINANCE)
+ && (type != CL_UNORM_INT8 && type != CL_UNORM_INT16
+ && type != CL_SNORM_INT8 && type != CL_SNORM_INT16
+ && type != CL_HALF_FLOAT && type != CL_FLOAT))
+ return INTEL_UNSUPPORTED_FORMAT;
+#endif
+
+/* XXX it seems we have some acuracy compatible issue with snomr_int8/16,
+ * have to disable those formats currently. */
+
+ switch (type) {
+ case CL_HALF_FLOAT: return I965_SURFACEFORMAT_R16_FLOAT;
+ case CL_FLOAT: return I965_SURFACEFORMAT_R32_FLOAT;
+// case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16_SNORM;
+// case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8_SNORM;
+ case CL_UNORM_INT8: return I965_SURFACEFORMAT_R8_UNORM;
+ case CL_UNORM_INT16: return I965_SURFACEFORMAT_R16_UNORM;
+ case CL_SIGNED_INT8: return I965_SURFACEFORMAT_R8_SINT;
+ case CL_SIGNED_INT16: return I965_SURFACEFORMAT_R16_SINT;
+ case CL_SIGNED_INT32: return I965_SURFACEFORMAT_R32_SINT;
+ case CL_UNSIGNED_INT8: return I965_SURFACEFORMAT_R8_UINT;
+ case CL_UNSIGNED_INT16: return I965_SURFACEFORMAT_R16_UINT;
+ case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32_UINT;
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
+#if 0
+ case CL_RG:
+ case CL_RA:
+ switch (type) {
+ case CL_HALF_FLOAT: return I965_SURFACEFORMAT_R16G16_FLOAT;
+ case CL_FLOAT: return I965_SURFACEFORMAT_R32G32_FLOAT;
+ case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16G16_SNORM;
+ case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8G8_SNORM;
+ case CL_UNORM_INT8: return I965_SURFACEFORMAT_R8G8_UNORM;
+ case CL_UNORM_INT16: return I965_SURFACEFORMAT_R16G16_UNORM;
+ case CL_SIGNED_INT8: return I965_SURFACEFORMAT_R8G8_SINT;
+ case CL_SIGNED_INT16: return I965_SURFACEFORMAT_R16G16_SINT;
+ case CL_SIGNED_INT32: return I965_SURFACEFORMAT_R32G32_SINT;
+ case CL_UNSIGNED_INT8: return I965_SURFACEFORMAT_R8G8_UINT;
+ case CL_UNSIGNED_INT16: return I965_SURFACEFORMAT_R16G16_UINT;
+ case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32G32_UINT;
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
+ case CL_RGB:
+ case CL_RGBx:
+ switch (type) {
+ case CL_UNORM_INT_101010: return I965_SURFACEFORMAT_R10G10B10A2_UNORM;
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_SHORT_555:
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
+#endif
+ case CL_RGBA:
+ switch (type) {
+ case CL_HALF_FLOAT: return I965_SURFACEFORMAT_R16G16B16A16_FLOAT;
+ case CL_FLOAT: return I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
+// case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16G16B16A16_SNORM;
+// case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8G8B8A8_SNORM;
+ case CL_UNORM_INT8: return I965_SURFACEFORMAT_R8G8B8A8_UNORM;
+ case CL_UNORM_INT16: return I965_SURFACEFORMAT_R16G16B16A16_UNORM;
+ case CL_SIGNED_INT8: return I965_SURFACEFORMAT_R8G8B8A8_SINT;
+ case CL_SIGNED_INT16: return I965_SURFACEFORMAT_R16G16B16A16_SINT;
+ case CL_SIGNED_INT32: return I965_SURFACEFORMAT_R32G32B32A32_SINT;
+ case CL_UNSIGNED_INT8: return I965_SURFACEFORMAT_R8G8B8A8_UINT;
+ case CL_UNSIGNED_INT16: return I965_SURFACEFORMAT_R16G16B16A16_UINT;
+ case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32G32B32A32_UINT;
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
+ case CL_ARGB: return INTEL_UNSUPPORTED_FORMAT;
+ case CL_BGRA:
+ switch (type) {
+ case CL_UNORM_INT8: return I965_SURFACEFORMAT_B8G8R8A8_UNORM;
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
+ default: return INTEL_UNSUPPORTED_FORMAT;
+ };
+}
+
+static const uint32_t cl_image_order[] = {
+ CL_R, CL_A, CL_RG, CL_RA, CL_RGB, CL_RGBA, CL_BGRA, CL_ARGB,
+ CL_INTENSITY, CL_LUMINANCE, CL_Rx, CL_RGx, CL_RGBx
+};
+
+static const uint32_t cl_image_type[] = {
+ CL_SNORM_INT8, CL_SNORM_INT16, CL_UNORM_INT8, CL_UNORM_INT16,
+ CL_UNORM_SHORT_565, CL_UNORM_SHORT_555, CL_UNORM_INT_101010,
+ CL_SIGNED_INT8, CL_SIGNED_INT16, CL_SIGNED_INT32,
+ CL_UNSIGNED_INT8, CL_UNSIGNED_INT16, CL_UNSIGNED_INT32,
+ CL_HALF_FLOAT, CL_FLOAT
+};
+
+static const size_t cl_image_order_n = SIZEOF32(cl_image_order);
+static const size_t cl_image_type_n = SIZEOF32(cl_image_type);
+
+cl_int
+cl_image_get_supported_fmt(cl_context ctx,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format *image_formats,
+ cl_uint *num_image_formats)
+{
+ size_t i, j, n = 0;
+ for (i = 0; i < cl_image_order_n; ++i)
+ for (j = 0; j < cl_image_type_n; ++j) {
+ const cl_image_format fmt = {
+ .image_channel_order = cl_image_order[i],
+ .image_channel_data_type = cl_image_type[j]
+ };
+ const uint32_t intel_fmt = cl_image_get_intel_format(&fmt);
+ if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
+ continue;
+ if (n < num_entries && image_formats) image_formats[n] = fmt;
+ n++;
+ }
+ if (num_image_formats) *num_image_formats = n;
+ return CL_SUCCESS;
+}
+
diff --git a/src/cl_image.h b/src/cl_image.h
new file mode 100644
index 0000000..86cc76a
--- /dev/null
+++ b/src/cl_image.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_IMAGE_H__
+#define __CL_IMAGE_H__
+
+#include "cl_internals.h"
+#include "CL/cl.h"
+#include <stdint.h>
+
+/* Returned when the OCL format is not supported */
+#define INTEL_UNSUPPORTED_FORMAT ((uint32_t) ~0x0u)
+
+/* Compute the number of bytes per pixel if the format is supported */
+extern cl_int cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp);
+
+/* Return the intel format for the given OCL format */
+extern uint32_t cl_image_get_intel_format(const cl_image_format *fmt);
+
+/* Return the list of formats supported by the API */
+extern cl_int cl_image_get_supported_fmt(cl_context context,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format *image_formats,
+ cl_uint *num_image_formats);
+
+#endif /* __CL_IMAGE_H__ */
+
diff --git a/src/cl_internals.h b/src/cl_internals.h
new file mode 100644
index 0000000..693de1d
--- /dev/null
+++ b/src/cl_internals.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_INTERNALS_H__
+#define __CL_INTERNALS_H__
+
+/* We put a header to identify each object. This will make the programmer life
+ * easy if objects are wrongly used in the API
+ */
+#define CL_MAGIC_KERNEL_HEADER 0x1234567890abcdefLL
+#define CL_MAGIC_CONTEXT_HEADER 0x0ab123456789cdefLL
+#define CL_MAGIC_PROGRAM_HEADER 0x34560ab12789cdefLL
+#define CL_MAGIC_QUEUE_HEADER 0x83650a12b79ce4dfLL
+#define CL_MAGIC_SAMPLER_HEADER 0x686a0ecba79ce33fLL
+#define CL_MAGIC_EVENT_HEADER 0x8324a9c810ebf90fLL
+#define CL_MAGIC_MEM_HEADER 0x381a27b9ce6504dfLL
+#define CL_MAGIC_DEAD_HEADER 0xdeaddeaddeaddeadLL
+
+#endif /* __CL_INTERNALS_H__ */
+
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
new file mode 100644
index 0000000..55b707a
--- /dev/null
+++ b/src/cl_kernel.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_mem.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_khr_icd.h"
+#include "CL/cl.h"
+#include "cl_sampler.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+
+LOCAL void
+cl_kernel_delete(cl_kernel k)
+{
+ uint32_t i;
+ if (k == NULL) return;
+
+ /* We are not done with the kernel */
+ if (atomic_dec(&k->ref_n) > 1) return;
+ /* Release one reference on all bos we own */
+ if (k->bo) cl_buffer_unreference(k->bo);
+ /* This will be true for kernels created by clCreateKernel */
+ if (k->ref_its_program) cl_program_delete(k->program);
+ /* Release the curbe if allocated */
+ if (k->curbe) cl_free(k->curbe);
+ /* Release the argument array if required */
+ if (k->args) {
+ for (i = 0; i < k->arg_n; ++i)
+ if (k->args[i].mem != NULL)
+ cl_mem_delete(k->args[i].mem);
+ cl_free(k->args);
+ }
+ if (k->image_sz)
+ cl_free(k->images);
+ k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ cl_free(k);
+}
+
+LOCAL cl_kernel
+cl_kernel_new(cl_program p)
+{
+ cl_kernel k = NULL;
+ TRY_ALLOC_NO_ERR (k, CALLOC(struct _cl_kernel));
+ SET_ICD(k->dispatch)
+ k->ref_n = 1;
+ k->magic = CL_MAGIC_KERNEL_HEADER;
+ k->program = p;
+
+exit:
+ return k;
+error:
+ cl_kernel_delete(k);
+ k = NULL;
+ goto exit;
+}
+
+LOCAL const char*
+cl_kernel_get_name(cl_kernel k)
+{
+ if (UNLIKELY(k == NULL)) return NULL;
+ return interp_kernel_get_name(k->opaque);
+}
+
+LOCAL const char*
+cl_kernel_get_attributes(cl_kernel k)
+{
+ if (UNLIKELY(k == NULL)) return NULL;
+ return interp_kernel_get_attributes(k->opaque);
+}
+
+LOCAL void
+cl_kernel_add_ref(cl_kernel k)
+{
+ atomic_inc(&k->ref_n);
+}
+
+LOCAL cl_int
+cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
+{
+ uint32_t offset; /* where to patch */
+ enum gbe_arg_type arg_type; /* kind of argument */
+ size_t arg_sz; /* size of the argument */
+ cl_mem mem = NULL; /* for __global, __constant and image arguments */
+ cl_context ctx = k->program->ctx;
+
+ if (UNLIKELY(index >= k->arg_n))
+ return CL_INVALID_ARG_INDEX;
+ arg_type = interp_kernel_get_arg_type(k->opaque, index);
+ arg_sz = interp_kernel_get_arg_size(k->opaque, index);
+
+ if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
+ if (arg_sz == 2 && arg_type == GBE_ARG_VALUE && sz == sizeof(cl_sampler)) {
+ /* FIXME, this is a workaround for the case when a kernel arg
+ defined a sampler_t but doesn't use it.*/
+ arg_type = GBE_ARG_SAMPLER;
+ } else
+ return CL_INVALID_ARG_SIZE;
+ }
+
+ if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
+ return CL_INVALID_ARG_SIZE;
+ if(arg_type == GBE_ARG_VALUE) {
+ if(UNLIKELY(value == NULL))
+ return CL_INVALID_ARG_VALUE;
+ } else if(arg_type == GBE_ARG_LOCAL_PTR) {
+ if(UNLIKELY(value != NULL))
+ return CL_INVALID_ARG_VALUE;
+ } else if(arg_type == GBE_ARG_SAMPLER) {
+ if (UNLIKELY(value == NULL))
+ return CL_INVALID_ARG_VALUE;
+
+ cl_sampler s = *(cl_sampler*)value;
+ if(s->magic != CL_MAGIC_SAMPLER_HEADER)
+ return CL_INVALID_SAMPLER;
+ } else {
+ // should be image, GLOBAL_PTR, CONSTANT_PTR
+ if (UNLIKELY(value == NULL && arg_type == GBE_ARG_IMAGE))
+ return CL_INVALID_ARG_VALUE;
+ if(value != NULL)
+ mem = *(cl_mem*)value;
+ if(value != NULL && mem) {
+ if( CL_SUCCESS != is_valid_mem(mem, ctx->buffers))
+ return CL_INVALID_MEM_OBJECT;
+
+ if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
+ || (arg_type != GBE_ARG_IMAGE && IS_IMAGE(mem))))
+ return CL_INVALID_ARG_VALUE;
+ }
+ }
+
+ /* Copy the structure or the value directly into the curbe */
+ if (arg_type == GBE_ARG_VALUE) {
+ offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+ assert(offset + sz <= k->curbe_sz);
+ memcpy(k->curbe + offset, value, sz);
+ k->args[index].local_sz = 0;
+ k->args[index].is_set = 1;
+ k->args[index].mem = NULL;
+ return CL_SUCCESS;
+ }
+
+ /* For a local pointer just save the size */
+ if (arg_type == GBE_ARG_LOCAL_PTR) {
+ k->args[index].local_sz = sz;
+ k->args[index].is_set = 1;
+ k->args[index].mem = NULL;
+ return CL_SUCCESS;
+ }
+
+ /* Is it a sampler*/
+ if (arg_type == GBE_ARG_SAMPLER) {
+ cl_sampler sampler;
+ memcpy(&sampler, value, sz);
+ k->args[index].local_sz = 0;
+ k->args[index].is_set = 1;
+ k->args[index].mem = NULL;
+ k->args[index].sampler = sampler;
+ cl_set_sampler_arg_slot(k, index, sampler);
+ offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+ assert(offset + 2 <= k->curbe_sz);
+ memcpy(k->curbe + offset, &sampler->clkSamplerValue, 2);
+ return CL_SUCCESS;
+ }
+
+ if(value != NULL)
+ mem = *(cl_mem*) value;
+
+ if(value == NULL || mem == NULL) {
+ /* for buffer object GLOBAL_PTR CONSTANT_PTR, it maybe NULL */
+ int32_t offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+ *((uint32_t *)(k->curbe + offset)) = 0;
+ assert(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR);
+
+ if (k->args[index].mem)
+ cl_mem_delete(k->args[index].mem);
+ k->args[index].mem = NULL;
+ k->args[index].is_set = 1;
+ k->args[index].local_sz = 0;
+ return CL_SUCCESS;
+ }
+
+ mem = *(cl_mem*) value;
+
+ cl_mem_add_ref(mem);
+ if (k->args[index].mem)
+ cl_mem_delete(k->args[index].mem);
+ k->args[index].mem = mem;
+ k->args[index].is_set = 1;
+ k->args[index].local_sz = 0;
+ k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
+ return CL_SUCCESS;
+}
+
+LOCAL int
+cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
+ size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+ assert(k != NULL);
+ void *ret_info = interp_kernel_get_arg_info(k->opaque, arg_index,
+ param_name - CL_KERNEL_ARG_ADDRESS_QUALIFIER);
+ int str_len = 0;
+ cl_kernel_arg_type_qualifier type_qual = CL_KERNEL_ARG_TYPE_NONE;
+
+ switch (param_name) {
+ case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
+ if (param_value_size < sizeof(cl_kernel_arg_address_qualifier))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret)
+ *param_value_size_ret = sizeof(cl_kernel_arg_address_qualifier);
+ if (!param_value) return CL_SUCCESS;
+ if ((cl_ulong)ret_info == 0) {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+ } else if ((cl_ulong)ret_info == 1 || (cl_ulong)ret_info == 4) {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ } else if ((cl_ulong)ret_info == 2) {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_CONSTANT;
+ } else if ((cl_ulong)ret_info == 3) {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
+ } else {
+ /* If no address qualifier is specified, the default address qualifier
+ which is CL_KERNEL_ARG_ADDRESS_PRIVATE is returned. */
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
+ }
+ return CL_SUCCESS;
+
+ case CL_KERNEL_ARG_ACCESS_QUALIFIER:
+ if (param_value_size < sizeof(cl_kernel_arg_access_qualifier))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret)
+ *param_value_size_ret = sizeof(cl_kernel_arg_access_qualifier);
+ if (!param_value) return CL_SUCCESS;
+ if (!strcmp((char*)ret_info, "write_only")) {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+ } else if (!strcmp((char*)ret_info, "read_only")) {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_ONLY;
+ } else if (!strcmp((char*)ret_info, "read_write")) {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_WRITE;
+ } else {
+ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_NONE;
+ }
+ return CL_SUCCESS;
+
+ case CL_KERNEL_ARG_TYPE_NAME:
+ case CL_KERNEL_ARG_NAME:
+ str_len = strlen(ret_info);
+ if (param_value_size < str_len + 1)
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret)
+ *param_value_size_ret = str_len + 1;
+ if (!param_value) return CL_SUCCESS;
+ memcpy(param_value, ret_info, str_len);
+ ((char *)param_value)[str_len] = 0;
+ return CL_SUCCESS;
+
+ case CL_KERNEL_ARG_TYPE_QUALIFIER:
+ if (param_value_size < sizeof(cl_kernel_arg_type_qualifier))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret)
+ *param_value_size_ret = sizeof(cl_kernel_arg_type_qualifier);
+ if (!param_value) return CL_SUCCESS;
+ if (strstr((char*)ret_info, "const"))
+ type_qual = type_qual | CL_KERNEL_ARG_TYPE_CONST;
+ if (strstr((char*)ret_info, "volatile"))
+ type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
+ if (strstr((char*)ret_info, "restrict"))
+ type_qual = type_qual | CL_KERNEL_ARG_TYPE_RESTRICT;
+ *(cl_kernel_arg_type_qualifier *)param_value = type_qual;
+ return CL_SUCCESS;
+
+ default:
+ assert(0);
+ }
+
+ return CL_SUCCESS;
+}
+
+LOCAL uint32_t
+cl_kernel_get_simd_width(cl_kernel k)
+{
+ assert(k != NULL);
+ return interp_kernel_get_simd_width(k->opaque);
+}
+
+LOCAL void
+cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
+{
+ cl_context ctx = k->program->ctx;
+ cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
+
+ if(k->bo != NULL)
+ cl_buffer_unreference(k->bo);
+
+ /* Allocate the gen code here */
+ const uint32_t code_sz = interp_kernel_get_code_size(opaque);
+ const char *code = interp_kernel_get_code(opaque);
+ k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u);
+ k->arg_n = interp_kernel_get_arg_num(opaque);
+
+ /* Upload the code */
+ cl_buffer_subdata(k->bo, 0, code_sz, code);
+ k->opaque = opaque;
+
+ /* Create the curbe */
+ k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
+
+ /* Get sampler data & size */
+ k->sampler_sz = interp_kernel_get_sampler_size(k->opaque);
+ assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
+ if (k->sampler_sz > 0)
+ interp_kernel_get_sampler_data(k->opaque, k->samplers);
+ interp_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
+ k->stack_size = interp_kernel_get_stack_size(k->opaque);
+ /* Get image data & size */
+ k->image_sz = interp_kernel_get_image_size(k->opaque);
+ assert(k->sampler_sz <= GEN_MAX_SURFACES);
+ if (k->image_sz > 0) {
+ TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
+ interp_kernel_get_image_data(k->opaque, k->images);
+ } else
+ k->images = NULL;
+ return;
+error:
+ cl_buffer_unreference(k->bo);
+ k->bo = NULL;
+}
+
+LOCAL cl_kernel
+cl_kernel_dup(cl_kernel from)
+{
+ cl_kernel to = NULL;
+
+ if (UNLIKELY(from == NULL))
+ return NULL;
+ TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
+ SET_ICD(to->dispatch)
+ to->bo = from->bo;
+ to->opaque = from->opaque;
+ to->ref_n = 1;
+ to->magic = CL_MAGIC_KERNEL_HEADER;
+ to->program = from->program;
+ to->arg_n = from->arg_n;
+ to->curbe_sz = from->curbe_sz;
+ to->sampler_sz = from->sampler_sz;
+ to->image_sz = from->image_sz;
+ memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
+ to->stack_size = from->stack_size;
+ if (to->sampler_sz)
+ memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
+ if (to->image_sz) {
+ TRY_ALLOC_NO_ERR(to->images, cl_calloc(to->image_sz, sizeof(to->images[0])));
+ memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
+ } else
+ to->images = NULL;
+ TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
+ if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
+
+ /* Retain the bos */
+ if (from->bo) cl_buffer_reference(from->bo);
+
+ /* We retain the program destruction since this kernel (user allocated)
+ * depends on the program for some of its pointers
+ */
+ assert(from->program);
+ cl_program_add_ref(from->program);
+ to->ref_its_program = CL_TRUE;
+
+exit:
+ return to;
+error:
+ cl_kernel_delete(to);
+ to = NULL;
+ goto exit;
+}
+
+LOCAL cl_int
+cl_kernel_work_group_sz(cl_kernel ker,
+ const size_t *local_wk_sz,
+ uint32_t wk_dim,
+ size_t *wk_grp_sz)
+{
+ cl_int err = CL_SUCCESS;
+ size_t sz = 0;
+ cl_uint i;
+
+ for (i = 0; i < wk_dim; ++i) {
+ const uint32_t required_sz = interp_kernel_get_required_work_group_size(ker->opaque, i);
+ if (required_sz != 0 && required_sz != local_wk_sz[i]) {
+ err = CL_INVALID_WORK_ITEM_SIZE;
+ goto error;
+ }
+ }
+ sz = local_wk_sz[0];
+ for (i = 1; i < wk_dim; ++i)
+ sz *= local_wk_sz[i];
+
+ if (sz > cl_get_kernel_max_wg_sz(ker)) {
+ err = CL_INVALID_WORK_ITEM_SIZE;
+ goto error;
+ }
+
+error:
+ if (wk_grp_sz) *wk_grp_sz = sz;
+ return err;
+}
+
+
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
new file mode 100644
index 0000000..1ed90a5
--- /dev/null
+++ b/src/cl_kernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_KERNEL_H__
+#define __CL_KERNEL_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_gbe_loader.h"
+#include "CL/cl.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* This is the kernel as it is interfaced by the compiler */
+struct _gbe_kernel;
+
+/* We need to save buffer data for relocation and binding and we must figure out
+ * if all arguments are properly set
+ */
+typedef struct cl_argument {
+ cl_mem mem; /* For image and regular buffers */
+ cl_sampler sampler; /* For sampler. */
+ unsigned char bti;
+ uint32_t local_sz:31; /* For __local size specification */
+ uint32_t is_set:1; /* All args must be set before NDRange */
+} cl_argument;
+
+/* One OCL function */
+struct _cl_kernel {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a kernel */
+ volatile int ref_n; /* We reference count this object */
+ cl_buffer bo; /* The code itself */
+ cl_program program; /* Owns this structure (and pointers) */
+ gbe_kernel opaque; /* (Opaque) compiler structure for the OCL kernel */
+ char *curbe; /* One curbe per kernel */
+ size_t curbe_sz; /* Size of it */
+ uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
+ size_t sampler_sz; /* sampler size defined in kernel & kernel args. */
+ struct ImageInfo *images; /* images defined in kernel args */
+ size_t image_sz; /* image count in kernel args */
+ cl_ulong local_mem_sz; /* local memory size specified in kernel args. */
+ size_t compile_wg_sz[3]; /* Required workgroup size by __attribute__((reqd_work_gro
+ up_size(X, Y, Z))) qualifier.*/
+ size_t global_work_sz[3]; /* maximum global size that can be used to execute a kernel
+ (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
+ size_t stack_size; /* stack size per work item. */
+ cl_argument *args; /* To track argument setting */
+ uint32_t arg_n:31; /* Number of arguments */
+ uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
+};
+
+/* Allocate an empty kernel */
+extern cl_kernel cl_kernel_new(cl_program);
+
+/* Destroy and deallocate an empty kernel */
+extern void cl_kernel_delete(cl_kernel);
+
+/* Setup the kernel with the given GBE Kernel */
+extern void cl_kernel_setup(cl_kernel k, gbe_kernel opaque);
+
+/* Get the kernel name */
+extern const char *cl_kernel_get_name(cl_kernel k);
+
+/* Get the kernel attributes*/
+extern const char *cl_kernel_get_attributes(cl_kernel k);
+
+/* Get the simd width as used in the code */
+extern uint32_t cl_kernel_get_simd_width(cl_kernel k);
+
+/* When a kernel is created from outside, we just duplicate the structure we
+ * have internally and give it back to the user
+ */
+extern cl_kernel cl_kernel_dup(cl_kernel);
+
+/* Add one more reference on the kernel object */
+extern void cl_kernel_add_ref(cl_kernel);
+
+/* Set the argument before kernel execution */
+extern int cl_kernel_set_arg(cl_kernel,
+ uint32_t arg_index,
+ size_t arg_size,
+ const void *arg_value);
+
+/* Get the argument information */
+extern int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index,
+ cl_kernel_arg_info param_name,
+ size_t param_value_size, void *param_value,
+ size_t *param_value_size_ret);
+
+/* Compute and check the work group size from the user provided local size */
+extern cl_int
+cl_kernel_work_group_sz(cl_kernel ker,
+ const size_t *local_wk_sz,
+ cl_uint wk_dim,
+ size_t *wk_grp_sz);
+
+#endif /* __CL_KERNEL_H__ */
+
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
new file mode 100644
index 0000000..50a0898
--- /dev/null
+++ b/src/cl_khr_icd.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright © 2013 Simon Richter
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <ocl_icd.h>
+
+#include "cl_platform_id.h"
+
+/* The interop functions are not implemented in Beignet */
+#define CL_GL_INTEROP(x) NULL
+/* OpenCL 1.2 is not implemented in Beignet */
+#define CL_1_2_NOTYET(x) NULL
+
+/** Return platform list through ICD interface
+ * This code is used only if a client is linked directly against the library
+ * instead of using the ICD loader. In this case, no other implementations
+ * should exist in the process address space, so the call is equivalent to
+ * clGetPlatformIDs().
+ *
+ * @param[in] num_entries Number of entries allocated in return buffer
+ * @param[out] platforms Platform identifiers supported by this implementation
+ * @param[out] num_platforms Number of platform identifiers returned
+ * @return OpenCL error code
+ * @retval CL_SUCCESS Successful execution
+ * @retval CL_PLATFORM_NOT_FOUND_KHR No platforms provided
+ * @retval CL_INVALID_VALUE Invalid parameters
+ */
+cl_int
+clIcdGetPlatformIDsKHR(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms)
+{
+ return clGetPlatformIDs(num_entries, platforms, num_platforms);
+}
+
+struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
+ clGetPlatformIDs,
+ clGetPlatformInfo,
+ clGetDeviceIDs,
+ clGetDeviceInfo,
+ clCreateContext,
+ clCreateContextFromType,
+ clRetainContext,
+ clReleaseContext,
+ clGetContextInfo,
+ clCreateCommandQueue,
+ clRetainCommandQueue,
+ clReleaseCommandQueue,
+ clGetCommandQueueInfo,
+ (void *) NULL, /* clSetCommandQueueProperty */
+ clCreateBuffer,
+ clCreateImage2D,
+ clCreateImage3D,
+ clRetainMemObject,
+ clReleaseMemObject,
+ clGetSupportedImageFormats,
+ clGetMemObjectInfo,
+ clGetImageInfo,
+ clCreateSampler,
+ clRetainSampler,
+ clReleaseSampler,
+ clGetSamplerInfo,
+ clCreateProgramWithSource,
+ clCreateProgramWithBinary,
+ clRetainProgram,
+ clReleaseProgram,
+ clBuildProgram,
+ clUnloadCompiler,
+ clGetProgramInfo,
+ clGetProgramBuildInfo,
+ clCreateKernel,
+ clCreateKernelsInProgram,
+ clRetainKernel,
+ clReleaseKernel,
+ clSetKernelArg,
+ clGetKernelInfo,
+ clGetKernelWorkGroupInfo,
+ clWaitForEvents,
+ clGetEventInfo,
+ clRetainEvent,
+ clReleaseEvent,
+ clGetEventProfilingInfo,
+ clFlush,
+ clFinish,
+ clEnqueueReadBuffer,
+ clEnqueueWriteBuffer,
+ clEnqueueCopyBuffer,
+ clEnqueueReadImage,
+ clEnqueueWriteImage,
+ clEnqueueCopyImage,
+ clEnqueueCopyImageToBuffer,
+ clEnqueueCopyBufferToImage,
+ clEnqueueMapBuffer,
+ clEnqueueMapImage,
+ clEnqueueUnmapMemObject,
+ clEnqueueNDRangeKernel,
+ clEnqueueTask,
+ clEnqueueNativeKernel,
+ clEnqueueMarker,
+ clEnqueueWaitForEvents,
+ clEnqueueBarrier,
+ clGetExtensionFunctionAddress,
+ CL_GL_INTEROP(clCreateFromGLBuffer),
+ CL_GL_INTEROP(clCreateFromGLTexture2D),
+ CL_GL_INTEROP(clCreateFromGLTexture3D),
+ CL_GL_INTEROP(clCreateFromGLRenderbuffer),
+ CL_GL_INTEROP(clGetGLObjectInfo),
+ CL_GL_INTEROP(clGetGLTextureInfo),
+ CL_GL_INTEROP(clEnqueueAcquireGLObjects),
+ CL_GL_INTEROP(clEnqueueReleaseGLObjects),
+ CL_GL_INTEROP(clGetGLContextInfoKHR),
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ clSetEventCallback,
+ clCreateSubBuffer,
+ clSetMemObjectDestructorCallback,
+ clCreateUserEvent,
+ clSetUserEventStatus,
+ clEnqueueReadBufferRect,
+ clEnqueueWriteBufferRect,
+ clEnqueueCopyBufferRect,
+ CL_1_2_NOTYET(clCreateSubDevicesEXT),
+ CL_1_2_NOTYET(clRetainDeviceEXT),
+ CL_1_2_NOTYET(clReleaseDeviceEXT),
+#ifdef CL_VERSION_1_2
+ (void *) NULL,
+ clCreateSubDevices,
+ clRetainDevice,
+ clReleaseDevice,
+ clCreateImage,
+ clCreateProgramWithBuiltInKernels,
+ clCompileProgram,
+ clLinkProgram,
+ clUnloadPlatformCompiler,
+ clGetKernelArgInfo,
+ clEnqueueFillBuffer,
+ clEnqueueFillImage,
+ clEnqueueMigrateMemObjects,
+ clEnqueueMarkerWithWaitList,
+ clEnqueueBarrierWithWaitList,
+ clGetExtensionFunctionAddressForPlatform,
+ CL_GL_INTEROP(clCreateFromGLTexture),
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL,
+ (void *) NULL
+#endif
+};
+
diff --git a/src/cl_khr_icd.h b/src/cl_khr_icd.h
new file mode 100644
index 0000000..1e206b4
--- /dev/null
+++ b/src/cl_khr_icd.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2013 Simon Richter
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __CL_KHR_ICD_H__
+#define __CL_KHR_ICD_H__
+
+#ifdef HAS_OCLIcd
+
+#define SET_ICD(dispatch) \
+ dispatch = &cl_khr_icd_dispatch;
+#define INIT_ICD(member) .member = &cl_khr_icd_dispatch,
+#define DEFINE_ICD(member) struct _cl_icd_dispatch const *member;
+
+extern struct _cl_icd_dispatch const cl_khr_icd_dispatch;
+#else
+#define SET_ICD(dispatch)
+#define INIT_ICD(member)
+#define DEFINE_ICD(member)
+#endif
+
+#endif
diff --git a/src/cl_mem.c b/src/cl_mem.c
new file mode 100644
index 0000000..81c4d64
--- /dev/null
+++ b/src/cl_mem.c
@@ -0,0 +1,1903 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_device_id.h"
+#include "cl_driver.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#define FIELD_SIZE(CASE,TYPE) \
+ case JOIN(CL_,CASE): \
+ if(param_value_size_ret) \
+ *param_value_size_ret = sizeof(TYPE); \
+ if(!param_value) \
+ return CL_SUCCESS; \
+ if(param_value_size < sizeof(TYPE)) \
+ return CL_INVALID_VALUE; \
+ break;
+
+#define MAX_TILING_SIZE 128 * MB
+
+static cl_mem_object_type
+cl_get_mem_object_type(cl_mem mem)
+{
+ switch (mem->type) {
+ case CL_MEM_BUFFER_TYPE:
+ case CL_MEM_SUBBUFFER_TYPE:
+ return CL_MEM_OBJECT_BUFFER;
+ case CL_MEM_IMAGE_TYPE:
+ case CL_MEM_GL_IMAGE_TYPE:
+ {
+ struct _cl_mem_image *image = cl_mem_image(mem);
+ return image->image_type;
+ }
+ default:
+ return CL_MEM_OBJECT_BUFFER;
+ }
+}
+
+LOCAL cl_int
+cl_get_mem_object_info(cl_mem mem,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ switch(param_name)
+ {
+ FIELD_SIZE(MEM_TYPE, cl_mem_object_type);
+ FIELD_SIZE(MEM_FLAGS, cl_mem_flags);
+ FIELD_SIZE(MEM_SIZE, size_t);
+ FIELD_SIZE(MEM_HOST_PTR, void *);
+ FIELD_SIZE(MEM_MAP_COUNT, cl_uint);
+ FIELD_SIZE(MEM_REFERENCE_COUNT, cl_uint);
+ FIELD_SIZE(MEM_CONTEXT, cl_context);
+ FIELD_SIZE(MEM_ASSOCIATED_MEMOBJECT, cl_mem);
+ FIELD_SIZE(MEM_OFFSET, size_t);
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ switch(param_name)
+ {
+ case CL_MEM_TYPE:
+ *((cl_mem_object_type *)param_value) = cl_get_mem_object_type(mem);
+ break;
+ case CL_MEM_FLAGS:
+ *((cl_mem_flags *)param_value) = mem->flags;
+ break;
+ case CL_MEM_SIZE:
+ *((size_t *)param_value) = mem->size;
+ break;
+ case CL_MEM_HOST_PTR:
+ if(mem->type == CL_MEM_IMAGE_TYPE) {
+ *((size_t *)param_value) = (size_t)mem->host_ptr;
+ } else {
+ struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+ *((size_t *)param_value) = (size_t)mem->host_ptr + buf->sub_offset;
+ }
+ break;
+ case CL_MEM_MAP_COUNT:
+ *((cl_uint *)param_value) = mem->map_ref;
+ break;
+ case CL_MEM_REFERENCE_COUNT:
+ *((cl_uint *)param_value) = mem->ref_n;
+ break;
+ case CL_MEM_CONTEXT:
+ *((cl_context *)param_value) = mem->ctx;
+ break;
+ case CL_MEM_ASSOCIATED_MEMOBJECT:
+ if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
+ *((cl_mem *)param_value) = NULL;
+ } else {
+ struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+ *((cl_mem *)param_value) = (cl_mem)(buf->parent);
+ }
+ break;
+ case CL_MEM_OFFSET:
+ if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
+ *((size_t *)param_value) = 0;
+ } else {
+ struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+ *((size_t *)param_value) = buf->sub_offset;
+ }
+ break;
+ }
+
+ return CL_SUCCESS;
+}
+
+#define IS_1D(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D || \
+ image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
+ image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+
+#define IS_2D(image) (image->image_type == CL_MEM_OBJECT_IMAGE2D || \
+ image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
+#define IS_3D(image) (image->image_type == CL_MEM_OBJECT_IMAGE3D)
+
+#define IS_ARRAY(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
+ image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
+LOCAL cl_int
+cl_get_image_info(cl_mem mem,
+ cl_image_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ int err;
+ CHECK_IMAGE(mem, image);
+
+ switch(param_name)
+ {
+ FIELD_SIZE(IMAGE_FORMAT, cl_image_format);
+ FIELD_SIZE(IMAGE_ELEMENT_SIZE, size_t);
+ FIELD_SIZE(IMAGE_ROW_PITCH, size_t);
+ FIELD_SIZE(IMAGE_SLICE_PITCH, size_t);
+ FIELD_SIZE(IMAGE_WIDTH, size_t);
+ FIELD_SIZE(IMAGE_HEIGHT, size_t);
+ FIELD_SIZE(IMAGE_DEPTH, size_t);
+ FIELD_SIZE(IMAGE_ARRAY_SIZE, size_t);
+ FIELD_SIZE(IMAGE_BUFFER, cl_mem);
+ FIELD_SIZE(IMAGE_NUM_MIP_LEVELS, cl_uint);
+ FIELD_SIZE(IMAGE_NUM_SAMPLES, cl_uint);
+ default:
+ return CL_INVALID_VALUE;
+ }
+
+ switch(param_name)
+ {
+ case CL_IMAGE_FORMAT:
+ *(cl_image_format *)param_value = image->fmt;
+ break;
+ case CL_IMAGE_ELEMENT_SIZE:
+ *(size_t *)param_value = image->bpp;
+ break;
+ case CL_IMAGE_ROW_PITCH:
+ *(size_t *)param_value = image->row_pitch;
+ break;
+ case CL_IMAGE_SLICE_PITCH:
+ *(size_t *)param_value = image->slice_pitch;
+ break;
+ case CL_IMAGE_WIDTH:
+ *(size_t *)param_value = image->w;
+ break;
+ case CL_IMAGE_HEIGHT:
+ *(size_t *)param_value = IS_1D(image) ? 0 : image->h;
+ break;
+ case CL_IMAGE_DEPTH:
+ *(size_t *)param_value = IS_3D(image) ? image->depth : 0;
+ break;
+ case CL_IMAGE_ARRAY_SIZE:
+ *(size_t *)param_value = IS_ARRAY(image) ? image->depth : 0;
+ break;
+ case CL_IMAGE_BUFFER:
+ *(cl_mem *)param_value = image->buffer_1d;
+ break;
+ case CL_IMAGE_NUM_MIP_LEVELS:
+ case CL_IMAGE_NUM_SAMPLES:
+ *(cl_mem *)param_value = 0;
+ break;
+ }
+
+ return CL_SUCCESS;
+
+error:
+ return err;
+}
+
+#undef FIELD_SIZE
+
+LOCAL cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+ cl_context ctx,
+ cl_mem_flags flags,
+ size_t sz,
+ cl_int is_tiled,
+ cl_int *errcode)
+{
+ cl_buffer_mgr bufmgr = NULL;
+ cl_mem mem = NULL;
+ cl_int err = CL_SUCCESS;
+ size_t alignment = 64;
+
+ assert(ctx);
+
+ /* Allocate and inialize the structure itself */
+ if (type == CL_MEM_IMAGE_TYPE) {
+ struct _cl_mem_image *image = NULL;
+ TRY_ALLOC (image, CALLOC(struct _cl_mem_image));
+ mem = &image->base;
+ } else if (type == CL_MEM_GL_IMAGE_TYPE ) {
+ struct _cl_mem_gl_image *gl_image = NULL;
+ TRY_ALLOC (gl_image, CALLOC(struct _cl_mem_gl_image));
+ mem = &gl_image->base.base;
+ } else {
+ struct _cl_mem_buffer *buffer = NULL;
+ TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer));
+ mem = &buffer->base;
+ }
+ mem->type = type;
+ SET_ICD(mem->dispatch)
+ mem->ref_n = 1;
+ mem->magic = CL_MAGIC_MEM_HEADER;
+ mem->flags = flags;
+
+ if (sz != 0) {
+ /* Pinning will require stricter alignment rules */
+ if ((flags & CL_MEM_PINNABLE) || is_tiled)
+ alignment = 4096;
+
+ /* Allocate space in memory */
+ bufmgr = cl_context_get_bufmgr(ctx);
+ assert(bufmgr);
+ mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+ if (UNLIKELY(mem->bo == NULL)) {
+ err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ goto error;
+ }
+ mem->size = sz;
+ }
+
+ cl_context_add_ref(ctx);
+ mem->ctx = ctx;
+ /* Append the buffer in the context buffer list */
+ pthread_mutex_lock(&ctx->buffer_lock);
+ mem->next = ctx->buffers;
+ if (ctx->buffers != NULL)
+ ctx->buffers->prev = mem;
+ ctx->buffers = mem;
+ pthread_mutex_unlock(&ctx->buffer_lock);
+
+exit:
+ if (errcode)
+ *errcode = err;
+ return mem;
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+
+}
+
+LOCAL cl_int
+is_valid_mem(cl_mem mem, cl_mem buffers)
+{
+ cl_mem tmp = buffers;
+ while(tmp){
+ if(mem == tmp){
+ if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+ return CL_INVALID_MEM_OBJECT;
+ return CL_SUCCESS;
+ }
+ tmp = tmp->next;
+ }
+ return CL_INVALID_MEM_OBJECT;
+}
+
+LOCAL cl_mem
+cl_mem_new_buffer(cl_context ctx,
+ cl_mem_flags flags,
+ size_t sz,
+ void *data,
+ cl_int *errcode_ret)
+{
+ /* Possible mem type combination:
+ CL_MEM_ALLOC_HOST_PTR
+ CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+ CL_MEM_USE_HOST_PTR
+ CL_MEM_COPY_HOST_PTR */
+
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ cl_ulong max_mem_size;
+
+ if (UNLIKELY(sz == 0)) {
+ err = CL_INVALID_BUFFER_SIZE;
+ goto error;
+ }
+
+ if (UNLIKELY(((flags & CL_MEM_READ_WRITE)
+ && (flags & (CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)))
+ || ((flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_WRITE_ONLY)))
+ || ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+ || ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+ || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+ || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY))
+ || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+ || ((flags & (~(CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY
+ | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+ | CL_MEM_USE_HOST_PTR | CL_MEM_HOST_WRITE_ONLY
+ | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) != 0))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ /* This flag is valid only if host_ptr is not NULL */
+ if (UNLIKELY((((flags & CL_MEM_COPY_HOST_PTR) ||
+ (flags & CL_MEM_USE_HOST_PTR)) &&
+ data == NULL))
+ || (!(flags & (CL_MEM_COPY_HOST_PTR
+ |CL_MEM_USE_HOST_PTR))
+ && (data != NULL))) {
+ err = CL_INVALID_HOST_PTR;
+ goto error;
+ }
+
+ /* CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR
+ are mutually exclusive. */
+ if (UNLIKELY(flags & CL_MEM_ALLOC_HOST_PTR &&
+ flags & CL_MEM_USE_HOST_PTR)) {
+ err = CL_INVALID_HOST_PTR;
+ goto error;
+ }
+
+ /* CL_MEM_COPY_HOST_PTR and CL_MEM_USE_HOST_PTR
+ are mutually exclusive. */
+ if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR &&
+ flags & CL_MEM_USE_HOST_PTR)) {
+ err = CL_INVALID_HOST_PTR;
+ goto error;
+ }
+
+ if ((err = cl_get_device_info(ctx->device,
+ CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+ sizeof(max_mem_size),
+ &max_mem_size,
+ NULL)) != CL_SUCCESS) {
+ goto error;
+ }
+
+ if (UNLIKELY(sz > max_mem_size)) {
+ err = CL_INVALID_BUFFER_SIZE;
+ goto error;
+ }
+
+ /* HSW: Byte scattered Read/Write has limitation that
+ the buffer size must be a multiple of 4 bytes. */
+ sz = ALIGN(sz, 4);
+
+ /* Create the buffer in video memory */
+ mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
+ if (mem == NULL || err != CL_SUCCESS)
+ goto error;
+
+ /* Copy the data if required */
+ if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR)
+ cl_buffer_subdata(mem->bo, 0, sz, data);
+
+ if (flags & CL_MEM_USE_HOST_PTR || flags & CL_MEM_COPY_HOST_PTR)
+ mem->host_ptr = data;
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+}
+
+LOCAL cl_mem
+cl_mem_new_sub_buffer(cl_mem buffer,
+ cl_mem_flags flags,
+ cl_buffer_create_type create_type,
+ const void *create_info,
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ struct _cl_mem_buffer *sub_buf = NULL;
+
+ if (buffer->type != CL_MEM_BUFFER_TYPE) {
+ err = CL_INVALID_MEM_OBJECT;
+ goto error;
+ }
+
+ if (flags && (((buffer->flags & CL_MEM_WRITE_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY)))
+ || ((buffer->flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)))
+ || (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR))
+ || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+ || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY))
+ || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS)))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE)) == 0) {
+ flags |= buffer->flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE);
+ }
+ flags |= buffer->flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR);
+ if((flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) {
+ flags |= buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS);
+ }
+
+ if (create_type != CL_BUFFER_CREATE_TYPE_REGION) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!create_info) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ cl_buffer_region *info = (cl_buffer_region *)create_info;
+
+ if (!info->size) {
+ err = CL_INVALID_BUFFER_SIZE;
+ goto error;
+ }
+
+ if (info->origin > buffer->size || info->origin + info->size > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (info->origin & (buffer->ctx->device->mem_base_addr_align / 8 - 1)) {
+ err = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ goto error;
+ }
+
+ /* Now create the sub buffer and link it to the buffer. */
+ TRY_ALLOC (sub_buf, CALLOC(struct _cl_mem_buffer));
+ mem = &sub_buf->base;
+ mem->type = CL_MEM_SUBBUFFER_TYPE;
+ SET_ICD(mem->dispatch)
+ mem->ref_n = 1;
+ mem->magic = CL_MAGIC_MEM_HEADER;
+ mem->flags = flags;
+ sub_buf->parent = (struct _cl_mem_buffer*)buffer;
+
+ cl_mem_add_ref(buffer);
+ /* Append the buffer in the parent buffer list */
+ pthread_mutex_lock(&((struct _cl_mem_buffer*)buffer)->sub_lock);
+ sub_buf->sub_next = ((struct _cl_mem_buffer*)buffer)->subs;
+ if (((struct _cl_mem_buffer*)buffer)->subs != NULL)
+ ((struct _cl_mem_buffer*)buffer)->subs->sub_prev = sub_buf;
+ ((struct _cl_mem_buffer*)buffer)->subs = sub_buf;
+ pthread_mutex_unlock(&((struct _cl_mem_buffer*)buffer)->sub_lock);
+
+ mem->bo = buffer->bo;
+ mem->size = info->size;
+ sub_buf->sub_offset = info->origin;
+ if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR) {
+ mem->host_ptr = buffer->host_ptr;
+ }
+
+ cl_context_add_ref(buffer->ctx);
+ mem->ctx = buffer->ctx;
+ /* Append the buffer in the context buffer list */
+ pthread_mutex_lock(&buffer->ctx->buffer_lock);
+ mem->next = buffer->ctx->buffers;
+ if (buffer->ctx->buffers != NULL)
+ buffer->ctx->buffers->prev = mem;
+ buffer->ctx->buffers = mem;
+ pthread_mutex_unlock(&buffer->ctx->buffer_lock);
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+}
+
+void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
+{
+ cl_buffer_unreference(buffer->bo);
+ buffer->bo = new_bo;
+ cl_buffer_reference(new_bo);
+ if (buffer->type != CL_MEM_SUBBUFFER_TYPE)
+ return;
+
+ struct _cl_mem_buffer *it = ((struct _cl_mem_buffer*)buffer)->sub_next;
+ for( ; it != (struct _cl_mem_buffer*)buffer; it = it->sub_next)
+ {
+ cl_buffer_unreference(it->base.bo);
+ it->base.bo = new_bo;
+ cl_buffer_reference(new_bo);
+ }
+}
+
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+ void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+ const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+ const struct _cl_mem_image *image, cl_bool offset_dst, cl_bool offset_src)
+{
+ if(offset_dst) {
+ size_t dst_offset = image->bpp * origin[0] + dst_row_pitch * origin[1] + dst_slice_pitch * origin[2];
+ dst = (char*)dst + dst_offset;
+ }
+ if(offset_src) {
+ size_t src_offset = image->bpp * origin[0] + src_row_pitch * origin[1] + src_slice_pitch * origin[2];
+ src = (char*)src + src_offset;
+ }
+ if (!origin[0] && region[0] == image->w && dst_row_pitch == src_row_pitch &&
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && dst_slice_pitch == src_slice_pitch)))
+ {
+ memcpy(dst, src, region[2] == 1 ? src_row_pitch*region[1] : src_slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src_ptr = src;
+ char* dst_ptr = dst;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst_ptr, src_ptr, image->bpp*region[0]);
+ src_ptr += src_row_pitch;
+ dst_ptr += dst_row_pitch;
+ }
+ src = (char*)src + src_slice_pitch;
+ dst = (char*)dst + dst_slice_pitch;
+ }
+ }
+}
+
+void
+cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region,
+ const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image)
+{
+ char* dst= cl_mem_map_auto((cl_mem)dst_image);
+ char* src= cl_mem_map_auto((cl_mem)src_image);
+ size_t dst_offset = dst_image->bpp * dst_origin[0] + dst_image->row_pitch * dst_origin[1] + dst_image->slice_pitch * dst_origin[2];
+ size_t src_offset = src_image->bpp * src_origin[0] + src_image->row_pitch * src_origin[1] + src_image->slice_pitch * src_origin[2];
+ dst= (char*)dst+ dst_offset;
+ src= (char*)src+ src_offset;
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src_ptr = src;
+ char* dst_ptr = dst;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst_ptr, src_ptr, src_image->bpp*region[0]);
+ src_ptr += src_image->row_pitch;
+ dst_ptr += dst_image->row_pitch;
+ }
+ src = (char*)src + src_image->slice_pitch;
+ dst = (char*)dst + dst_image->slice_pitch;
+ }
+
+ cl_mem_unmap_auto((cl_mem)src_image);
+ cl_mem_unmap_auto((cl_mem)dst_image);
+
+}
+
+static void
+cl_mem_copy_image(struct _cl_mem_image *image,
+ size_t row_pitch,
+ size_t slice_pitch,
+ void* host_ptr)
+{
+ char* dst_ptr = cl_mem_map_auto((cl_mem)image);
+ size_t origin[3] = {0, 0, 0};
+ size_t region[3] = {image->w, image->h, image->depth};
+
+ cl_mem_copy_image_region(origin, region, dst_ptr, image->row_pitch, image->slice_pitch,
+ host_ptr, row_pitch, slice_pitch, image, CL_FALSE, CL_FALSE); //offset is 0
+ cl_mem_unmap_auto((cl_mem)image);
+}
+
+static const uint32_t tile_sz = 4096; /* 4KB per tile */
+static const uint32_t tilex_w = 512; /* tileX width in bytes */
+static const uint32_t tilex_h = 8; /* tileX height in number of rows */
+static const uint32_t tiley_w = 128; /* tileY width in bytes */
+static const uint32_t tiley_h = 32; /* tileY height in number of rows */
+static const uint32_t valign = 2; /* vertical alignment is 2. */
+
+cl_image_tiling_t cl_get_default_tiling(void)
+{
+ static int initialized = 0;
+ static cl_image_tiling_t tiling = CL_TILE_X;
+ if (!initialized) {
+ char *tilingStr = getenv("OCL_TILING");
+ if (tilingStr != NULL) {
+ switch (tilingStr[0]) {
+ case '0': tiling = CL_NO_TILE; break;
+ case '1': tiling = CL_TILE_X; break;
+ case '2': tiling = CL_TILE_Y; break;
+ default:
+ break;
+ }
+ }
+ initialized = 1;
+ }
+
+ return tiling;
+}
+
+static cl_mem
+_cl_mem_new_image(cl_context ctx,
+ cl_mem_flags flags,
+ const cl_image_format *fmt,
+ const cl_mem_object_type orig_image_type,
+ size_t w,
+ size_t h,
+ size_t depth,
+ size_t pitch,
+ size_t slice_pitch,
+ void *data,
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ cl_mem_object_type image_type = orig_image_type;
+ uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
+ size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h = 0;
+ cl_image_tiling_t tiling = CL_NO_TILE;
+
+ /* Check flags consistency */
+ if (UNLIKELY((flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) && data == NULL)) {
+ err = CL_INVALID_HOST_PTR;
+ goto error;
+ }
+
+ /* Get the size of each pixel */
+ if (UNLIKELY((err = cl_image_byte_per_pixel(fmt, &bpp)) != CL_SUCCESS))
+ goto error;
+
+ /* Only a sub-set of the formats are supported */
+ intel_fmt = cl_image_get_intel_format(fmt);
+ if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) {
+ err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
+ goto error;
+ }
+
+ /* See if the user parameters match */
+#define DO_IMAGE_ERROR \
+ do { \
+ err = CL_INVALID_IMAGE_SIZE; \
+ goto error; \
+ } while (0);
+
+ if (UNLIKELY(w == 0)) DO_IMAGE_ERROR;
+ if (UNLIKELY(h == 0 && (image_type != CL_MEM_OBJECT_IMAGE1D &&
+ image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+ image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)))
+ DO_IMAGE_ERROR;
+
+ if (image_type == CL_MEM_OBJECT_IMAGE1D ||
+ image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+ size_t min_pitch = bpp * w;
+ if (data && pitch == 0)
+ pitch = min_pitch;
+
+ h = 1;
+ depth = 1;
+ if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
+ if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
+ if (UNLIKELY(data && (slice_pitch % pitch != 0))) DO_IMAGE_ERROR;
+ if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+ if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR;
+ tiling = CL_NO_TILE;
+ } else if (image_type == CL_MEM_OBJECT_IMAGE2D) {
+ size_t min_pitch = bpp * w;
+ if (data && pitch == 0)
+ pitch = min_pitch;
+ if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
+ if (UNLIKELY(h > ctx->device->image2d_max_height)) DO_IMAGE_ERROR;
+ if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
+ if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+
+ /* Pick up tiling mode (we do only linear on SNB) */
+ if (cl_driver_get_ver(ctx->drv) != 6)
+ tiling = cl_get_default_tiling();
+
+ depth = 1;
+ } else if (image_type == CL_MEM_OBJECT_IMAGE3D ||
+ image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+ image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ h = 1;
+ tiling = CL_NO_TILE;
+ } else if (cl_driver_get_ver(ctx->drv) != 6)
+ tiling = cl_get_default_tiling();
+
+ size_t min_pitch = bpp * w;
+ if (data && pitch == 0)
+ pitch = min_pitch;
+ size_t min_slice_pitch = pitch * h;
+ if (data && slice_pitch == 0)
+ slice_pitch = min_slice_pitch;
+ if (UNLIKELY(w > ctx->device->image3d_max_width)) DO_IMAGE_ERROR;
+ if (UNLIKELY(h > ctx->device->image3d_max_height)) DO_IMAGE_ERROR;
+ if (image_type == CL_MEM_OBJECT_IMAGE3D &&
+ (UNLIKELY(depth > ctx->device->image3d_max_depth))) DO_IMAGE_ERROR
+ else if (UNLIKELY(depth > ctx->device->image_max_array_size)) DO_IMAGE_ERROR;
+ if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
+ if (UNLIKELY(data && min_slice_pitch > slice_pitch)) DO_IMAGE_ERROR;
+ if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+ if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR;
+
+ } else
+ assert(0);
+
+#undef DO_IMAGE_ERROR
+
+ /* Tiling requires to align both pitch and height */
+ if (tiling == CL_NO_TILE) {
+ aligned_pitch = w * bpp;
+ aligned_h = ALIGN(h, valign);
+ } else if (tiling == CL_TILE_X) {
+ aligned_pitch = ALIGN(w * bpp, tilex_w);
+ aligned_h = ALIGN(h, tilex_h);
+ } else if (tiling == CL_TILE_Y) {
+ aligned_pitch = ALIGN(w * bpp, tiley_w);
+ aligned_h = ALIGN(h, tiley_h);
+ }
+
+ sz = aligned_pitch * aligned_h * depth;
+
+ /* If sz is large than 128MB, map gtt may fail in some system.
+ Because there is no obviours performance drop, disable tiling. */
+ if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
+ tiling = CL_NO_TILE;
+ aligned_pitch = w * bpp;
+ aligned_h = h;
+ sz = aligned_pitch * aligned_h * depth;
+ }
+
+ mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
+ if (mem == NULL || err != CL_SUCCESS)
+ goto error;
+
+ cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+ if (image_type == CL_MEM_OBJECT_IMAGE1D ||
+ image_type == CL_MEM_OBJECT_IMAGE2D ||
+ image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+ aligned_slice_pitch = 0;
+ else
+ aligned_slice_pitch = aligned_pitch * ALIGN(h, 2);
+
+ cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+ intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+ 0, 0, 0);
+
+ /* Copy the data if required */
+ if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+ cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
+ if (flags & CL_MEM_USE_HOST_PTR) {
+ mem->host_ptr = data;
+ cl_mem_image(mem)->host_row_pitch = pitch;
+ cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+ }
+ }
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+}
+
+static cl_mem
+_cl_mem_new_image_from_buffer(cl_context ctx,
+ cl_mem_flags flags,
+ const cl_image_format* image_format,
+ const cl_image_desc *image_desc,
+ cl_int *errcode_ret)
+{
+ cl_mem image = NULL;
+ cl_mem buffer = image_desc->buffer;
+ cl_int err = CL_SUCCESS;
+ *errcode_ret = err;
+ cl_ulong max_size;
+ cl_mem_flags merged_flags;
+ uint32_t bpp;
+ uint32_t intel_fmt = INTEL_UNSUPPORTED_FORMAT;
+ size_t offset = 0;
+
+ /* Get the size of each pixel */
+ if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS))
+ goto error;
+
+ /* Only a sub-set of the formats are supported */
+ intel_fmt = cl_image_get_intel_format(image_format);
+ if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) {
+ err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+ goto error;
+ }
+
+ if (!buffer) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+
+ if (flags & (CL_MEM_USE_HOST_PTR|CL_MEM_ALLOC_HOST_PTR|CL_MEM_COPY_HOST_PTR)) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+
+ /* access check. */
+ if ((buffer->flags & CL_MEM_WRITE_ONLY) &&
+ (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if ((buffer->flags & CL_MEM_READ_ONLY) &&
+ (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if ((buffer->flags & CL_MEM_HOST_WRITE_ONLY) &&
+ (flags & CL_MEM_HOST_READ_ONLY)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if ((buffer->flags & CL_MEM_HOST_READ_ONLY) &&
+ (flags & CL_MEM_HOST_WRITE_ONLY)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if ((buffer->flags & CL_MEM_HOST_NO_ACCESS) &&
+ (flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((err = cl_get_device_info(ctx->device,
+ CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+ sizeof(max_size),
+ &max_size,
+ NULL)) != CL_SUCCESS) {
+ goto error;
+ }
+
+ if (image_desc->image_width > max_size) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+
+ if (image_desc->image_width*bpp > buffer->size) {
+ err = CL_INVALID_IMAGE_DESCRIPTOR;
+ goto error;
+ }
+
+ merged_flags = buffer->flags;
+ if (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)) {
+ merged_flags &= ~(CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY);
+ merged_flags |= flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY);
+ }
+ if (flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS)) {
+ merged_flags &= ~(CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
+ merged_flags |= flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
+ }
+ struct _cl_mem_buffer *mem_buffer = (struct _cl_mem_buffer*)buffer;
+ if (buffer->type == CL_MEM_SUBBUFFER_TYPE) {
+ offset = ((struct _cl_mem_buffer *)buffer)->sub_offset;
+ mem_buffer = mem_buffer->parent;
+ }
+ /* Get the size of each pixel */
+ if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS))
+ goto error;
+
+ // Per bspec, a image should has a at least 2 line vertical alignment,
+ // thus we can't simply attach a buffer to a 1d image surface which has the same size.
+ // We have to create a new image, and copy the buffer data to this new image.
+ // And replace all the buffer object's reference to this image.
+ image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type,
+ mem_buffer->base.size / bpp, 0, 0, 0, 0, NULL, errcode_ret);
+ if (image == NULL)
+ return NULL;
+ void *src = cl_mem_map(buffer);
+ void *dst = cl_mem_map(image);
+ //
+ // FIXME, we could use copy buffer to image to do this on GPU latter.
+ // currently the copy buffer to image function doesn't support 1D image.
+ //
+ // There is a potential risk that this buffer was mapped and the caller
+ // still hold the pointer and want to access it again. This scenario is
+ // not explicitly forbidden in the spec, although it should not be permitted.
+ memcpy(dst, src, mem_buffer->base.size);
+ cl_mem_unmap(buffer);
+ cl_mem_unmap(image);
+
+ if (err != 0)
+ goto error;
+
+ // Now replace buffer's bo to this new bo, need to take care of sub buffer
+ // case.
+ cl_mem_replace_buffer(buffer, image->bo);
+ /* Now point to the right offset if buffer is a SUB_BUFFER. */
+ if (buffer->flags & CL_MEM_USE_HOST_PTR)
+ image->host_ptr = buffer->host_ptr + offset;
+ cl_mem_image(image)->offset = offset;
+ cl_mem_image(image)->w = image_desc->image_width;
+ cl_mem_add_ref(buffer);
+ cl_mem_image(image)->buffer_1d = buffer;
+ return image;
+
+error:
+ if (image)
+ cl_mem_delete(image);
+ image = NULL;
+ *errcode_ret = err;
+ return image;
+}
+
+LOCAL cl_mem
+cl_mem_new_image(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format *image_format,
+ const cl_image_desc *image_desc,
+ void *host_ptr,
+ cl_int *errcode_ret)
+{
+ switch (image_desc->image_type) {
+ case CL_MEM_OBJECT_IMAGE1D:
+ case CL_MEM_OBJECT_IMAGE2D:
+ case CL_MEM_OBJECT_IMAGE3D:
+ return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
+ image_desc->image_width, image_desc->image_height, image_desc->image_depth,
+ image_desc->image_row_pitch, image_desc->image_slice_pitch,
+ host_ptr, errcode_ret);
+ case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+ case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+ return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
+ image_desc->image_width, image_desc->image_height, image_desc->image_array_size,
+ image_desc->image_row_pitch, image_desc->image_slice_pitch,
+ host_ptr, errcode_ret);
+ case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+ return _cl_mem_new_image_from_buffer(context, flags, image_format,
+ image_desc, errcode_ret);
+ break;
+ case CL_MEM_OBJECT_BUFFER:
+ default:
+ assert(0);
+ }
+ return NULL;
+}
+
+LOCAL void
+cl_mem_delete(cl_mem mem)
+{
+ cl_int i;
+ if (UNLIKELY(mem == NULL))
+ return;
+ if (atomic_dec(&mem->ref_n) > 1)
+ return;
+#ifdef HAS_EGL
+ if (UNLIKELY(IS_GL_IMAGE(mem))) {
+ cl_mem_gl_delete(cl_mem_gl_image(mem));
+ }
+#endif
+
+ /* iff we are a image, delete the 1d buffer if has. */
+ if (IS_IMAGE(mem)) {
+ if (cl_mem_image(mem)->buffer_1d) {
+ assert(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER);
+ cl_mem_delete(cl_mem_image(mem)->buffer_1d);
+ cl_mem_image(mem)->buffer_1d = NULL;
+ }
+ }
+
+ /* Remove it from the list */
+ assert(mem->ctx);
+ pthread_mutex_lock(&mem->ctx->buffer_lock);
+ if (mem->prev)
+ mem->prev->next = mem->next;
+ if (mem->next)
+ mem->next->prev = mem->prev;
+ if (mem->ctx->buffers == mem)
+ mem->ctx->buffers = mem->next;
+ pthread_mutex_unlock(&mem->ctx->buffer_lock);
+ cl_context_delete(mem->ctx);
+
+ /* Someone still mapped, unmap */
+ if(mem->map_ref > 0) {
+ assert(mem->mapped_ptr);
+ for(i=0; i<mem->mapped_ptr_sz; i++) {
+ if(mem->mapped_ptr[i].ptr != NULL) {
+ mem->map_ref--;
+ cl_mem_unmap_auto(mem);
+ }
+ }
+ assert(mem->map_ref == 0);
+ }
+
+ if (mem->mapped_ptr)
+ free(mem->mapped_ptr);
+
+ if (mem->dstr_cb) {
+ cl_mem_dstr_cb *cb = mem->dstr_cb;
+ while (mem->dstr_cb) {
+ cb = mem->dstr_cb;
+ cb->pfn_notify(mem, cb->user_data);
+ mem->dstr_cb = cb->next;
+ free(cb);
+ }
+ }
+
+ /* Iff we are sub, do nothing for bo release. */
+ if (mem->type == CL_MEM_SUBBUFFER_TYPE) {
+ struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+ /* Remove it from the parent's list */
+ assert(buffer->parent);
+ pthread_mutex_lock(&buffer->parent->sub_lock);
+ if (buffer->sub_prev)
+ buffer->sub_prev->sub_next = buffer->sub_next;
+ if (buffer->sub_next)
+ buffer->sub_next->sub_prev = buffer->sub_prev;
+ if (buffer->parent->subs == buffer)
+ buffer->parent->subs = buffer->sub_next;
+ pthread_mutex_unlock(&buffer->parent->sub_lock);
+ cl_mem_delete((cl_mem )(buffer->parent));
+ } else if (LIKELY(mem->bo != NULL)) {
+ cl_buffer_unreference(mem->bo);
+ }
+
+ cl_free(mem);
+}
+
+LOCAL void
+cl_mem_add_ref(cl_mem mem)
+{
+ assert(mem);
+ atomic_inc(&mem->ref_n);
+}
+
+#define LOCAL_SZ_0 16
+#define LOCAL_SZ_1 4
+#define LOCAL_SZ_2 4
+
+LOCAL cl_int
+cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+ size_t src_offset, size_t dst_offset, size_t cb)
+{
+ cl_int ret = CL_SUCCESS;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {1,1,1};
+ const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff};
+ int aligned = 0;
+ int dw_src_offset = src_offset/4;
+ int dw_dst_offset = dst_offset/4;
+
+ if (!cb)
+ return ret;
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_buf->ctx == dst_buf->ctx);
+
+ /* All 16 bytes aligned, fast and easy one. */
+ if((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) {
+ extern char cl_internal_copy_buf_align16_str[];
+ extern size_t cl_internal_copy_buf_align16_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+ cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
+ cb = cb/16;
+ aligned = 1;
+ } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/
+ extern char cl_internal_copy_buf_align4_str[];
+ extern size_t cl_internal_copy_buf_align4_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+ cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL);
+ cb = cb/4;
+ aligned = 1;
+ }
+
+ if (aligned) {
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ if (cb < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* Now handle the unaligned cases. */
+ int dw_num = ((dst_offset % 4 + cb) + 3) / 4;
+ unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4];
+ unsigned int last_mask = masks[(dst_offset + cb) % 4];
+ /* handle the very small range copy. */
+ if (cb < 4 && dw_num == 1) {
+ first_mask = first_mask | ~last_mask;
+ }
+
+ if (cb < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+
+ if (src_offset % 4 == dst_offset % 4) {
+ /* Src and dst has the same unaligned offset, just handle the
+ header and tail. */
+ extern char cl_internal_copy_buf_unalign_same_offset_str[];
+ extern size_t cl_internal_copy_buf_unalign_same_offset_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
+ cl_internal_copy_buf_unalign_same_offset_str,
+ (size_t)cl_internal_copy_buf_unalign_same_offset_str_size, NULL);
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+ cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+ cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */
+ if (dst_offset % 4 < src_offset % 4) {
+ extern char cl_internal_copy_buf_unalign_dst_offset_str[];
+ extern size_t cl_internal_copy_buf_unalign_dst_offset_str_size;
+
+ int align_diff = src_offset % 4 - dst_offset % 4;
+ unsigned int dw_mask = masks[align_diff];
+ int shift = align_diff * 8;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
+ cl_internal_copy_buf_unalign_dst_offset_str,
+ (size_t)cl_internal_copy_buf_unalign_dst_offset_str_size, NULL);
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+ cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+ cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+ cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+ cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */
+ if (dst_offset % 4 > src_offset % 4) {
+ extern char cl_internal_copy_buf_unalign_src_offset_str[];
+ extern size_t cl_internal_copy_buf_unalign_src_offset_str_size;
+
+ int align_diff = dst_offset % 4 - src_offset % 4;
+ unsigned int dw_mask = masks[4 - align_diff];
+ int shift = align_diff * 8;
+ int src_less = !(src_offset % 4) && !((src_offset + cb) % 4);
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
+ cl_internal_copy_buf_unalign_src_offset_str,
+ (size_t)cl_internal_copy_buf_unalign_src_offset_str_size, NULL);
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+ cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+ cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+ cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+ cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+ cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* no case can hanldle? */
+ assert(0);
+
+ return ret;
+}
+
+LOCAL cl_int
+cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image* src_image,
+ const size_t * origin, const size_t * region)
+{
+ cl_int ret = CL_SUCCESS;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+ extern char cl_internal_fill_image_1d_str[];
+ extern size_t cl_internal_fill_image_1d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D,
+ cl_internal_fill_image_1d_str, (size_t)cl_internal_fill_image_1d_str_size, NULL);
+ }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ extern char cl_internal_fill_image_1d_array_str[];
+ extern size_t cl_internal_fill_image_1d_array_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,
+ cl_internal_fill_image_1d_array_str, (size_t)cl_internal_fill_image_1d_array_str_size, NULL);
+ }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ extern char cl_internal_fill_image_2d_str[];
+ extern size_t cl_internal_fill_image_2d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D,
+ cl_internal_fill_image_2d_str, (size_t)cl_internal_fill_image_2d_str_size, NULL);
+ }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ extern char cl_internal_fill_image_2d_array_str[];
+ extern size_t cl_internal_fill_image_2d_array_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,
+ cl_internal_fill_image_2d_array_str, (size_t)cl_internal_fill_image_2d_array_str_size, NULL);
+ }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ extern char cl_internal_fill_image_3d_str[];
+ extern size_t cl_internal_fill_image_3d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_3D,
+ cl_internal_fill_image_3d_str, (size_t)cl_internal_fill_image_3d_str_size, NULL);
+ }else{
+ return CL_IMAGE_FORMAT_NOT_SUPPORTED;
+ }
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+ cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
+
+ ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz, local_sz);
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+ cl_mem buffer, size_t offset, size_t size)
+{
+ cl_int ret = CL_SUCCESS;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {1,1,1};
+ char pattern_comb[4];
+ int is_128 = 0;
+ const void * pattern1 = NULL;
+
+ assert(offset % pattern_size == 0);
+ assert(size % pattern_size == 0);
+
+ if (!size)
+ return ret;
+
+ if (pattern_size == 128) {
+ /* 128 is according to pattern of double16, but double works not very
+ well on some platform. We use two float16 to handle this. */
+ extern char cl_internal_fill_buf_align128_str[];
+ extern size_t cl_internal_fill_buf_align128_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
+ cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
+ is_128 = 1;
+ pattern_size = pattern_size / 2;
+ pattern1 = pattern + pattern_size;
+ size = size / 2;
+ } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
+ extern char cl_internal_fill_buf_align8_str[];
+ extern size_t cl_internal_fill_buf_align8_str_size;
+ int order = ffs(pattern_size / 8) - 1;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
+ cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
+ } else if (pattern_size == 4) {
+ extern char cl_internal_fill_buf_align4_str[];
+ extern size_t cl_internal_fill_buf_align4_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+ cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+ } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
+ /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
+ the pattern with the pattern duplication fill in. */
+ assert(pattern_size == 1 || pattern_size == 2);
+ extern char cl_internal_fill_buf_align4_str[];
+ extern size_t cl_internal_fill_buf_align4_str_size;
+
+ if (pattern_size == 2) {
+ memcpy(pattern_comb, pattern, sizeof(char)*2);
+ memcpy(pattern_comb + 2, pattern, sizeof(char)*2);
+ } else {
+ pattern_comb[0] = pattern_comb[1] = pattern_comb[2]
+ = pattern_comb[3] = *(char *)pattern;
+ }
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+ cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+ pattern_size = 4;
+ pattern = pattern_comb;
+ }
+ //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
+ //functions. This depend on the usage but now we just use aligned 1 and 2.
+ else if (pattern_size == 2) {
+ extern char cl_internal_fill_buf_align2_str[];
+ extern size_t cl_internal_fill_buf_align2_str_size;
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2,
+ cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL);
+ } else if (pattern_size == 1) {
+ extern char cl_internal_fill_buf_unalign_str[];
+ extern size_t cl_internal_fill_buf_unalign_str_size;
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN,
+ cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL);
+ } else
+ assert(0);
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ size = size / pattern_size;
+ offset = offset / pattern_size;
+
+ if (size < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 1, pattern_size, pattern);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
+ if (is_128)
+ cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+ const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+ size_t src_row_pitch, size_t src_slice_pitch,
+ size_t dst_row_pitch, size_t dst_slice_pitch) {
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+ cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+ cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_buf->ctx == dst_buf->ctx);
+
+ /* setup the kernel and run. */
+ extern char cl_internal_copy_buf_rect_str[];
+ extern size_t cl_internal_copy_buf_rect_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
+ cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL);
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
+ cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
+ cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
+ const size_t *src_origin, const size_t *dst_origin, const size_t *region) {
+ cl_int ret;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+ uint32_t fixupDataType;
+ uint32_t savedIntelFmt;
+
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ switch (src_image->fmt.image_channel_data_type) {
+ case CL_SNORM_INT8:
+ case CL_UNORM_INT8: fixupDataType = CL_UNSIGNED_INT8; break;
+ case CL_HALF_FLOAT:
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT16: fixupDataType = CL_UNSIGNED_INT16; break;
+ case CL_FLOAT: fixupDataType = CL_UNSIGNED_INT32; break;
+ default:
+ fixupDataType = 0;
+ }
+
+ if (fixupDataType) {
+ cl_image_format fmt;
+ if (src_image->fmt.image_channel_order != CL_BGRA)
+ fmt.image_channel_order = src_image->fmt.image_channel_order;
+ else
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = fixupDataType;
+ savedIntelFmt = src_image->intel_fmt;
+ src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+ dst_image->intel_fmt = src_image->intel_fmt;
+ }
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_image->base.ctx == dst_image->base.ctx);
+
+ /* setup the kernel and run. */
+ if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+ if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+ extern char cl_internal_copy_image_1d_to_1d_str[];
+ extern size_t cl_internal_copy_image_1d_to_1d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,
+ cl_internal_copy_image_1d_to_1d_str, (size_t)cl_internal_copy_image_1d_to_1d_str_size, NULL);
+ }
+ } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ extern char cl_internal_copy_image_2d_to_2d_str[];
+ extern size_t cl_internal_copy_image_2d_to_2d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,
+ cl_internal_copy_image_2d_to_2d_str, (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL);
+ } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ extern char cl_internal_copy_image_2d_to_3d_str[];
+ extern size_t cl_internal_copy_image_2d_to_3d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,
+ cl_internal_copy_image_2d_to_3d_str, (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL);
+ } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+
+ cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+ return CL_SUCCESS;
+ }
+ } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+
+ cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+ return CL_SUCCESS;
+ }
+ } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+
+ cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+ return CL_SUCCESS;
+ } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+ return CL_SUCCESS;
+ } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+ return CL_SUCCESS;
+ }
+ } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ extern char cl_internal_copy_image_3d_to_2d_str[];
+ extern size_t cl_internal_copy_image_3d_to_2d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,
+ cl_internal_copy_image_3d_to_2d_str, (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL);
+ } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ extern char cl_internal_copy_image_3d_to_3d_str[];
+ extern size_t cl_internal_copy_image_3d_to_3d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,
+ cl_internal_copy_image_3d_to_3d_str, (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL);
+ } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+ return CL_SUCCESS;
+ }
+ }
+
+ if (!ker) {
+ ret = CL_OUT_OF_RESOURCES;
+ goto fail;
+ }
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]);
+ cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
+ cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+ if (fixupDataType) {
+ src_image->intel_fmt = savedIntelFmt;
+ dst_image->intel_fmt = savedIntelFmt;
+ }
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer,
+ const size_t *src_origin, const size_t dst_offset, const size_t *region) {
+ cl_int ret;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+ uint32_t intel_fmt, bpp;
+ cl_image_format fmt;
+ size_t origin0, region0;
+
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(image->base.ctx == buffer->ctx);
+
+ fmt.image_channel_order = CL_R;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+ intel_fmt = image->intel_fmt;
+ bpp = image->bpp;
+ image->intel_fmt = cl_image_get_intel_format(&fmt);
+ image->w = image->w * image->bpp;
+ image->bpp = 1;
+ region0 = region[0] * bpp;
+ origin0 = src_origin[0] * bpp;
+ global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+ /* setup the kernel and run. */
+ if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ extern char cl_internal_copy_image_2d_to_buffer_str[];
+ extern size_t cl_internal_copy_image_2d_to_buffer_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,
+ cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);
+ }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ extern char cl_internal_copy_image_3d_to_buffer_str[];
+ extern size_t cl_internal_copy_image_3d_to_buffer_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,
+ cl_internal_copy_image_3d_to_buffer_str, (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL);
+ }
+
+ if (!ker) {
+ ret = CL_OUT_OF_RESOURCES;
+ goto fail;
+ }
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_offset);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->w = image->w / bpp;
+
+ return ret;
+}
+
+
+LOCAL cl_int
+cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image,
+ const size_t src_offset, const size_t *dst_origin, const size_t *region) {
+ cl_int ret;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+ uint32_t intel_fmt, bpp;
+ cl_image_format fmt;
+ size_t origin0, region0;
+
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(image->base.ctx == buffer->ctx);
+
+ fmt.image_channel_order = CL_R;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+ intel_fmt = image->intel_fmt;
+ bpp = image->bpp;
+ image->intel_fmt = cl_image_get_intel_format(&fmt);
+ image->w = image->w * image->bpp;
+ image->bpp = 1;
+ region0 = region[0] * bpp;
+ origin0 = dst_origin[0] * bpp;
+ global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+ /* setup the kernel and run. */
+ if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ extern char cl_internal_copy_buffer_to_image_2d_str[];
+ extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
+ cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+ }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ extern char cl_internal_copy_buffer_to_image_3d_str[];
+ extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,
+ cl_internal_copy_buffer_to_image_3d_str, (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL);
+ }
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->w = image->w / bpp;
+
+ return ret;
+}
+
+
+LOCAL void*
+cl_mem_map(cl_mem mem)
+{
+ cl_buffer_map(mem->bo, 1);
+ assert(cl_buffer_get_virtual(mem->bo));
+ return cl_buffer_get_virtual(mem->bo);
+}
+
+LOCAL cl_int
+cl_mem_unmap(cl_mem mem)
+{
+ cl_buffer_unmap(mem->bo);
+ return CL_SUCCESS;
+}
+
+LOCAL void*
+cl_mem_map_gtt(cl_mem mem)
+{
+ cl_buffer_map_gtt(mem->bo);
+ assert(cl_buffer_get_virtual(mem->bo));
+ mem->mapped_gtt = 1;
+ return cl_buffer_get_virtual(mem->bo);
+}
+
+LOCAL void *
+cl_mem_map_gtt_unsync(cl_mem mem)
+{
+ cl_buffer_map_gtt_unsync(mem->bo);
+ assert(cl_buffer_get_virtual(mem->bo));
+ return cl_buffer_get_virtual(mem->bo);
+}
+
+LOCAL cl_int
+cl_mem_unmap_gtt(cl_mem mem)
+{
+ cl_buffer_unmap_gtt(mem->bo);
+ return CL_SUCCESS;
+}
+
+LOCAL void*
+cl_mem_map_auto(cl_mem mem)
+{
+ if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
+ return cl_mem_map_gtt(mem);
+ else
+ return cl_mem_map(mem);
+}
+
+LOCAL cl_int
+cl_mem_unmap_auto(cl_mem mem)
+{
+ if (mem->mapped_gtt == 1) {
+ cl_buffer_unmap_gtt(mem->bo);
+ mem->mapped_gtt = 0;
+ }
+ else
+ cl_buffer_unmap(mem->bo);
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_mem_pin(cl_mem mem)
+{
+ assert(mem);
+ if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
+ return CL_INVALID_MEM_OBJECT;
+ cl_buffer_pin(mem->bo, 4096);
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_mem_unpin(cl_mem mem)
+{
+ assert(mem);
+ if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
+ return CL_INVALID_MEM_OBJECT;
+ cl_buffer_unpin(mem->bo);
+ return CL_SUCCESS;
+}
+
+LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx,
+ unsigned int bo_name,
+ cl_int* errcode)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+
+ mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, &err);
+ if (mem == NULL || err != CL_SUCCESS)
+ goto error;
+
+ size_t sz = 0;
+ mem->bo = cl_buffer_get_buffer_from_libva(ctx, bo_name, &sz);
+ mem->size = sz;
+
+exit:
+ if (errcode)
+ *errcode = err;
+ return mem;
+
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+}
+
+LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
+ unsigned int bo_name, size_t offset,
+ size_t width, size_t height,
+ cl_image_format fmt,
+ size_t row_pitch,
+ cl_int *errcode)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ struct _cl_mem_image *image = NULL;
+ uint32_t intel_fmt, bpp;
+
+ /* Get the size of each pixel */
+ if (UNLIKELY((err = cl_image_byte_per_pixel(&fmt, &bpp)) != CL_SUCCESS))
+ goto error;
+
+ intel_fmt = cl_image_get_intel_format(&fmt);
+ if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) {
+ err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
+ goto error;
+ }
+
+ mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, &err);
+ if (mem == NULL || err != CL_SUCCESS) {
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+
+ image = cl_mem_image(mem);
+
+ mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image, offset);
+
+ image->w = width;
+ image->h = height;
+ image->image_type = CL_MEM_OBJECT_IMAGE2D;
+ image->depth = 2;
+ image->fmt = fmt;
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->row_pitch = row_pitch;
+ image->slice_pitch = 0;
+ // NOTE: tiling of image is set in cl_buffer_get_image_from_libva().
+ image->tile_x = 0;
+ image->tile_y = 0;
+ image->offset = offset;
+
+exit:
+ if (errcode)
+ *errcode = err;
+ return mem;
+
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+}
+
+LOCAL cl_int
+cl_mem_get_fd(cl_mem mem,
+ int* fd)
+{
+ cl_int err = CL_SUCCESS;
+ if(cl_buffer_get_fd(mem->bo, fd))
+ err = CL_INVALID_OPERATION;
+ return err;
+}
diff --git a/src/cl_mem.h b/src/cl_mem.h
new file mode 100644
index 0000000..3174c5c
--- /dev/null
+++ b/src/cl_mem.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_MEM_H__
+#define __CL_MEM_H__
+
+#include "cl_internals.h"
+#include "cl_driver_type.h"
+#include "CL/cl.h"
+#include "cl_khr_icd.h"
+#include <assert.h>
+
+#ifndef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE1D 0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
+typedef struct _cl_image_desc {
+ cl_mem_object_type image_type;
+ size_t image_width;
+ size_t image_height;
+ size_t image_depth;
+ size_t image_array_size;
+ size_t image_row_pitch;
+ size_t image_slice_pitch;
+ cl_uint num_mip_levels;
+ cl_uint num_samples;
+ cl_mem buffer;
+} cl_image_desc;
+#endif
+
+typedef enum cl_image_tiling {
+ CL_NO_TILE = 0,
+ CL_TILE_X = 1,
+ CL_TILE_Y = 2
+} cl_image_tiling_t;
+
+typedef struct _cl_mapped_ptr {
+ void * ptr;
+ void * v_ptr;
+ size_t size;
+ size_t origin[3]; /* mapped origin */
+ size_t region[3]; /* mapped region */
+}cl_mapped_ptr;
+
+typedef struct _cl_mem_dstr_cb {
+ struct _cl_mem_dstr_cb * next;
+ void (CL_CALLBACK *pfn_notify)(cl_mem memobj, void *user_data);
+ void *user_data;
+}cl_mem_dstr_cb;
+
+/* Used for buffers and images */
+enum cl_mem_type {
+ CL_MEM_BUFFER_TYPE,
+ CL_MEM_SUBBUFFER_TYPE,
+ CL_MEM_IMAGE_TYPE,
+ CL_MEM_GL_IMAGE_TYPE,
+};
+#define IS_IMAGE(mem) (mem->type >= CL_MEM_IMAGE_TYPE)
+#define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
+
+typedef struct _cl_mem {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a memory object */
+ cl_mem prev, next; /* We chain the memory buffers together */
+ enum cl_mem_type type;
+ volatile int ref_n; /* This object is reference counted */
+ cl_buffer bo; /* Data in GPU memory */
+ size_t size; /* original request size, not alignment size, used in constant buffer */
+ cl_context ctx; /* Context it belongs to */
+ cl_mem_flags flags; /* Flags specified at the creation time */
+ void * host_ptr; /* Pointer of the host mem specified by CL_MEM_ALLOC_HOST_PTR */
+ cl_mapped_ptr* mapped_ptr;/* Store the mapped addresses and size by caller. */
+ int mapped_ptr_sz; /* The array size of mapped_ptr. */
+ int map_ref; /* The mapped count. */
+ uint8_t mapped_gtt; /* This object has mapped gtt, for unmap. */
+ cl_mem_dstr_cb *dstr_cb; /* The destroy callback. */
+} _cl_mem;
+
+struct _cl_mem_image {
+ _cl_mem base;
+ cl_image_format fmt; /* only for images */
+ uint32_t intel_fmt; /* format to provide in the surface state */
+ uint32_t bpp; /* number of bytes per pixel */
+ cl_mem_object_type image_type; /* only for images 1D/2D...*/
+ size_t w, h, depth; /* only for images (depth is only for 3D images) */
+ size_t row_pitch, slice_pitch;
+ size_t host_row_pitch, host_slice_pitch;
+ cl_image_tiling_t tiling; /* only IVB+ supports TILE_[X,Y] (image only) */
+ size_t tile_x, tile_y; /* tile offset, used for mipmap images. */
+ size_t offset; /* offset for dri_bo, used when it's reloc. */
+ cl_mem buffer_1d; /* if the image is created from buffer, it point to the buffer.*/
+};
+
+struct _cl_mem_gl_image {
+ struct _cl_mem_image base;
+ uint32_t target;
+ int miplevel;
+ uint32_t texture;
+};
+
+inline static void
+cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
+ cl_mem_object_type image_type,
+ size_t depth, cl_image_format fmt,
+ uint32_t intel_fmt, uint32_t bpp,
+ size_t row_pitch, size_t slice_pitch,
+ cl_image_tiling_t tiling,
+ size_t tile_x, size_t tile_y,
+ size_t offset)
+{
+ image->w = w;
+ image->h = h;
+ image->image_type = image_type;
+ image->depth = depth;
+ image->fmt = fmt;
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->row_pitch = row_pitch;
+ image->slice_pitch = slice_pitch;
+ image->tiling = tiling;
+ image->tile_x = tile_x;
+ image->tile_y = tile_y;
+ image->offset = offset;
+}
+
+struct _cl_mem_buffer {
+ _cl_mem base;
+ struct _cl_mem_buffer* subs; /* Sub buf objects. */
+ size_t sub_offset; /* The sub start offset. */
+ struct _cl_mem_buffer* sub_prev, *sub_next;/* We chain the sub memory buffers together */
+ pthread_mutex_t sub_lock; /* Sub buffers list lock*/
+ struct _cl_mem_buffer* parent; /* Point to the parent buffer if is sub-buffer */
+};
+
+inline static struct _cl_mem_image *
+cl_mem_image(cl_mem mem)
+{
+ assert(IS_IMAGE(mem));
+ return (struct _cl_mem_image *)mem;
+}
+
+inline static struct _cl_mem_gl_image *
+cl_mem_gl_image(cl_mem mem)
+{
+ assert(IS_GL_IMAGE(mem));
+ return (struct _cl_mem_gl_image*)mem;
+}
+
+inline static struct _cl_mem_buffer *
+cl_mem_buffer(cl_mem mem)
+{
+ assert(!IS_IMAGE(mem));
+ return (struct _cl_mem_buffer *)mem;
+}
+
+/* Query information about a memory object */
+extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
+
+/* Query information about an image */
+extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
+
+/* Query whether mem is in buffers */
+extern cl_int is_valid_mem(cl_mem mem, cl_mem buffers);
+
+/* Create a new memory object and initialize it with possible user data */
+extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
+
+/* Create a new sub memory object */
+extern cl_mem cl_mem_new_sub_buffer(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *);
+
+/* Idem but this is an image */
+extern cl_mem
+cl_mem_new_image(cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format *image_format,
+ const cl_image_desc *image_desc,
+ void *host_ptr,
+ cl_int *errcode_ret);
+
+/* Unref the object and delete it if no more reference */
+extern void cl_mem_delete(cl_mem);
+
+/* Destroy egl image. */
+extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
+
+/* Add one more reference to this object */
+extern void cl_mem_add_ref(cl_mem);
+
+/* api clEnqueueCopyBuffer help function */
+extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+ size_t src_offset, size_t dst_offset, size_t cb);
+
+extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+ cl_mem buffer, size_t offset, size_t size);
+
+extern cl_int cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image*,
+ const size_t *, const size_t *);
+
+/* api clEnqueueCopyBufferRect help function */
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+ const size_t *, const size_t *, const size_t *,
+ size_t, size_t, size_t, size_t);
+
+/* api clEnqueueCopyImage help function */
+extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct _cl_mem_image*, struct _cl_mem_image*,
+ const size_t *, const size_t *, const size_t *);
+
+/* api clEnqueueCopyImageToBuffer help function */
+extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct _cl_mem_image*, cl_mem,
+ const size_t *, const size_t, const size_t *);
+
+/* api clEnqueueCopyBufferToImage help function */
+extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_mem_image*,
+ const size_t, const size_t *, const size_t *);
+
+/* Directly map a memory object */
+extern void *cl_mem_map(cl_mem);
+
+/* Unmap a memory object */
+extern cl_int cl_mem_unmap(cl_mem);
+
+/* Directly map a memory object in GTT mode */
+extern void *cl_mem_map_gtt(cl_mem);
+
+/* Directly map a memory object in GTT mode, with out waiting gpu idle */
+extern void *cl_mem_map_gtt_unsync(cl_mem);
+
+/* Unmap a memory object in GTT mode */
+extern cl_int cl_mem_unmap_gtt(cl_mem);
+
+/* Directly map a memory object - tiled images are mapped in GTT mode */
+extern void *cl_mem_map_auto(cl_mem);
+
+/* Unmap a memory object - tiled images are unmapped in GTT mode */
+extern cl_int cl_mem_unmap_auto(cl_mem);
+
+/* Pin/unpin the buffer in memory (you must be root) */
+extern cl_int cl_mem_pin(cl_mem);
+extern cl_int cl_mem_unpin(cl_mem);
+
+extern cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+ cl_context ctx,
+ cl_mem_flags flags,
+ size_t sz,
+ cl_int is_tiled,
+ cl_int *errcode);
+
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+ void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+ const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+ const struct _cl_mem_image *image, cl_bool offset_dst, cl_bool offset_src);
+
+void
+cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region,
+ const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image);
+
+extern cl_mem cl_mem_new_libva_buffer(cl_context ctx,
+ unsigned int bo_name,
+ cl_int *errcode);
+
+extern cl_mem cl_mem_new_libva_image(cl_context ctx,
+ unsigned int bo_name, size_t offset,
+ size_t width, size_t height,
+ cl_image_format fmt,
+ size_t row_pitch,
+ cl_int *errcode);
+extern cl_int cl_mem_get_fd(cl_mem mem, int* fd);
+
+
+#endif /* __CL_MEM_H__ */
+
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
new file mode 100644
index 0000000..28d2ac6
--- /dev/null
+++ b/src/cl_mem_gl.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Zhigang Gong <zhigang.gong at intel.com>
+ */
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_device_id.h"
+#include "cl_driver.h"
+#include "cl_platform_id.h"
+#include "cl_mem_gl.h"
+
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+#include "CL/cl_gl.h"
+
+
+LOCAL cl_mem
+cl_mem_new_gl_buffer(cl_context ctx,
+ cl_mem_flags flags,
+ GLuint buf_obj,
+ cl_int *errcode_ret)
+{
+ NOT_IMPLEMENTED;
+}
+
+LOCAL cl_mem
+cl_mem_new_gl_texture(cl_context ctx,
+ cl_mem_flags flags,
+ GLenum texture_target,
+ GLint miplevel,
+ GLuint texture,
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ /* Check flags consistency */
+ if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR)) {
+ err = CL_INVALID_ARG_VALUE;
+ goto error;
+ }
+
+ mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, &err);
+ if (mem == NULL || err != CL_SUCCESS)
+ goto error;
+
+ mem->bo = cl_buffer_alloc_from_texture(ctx, texture_target, miplevel,
+ texture, cl_mem_image(mem));
+ if (UNLIKELY(mem->bo == NULL)) {
+ err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ goto error;
+ }
+
+ cl_mem_gl_image(mem)->target = texture_target;
+ cl_mem_gl_image(mem)->miplevel = miplevel;
+ cl_mem_gl_image(mem)->texture = texture;
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return mem;
+error:
+ cl_mem_delete(mem);
+ mem = NULL;
+ goto exit;
+
+}
+
+LOCAL void cl_mem_gl_delete(struct _cl_mem_gl_image *gl_image)
+{
+ if (gl_image->base.base.bo != NULL)
+ cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image->target,
+ gl_image->miplevel, gl_image->texture);
+}
diff --git a/src/cl_mem_gl.h b/src/cl_mem_gl.h
new file mode 100644
index 0000000..717ccfb
--- /dev/null
+++ b/src/cl_mem_gl.h
@@ -0,0 +1,17 @@
+#ifndef __CL_MEM_GL_H__
+#define __CL_MEM_GL_H__
+#include "cl_mem.h"
+
+cl_mem cl_mem_new_gl_buffer(cl_context ctx,
+ cl_mem_flags flags,
+ GLuint buf_obj,
+ cl_int *errcode_ret);
+
+cl_mem cl_mem_new_gl_texture(cl_context ctx,
+ cl_mem_flags flags,
+ GLenum texture_target,
+ GLint miplevel,
+ GLuint texture,
+ cl_int *errcode_ret);
+
+#endif
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
new file mode 100644
index 0000000..e7c8d6a
--- /dev/null
+++ b/src/cl_platform_id.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_internals.h"
+#include "cl_utils.h"
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#define DECL_INFO_STRING(FIELD, STRING) \
+ .FIELD = STRING, \
+ .JOIN(FIELD,_sz) = sizeof(STRING),
+
+static struct _cl_platform_id intel_platform_data = {
+ INIT_ICD(dispatch)
+ DECL_INFO_STRING(profile, "FULL_PROFILE")
+ DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
+ DECL_INFO_STRING(name, "Intel Gen OCL Driver")
+ DECL_INFO_STRING(vendor, "Intel")
+ DECL_INFO_STRING(icd_suffix_khr, "Intel")
+};
+
+#undef DECL_INFO_STRING
+
+/* Intel platform (only GPU now) */
+cl_platform_id const intel_platform = &intel_platform_data;
+
+LOCAL cl_int
+cl_get_platform_ids(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms)
+{
+ if (num_platforms != NULL)
+ *num_platforms = 1;
+
+ cl_intel_platform_extension_init(intel_platform);
+ /* Easy right now, only one platform is supported */
+ if(platforms)
+ *platforms = intel_platform;
+ intel_platform->extensions_sz = strlen(intel_platform->extensions) + 1;
+ return CL_SUCCESS;
+}
+
+#define DECL_FIELD(CASE,FIELD) \
+ case JOIN(CL_,CASE): \
+ if (param_value_size < intel_platform->JOIN(FIELD,_sz)) \
+ return CL_INVALID_VALUE; \
+ if (param_value_size_ret != NULL) \
+ *param_value_size_ret = intel_platform->JOIN(FIELD,_sz); \
+ memcpy(param_value, \
+ intel_platform->FIELD, \
+ intel_platform->JOIN(FIELD,_sz)); \
+ return CL_SUCCESS;
+
+#define GET_FIELD_SZ(CASE,FIELD) \
+ case JOIN(CL_,CASE): \
+ if (param_value_size_ret != NULL) \
+ *param_value_size_ret = intel_platform->JOIN(FIELD,_sz); \
+ return CL_SUCCESS;
+
+LOCAL cl_int
+cl_get_platform_info(cl_platform_id platform,
+ cl_platform_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret)
+{
+ if (param_value == NULL) {
+ switch (param_name) {
+ GET_FIELD_SZ (PLATFORM_PROFILE, profile);
+ GET_FIELD_SZ (PLATFORM_VERSION, version);
+ GET_FIELD_SZ (PLATFORM_NAME, name);
+ GET_FIELD_SZ (PLATFORM_VENDOR, vendor);
+ GET_FIELD_SZ (PLATFORM_EXTENSIONS, extensions);
+ GET_FIELD_SZ (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
+ default: return CL_INVALID_VALUE;
+ }
+ }
+
+ /* Fetch the platform inform */
+ switch (param_name) {
+ DECL_FIELD (PLATFORM_PROFILE, profile);
+ DECL_FIELD (PLATFORM_VERSION, version);
+ DECL_FIELD (PLATFORM_NAME, name);
+ DECL_FIELD (PLATFORM_VENDOR, vendor);
+ DECL_FIELD (PLATFORM_EXTENSIONS, extensions);
+ DECL_FIELD (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
+ default: return CL_INVALID_VALUE;
+ }
+}
+
+#undef DECL_FIELD
+
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
new file mode 100644
index 0000000..c7c716e
--- /dev/null
+++ b/src/cl_platform_id.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_PLATFORM_ID_H__
+#define __CL_PLATFORM_ID_H__
+
+#include "cl_internals.h"
+#include "cl_extensions.h"
+#include "cl_khr_icd.h"
+#include "CL/cl.h"
+
+#include "src/OCLConfig.h"
+
+struct _cl_platform_id {
+ DEFINE_ICD(dispatch)
+ const char *profile;
+ const char *version;
+ const char *name;
+ const char *vendor;
+ char *extensions;
+ const char *icd_suffix_khr;
+ size_t profile_sz;
+ size_t version_sz;
+ size_t name_sz;
+ size_t vendor_sz;
+ size_t extensions_sz;
+ size_t icd_suffix_khr_sz;
+ struct cl_extensions *internal_extensions;
+};
+
+/* Platform implemented by this run-time */
+extern cl_platform_id const intel_platform;
+
+/* Return the valid platform */
+extern cl_int cl_get_platform_ids(cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms);
+
+/* Return information for the current platform */
+extern cl_int cl_get_platform_info(cl_platform_id platform,
+ cl_platform_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret);
+
+#define _STR(x) #x
+#define _JOINT(x, y) _STR(x) "." _STR(y)
+#define _JOINT3(x, y, z) _STR(x) "." _STR(y) "." _STR(z)
+
+
+#define LIBCL_DRIVER_VERSION_STRING _JOINT3(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR, LIBCL_DRIVER_VERSION_PATCH)
+#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
+#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
+
+#endif /* __CL_PLATFORM_ID_H__ */
+
diff --git a/src/cl_program.c b/src/cl_program.c
new file mode 100644
index 0000000..79dff34
--- /dev/null
+++ b/src/cl_program.c
@@ -0,0 +1,851 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_khr_icd.h"
+#include "cl_gbe_loader.h"
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <libgen.h>
+
+static void
+cl_program_release_sources(cl_program p)
+{
+ if (p->source) {
+ cl_free(p->source);
+ p->source = NULL;
+ }
+}
+
+static void
+cl_program_release_binary(cl_program p)
+{
+ if (p->binary) {
+ cl_free(p->binary);
+ p->binary = NULL;
+ }
+}
+
+LOCAL void
+cl_program_delete(cl_program p)
+{
+ uint32_t ref, i;
+
+ if (p == NULL)
+ return;
+
+ /* We are not done with it yet */
+ if ((ref = atomic_dec(&p->ref_n)) > 1) return;
+
+ /* Destroy the sources and binary if still allocated */
+ cl_program_release_sources(p);
+ cl_program_release_binary(p);
+
+ /* Release the build options. */
+ if (p->build_opts) {
+ cl_free(p->build_opts);
+ p->build_opts = NULL;
+ }
+
+ if (p->build_log) {
+ free(p->build_log);
+ p->build_log = NULL;
+ }
+
+ /* Remove it from the list */
+ assert(p->ctx);
+ pthread_mutex_lock(&p->ctx->program_lock);
+ if (p->prev)
+ p->prev->next = p->next;
+ if (p->next)
+ p->next->prev = p->prev;
+ if (p->ctx->programs == p)
+ p->ctx->programs = p->next;
+ pthread_mutex_unlock(&p->ctx->program_lock);
+
+ cl_free(p->bin); /* Free the blob */
+ for (i = 0; i < p->ker_n; ++i) /* Free the kernels */
+ cl_kernel_delete(p->ker[i]);
+ cl_free(p->ker);
+
+ /* Program belongs to their parent context */
+ cl_context_delete(p->ctx);
+
+ /* Free the program as allocated by the compiler */
+ if (p->opaque) {
+ if (CompilerSupported())
+ compiler_program_clean_llvm_resource(p->opaque);
+ interp_program_delete(p->opaque);
+ }
+
+ p->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+ cl_free(p);
+}
+
+LOCAL cl_program
+cl_program_new(cl_context ctx)
+{
+ cl_program p = NULL;
+
+ /* Allocate the structure */
+ TRY_ALLOC_NO_ERR (p, CALLOC(struct _cl_program));
+ SET_ICD(p->dispatch)
+ p->build_status = CL_BUILD_NONE;
+ p->ref_n = 1;
+ p->magic = CL_MAGIC_PROGRAM_HEADER;
+ p->ctx = ctx;
+ p->build_log = calloc(1000, sizeof(char));
+ if (p->build_log)
+ p->build_log_max_sz = 1000;
+ /* The queue also belongs to its context */
+ cl_context_add_ref(ctx);
+
+exit:
+ return p;
+error:
+ cl_program_delete(p);
+ goto exit;
+}
+
+LOCAL void
+cl_program_add_ref(cl_program p)
+{
+ assert(p);
+ atomic_inc(&p->ref_n);
+}
+
+static cl_int
+cl_program_load_gen_program(cl_program p)
+{
+ cl_int err = CL_SUCCESS;
+ uint32_t i;
+
+ assert(p->opaque != NULL);
+ p->ker_n = interp_program_get_kernel_num(p->opaque);
+
+ /* Allocate the kernel array */
+ TRY_ALLOC (p->ker, CALLOC_ARRAY(cl_kernel, p->ker_n));
+
+ for (i = 0; i < p->ker_n; ++i) {
+ const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+ assert(opaque != NULL);
+ TRY_ALLOC (p->ker[i], cl_kernel_new(p));
+ cl_kernel_setup(p->ker[i], opaque);
+ }
+
+error:
+ return err;
+}
+
+inline cl_bool isBitcodeWrapper(const unsigned char *BufPtr, const unsigned char *BufEnd)
+{
+ // See if you can find the hidden message in the magic bytes :-).
+ // (Hint: it's a little-endian encoding.)
+ return BufPtr != BufEnd &&
+ BufPtr[0] == 0xDE &&
+ BufPtr[1] == 0xC0 &&
+ BufPtr[2] == 0x17 &&
+ BufPtr[3] == 0x0B;
+}
+
+inline cl_bool isRawBitcode(const unsigned char *BufPtr, const unsigned char *BufEnd)
+{
+ // These bytes sort of have a hidden message, but it's not in
+ // little-endian this time, and it's a little redundant.
+ return BufPtr != BufEnd &&
+ BufPtr[0] == 'B' &&
+ BufPtr[1] == 'C' &&
+ BufPtr[2] == 0xc0 &&
+ BufPtr[3] == 0xde;
+}
+
+#define isBitcode(BufPtr,BufEnd) (isBitcodeWrapper(BufPtr, BufEnd) || isRawBitcode(BufPtr, BufEnd))
+
+LOCAL cl_program
+cl_program_create_from_binary(cl_context ctx,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ const size_t * lengths,
+ const unsigned char ** binaries,
+ cl_int * binary_status,
+ cl_int * errcode_ret)
+{
+ cl_program program = NULL;
+ cl_int err = CL_SUCCESS;
+
+ assert(ctx);
+ INVALID_DEVICE_IF (num_devices != 1);
+ INVALID_DEVICE_IF (devices == NULL);
+ INVALID_DEVICE_IF (devices[0] != ctx->device);
+ INVALID_VALUE_IF (binaries == NULL);
+ INVALID_VALUE_IF (lengths == NULL);
+
+ if (binaries[0] == NULL) {
+ err = CL_INVALID_VALUE;
+ if (binary_status)
+ binary_status[0] = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (lengths[0] == 0) {
+ err = CL_INVALID_VALUE;
+ if (binary_status)
+ binary_status[0] = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ program = cl_program_new(ctx);
+
+ // TODO: Need to check the binary format here to return CL_INVALID_BINARY.
+ TRY_ALLOC(program->binary, cl_calloc(lengths[0], sizeof(char)));
+ memcpy(program->binary, binaries[0], lengths[0]);
+ program->binary_sz = lengths[0];
+ program->source_type = FROM_BINARY;
+
+ if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
+ if(*program->binary == 1){
+ program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+ }else if(*program->binary == 2){
+ program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+ }else{
+ err= CL_INVALID_BINARY;
+ goto error;
+ }
+ program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->vendor_id, program->binary, program->binary_sz);
+
+ if (UNLIKELY(program->opaque == NULL)) {
+ err = CL_INVALID_PROGRAM;
+ goto error;
+ }
+ program->source_type = FROM_LLVM;
+ }
+ else if (*program->binary == 0) {
+ program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+ }
+
+ if (binary_status)
+ binary_status[0] = CL_SUCCESS;
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return program;
+error:
+ cl_program_delete(program);
+ program = NULL;
+ goto exit;
+
+ return CL_SUCCESS;
+}
+
+LOCAL cl_program
+cl_program_create_with_built_in_kernles(cl_context ctx,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ const char * kernel_names,
+ cl_int * errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+
+ assert(ctx);
+ INVALID_DEVICE_IF (num_devices != 1);
+ INVALID_DEVICE_IF (devices == NULL);
+ INVALID_DEVICE_IF (devices[0] != ctx->device);
+
+ cl_int binary_status = CL_SUCCESS;
+ extern char cl_internal_built_in_kernel_str[];
+ extern size_t cl_internal_built_in_kernel_str_size;
+ char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
+
+ ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
+ &ctx->device,
+ (size_t*)&cl_internal_built_in_kernel_str_size,
+ (const unsigned char **)&p_built_in_kernel_str,
+ &binary_status, &err);
+ if (!ctx->built_in_prgs)
+ return NULL;
+
+ err = cl_program_build(ctx->built_in_prgs, NULL);
+ if (err != CL_SUCCESS)
+ return NULL;
+
+ ctx->built_in_prgs->is_built = 1;
+
+ char delims[] = ";";
+ char* saveptr = NULL;
+ char* local_kernel_names;
+ char* kernel = NULL;
+ char* matched_kernel;
+ int i = 0;
+
+ //copy the content to local_kernel_names to protect the kernel_names.
+ TRY_ALLOC(local_kernel_names, cl_calloc(strlen(kernel_names)+1, sizeof(char) ) );
+ memcpy(local_kernel_names, kernel_names, strlen(kernel_names)+1);
+
+ kernel = strtok_r( local_kernel_names, delims , &saveptr);
+ while( kernel != NULL ) {
+ matched_kernel = strstr(ctx->device->built_in_kernels, kernel);
+ if(matched_kernel){
+ for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
+ assert(ctx->built_in_prgs->ker[i]);
+ const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
+ if (strcmp(ker_name, kernel) == 0) {
+ break;
+ }
+ }
+
+ ctx->built_in_kernels[i] = cl_program_create_kernel(ctx->built_in_prgs, kernel, NULL);
+ }
+ kernel = strtok_r((char*)saveptr , delims, &saveptr );
+ }
+
+ cl_free(local_kernel_names);
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return ctx->built_in_prgs;
+error:
+ goto exit;
+
+ return CL_SUCCESS;
+}
+
+LOCAL cl_program
+cl_program_create_from_llvm(cl_context ctx,
+ cl_uint num_devices,
+ const cl_device_id *devices,
+ const char *file_name,
+ cl_int *errcode_ret)
+{
+ cl_program program = NULL;
+ cl_int err = CL_SUCCESS;
+
+ assert(ctx);
+ INVALID_DEVICE_IF (num_devices != 1);
+ INVALID_DEVICE_IF (devices == NULL);
+ INVALID_DEVICE_IF (devices[0] != ctx->device);
+ INVALID_VALUE_IF (file_name == NULL);
+
+ program = cl_program_new(ctx);
+ program->opaque = compiler_program_new_from_llvm(ctx->device->vendor_id, file_name, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1);
+ if (UNLIKELY(program->opaque == NULL)) {
+ err = CL_INVALID_PROGRAM;
+ goto error;
+ }
+
+ /* Create all the kernels */
+ TRY (cl_program_load_gen_program, program);
+ program->source_type = FROM_LLVM;
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return program;
+error:
+ cl_program_delete(program);
+ program = NULL;
+ goto exit;
+}
+
+LOCAL cl_program
+cl_program_create_from_source(cl_context ctx,
+ cl_uint count,
+ const char **strings,
+ const size_t *lengths,
+ cl_int *errcode_ret)
+
+{
+ cl_program program = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_uint i;
+ int32_t * lens = NULL;
+ int32_t len_total = 0;
+ assert(ctx);
+ char * p = NULL;
+ // the real compilation step will be done at build time since we do not have
+ // yet the compilation options
+ program = cl_program_new(ctx);
+ TRY_ALLOC (lens, cl_calloc(count, sizeof(int32_t)));
+ for (i = 0; i < (int) count; ++i) {
+ size_t len;
+ if (lengths == NULL || lengths[i] == 0)
+ len = strlen(strings[i]);
+ else
+ len = lengths[i];
+ lens[i] = len;
+ len_total += len;
+ }
+ TRY_ALLOC(program->source, cl_calloc(len_total+1, sizeof(char)));
+ p = program->source;
+ for (i = 0; i < (int) count; ++i) {
+ memcpy(p, strings[i], lens[i]);
+ p += lens[i];
+ }
+ *p = '\0';
+
+ program->source_type = FROM_SOURCE;
+ program->binary_type = CL_PROGRAM_BINARY_TYPE_NONE;
+
+exit:
+ cl_free(lens);
+ lens = NULL;
+ if (errcode_ret)
+ *errcode_ret = err;
+ return program;
+error:
+ cl_program_delete(program);
+ program = NULL;
+ goto exit;
+}
+
+/* Before we do the real work, we need to check whether our platform
+ cl version can meet -cl-std= */
+static int check_cl_version_option(cl_program p, const char* options) {
+ const char* s = NULL;
+ int ver1 = 0;
+ int ver2 = 0;
+ char version_str[64];
+
+ if (options && (s = strstr(options, "-cl-std="))) {
+
+ if (s + strlen("-cl-std=CLX.X") > options + strlen(options)) {
+ return 0;
+ }
+
+ if (s[8] != 'C' || s[9] != 'L' || s[10] > '9' || s[10] < '0' || s[11] != '.'
+ || s[12] > '9' || s[12] < '0') {
+ return 0;
+ }
+
+ ver1 = (s[10] - '0') * 10 + (s[12] - '0');
+
+ if (cl_get_device_info(p->ctx->device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
+ version_str, NULL) != CL_SUCCESS)
+ return 0;
+
+ assert(strstr(version_str, "OpenCL") && version_str[0] == 'O');
+ ver2 = (version_str[9] - '0') * 10 + (version_str[11] - '0');
+
+ if (ver2 < ver1)
+ return 0;
+
+ return 1;
+ }
+
+ return 1;
+}
+
+LOCAL cl_int
+cl_program_build(cl_program p, const char *options)
+{
+ cl_int err = CL_SUCCESS;
+ int i = 0;
+ int copyed = 0;
+
+ if (p->ref_n > 1) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ if (!check_cl_version_option(p, options)) {
+ err = CL_BUILD_PROGRAM_FAILURE;
+ goto error;
+ }
+ if (options) {
+ if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+ if(p->build_opts) {
+ cl_free(p->build_opts);
+ p->build_opts = NULL;
+ }
+ TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
+ memcpy(p->build_opts, options, strlen(options));
+
+ p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+ }
+ }
+
+ if (options == NULL && p->build_opts) {
+ p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+
+ cl_free(p->build_opts);
+ p->build_opts = NULL;
+ }
+
+ if (p->source_type == FROM_SOURCE) {
+ if (!CompilerSupported()) {
+ err = CL_COMPILER_NOT_AVAILABLE;
+ goto error;
+ }
+
+ p->opaque = compiler_program_new_from_source(p->ctx->device->vendor_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+ if (UNLIKELY(p->opaque == NULL)) {
+ if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+ err = CL_INVALID_BUILD_OPTIONS;
+ else
+ err = CL_BUILD_PROGRAM_FAILURE;
+ goto error;
+ }
+
+ /* Create all the kernels */
+ TRY (cl_program_load_gen_program, p);
+ } else if (p->source_type == FROM_LLVM) {
+ if (!CompilerSupported()) {
+ err = CL_COMPILER_NOT_AVAILABLE;
+ goto error;
+ }
+
+ compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
+ if (UNLIKELY(p->opaque == NULL)) {
+ if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+ err = CL_INVALID_BUILD_OPTIONS;
+ else
+ err = CL_BUILD_PROGRAM_FAILURE;
+ goto error;
+ }
+ /* Create all the kernels */
+ TRY (cl_program_load_gen_program, p);
+ } else if (p->source_type == FROM_BINARY) {
+ p->opaque = interp_program_new_from_binary(p->ctx->device->vendor_id, p->binary, p->binary_sz);
+ if (UNLIKELY(p->opaque == NULL)) {
+ err = CL_BUILD_PROGRAM_FAILURE;
+ goto error;
+ }
+
+ /* Create all the kernels */
+ TRY (cl_program_load_gen_program, p);
+ }
+ p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+
+ for (i = 0; i < p->ker_n; i ++) {
+ const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+ p->bin_sz += interp_kernel_get_code_size(opaque);
+ }
+
+ TRY_ALLOC (p->bin, cl_calloc(p->bin_sz, sizeof(char)));
+ for (i = 0; i < p->ker_n; i ++) {
+ const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+ size_t sz = interp_kernel_get_code_size(opaque);
+
+ memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
+ copyed += sz;
+ }
+ p->is_built = 1;
+ p->build_status = CL_BUILD_SUCCESS;
+ return CL_SUCCESS;
+
+error:
+ p->build_status = CL_BUILD_ERROR;
+ return err;
+}
+
+cl_program
+cl_program_link(cl_context context,
+ cl_uint num_input_programs,
+ const cl_program * input_programs,
+ const char * options,
+ cl_int* errcode_ret)
+{
+ cl_program p = NULL;
+ cl_int err = CL_SUCCESS;
+ cl_int i = 0;
+ int copyed = 0;
+ p = cl_program_new(context);
+
+ if (!check_cl_version_option(p, options)) {
+ err = CL_BUILD_PROGRAM_FAILURE;
+ goto error;
+ }
+
+ p->opaque = compiler_program_new_gen_program(context->device->vendor_id, NULL, NULL);
+
+ for(i = 0; i < num_input_programs; i++) {
+ // if program create with llvm binary, need deserilize first to get module.
+ if(input_programs[i])
+ compiler_program_link_program(p->opaque, input_programs[i]->opaque,
+ p->build_log_max_sz, p->build_log, &p->build_log_sz);
+ if (UNLIKELY(p->opaque == NULL)) {
+ err = CL_LINK_PROGRAM_FAILURE;
+ goto error;
+ }
+ }
+
+ if(options && strstr(options, "-create-library")){
+ p->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+ goto done;
+ }else{
+ p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+ }
+
+ compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
+
+ /* Create all the kernels */
+ TRY (cl_program_load_gen_program, p);
+
+ for (i = 0; i < p->ker_n; i ++) {
+ const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+ p->bin_sz += interp_kernel_get_code_size(opaque);
+ }
+
+ TRY_ALLOC (p->bin, cl_calloc(p->bin_sz, sizeof(char)));
+ for (i = 0; i < p->ker_n; i ++) {
+ const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+ size_t sz = interp_kernel_get_code_size(opaque);
+
+ memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
+ copyed += sz;
+ }
+done:
+ p->is_built = 1;
+ p->build_status = CL_BUILD_SUCCESS;
+ if (errcode_ret)
+ *errcode_ret = err;
+ return p;
+
+error:
+ p->build_status = CL_BUILD_ERROR;
+ if (errcode_ret)
+ *errcode_ret = err;
+ return p;
+}
+
+LOCAL cl_int
+cl_program_compile(cl_program p,
+ cl_uint num_input_headers,
+ const cl_program * input_headers,
+ const char ** header_include_names,
+ const char* options)
+{
+ cl_int err = CL_SUCCESS;
+ int i = 0;
+
+ if (p->ref_n > 1) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ if (!check_cl_version_option(p, options)) {
+ err = CL_BUILD_PROGRAM_FAILURE;
+ goto error;
+ }
+
+ if (options) {
+ if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+ if(p->build_opts) {
+ cl_free(p->build_opts);
+ p->build_opts = NULL;
+ }
+ TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
+ memcpy(p->build_opts, options, strlen(options));
+
+ p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+ }
+ }
+
+ if (options == NULL && p->build_opts) {
+ p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+
+ cl_free(p->build_opts);
+ p->build_opts = NULL;
+ }
+
+ char temp_header_template[]= "/tmp/beignet.XXXXXX";
+ char* temp_header_path = mkdtemp(temp_header_template);
+
+ if (p->source_type == FROM_SOURCE) {
+
+ if (!CompilerSupported()) {
+ err = CL_COMPILER_NOT_AVAILABLE;
+ goto error;
+ }
+
+ //write the headers to /tmp/beignet.XXXXXX for include.
+ for (i = 0; i < num_input_headers; i++) {
+ if(header_include_names[i] == NULL || input_headers[i] == NULL)
+ continue;
+
+ char temp_path[255]="";
+ strncpy(temp_path, temp_header_path, strlen(temp_header_path));
+ strncat(temp_path, "/", 1);
+ strncat(temp_path, header_include_names[i], strlen(header_include_names[i]));
+ char* dirc = strdup(temp_path);
+ char* dir = dirname(dirc);
+ mkdir(dir, 0755);
+ if(access(dir, R_OK|W_OK) != 0){
+ err = CL_COMPILE_PROGRAM_FAILURE;
+ goto error;
+ }
+ free(dirc);
+
+ FILE* pfile = fopen(temp_path, "wb");
+ if(pfile){
+ fwrite(input_headers[i]->source, strlen(input_headers[i]->source), 1, pfile);
+ fclose(pfile);
+ }else{
+ err = CL_COMPILE_PROGRAM_FAILURE;
+ goto error;
+ }
+ }
+
+ p->opaque = compiler_program_compile_from_source(p->ctx->device->vendor_id, p->source, temp_header_path,
+ p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+
+ char rm_path[255]="rm ";
+ strncat(rm_path, temp_header_path, strlen(temp_header_path));
+ strncat(rm_path, " -rf", 4);
+ int temp = system(rm_path);
+
+ if(temp){
+ assert(0);
+ }
+
+ if (UNLIKELY(p->opaque == NULL)) {
+ if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+ err = CL_INVALID_BUILD_OPTIONS;
+ else
+ err = CL_BUILD_PROGRAM_FAILURE;
+ goto error;
+ }
+
+ /* Create all the kernels */
+ p->source_type = FROM_LLVM;
+ p->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+ }
+ p->is_built = 1;
+ p->build_status = CL_BUILD_SUCCESS;
+ return CL_SUCCESS;
+
+error:
+ p->build_status = CL_BUILD_ERROR;
+ cl_program_delete(p);
+ p = NULL;
+ return err;
+}
+
+LOCAL cl_kernel
+cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
+{
+ cl_kernel from = NULL, to = NULL;
+ cl_int err = CL_SUCCESS;
+ uint32_t i = 0;
+
+ /* Find the program first */
+ for (i = 0; i < p->ker_n; ++i) {
+ assert(p->ker[i]);
+ const char *ker_name = cl_kernel_get_name(p->ker[i]);
+ if (strcmp(ker_name, name) == 0) {
+ from = p->ker[i];
+ break;
+ }
+ }
+
+ /* We were not able to find this named kernel */
+ if (UNLIKELY(from == NULL)) {
+ err = CL_INVALID_KERNEL_NAME;
+ goto error;
+ }
+
+ TRY_ALLOC(to, cl_kernel_dup(from));
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return to;
+error:
+ cl_kernel_delete(to);
+ to = NULL;
+ goto exit;
+}
+
+LOCAL cl_int
+cl_program_create_kernels_in_program(cl_program p, cl_kernel* ker)
+{
+ int i = 0;
+
+ if(ker == NULL)
+ return CL_SUCCESS;
+
+ for (i = 0; i < p->ker_n; ++i) {
+ TRY_ALLOC_NO_ERR(ker[i], cl_kernel_dup(p->ker[i]));
+ }
+
+ return CL_SUCCESS;
+
+error:
+ do {
+ cl_kernel_delete(ker[i]);
+ ker[i--] = NULL;
+ } while(i > 0);
+
+ return CL_OUT_OF_HOST_MEMORY;
+}
+
+LOCAL void
+cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size_ret)
+{
+ int i = 0;
+ const char *ker_name = NULL;
+ size_t len = 0;
+ if(size_ret) *size_ret = 0;
+
+ if(p->ker == NULL) {
+ return;
+ }
+
+ ker_name = cl_kernel_get_name(p->ker[i]);
+ len = strlen(ker_name);
+ if(names) {
+ strncpy(names, cl_kernel_get_name(p->ker[0]), size - 1);
+ if(size < len - 1) {
+ if(size_ret) *size_ret = size;
+ return;
+ }
+ size = size - len - 1; //sub \0
+ }
+ if(size_ret) *size_ret = strlen(ker_name) + 1; //add NULL
+
+ for (i = 1; i < p->ker_n; ++i) {
+ ker_name = cl_kernel_get_name(p->ker[i]);
+ len = strlen(ker_name);
+ if(names) {
+ strncat(names, ";", size);
+ if(size >= 1)
+ strncat(names, ker_name, size - 1);
+ if(size < len + 1) {
+ if(size_ret) *size_ret = size;
+ break;
+ }
+ size = size - len - 1;
+ }
+ if(size_ret) *size_ret += len + 1; //add ';'
+ }
+}
diff --git a/src/cl_program.h b/src/cl_program.h
new file mode 100644
index 0000000..6dea29a
--- /dev/null
+++ b/src/cl_program.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_PROGRAM_H__
+#define __CL_PROGRAM_H__
+
+#include "cl_internals.h"
+#include "cl_gbe_loader.h"
+#include "CL/cl.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// This is the structure ouput by the compiler
+struct _gbe_program;
+
+enum {
+ FROM_SOURCE = 0,
+ FROM_LLVM = 1,
+ FROM_BINARY = 2
+};
+
+/* This maps an OCL file containing some kernels */
+struct _cl_program {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a program */
+ volatile int ref_n; /* We reference count this object */
+ gbe_program opaque; /* (Opaque) program as ouput by the compiler */
+ cl_kernel *ker; /* All kernels included by the OCL file */
+ cl_program prev, next; /* We chain the programs together */
+ cl_context ctx; /* Its parent context */
+ char *bin; /* The program copied verbatim */
+ size_t bin_sz; /* Its size in memory */
+ char *source; /* Program sources */
+ char *binary; /* Program binary. */
+ size_t binary_sz; /* The binary size. */
+ uint32_t binary_type; /* binary type: COMPILED_OBJECT(LLVM IR), LIBRARY(LLVM IR with option "-create-library"), or EXECUTABLE(GEN binary). */
+ uint32_t ker_n; /* Number of declared kernels */
+ uint32_t source_type:2; /* Built from binary, source or LLVM */
+ uint32_t is_built:1; /* Did we call clBuildProgram on it? */
+ int32_t build_status; /* build status. */
+ char *build_opts; /* The build options for this program */
+ size_t build_log_max_sz; /*build log maximum size in byte.*/
+ char *build_log; /* The build log for this program. */
+ size_t build_log_sz; /* The actual build log size.*/
+};
+
+/* Create a empty program */
+extern cl_program cl_program_new(cl_context);
+
+/* Destroy and deallocate an empty kernel */
+extern void cl_program_delete(cl_program);
+
+/* Add one more reference to the object (to defer its deletion) */
+extern void cl_program_add_ref(cl_program);
+
+/* Create a kernel for the OCL user */
+extern cl_kernel cl_program_create_kernel(cl_program, const char*, cl_int*);
+
+/* creates kernel objects for all kernel functions in program. */
+extern cl_int cl_program_create_kernels_in_program(cl_program, cl_kernel*);
+
+/* Create a program from OCL source */
+extern cl_program
+cl_program_create_from_source(cl_context ctx,
+ cl_uint count,
+ const char **strings,
+ const size_t *lengths,
+ cl_int *errcode_ret);
+
+/* Directly create a program from a blob */
+extern cl_program
+cl_program_create_from_binary(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ const size_t * lengths,
+ const unsigned char ** binaries,
+ cl_int * binary_status,
+ cl_int * errcode_ret);
+
+/* Create a program with built-in kernels*/
+extern cl_program
+cl_program_create_with_built_in_kernles(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * kernel_names,
+ cl_int * errcode_ret);
+/* Directly create a program from a LLVM source file */
+extern cl_program
+cl_program_create_from_llvm(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ const char * fileName,
+ cl_int * errcode_ret);
+
+/* Build the program as specified by OCL */
+extern cl_int
+cl_program_build(cl_program p, const char* options);
+/* Compile the program as specified by OCL */
+extern cl_int
+cl_program_compile(cl_program p,
+ cl_uint num_input_headers,
+ const cl_program * input_headers,
+ const char ** header_include_names,
+ const char* options);
+/* link the program as specified by OCL */
+extern cl_program
+cl_program_link(cl_context context,
+ cl_uint num_input_programs,
+ const cl_program * input_programs,
+ const char * options,
+ cl_int* errcode_ret);
+/* Get the kernel names in program */
+extern void
+cl_program_get_kernel_names(cl_program p,
+ size_t size,
+ char *names,
+ size_t *size_ret);
+#endif /* __CL_PROGRAM_H__ */
+
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
new file mode 100644
index 0000000..d718256
--- /dev/null
+++ b/src/cl_sampler.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_context.h"
+#include "cl_sampler.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+
+uint32_t cl_to_clk(cl_bool normalized_coords,
+ cl_addressing_mode address,
+ cl_filter_mode filter)
+{
+ int clk_address = CLK_ADDRESS_NONE;
+ int clk_filter = CLK_FILTER_NEAREST;
+ switch (address) {
+ case CL_ADDRESS_NONE: clk_address = CLK_ADDRESS_NONE; break;
+ case CL_ADDRESS_CLAMP: clk_address = CLK_ADDRESS_CLAMP; break;
+ case CL_ADDRESS_CLAMP_TO_EDGE: clk_address = CLK_ADDRESS_CLAMP_TO_EDGE; break;
+ case CL_ADDRESS_REPEAT: clk_address = CLK_ADDRESS_REPEAT; break;
+ case CL_ADDRESS_MIRRORED_REPEAT: clk_address = CLK_ADDRESS_MIRRORED_REPEAT; break;
+ default:
+ assert(0);
+ }
+ switch(filter) {
+ case CL_FILTER_NEAREST: clk_filter = CLK_FILTER_NEAREST; break;
+ case CL_FILTER_LINEAR: clk_filter = CLK_FILTER_LINEAR; break;
+ default:
+ assert(0);
+ }
+ return (clk_address << __CLK_ADDRESS_BASE)
+ | (normalized_coords << __CLK_NORMALIZED_BASE)
+ | (clk_filter);
+}
+
+#define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
+#define SAMPLER_ARG_ID(v) ((v & __CLK_SAMPLER_ARG_MASK) >> __CLK_SAMPLER_ARG_BASE)
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
+{
+ int slot_id;
+ for(slot_id = 0; slot_id < k->sampler_sz; slot_id++)
+ {
+ if (IS_SAMPLER_ARG(k->samplers[slot_id])) {
+ if (SAMPLER_ARG_ID(k->samplers[slot_id]) == index) {
+ k->samplers[slot_id] = (k->samplers[slot_id] & (~__CLK_SAMPLER_MASK))
+ | sampler->clkSamplerValue;
+ return slot_id;
+ }
+ }
+ }
+ return -1;
+}
+
+LOCAL cl_sampler
+cl_sampler_new(cl_context ctx,
+ cl_bool normalized_coords,
+ cl_addressing_mode address,
+ cl_filter_mode filter,
+ cl_int *errcode_ret)
+{
+ cl_sampler sampler = NULL;
+ cl_int err = CL_SUCCESS;
+
+ /* Allocate and inialize the structure itself */
+ TRY_ALLOC (sampler, CALLOC(struct _cl_sampler));
+ SET_ICD(sampler->dispatch)
+ sampler->ref_n = 1;
+ sampler->magic = CL_MAGIC_SAMPLER_HEADER;
+ sampler->normalized_coords = normalized_coords;
+ sampler->address = address;
+ sampler->filter = filter;
+
+ /* Append the sampler in the context sampler list */
+ pthread_mutex_lock(&ctx->sampler_lock);
+ sampler->next = ctx->samplers;
+ if (ctx->samplers != NULL)
+ ctx->samplers->prev = sampler;
+ ctx->samplers = sampler;
+ pthread_mutex_unlock(&ctx->sampler_lock);
+ sampler->ctx = ctx;
+ cl_context_add_ref(ctx);
+
+ sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
+
+exit:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return sampler;
+error:
+ cl_sampler_delete(sampler);
+ sampler = NULL;
+ goto exit;
+}
+
+LOCAL void
+cl_sampler_delete(cl_sampler sampler)
+{
+ if (UNLIKELY(sampler == NULL))
+ return;
+ if (atomic_dec(&sampler->ref_n) > 1)
+ return;
+
+ assert(sampler->ctx);
+ pthread_mutex_lock(&sampler->ctx->sampler_lock);
+ if (sampler->prev)
+ sampler->prev->next = sampler->next;
+ if (sampler->next)
+ sampler->next->prev = sampler->prev;
+ if (sampler->ctx->samplers == sampler)
+ sampler->ctx->samplers = sampler->next;
+ pthread_mutex_unlock(&sampler->ctx->sampler_lock);
+ cl_context_delete(sampler->ctx);
+
+ cl_free(sampler);
+}
+
+LOCAL void
+cl_sampler_add_ref(cl_sampler sampler)
+{
+ assert(sampler);
+ atomic_inc(&sampler->ref_n);
+}
+
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
new file mode 100644
index 0000000..4785928
--- /dev/null
+++ b/src/cl_sampler.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_SAMPLER_H__
+#define __CL_SAMPLER_H__
+
+#include "CL/cl.h"
+#include "../backend/src/ocl_common_defines.h"
+#include <stdint.h>
+
+/* How to access images */
+struct _cl_sampler {
+ DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a sampler object */
+ volatile int ref_n; /* This object is reference counted */
+ cl_sampler prev, next; /* We chain the samplers in the allocator */
+ cl_context ctx; /* Context it belongs to */
+ cl_bool normalized_coords; /* Are coordinates normalized? */
+ cl_addressing_mode address;/* CLAMP / REPEAT and so on... */
+ cl_filter_mode filter; /* LINEAR / NEAREST mostly */
+ uint32_t clkSamplerValue;
+};
+
+/* Create a new sampler object */
+extern cl_sampler cl_sampler_new(cl_context,
+ cl_bool,
+ cl_addressing_mode,
+ cl_filter_mode,
+ cl_int *err);
+
+/* Unref the object and delete it if no more reference on it */
+extern void cl_sampler_delete(cl_sampler);
+
+/* Add one more reference to this object */
+extern void cl_sampler_add_ref(cl_sampler);
+
+/* set a sampler kernel argument */
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler);
+
+#endif /* __CL_SAMPLER_H__ */
+
diff --git a/src/cl_thread.c b/src/cl_thread.c
new file mode 100644
index 0000000..5713d70
--- /dev/null
+++ b/src/cl_thread.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <string.h>
+#include <stdio.h>
+
+#include "cl_thread.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+/* Because the cl_command_queue can be used in several threads simultaneously but
+ without add ref to it, we now handle it like this:
+ Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it
+ does not have a slot, assign it.
+ The resources are keeped in queue private, and resize it if needed.
+ When the thread exit, the slot will be set invalid.
+ When queue released, all the resources will be released. If user still enqueue, flush
+ or finish the queue after it has been released, the behavior is undefined.
+ TODO: Need to shrink the slot map.
+ */
+
+static int thread_array_num = 1;
+static int *thread_slot_map = NULL;
+static int thread_magic_num = 1;
+static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_key_t destroy_key;
+
+static __thread int thread_id = -1;
+static __thread int thread_magic = -1;
+
+typedef struct _thread_spec_data {
+ cl_gpgpu gpgpu ;
+ int valid;
+ void* thread_batch_buf;
+ int thread_magic;
+} thread_spec_data;
+
+typedef struct _queue_thread_private {
+ thread_spec_data** threads_data;
+ int threads_data_num;
+ pthread_mutex_t thread_data_lock;
+} queue_thread_private;
+
+static void thread_data_destructor(void *dummy) {
+ pthread_mutex_lock(&thread_queue_map_lock);
+ thread_slot_map[thread_id] = 0;
+ pthread_mutex_unlock(&thread_queue_map_lock);
+ free(dummy);
+}
+
+static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
+{
+ queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+ thread_spec_data* spec = NULL;
+ int i = 0;
+
+ if (thread_id == -1) {
+ void * dummy = malloc(sizeof(int));
+
+ pthread_mutex_lock(&thread_queue_map_lock);
+ for (i = 0; i < thread_array_num; i++) {
+ if (thread_slot_map[i] == 0) {
+ thread_id = i;
+ break;
+ }
+ }
+
+ if (i == thread_array_num) {
+ thread_array_num *= 2;
+ thread_slot_map = realloc(thread_slot_map, sizeof(int) * thread_array_num);
+ memset(thread_slot_map + thread_array_num/2, 0, sizeof(int) * (thread_array_num/2));
+ thread_id = thread_array_num/2;
+ }
+
+ thread_slot_map[thread_id] = 1;
+
+ thread_magic = thread_magic_num++;
+ pthread_mutex_unlock(&thread_queue_map_lock);
+
+ pthread_setspecific(destroy_key, dummy);
+ }
+
+ pthread_mutex_lock(&thread_private->thread_data_lock);
+ if (thread_array_num > thread_private->threads_data_num) {// just enlarge
+ int old_num = thread_private->threads_data_num;
+ thread_private->threads_data_num = thread_array_num;
+ thread_private->threads_data = realloc(thread_private->threads_data,
+ thread_private->threads_data_num * sizeof(void *));
+ memset(thread_private->threads_data + old_num, 0,
+ sizeof(void*) * (thread_private->threads_data_num - old_num));
+ }
+
+ assert(thread_id != -1 && thread_id < thread_array_num);
+ spec = thread_private->threads_data[thread_id];
+ if (!spec && create) {
+ spec = CALLOC(thread_spec_data);
+ spec->thread_magic = thread_magic;
+ thread_private->threads_data[thread_id] = spec;
+ }
+
+ pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+ return spec;
+}
+
+void* cl_thread_data_create(void)
+{
+ queue_thread_private* thread_private = CALLOC(queue_thread_private);
+
+ if (thread_private == NULL)
+ return NULL;
+
+ if (thread_slot_map == NULL) {
+ pthread_mutex_lock(&thread_queue_map_lock);
+ thread_slot_map = calloc(thread_array_num, sizeof(int));
+ pthread_mutex_unlock(&thread_queue_map_lock);
+
+ pthread_key_create(&destroy_key, thread_data_destructor);
+ }
+
+ pthread_mutex_init(&thread_private->thread_data_lock, NULL);
+
+ pthread_mutex_lock(&thread_private->thread_data_lock);
+ thread_private->threads_data = malloc(thread_array_num * sizeof(void *));
+ memset(thread_private->threads_data, 0, sizeof(void*) * thread_array_num);
+ thread_private->threads_data_num = thread_array_num;
+ pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+ return thread_private;
+}
+
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+{
+ thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+
+ if (!spec->thread_magic && spec->thread_magic != thread_magic) {
+ //We may get the slot from last thread. So free the resource.
+ spec->valid = 0;
+ }
+
+ if (!spec->valid) {
+ if (spec->thread_batch_buf) {
+ cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
+ spec->thread_batch_buf = NULL;
+ }
+ if (spec->gpgpu) {
+ cl_gpgpu_delete(spec->gpgpu);
+ spec->gpgpu = NULL;
+ }
+ TRY_ALLOC_NO_ERR(spec->gpgpu, cl_gpgpu_new(queue->ctx->drv));
+ spec->valid = 1;
+ }
+
+ error:
+ return spec->gpgpu;
+}
+
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
+{
+ thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+
+ assert(spec && spec->thread_magic == thread_magic);
+
+ if (spec->thread_batch_buf) {
+ cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
+ }
+ spec->thread_batch_buf = buf;
+}
+
+void* cl_get_thread_batch_buf(cl_command_queue queue) {
+ thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+
+ assert(spec && spec->thread_magic == thread_magic);
+
+ return spec->thread_batch_buf;
+}
+
+void cl_invalid_thread_gpgpu(cl_command_queue queue)
+{
+ queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+ thread_spec_data* spec = NULL;
+
+ pthread_mutex_lock(&thread_private->thread_data_lock);
+ spec = thread_private->threads_data[thread_id];
+ assert(spec);
+ pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+ if (!spec->valid) {
+ return;
+ }
+
+ assert(spec->gpgpu);
+ cl_gpgpu_delete(spec->gpgpu);
+ spec->gpgpu = NULL;
+ spec->valid = 0;
+}
+
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue)
+{
+ queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+ thread_spec_data* spec = NULL;
+
+ pthread_mutex_lock(&thread_private->thread_data_lock);
+ spec = thread_private->threads_data[thread_id];
+ assert(spec);
+ pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+ if (!spec->valid)
+ return NULL;
+
+ assert(spec->gpgpu);
+ cl_gpgpu gpgpu = spec->gpgpu;
+ spec->gpgpu = NULL;
+ spec->valid = 0;
+ return gpgpu;
+}
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(cl_command_queue queue)
+{
+ int i = 0;
+ queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+ int threads_data_num;
+ thread_spec_data** threads_data;
+
+ pthread_mutex_lock(&thread_private->thread_data_lock);
+ assert(thread_private->threads_data_num == thread_array_num);
+ threads_data_num = thread_private->threads_data_num;
+ threads_data = thread_private->threads_data;
+ thread_private->threads_data_num = 0;
+ thread_private->threads_data = NULL;
+ pthread_mutex_unlock(&thread_private->thread_data_lock);
+ cl_free(thread_private);
+ queue->thread_data = NULL;
+
+ for (i = 0; i < threads_data_num; i++) {
+ if (threads_data[i] != NULL && threads_data[i]->thread_batch_buf) {
+ cl_gpgpu_unref_batch_buf(threads_data[i]->thread_batch_buf);
+ threads_data[i]->thread_batch_buf = NULL;
+ }
+
+ if (threads_data[i] != NULL && threads_data[i]->valid) {
+ cl_gpgpu_delete(threads_data[i]->gpgpu);
+ threads_data[i]->gpgpu = NULL;
+ threads_data[i]->valid = 0;
+ }
+ cl_free(threads_data[i]);
+ }
+
+ cl_free(threads_data);
+}
diff --git a/src/cl_thread.h b/src/cl_thread.h
new file mode 100644
index 0000000..ecc99ad
--- /dev/null
+++ b/src/cl_thread.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_THREAD_H__
+#define __CL_THREAD_H__
+
+#include <pthread.h>
+#include "cl_internals.h"
+#include "cl_command_queue.h"
+
+/* Create the thread specific data. */
+void* cl_thread_data_create(void);
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(cl_command_queue queue);
+
+/* Used to get the gpgpu struct of each thread. */
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
+
+/* Used to release the gpgpu struct of each thread. */
+void cl_invalid_thread_gpgpu(cl_command_queue queue);
+
+/* Used to set the batch buffer of each thread. */
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf);
+
+/* Used to get the batch buffer of each thread. */
+void* cl_get_thread_batch_buf(cl_command_queue queue);
+
+/* take current gpgpu from the thread gpgpu pool. */
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue);
+
+#endif /* __CL_THREAD_H__ */
diff --git a/src/cl_utils.h b/src/cl_utils.h
new file mode 100644
index 0000000..26cf329
--- /dev/null
+++ b/src/cl_utils.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_UTILS_H__
+#define __CL_UTILS_H__
+
+/* INLINE is forceinline */
+#define INLINE __attribute__((always_inline)) inline
+
+/* Branch hint */
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+
+/* Stringify macros */
+#define JOIN(X, Y) _DO_JOIN(X, Y)
+#define _DO_JOIN(X, Y) _DO_JOIN2(X, Y)
+#define _DO_JOIN2(X, Y) X##Y
+
+/* Check compile time errors */
+#define STATIC_ASSERT(value) \
+struct JOIN(__,JOIN(__,__LINE__)) { \
+ int x[(value) ? 1 : -1]; \
+}
+
+/* Throw errors */
+#ifdef NDEBUG
+ #define ERR(ERROR, ...) \
+ do { \
+ err = ERROR; \
+ goto error; \
+ } while (0)
+#else
+ #define ERR(ERROR, ...) \
+ do { \
+ fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__); \
+ fprintf(stderr, __VA_ARGS__); \
+ fprintf(stderr, "\n"); \
+ err = ERROR; \
+ goto error; \
+ } while (0)
+#endif
+
+#define DO_ALLOC_ERR \
+do { \
+ ERR(CL_OUT_OF_HOST_MEMORY, "Out of memory"); \
+} while (0)
+
+#define ERR_IF(COND, ERROR, ...) \
+do { \
+ if (UNLIKELY(COND)) ERR (ERROR, __VA_ARGS__); \
+} while (0)
+
+#define INVALID_VALUE_IF(COND) \
+do { \
+ ERR_IF(COND, CL_INVALID_VALUE, "Invalid value"); \
+} while (0)
+
+#define INVALID_DEVICE_IF(COND) \
+do { \
+ ERR_IF(COND, CL_INVALID_DEVICE, "Invalid device"); \
+} while (0)
+
+#define MAX(x0, x1) ((x0) > (x1) ? (x0) : (x1))
+#define MIN(x0, x1) ((x0) < (x1) ? (x0) : (x1))
+#define ALIGN(A, B) (((A) % (B)) ? (A) + (B) - ((A) % (B)) : (A))
+
+#define DO_ALLOC_ERROR \
+do { \
+ err = CL_OUT_OF_HOST_MEMORY; \
+ goto error; \
+} while (0)
+
+#define FATAL(...) \
+do { \
+ fprintf(stderr, "error: "); \
+ fprintf(stderr, __VA_ARGS__); \
+ fprintf(stderr, "\n"); \
+ assert(0); \
+ exit(-1); \
+} while (0)
+
+#define FATAL_IF(COND, ...) \
+do { \
+ if (UNLIKELY(COND)) FATAL(__VA_ARGS__); \
+} while (0)
+
+#define NOT_IMPLEMENTED FATAL ("Not implemented")
+
+#define CHECK_CONTEXT(CTX) \
+do { \
+ if (UNLIKELY(CTX == NULL)) { \
+ err = CL_INVALID_CONTEXT; \
+ goto error; \
+ } \
+ if (UNLIKELY(CTX->magic != CL_MAGIC_CONTEXT_HEADER)) { \
+ err = CL_INVALID_CONTEXT; \
+ goto error; \
+ } \
+} while (0)
+
+#define CHECK_QUEUE(QUEUE) \
+do { \
+ if (UNLIKELY(QUEUE == NULL)) { \
+ err = CL_INVALID_COMMAND_QUEUE; \
+ goto error; \
+ } \
+ if (UNLIKELY(QUEUE->magic != CL_MAGIC_QUEUE_HEADER)) { \
+ err = CL_INVALID_COMMAND_QUEUE; \
+ goto error; \
+ } \
+} while (0)
+
+#define CHECK_MEM(MEM) \
+do { \
+ if (UNLIKELY(MEM == NULL)) { \
+ err = CL_INVALID_MEM_OBJECT; \
+ goto error; \
+ } \
+ if (UNLIKELY(MEM->magic != CL_MAGIC_MEM_HEADER)) { \
+ err = CL_INVALID_MEM_OBJECT; \
+ goto error; \
+ } \
+} while (0)
+
+#define CHECK_IMAGE(MEM, IMAGE) \
+CHECK_MEM(MEM); \
+do { \
+ if (UNLIKELY(!IS_IMAGE(MEM))) { \
+ err = CL_INVALID_MEM_OBJECT; \
+ goto error; \
+ } \
+} while (0); \
+struct _cl_mem_image *IMAGE; \
+IMAGE = cl_mem_image(MEM); \
+
+#define FIXUP_IMAGE_REGION(IMAGE, PREGION, REGION) \
+const size_t *REGION; \
+size_t REGION ##_REC[3]; \
+do { \
+ if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { \
+ REGION ##_REC[0] = PREGION[0]; \
+ REGION ##_REC[1] = 1; \
+ REGION ##_REC[2] = PREGION[1]; \
+ REGION = REGION ##_REC; \
+ } else { \
+ REGION = PREGION; \
+ } \
+} while(0)
+
+#define FIXUP_IMAGE_ORIGIN(IMAGE, PREGION, REGION) \
+const size_t *REGION; \
+size_t REGION ##_REC[3]; \
+do { \
+ if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { \
+ REGION ##_REC[0] = PREGION[0]; \
+ REGION ##_REC[1] = 0; \
+ REGION ##_REC[2] = PREGION[1]; \
+ REGION = REGION ##_REC; \
+ } else { \
+ REGION = PREGION; \
+ } \
+} while(0)
+
+
+#define CHECK_EVENT(EVENT) \
+ do { \
+ if (UNLIKELY(EVENT == NULL)) { \
+ err = CL_INVALID_EVENT; \
+ goto error; \
+ } \
+ if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) { \
+ err = CL_INVALID_EVENT; \
+ goto error; \
+ } \
+ } while (0)
+
+#define CHECK_SAMPLER(SAMPLER) \
+do { \
+ if (UNLIKELY(SAMPLER == NULL)) { \
+ err = CL_INVALID_SAMPLER; \
+ goto error; \
+ } \
+ if (UNLIKELY(SAMPLER->magic != CL_MAGIC_SAMPLER_HEADER)) {\
+ err = CL_INVALID_SAMPLER; \
+ goto error; \
+ } \
+} while (0)
+
+#define CHECK_KERNEL(KERNEL) \
+do { \
+ if (UNLIKELY(KERNEL == NULL)) { \
+ err = CL_INVALID_KERNEL; \
+ goto error; \
+ } \
+ if (UNLIKELY(KERNEL->magic != CL_MAGIC_KERNEL_HEADER)) { \
+ err = CL_INVALID_KERNEL; \
+ goto error; \
+ } \
+} while (0)
+
+#define CHECK_PROGRAM(PROGRAM) \
+do { \
+ if (UNLIKELY(PROGRAM == NULL)) { \
+ err = CL_INVALID_PROGRAM; \
+ goto error; \
+ } \
+ if (UNLIKELY(PROGRAM->magic != CL_MAGIC_PROGRAM_HEADER)) {\
+ err = CL_INVALID_PROGRAM; \
+ goto error; \
+ } \
+} while (0)
+
+#define ELEMENTS(x) (sizeof(x)/sizeof(*(x)))
+#define CALLOC_STRUCT(T) (struct T*) cl_calloc(1, sizeof(struct T))
+#define CALLOC(T) (T*) cl_calloc(1, sizeof(T))
+#define CALLOC_ARRAY(T, N) (T*) cl_calloc(N, sizeof(T))
+#define MEMZERO(x) do { memset((x),0,sizeof(*(x))); } while (0)
+
+/* Run some code and catch errors */
+#define TRY(fn,...) \
+do { \
+ if (UNLIKELY((err = fn(__VA_ARGS__)) != CL_SUCCESS)) \
+ goto error; \
+} while (0)
+
+#define TRY_NO_ERR(fn,...) \
+do { \
+ if (UNLIKELY(fn(__VA_ARGS__) != CL_SUCCESS)) \
+ goto error; \
+} while (0)
+
+#define TRY_ALLOC(dst, EXPR) \
+do { \
+ if (UNLIKELY((dst = EXPR) == NULL)) \
+ DO_ALLOC_ERROR; \
+} while (0)
+
+#define TRY_ALLOC_NO_ERR(dst, EXPR) \
+do { \
+ if (UNLIKELY((dst = EXPR) == NULL)) \
+ goto error; \
+} while (0)
+
+#define TRY_ALLOC_NO_RET(EXPR) \
+do { \
+ if (UNLIKELY((EXPR) == NULL)) \
+ DO_ALLOC_ERROR; \
+} while (0)
+
+/* Break Point Definitions */
+#if !defined(NDEBUG)
+
+#define BREAK \
+do { \
+ __asm__("int3"); \
+} while(0)
+
+#define BREAK_IF(value) \
+do { \
+ if (UNLIKELY(!(value))) BREAKPOINT(); \
+} while(0)
+
+#else
+#define BREAKPOINT() do { } while(0)
+#define ASSERT(value) do { } while(0)
+#endif
+
+/* For all internal functions */
+#define LOCAL __attribute__ ((visibility ("internal")))
+
+/* Align a structure or a variable */
+#define ALIGNED(X) __attribute__ ((aligned (X)))
+
+/* Number of DWORDS */
+#define SIZEOF32(X) (sizeof(X) / sizeof(uint32_t))
+
+/* Memory quantity */
+#define KB 1024
+#define MB (KB*KB)
+
+/* To help bitfield definitions */
+#define BITFIELD_BIT(X) 1
+#define BITFIELD_RANGE(X,Y) ((Y) - (X) + 1)
+
+/* 32 bits atomic variable */
+typedef volatile int atomic_t;
+
+static INLINE int atomic_add(atomic_t *v, const int c) {
+ register int i = c;
+ __asm__ __volatile__("lock ; xaddl %0, %1;"
+ : "+r"(i), "+m"(*v)
+ : "m"(*v), "r"(i));
+ return i;
+}
+
+static INLINE int atomic_inc(atomic_t *v) { return atomic_add(v, 1); }
+static INLINE int atomic_dec(atomic_t *v) { return atomic_add(v, -1); }
+
+#endif /* __CL_UTILS_H__ */
+
diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
new file mode 100644
index 0000000..d3da3cc
--- /dev/null
+++ b/src/intel/intel_batchbuffer.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "intel/intel_batchbuffer.h"
+#include "intel/intel_driver.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+LOCAL int
+intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t sz)
+{
+ if (batch->buffer != NULL) {
+ dri_bo_unreference(batch->buffer);
+ batch->buffer = NULL;
+ batch->last_bo = NULL;
+ }
+
+ batch->buffer = dri_bo_alloc(batch->intel->bufmgr,
+ "batch buffer",
+ sz,
+ 64);
+ if (!batch->buffer || (dri_bo_map(batch->buffer, 1) != 0)) {
+ if (batch->buffer)
+ dri_bo_unreference(batch->buffer);
+ batch->buffer = NULL;
+ return -1;
+ }
+ batch->map = (uint8_t*) batch->buffer->virtual;
+ batch->size = sz;
+ batch->ptr = batch->map;
+ batch->atomic = 0;
+ batch->last_bo = batch->buffer;
+ batch->enable_slm = 0;
+ return 0;
+}
+
+LOCAL void
+intel_batchbuffer_init(intel_batchbuffer_t *batch, intel_driver_t *intel)
+{
+ assert(intel);
+ batch->intel = intel;
+}
+
+LOCAL void
+intel_batchbuffer_terminate(intel_batchbuffer_t *batch)
+{
+ assert(batch->buffer);
+
+ if (batch->map) {
+ dri_bo_unmap(batch->buffer);
+ batch->map = NULL;
+ }
+
+ dri_bo_unreference(batch->buffer);
+ batch->buffer = NULL;
+}
+
+LOCAL void
+intel_batchbuffer_flush(intel_batchbuffer_t *batch)
+{
+ uint32_t used = batch->ptr - batch->map;
+ int is_locked = batch->intel->locked;
+
+ if (used == 0)
+ return;
+
+ if ((used & 4) == 0) {
+ *(uint32_t*) batch->ptr = 0;
+ batch->ptr += 4;
+ }
+
+ *(uint32_t*)batch->ptr = MI_BATCH_BUFFER_END;
+ batch->ptr += 4;
+ dri_bo_unmap(batch->buffer);
+ used = batch->ptr - batch->map;
+
+ if (!is_locked)
+ intel_driver_lock_hardware(batch->intel);
+
+ int flag = I915_EXEC_RENDER;
+ if(batch->enable_slm) {
+ /* use the hard code here temp, must change to
+ * I915_EXEC_ENABLE_SLM when it drm accept the patch */
+ flag |= (1<<13);
+ }
+ drm_intel_gem_bo_context_exec(batch->buffer, batch->intel->ctx, used, flag);
+
+ if (!is_locked)
+ intel_driver_unlock_hardware(batch->intel);
+
+ // Release the buffer
+ intel_batchbuffer_terminate(batch);
+}
+
+LOCAL void
+intel_batchbuffer_emit_reloc(intel_batchbuffer_t *batch,
+ dri_bo *bo,
+ uint32_t read_domains,
+ uint32_t write_domains,
+ uint32_t delta)
+{
+ assert(batch->ptr - batch->map < batch->size);
+ dri_bo_emit_reloc(batch->buffer,
+ read_domains,
+ write_domains,
+ delta,
+ batch->ptr - batch->map,
+ bo);
+ intel_batchbuffer_emit_dword(batch, bo->offset + delta);
+}
+
+LOCAL void
+intel_batchbuffer_emit_mi_flush(intel_batchbuffer_t *batch)
+{
+ intel_batchbuffer_require_space(batch, 4);
+ intel_batchbuffer_emit_dword(batch, MI_FLUSH | STATE_INSTRUCTION_CACHE_INVALIDATE);
+}
+
+LOCAL intel_batchbuffer_t*
+intel_batchbuffer_new(intel_driver_t *intel)
+{
+ intel_batchbuffer_t *batch = NULL;
+ assert(intel);
+ TRY_ALLOC_NO_ERR (batch, CALLOC(intel_batchbuffer_t));
+ intel_batchbuffer_init(batch, intel);
+
+exit:
+ return batch;
+error:
+ intel_batchbuffer_delete(batch);
+ batch = NULL;
+ goto exit;
+}
+
+LOCAL void
+intel_batchbuffer_delete(intel_batchbuffer_t *batch)
+{
+ if (batch == NULL)
+ return;
+ if(batch->buffer)
+ intel_batchbuffer_terminate(batch);
+
+ cl_free(batch);
+}
diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
new file mode 100644
index 0000000..4c28a7c
--- /dev/null
+++ b/src/intel/intel_batchbuffer.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef _INTEL_BATCHBUFFER_H_
+#define _INTEL_BATCHBUFFER_H_
+
+#include "intel_defines.h"
+#include "cl_utils.h"
+
+#include <xf86drm.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+#include <stdint.h>
+#include <memory.h>
+#include <assert.h>
+
+#define BEGIN_BATCH(b, n) do { \
+ intel_batchbuffer_require_space(b, (n) * 4); \
+} while (0)
+
+#define OUT_BATCH(b, d) do { \
+ intel_batchbuffer_emit_dword(b, d); \
+} while (0)
+
+#define OUT_RELOC(b, bo, read_domains, write_domain, delta) do { \
+ assert((delta) >= 0); \
+ intel_batchbuffer_emit_reloc(b, bo, read_domains, write_domain, delta); \
+} while (0)
+
+#define ADVANCE_BATCH(b) do { } while (0)
+
+struct intel_driver;
+
+typedef struct intel_batchbuffer
+{
+ struct intel_driver *intel;
+ drm_intel_bo *buffer;
+ /** Last bo submitted to the hardware. used for clFinish. */
+ drm_intel_bo *last_bo;
+ uint32_t size;
+ uint8_t *map;
+ uint8_t *ptr;
+ /** HSW: can't set LRI in batch buffer, set I915_EXEC_ENABLE_SLM
+ * flag when call exec. */
+ uint8_t enable_slm;
+ int atomic;
+} intel_batchbuffer_t;
+
+extern intel_batchbuffer_t* intel_batchbuffer_new(struct intel_driver*);
+extern void intel_batchbuffer_delete(intel_batchbuffer_t*);
+extern void intel_batchbuffer_emit_reloc(intel_batchbuffer_t*,
+ drm_intel_bo*,
+ uint32_t read_domains,
+ uint32_t write_domains,
+ uint32_t delta);
+extern void intel_batchbuffer_emit_mi_flush(intel_batchbuffer_t*);
+extern void intel_batchbuffer_init(intel_batchbuffer_t*, struct intel_driver*);
+extern void intel_batchbuffer_terminate(intel_batchbuffer_t*);
+extern void intel_batchbuffer_flush(intel_batchbuffer_t*);
+extern int intel_batchbuffer_reset(intel_batchbuffer_t*, size_t sz);
+
+static INLINE uint32_t
+intel_batchbuffer_space(const intel_batchbuffer_t *batch)
+{
+ assert(batch->ptr);
+ return batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE void
+intel_batchbuffer_emit_dword(intel_batchbuffer_t *batch, uint32_t x)
+{
+ assert(intel_batchbuffer_space(batch) >= 4);
+ *(uint32_t*)batch->ptr = x;
+ batch->ptr += 4;
+}
+
+static INLINE void
+intel_batchbuffer_require_space(intel_batchbuffer_t *batch, uint32_t size) {
+ assert(size < batch->size - 8);
+ if (intel_batchbuffer_space(batch) < size)
+ intel_batchbuffer_space(batch);
+}
+
+static INLINE uint8_t*
+intel_batchbuffer_alloc_space(intel_batchbuffer_t *batch, uint32_t size)
+{
+ assert(intel_batchbuffer_space(batch) >= size);
+ uint8_t *space_ptr = batch->ptr;
+ batch->ptr += size;
+ return space_ptr;
+}
+
+static INLINE void
+intel_batchbuffer_start_atomic(intel_batchbuffer_t *batch, uint32_t size)
+{
+ assert(!batch->atomic);
+ intel_batchbuffer_require_space(batch, size);
+ batch->atomic = 1;
+}
+
+static INLINE void
+intel_batchbuffer_end_atomic(intel_batchbuffer_t *batch)
+{
+ assert(batch->atomic);
+ batch->atomic = 0;
+}
+
+#endif /* _INTEL_BATCHBUFFER_H_ */
+
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
new file mode 100644
index 0000000..02ffde4
--- /dev/null
+++ b/src/intel/intel_defines.h
@@ -0,0 +1,339 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp. 2006. All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+ * Authors:
+ * Keith Whitwell <keith at tungstengraphics.com>
+ */
+#ifndef __GENX_DEFINES_H__
+#define __GENX_DEFINES_H__
+
+#define CMD(PIPELINE,OP,SUB_OP) ((3 << 29) | \
+ ((PIPELINE) << 27) | \
+ ((OP) << 24) | \
+ ((SUB_OP) << 16))
+
+#define CMD_URB_FENCE CMD(0, 0, 0)
+#define CMD_CS_URB_STATE CMD(0, 0, 1)
+#define CMD_CONSTANT_BUFFER CMD(0, 0, 2)
+#define CMD_STATE_PREFETCH CMD(0, 0, 3)
+#define CMD_MEDIA_GATEWAY_STATE CMD(2, 0, 3)
+#define CMD_MEDIA_STATE_FLUSH CMD(2, 0, 4)
+#define CMD_GPGPU_WALKER CMD(2, 1, 5)
+#define CMD_PIPE_CONTROL CMD(3, 2, 0)
+
+#define CMD_LOAD_REGISTER_IMM (0x22 << 23)
+
+#define CMD_STATE_BASE_ADDRESS CMD(0, 1, 1)
+#define CMD_STATE_SIP CMD(0, 1, 2)
+#define CMD_PIPELINE_SELECT CMD(1, 1, 4)
+#define CMD_SAMPLER_PALETTE_LOAD CMD(3, 1, 2)
+
+#define CMD_MEDIA_STATE_POINTERS CMD(2, 0, 0)
+#define CMD_MEDIA CMD(2, 1, 0)
+#define CMD_MEDIA_EX CMD(2, 1, 1)
+
+#define CMD_PIPELINED_POINTERS CMD(3, 0, 0)
+#define CMD_BINDING_TABLE_POINTERS CMD(3, 0, 1)
+#define CMD_VERTEX_BUFFERS CMD(3, 0, 8)
+#define CMD_VERTEX_ELEMENTS CMD(3, 0, 9)
+#define CMD_DRAWING_RECTANGLE CMD(3, 1, 0)
+#define CMD_CONSTANT_COLOR CMD(3, 1, 1)
+#define CMD_3DPRIMITIVE CMD(3, 3, 0)
+
+#define BASE_ADDRESS_MODIFY (1 << 0)
+
+#define PIPELINE_SELECT_3D 0
+#define PIPELINE_SELECT_MEDIA 1
+
+#define UF0_CS_REALLOC (1 << 13)
+#define UF0_VFE_REALLOC (1 << 12)
+#define UF0_SF_REALLOC (1 << 11)
+#define UF0_CLIP_REALLOC (1 << 10)
+#define UF0_GS_REALLOC (1 << 9)
+#define UF0_VS_REALLOC (1 << 8)
+#define UF1_CLIP_FENCE_SHIFT 20
+#define UF1_GS_FENCE_SHIFT 10
+#define UF1_VS_FENCE_SHIFT 0
+#define UF2_CS_FENCE_SHIFT 20
+#define UF2_VFE_FENCE_SHIFT 10
+#define UF2_SF_FENCE_SHIFT 0
+
+#define FLOATING_POINT_IEEE_754 0
+#define FLOATING_POINT_NON_IEEE_754 1
+
+#define I965_SURFACE_1D 0
+#define I965_SURFACE_2D 1
+#define I965_SURFACE_3D 2
+#define I965_SURFACE_CUBE 3
+#define I965_SURFACE_BUFFER 4
+#define I965_SURFACE_NULL 7
+
+#define I965_SURFACEFORMAT_R32G32B32A32_FLOAT 0x000
+#define I965_SURFACEFORMAT_R32G32B32A32_SINT 0x001
+#define I965_SURFACEFORMAT_R32G32B32A32_UINT 0x002
+#define I965_SURFACEFORMAT_R32G32B32A32_UNORM 0x003
+#define I965_SURFACEFORMAT_R32G32B32A32_SNORM 0x004
+#define I965_SURFACEFORMAT_R64G64_FLOAT 0x005
+#define I965_SURFACEFORMAT_R32G32B32X32_FLOAT 0x006
+#define I965_SURFACEFORMAT_R32G32B32A32_SSCALED 0x007
+#define I965_SURFACEFORMAT_R32G32B32A32_USCALED 0x008
+#define I965_SURFACEFORMAT_R32G32B32_FLOAT 0x040
+#define I965_SURFACEFORMAT_R32G32B32_SINT 0x041
+#define I965_SURFACEFORMAT_R32G32B32_UINT 0x042
+#define I965_SURFACEFORMAT_R32G32B32_UNORM 0x043
+#define I965_SURFACEFORMAT_R32G32B32_SNORM 0x044
+#define I965_SURFACEFORMAT_R32G32B32_SSCALED 0x045
+#define I965_SURFACEFORMAT_R32G32B32_USCALED 0x046
+#define I965_SURFACEFORMAT_R16G16B16A16_UNORM 0x080
+#define I965_SURFACEFORMAT_R16G16B16A16_SNORM 0x081
+#define I965_SURFACEFORMAT_R16G16B16A16_SINT 0x082
+#define I965_SURFACEFORMAT_R16G16B16A16_UINT 0x083
+#define I965_SURFACEFORMAT_R16G16B16A16_FLOAT 0x084
+#define I965_SURFACEFORMAT_R32G32_FLOAT 0x085
+#define I965_SURFACEFORMAT_R32G32_SINT 0x086
+#define I965_SURFACEFORMAT_R32G32_UINT 0x087
+#define I965_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS 0x088
+#define I965_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT 0x089
+#define I965_SURFACEFORMAT_L32A32_FLOAT 0x08A
+#define I965_SURFACEFORMAT_R32G32_UNORM 0x08B
+#define I965_SURFACEFORMAT_R32G32_SNORM 0x08C
+#define I965_SURFACEFORMAT_R64_FLOAT 0x08D
+#define I965_SURFACEFORMAT_R16G16B16X16_UNORM 0x08E
+#define I965_SURFACEFORMAT_R16G16B16X16_FLOAT 0x08F
+#define I965_SURFACEFORMAT_A32X32_FLOAT 0x090
+#define I965_SURFACEFORMAT_L32X32_FLOAT 0x091
+#define I965_SURFACEFORMAT_I32X32_FLOAT 0x092
+#define I965_SURFACEFORMAT_R16G16B16A16_SSCALED 0x093
+#define I965_SURFACEFORMAT_R16G16B16A16_USCALED 0x094
+#define I965_SURFACEFORMAT_R32G32_SSCALED 0x095
+#define I965_SURFACEFORMAT_R32G32_USCALED 0x096
+#define I965_SURFACEFORMAT_B8G8R8A8_UNORM 0x0C0
+#define I965_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB 0x0C1
+#define I965_SURFACEFORMAT_R10G10B10A2_UNORM 0x0C2
+#define I965_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB 0x0C3
+#define I965_SURFACEFORMAT_R10G10B10A2_UINT 0x0C4
+#define I965_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM 0x0C5
+#define I965_SURFACEFORMAT_R8G8B8A8_UNORM 0x0C7
+#define I965_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB 0x0C8
+#define I965_SURFACEFORMAT_R8G8B8A8_SNORM 0x0C9
+#define I965_SURFACEFORMAT_R8G8B8A8_SINT 0x0CA
+#define I965_SURFACEFORMAT_R8G8B8A8_UINT 0x0CB
+#define I965_SURFACEFORMAT_R16G16_UNORM 0x0CC
+#define I965_SURFACEFORMAT_R16G16_SNORM 0x0CD
+#define I965_SURFACEFORMAT_R16G16_SINT 0x0CE
+#define I965_SURFACEFORMAT_R16G16_UINT 0x0CF
+#define I965_SURFACEFORMAT_R16G16_FLOAT 0x0D0
+#define I965_SURFACEFORMAT_B10G10R10A2_UNORM 0x0D1
+#define I965_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB 0x0D2
+#define I965_SURFACEFORMAT_R11G11B10_FLOAT 0x0D3
+#define I965_SURFACEFORMAT_R32_SINT 0x0D6
+#define I965_SURFACEFORMAT_R32_UINT 0x0D7
+#define I965_SURFACEFORMAT_R32_FLOAT 0x0D8
+#define I965_SURFACEFORMAT_R24_UNORM_X8_TYPELESS 0x0D9
+#define I965_SURFACEFORMAT_X24_TYPELESS_G8_UINT 0x0DA
+#define I965_SURFACEFORMAT_L16A16_UNORM 0x0DF
+#define I965_SURFACEFORMAT_I24X8_UNORM 0x0E0
+#define I965_SURFACEFORMAT_L24X8_UNORM 0x0E1
+#define I965_SURFACEFORMAT_A24X8_UNORM 0x0E2
+#define I965_SURFACEFORMAT_I32_FLOAT 0x0E3
+#define I965_SURFACEFORMAT_L32_FLOAT 0x0E4
+#define I965_SURFACEFORMAT_A32_FLOAT 0x0E5
+#define I965_SURFACEFORMAT_B8G8R8X8_UNORM 0x0E9
+#define I965_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB 0x0EA
+#define I965_SURFACEFORMAT_R8G8B8X8_UNORM 0x0EB
+#define I965_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB 0x0EC
+#define I965_SURFACEFORMAT_R9G9B9E5_SHAREDEXP 0x0ED
+#define I965_SURFACEFORMAT_B10G10R10X2_UNORM 0x0EE
+#define I965_SURFACEFORMAT_L16A16_FLOAT 0x0F0
+#define I965_SURFACEFORMAT_R32_UNORM 0x0F1
+#define I965_SURFACEFORMAT_R32_SNORM 0x0F2
+#define I965_SURFACEFORMAT_R10G10B10X2_USCALED 0x0F3
+#define I965_SURFACEFORMAT_R8G8B8A8_SSCALED 0x0F4
+#define I965_SURFACEFORMAT_R8G8B8A8_USCALED 0x0F5
+#define I965_SURFACEFORMAT_R16G16_SSCALED 0x0F6
+#define I965_SURFACEFORMAT_R16G16_USCALED 0x0F7
+#define I965_SURFACEFORMAT_R32_SSCALED 0x0F8
+#define I965_SURFACEFORMAT_R32_USCALED 0x0F9
+#define I965_SURFACEFORMAT_B5G6R5_UNORM 0x100
+#define I965_SURFACEFORMAT_B5G6R5_UNORM_SRGB 0x101
+#define I965_SURFACEFORMAT_B5G5R5A1_UNORM 0x102
+#define I965_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB 0x103
+#define I965_SURFACEFORMAT_B4G4R4A4_UNORM 0x104
+#define I965_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB 0x105
+#define I965_SURFACEFORMAT_R8G8_UNORM 0x106
+#define I965_SURFACEFORMAT_R8G8_SNORM 0x107
+#define I965_SURFACEFORMAT_R8G8_SINT 0x108
+#define I965_SURFACEFORMAT_R8G8_UINT 0x109
+#define I965_SURFACEFORMAT_R16_UNORM 0x10A
+#define I965_SURFACEFORMAT_R16_SNORM 0x10B
+#define I965_SURFACEFORMAT_R16_SINT 0x10C
+#define I965_SURFACEFORMAT_R16_UINT 0x10D
+#define I965_SURFACEFORMAT_R16_FLOAT 0x10E
+#define I965_SURFACEFORMAT_I16_UNORM 0x111
+#define I965_SURFACEFORMAT_L16_UNORM 0x112
+#define I965_SURFACEFORMAT_A16_UNORM 0x113
+#define I965_SURFACEFORMAT_L8A8_UNORM 0x114
+#define I965_SURFACEFORMAT_I16_FLOAT 0x115
+#define I965_SURFACEFORMAT_L16_FLOAT 0x116
+#define I965_SURFACEFORMAT_A16_FLOAT 0x117
+#define I965_SURFACEFORMAT_R5G5_SNORM_B6_UNORM 0x119
+#define I965_SURFACEFORMAT_B5G5R5X1_UNORM 0x11A
+#define I965_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB 0x11B
+#define I965_SURFACEFORMAT_R8G8_SSCALED 0x11C
+#define I965_SURFACEFORMAT_R8G8_USCALED 0x11D
+#define I965_SURFACEFORMAT_R16_SSCALED 0x11E
+#define I965_SURFACEFORMAT_R16_USCALED 0x11F
+#define I965_SURFACEFORMAT_R8_UNORM 0x140
+#define I965_SURFACEFORMAT_R8_SNORM 0x141
+#define I965_SURFACEFORMAT_R8_SINT 0x142
+#define I965_SURFACEFORMAT_R8_UINT 0x143
+#define I965_SURFACEFORMAT_A8_UNORM 0x144
+#define I965_SURFACEFORMAT_I8_UNORM 0x145
+#define I965_SURFACEFORMAT_L8_UNORM 0x146
+#define I965_SURFACEFORMAT_P4A4_UNORM 0x147
+#define I965_SURFACEFORMAT_A4P4_UNORM 0x148
+#define I965_SURFACEFORMAT_R8_SSCALED 0x149
+#define I965_SURFACEFORMAT_R8_USCALED 0x14A
+#define I965_SURFACEFORMAT_R1_UINT 0x181
+#define I965_SURFACEFORMAT_YCRCB_NORMAL 0x182
+#define I965_SURFACEFORMAT_YCRCB_SWAPUVY 0x183
+#define I965_SURFACEFORMAT_BC1_UNORM 0x186
+#define I965_SURFACEFORMAT_BC2_UNORM 0x187
+#define I965_SURFACEFORMAT_BC3_UNORM 0x188
+#define I965_SURFACEFORMAT_BC4_UNORM 0x189
+#define I965_SURFACEFORMAT_BC5_UNORM 0x18A
+#define I965_SURFACEFORMAT_BC1_UNORM_SRGB 0x18B
+#define I965_SURFACEFORMAT_BC2_UNORM_SRGB 0x18C
+#define I965_SURFACEFORMAT_BC3_UNORM_SRGB 0x18D
+#define I965_SURFACEFORMAT_MONO8 0x18E
+#define I965_SURFACEFORMAT_YCRCB_SWAPUV 0x18F
+#define I965_SURFACEFORMAT_YCRCB_SWAPY 0x190
+#define I965_SURFACEFORMAT_DXT1_RGB 0x191
+#define I965_SURFACEFORMAT_FXT1 0x192
+#define I965_SURFACEFORMAT_R8G8B8_UNORM 0x193
+#define I965_SURFACEFORMAT_R8G8B8_SNORM 0x194
+#define I965_SURFACEFORMAT_R8G8B8_SSCALED 0x195
+#define I965_SURFACEFORMAT_R8G8B8_USCALED 0x196
+#define I965_SURFACEFORMAT_R64G64B64A64_FLOAT 0x197
+#define I965_SURFACEFORMAT_R64G64B64_FLOAT 0x198
+#define I965_SURFACEFORMAT_BC4_SNORM 0x199
+#define I965_SURFACEFORMAT_BC5_SNORM 0x19A
+#define I965_SURFACEFORMAT_R16G16B16_UNORM 0x19C
+#define I965_SURFACEFORMAT_R16G16B16_SNORM 0x19D
+#define I965_SURFACEFORMAT_R16G16B16_SSCALED 0x19E
+#define I965_SURFACEFORMAT_R16G16B16_USCALED 0x19F
+#define I965_SURFACEFORMAT_RAW 0x1FF
+
+#define I965_MAPFILTER_NEAREST 0x0
+#define I965_MAPFILTER_LINEAR 0x1
+#define I965_MAPFILTER_ANISOTROPIC 0x2
+
+#define I965_MIPFILTER_NONE 0
+#define I965_MIPFILTER_NEAREST 1
+#define I965_MIPFILTER_LINEAR 3
+
+#define I965_TEXCOORDMODE_WRAP 0
+#define I965_TEXCOORDMODE_MIRROR 1
+#define I965_TEXCOORDMODE_CLAMP 2
+#define I965_TEXCOORDMODE_CUBE 3
+#define I965_TEXCOORDMODE_CLAMP_BORDER 4
+#define I965_TEXCOORDMODE_MIRROR_ONCE 5
+
+#define I965_SURFACERETURNFORMAT_FLOAT32 0
+#define I965_SURFACERETURNFORMAT_S1 1
+
+#define I965_TILEWALK_XMAJOR 0
+#define I965_TILEWALK_YMAJOR 1
+
+#define I965_SURCHAN_SELECT_ZERO 0
+#define I965_SURCHAN_SELECT_ONE 1
+#define I965_SURCHAN_SELECT_RED 4
+#define I965_SURCHAN_SELECT_GREEN 5
+#define I965_SURCHAN_SELECT_BLUE 6
+#define I965_SURCHAN_SELECT_ALPHA 7
+
+#define URB_SIZE(intel) (IS_IGDNG(intel->device_id) ? 1024 : \
+ IS_G4X(intel->device_id) ? 384 : 256)
+
+// L3 cache stuff
+#define GEN7_L3_SQC_REG1_ADDRESS_OFFSET (0XB010)
+#define GEN7_L3_CNTL_REG2_ADDRESS_OFFSET (0xB020)
+#define GEN7_L3_CNTL_REG3_ADDRESS_OFFSET (0xB024)
+
+// To issue pipe controls (reset L3 / SLM or stall)
+#define GEN7_PIPE_CONTROL_MEDIA 0x2
+#define GEN7_PIPE_CONTROL_3D 0x3
+#define GEN7_PIPE_CONTROL_INSTRUCTION_GFX 0x3
+#define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2
+#define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0
+#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP (3 << 14)
+#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE (1 << 2)
+
+
+#define GEN_MAPFILTER_NEAREST 0x0
+#define GEN_MAPFILTER_LINEAR 0x1
+#define GEN_MAPFILTER_ANISOTROPIC 0x2
+
+#define GEN_MIPFILTER_NONE 0
+#define GEN_MIPFILTER_NEAREST 1
+#define GEN_MIPFILTER_LINEAR 3
+
+#define GEN_ADDRESS_ROUNDING_ENABLE_U_MAG 0x20
+#define GEN_ADDRESS_ROUNDING_ENABLE_U_MIN 0x10
+#define GEN_ADDRESS_ROUNDING_ENABLE_V_MAG 0x08
+#define GEN_ADDRESS_ROUNDING_ENABLE_V_MIN 0x04
+#define GEN_ADDRESS_ROUNDING_ENABLE_R_MAG 0x02
+#define GEN_ADDRESS_ROUNDING_ENABLE_R_MIN 0x01
+
+#define GEN_TEXCOORDMODE_WRAP 0
+#define GEN_TEXCOORDMODE_MIRROR 1
+#define GEN_TEXCOORDMODE_CLAMP 2
+#define GEN_TEXCOORDMODE_CUBE 3
+#define GEN_TEXCOORDMODE_CLAMP_BORDER 4
+#define GEN_TEXCOORDMODE_MIRROR_ONCE 5
+
+#endif /* __GENX_DEFINES_H__ */
+
diff --git a/src/intel/intel_dri_resource_sharing.c b/src/intel/intel_dri_resource_sharing.c
new file mode 100644
index 0000000..188c1fa
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#define HAVE_PTHREAD 1
+#include <errno.h>
+#include <time.h>
+#include "main/context.h"
+#include "main/renderbuffer.h"
+#include "main/texobj.h"
+#include <stdbool.h>
+#include <string.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+#include <GL/internal/dri_interface.h>
+#include "intel_mipmap_tree.h"
+#include "intel_regions.h"
+#include "intel_context.h"
+
+#include "intel_dri_resource_sharing.h"
+#include "intel_dri_resource_sharing_int.h"
+
+#include <dlfcn.h>
+/**
+ * Sets up a DRIImage structure to point to our shared image in a region
+ */
+static bool
+intel_setup_cl_region_from_mipmap_tree(void *driver,
+ struct intel_context *intel,
+ struct intel_mipmap_tree *mt,
+ GLuint level, GLuint zoffset,
+ struct _intel_dri_share_image_region *region)
+{
+ unsigned int draw_x, draw_y;
+ uint32_t mask_x, mask_y;
+ struct intel_region *null_region = (struct intel_region *)NULL;
+
+ intel_miptree_check_level_layer(mt, level, zoffset);
+
+ _intel_region_get_tile_masks(mt->region, &mask_x, &mask_y, false);
+ _intel_miptree_get_image_offset(mt, level, zoffset, &draw_x, &draw_y);
+
+ region->w = mt->level[level].width;
+ region->h = mt->level[level].height;
+ region->tile_x = draw_x & mask_x;
+ region->tile_y = draw_y & mask_y;
+ region->tiling = mt->region->tiling;
+ /* XXX hard code to 1 right now. */
+ region->depth = 1;
+ region->row_pitch = mt->region->pitch;
+
+ region->offset = _intel_region_get_aligned_offset(mt->region,
+ draw_x & ~mask_x,
+ draw_y & ~mask_y,
+ false);
+ if (!_intel_region_flink(mt->region, ®ion->name))
+ return false;
+ _intel_region_reference(&null_region, mt->region);
+ return true;
+}
+
+typedef void
+_mesa_test_texobj_completeness_t( const struct gl_context *ctx,
+ struct gl_texture_object *t );
+_mesa_test_texobj_completeness_t *__mesa_test_texobj_completeness;
+
+typedef struct gl_texture_object *
+_mesa_lookup_texture_t( const struct gl_context *ctx, GLuint id);
+_mesa_lookup_texture_t *__mesa_lookup_texture;
+
+static struct gl_texture_object *
+intel_get_gl_obj_from_texture(void *driver,
+ struct intel_context *intel,
+ GLenum target, GLint level,
+ GLuint texture, GLuint face)
+{
+ struct gl_texture_object *obj;
+ __mesa_lookup_texture = dlsym(driver, "_mesa_lookup_texture");
+ obj = __mesa_lookup_texture(&intel->ctx, texture);
+ if (!obj || obj->Target != target) {
+ return NULL;
+ }
+
+ __mesa_test_texobj_completeness = dlsym(driver, "_mesa_test_texobj_completeness");
+ __mesa_test_texobj_completeness(&intel->ctx, obj);
+ if (!obj->_BaseComplete || (level > 0 && !obj->_MipmapComplete)) {
+ return NULL;
+ }
+
+ if (level < obj->BaseLevel || level > obj->_MaxLevel) {
+ return NULL;
+ }
+
+ return obj;
+}
+
+static GLenum
+get_cl_gl_format(mesa_format format)
+{
+ switch (format) {
+ case MESA_FORMAT_R8G8B8A8_UNORM:
+ return GL_RGBA;
+ case MESA_FORMAT_A8R8G8B8_UNORM:
+ return GL_BGRA;
+ default:
+ return GL_BGRA;
+ }
+}
+
+static bool
+intelAcquireTexture(void *driver, __DRIcontext *context, GLenum target,
+ GLint level, GLuint texture, void *user_data)
+{
+ struct _intel_dri_share_image_region *region = intel_dri_share_image_region(user_data);
+ struct intel_context *intel = context->driverPrivate;
+ struct gl_texture_object *obj;
+ struct intel_texture_object *iobj;
+ /* XXX Always be face 0? */
+ GLuint face = 0;
+
+ obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+ if (obj == NULL)
+ return false;
+ iobj = intel_texture_object(obj);
+ region->gl_format = get_cl_gl_format(obj->Image[face][level]->TexFormat);
+ return intel_setup_cl_region_from_mipmap_tree(driver, intel, iobj->mt, level, 0, region);
+}
+
+static bool
+intelReleaseTexture(void *driver, __DRIcontext *context, GLenum target,
+ GLint level, GLuint texture)
+{
+ struct intel_context *intel = context->driverPrivate;
+ struct gl_texture_object *obj;
+ struct intel_texture_object *iobj;
+ /* XXX Always be face 0? */
+ GLuint face = 0;
+
+ obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+ if (obj == NULL)
+ return false;
+
+ iobj = intel_texture_object(obj);
+ _intel_region_release(&iobj->mt->region);
+ return true;
+}
+
+static bool
+intelAcquireBufferObj(void *driver, __DRIcontext *driContextPriv,
+ GLuint bufobj, void *user_data)
+{
+ return false;
+}
+
+static bool
+intelReleaseBufferObj(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+ return false;
+}
+
+static bool
+intelAcquireRenderBuffer(void *driver, __DRIcontext *driContextPriv,
+ GLuint bufobj, void *user_data)
+{
+ return false;
+}
+
+static bool
+intelReleaseRenderBuffer(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+ return false;
+}
+
+#include "cl_driver.h"
+void
+intel_set_cl_gl_callbacks(void)
+{
+ cl_gl_acquire_texture = (cl_gl_acquire_texture_cb*)intelAcquireTexture;
+ cl_gl_release_texture = (cl_gl_release_texture_cb*)intelReleaseTexture;
+ cl_gl_acquire_buffer_object = (cl_gl_acquire_buffer_object_cb*)intelAcquireBufferObj;
+ cl_gl_release_buffer_object = (cl_gl_release_buffer_object_cb*)intelReleaseBufferObj;
+ cl_gl_acquire_render_buffer = (cl_gl_acquire_render_buffer_cb*)intelAcquireRenderBuffer;
+ cl_gl_release_render_buffer = (cl_gl_release_render_buffer_cb*)intelReleaseRenderBuffer;
+}
diff --git a/src/intel/intel_dri_resource_sharing.h b/src/intel/intel_dri_resource_sharing.h
new file mode 100644
index 0000000..6d2ce4d
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.h
@@ -0,0 +1,39 @@
+#ifndef __INTEL_DRI_RESOURCE_SHARING_H__
+#define __INTEL_DRI_RESOURCE_SHARING_H__
+
+struct _intel_dri_share_image_region {
+ unsigned int name;
+ size_t w;
+ size_t h;
+ size_t depth;
+ size_t pitch;
+ int tiling;
+ size_t offset;
+ size_t tile_x;
+ size_t tile_y;
+ unsigned int gl_format;
+ size_t row_pitch, slice_pitch;
+};
+
+struct _intel_dri_share_buffer_object {
+ unsigned int name;
+ size_t sz;
+ size_t offset;
+};
+
+inline static struct _intel_dri_share_image_region *
+intel_dri_share_image_region(void *user_data)
+{
+ return (struct _intel_dri_share_image_region *)user_data;
+}
+
+inline static struct _intel_dri_share_buffer_object *
+intel_dri_share_buffer_object(void *user_data)
+{
+ return (struct _intel_dri_share_buffer_object *)user_data;
+}
+
+extern void intel_set_cl_gl_callbacks(void);
+
+
+#endif
diff --git a/src/intel/intel_dri_resource_sharing_int.h b/src/intel/intel_dri_resource_sharing_int.h
new file mode 100644
index 0000000..c7b283a
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing_int.h
@@ -0,0 +1,143 @@
+/*****************************************************************
+ * The following functions are copied from i965 driver, commit
+ * id 292368570a13501dfa95b1b0dd70966caf6ffc6b. Need to keep consistant
+ * with the dri driver installed on current system.
+ *****************************************************************/
+static bool
+_intel_region_flink(struct intel_region *region, uint32_t *name)
+{
+ if (region->name == 0) {
+ if (drm_intel_bo_flink(region->bo, ®ion->name))
+ return false;
+ }
+
+ *name = region->name;
+
+ return true;
+}
+
+#define _DBG(...)
+static void
+_intel_region_release(struct intel_region **region_handle)
+{
+ struct intel_region *region = *region_handle;
+
+ if (region == NULL) {
+ _DBG("%s NULL\n", __FUNCTION__);
+ return;
+ }
+
+ _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
+
+ ASSERT(region->refcount > 0);
+ region->refcount--;
+
+ if (region->refcount == 0) {
+ drm_intel_bo_unreference(region->bo);
+
+ free(region);
+ }
+ *region_handle = NULL;
+}
+
+static void
+_intel_region_reference(struct intel_region **dst, struct intel_region *src)
+{
+ _DBG("%s: %p(%d) -> %p(%d)\n", __FUNCTION__,
+ *dst, *dst ? (*dst)->refcount : 0, src, src ? src->refcount : 0);
+
+ if (src != *dst) {
+ if (*dst)
+ _intel_region_release(dst);
+
+ if (src)
+ src->refcount++;
+ *dst = src;
+ }
+}
+
+/**
+ * This function computes masks that may be used to select the bits of the X
+ * and Y coordinates that indicate the offset within a tile. If the region is
+ * untiled, the masks are set to 0.
+ */
+static void
+_intel_region_get_tile_masks(struct intel_region *region,
+ uint32_t *mask_x, uint32_t *mask_y,
+ bool map_stencil_as_y_tiled)
+{
+ int cpp = region->cpp;
+ uint32_t tiling = region->tiling;
+
+ if (map_stencil_as_y_tiled)
+ tiling = I915_TILING_Y;
+
+ switch (tiling) {
+ default:
+ assert(false);
+ case I915_TILING_NONE:
+ *mask_x = *mask_y = 0;
+ break;
+ case I915_TILING_X:
+ *mask_x = 512 / cpp - 1;
+ *mask_y = 7;
+ break;
+ case I915_TILING_Y:
+ *mask_x = 128 / cpp - 1;
+ *mask_y = 31;
+ break;
+ }
+}
+
+/**
+ * Compute the offset (in bytes) from the start of the region to the given x
+ * and y coordinate. For tiled regions, caller must ensure that x and y are
+ * multiples of the tile size.
+ */
+static uint32_t
+_intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
+ uint32_t y, bool map_stencil_as_y_tiled)
+{
+ int cpp = region->cpp;
+ uint32_t pitch = region->pitch;
+ uint32_t tiling = region->tiling;
+
+ if (map_stencil_as_y_tiled) {
+ tiling = I915_TILING_Y;
+
+ /* When mapping a W-tiled stencil buffer as Y-tiled, each 64-high W-tile
+ * gets transformed into a 32-high Y-tile. Accordingly, the pitch of
+ * the resulting region is twice the pitch of the original region, since
+ * each row in the Y-tiled view corresponds to two rows in the actual
+ * W-tiled surface. So we need to correct the pitch before computing
+ * the offsets.
+ */
+ pitch *= 2;
+ }
+
+ switch (tiling) {
+ default:
+ assert(false);
+ case I915_TILING_NONE:
+ return y * pitch + x * cpp;
+ case I915_TILING_X:
+ assert((x % (512 / cpp)) == 0);
+ assert((y % 8) == 0);
+ return y * pitch + x / (512 / cpp) * 4096;
+ case I915_TILING_Y:
+ assert((x % (128 / cpp)) == 0);
+ assert((y % 32) == 0);
+ return y * pitch + x / (128 / cpp) * 4096;
+ }
+}
+
+static void
+_intel_miptree_get_image_offset(struct intel_mipmap_tree *mt,
+ GLuint level, GLuint slice,
+ GLuint *x, GLuint *y)
+{
+ assert(slice < mt->level[level].depth);
+
+ *x = mt->level[level].slice[slice].x_offset;
+ *y = mt->level[level].slice[slice].y_offset;
+}
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
new file mode 100644
index 0000000..deb83c8
--- /dev/null
+++ b/src/intel/intel_driver.c
@@ -0,0 +1,744 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Xiang Haihao <haihao.xiang at intel.com>
+ * Zou Nan hai <nanhai.zou at intel.com>
+ *
+ */
+
+#if defined(HAS_EGL)
+#include "GL/gl.h"
+#include "EGL/egl.h"
+#include "x11/mesa_egl_extension.h"
+#endif
+
+#ifdef HAS_X11
+#include <X11/Xlibint.h>
+#include "x11/dricommon.h"
+#endif
+
+#include "intel_driver.h"
+#include "intel_gpgpu.h"
+#include "intel_batchbuffer.h"
+#include "intel_bufmgr.h"
+#include "cl_mem.h"
+
+#include <assert.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <xf86drm.h>
+#include <stdio.h>
+
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_context.h"
+#include "cl_driver.h"
+#include "cl_device_id.h"
+#include "cl_platform_id.h"
+
+#define SET_BLOCKED_SIGSET(DRIVER) do { \
+ sigset_t bl_mask; \
+ sigfillset(&bl_mask); \
+ sigdelset(&bl_mask, SIGFPE); \
+ sigdelset(&bl_mask, SIGILL); \
+ sigdelset(&bl_mask, SIGSEGV); \
+ sigdelset(&bl_mask, SIGBUS); \
+ sigdelset(&bl_mask, SIGKILL); \
+ pthread_sigmask(SIG_SETMASK, &bl_mask, &(DRIVER)->sa_mask); \
+} while (0)
+
+#define RESTORE_BLOCKED_SIGSET(DRIVER) do { \
+ pthread_sigmask(SIG_SETMASK, &(DRIVER)->sa_mask, NULL); \
+} while (0)
+
+#define PPTHREAD_MUTEX_LOCK(DRIVER) do { \
+ SET_BLOCKED_SIGSET(DRIVER); \
+ pthread_mutex_lock(&(DRIVER)->ctxmutex); \
+} while (0)
+
+#define PPTHREAD_MUTEX_UNLOCK(DRIVER) do { \
+ pthread_mutex_unlock(&(DRIVER)->ctxmutex); \
+ RESTORE_BLOCKED_SIGSET(DRIVER); \
+} while (0)
+
+static void
+intel_driver_delete(intel_driver_t *driver)
+{
+ if (driver == NULL)
+ return;
+
+ if (driver->bufmgr)
+ drm_intel_bufmgr_destroy(driver->bufmgr);
+ cl_free(driver);
+}
+
+static intel_driver_t*
+intel_driver_new(void)
+{
+ intel_driver_t *driver = NULL;
+
+ TRY_ALLOC_NO_ERR (driver, CALLOC(intel_driver_t));
+ driver->fd = -1;
+
+exit:
+ return driver;
+error:
+ intel_driver_delete(driver);
+ driver = NULL;
+ goto exit;
+}
+
+/* just used for maximum relocation number in drm_intel */
+#define BATCH_SIZE 0x4000
+
+static void
+intel_driver_memman_init(intel_driver_t *driver)
+{
+ driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
+ assert(driver->bufmgr);
+ //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
+ drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
+}
+
+static void
+intel_driver_context_init(intel_driver_t *driver)
+{
+ driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
+ assert(driver->ctx);
+}
+
+static void
+intel_driver_context_destroy(intel_driver_t *driver)
+{
+ if(driver->ctx)
+ drm_intel_gem_context_destroy(driver->ctx);
+ driver->ctx = NULL;
+}
+
+static void
+intel_driver_init(intel_driver_t *driver, int dev_fd)
+{
+ driver->fd = dev_fd;
+ driver->locked = 0;
+ pthread_mutex_init(&driver->ctxmutex, NULL);
+#ifndef NDEBUG
+ int res =
+#endif /* NDEBUG */
+ intel_driver_get_param(driver, I915_PARAM_CHIPSET_ID, &driver->device_id);
+ assert(res);
+ intel_driver_memman_init(driver);
+ intel_driver_context_init(driver);
+
+#if EMULATE_GEN
+ driver->gen_ver = EMULATE_GEN;
+ if (EMULATE_GEN == 75)
+ driver->device_id = PCI_CHIP_HASWELL_L; /* we pick L for HSW */
+ else if (EMULATE_GEN == 7)
+ driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
+ else if (EMULATE_GEN == 6)
+ driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
+ else
+ FATAL ("Unsupported Gen for emulation");
+#else
+ if (IS_GEN75(driver->device_id))
+ driver->gen_ver = 75;
+ else if (IS_GEN7(driver->device_id))
+ driver->gen_ver = 7;
+ else if (IS_GEN6(driver->device_id))
+ driver->gen_ver = 6;
+ else if(IS_IGDNG(driver->device_id))
+ driver->gen_ver = 5;
+ else
+ driver->gen_ver = 4;
+#endif /* EMULATE_GEN */
+}
+
+static cl_int
+intel_driver_open(intel_driver_t *intel, cl_context_prop props)
+{
+ int cardi;
+#ifdef HAS_X11
+ char *driver_name;
+#endif
+ if (props != NULL
+ && props->gl_type != CL_GL_NOSHARE
+ && props->gl_type != CL_GL_GLX_DISPLAY
+ && props->gl_type != CL_GL_EGL_DISPLAY) {
+ fprintf(stderr, "Unsupported gl share type %d.\n", props->gl_type);
+ return CL_INVALID_OPERATION;
+ }
+
+#ifdef HAS_X11
+ intel->x11_display = XOpenDisplay(NULL);
+
+ if(intel->x11_display) {
+ if((intel->dri_ctx = getDRI2State(intel->x11_display,
+ DefaultScreen(intel->x11_display),
+ &driver_name))) {
+ intel_driver_init_shared(intel, intel->dri_ctx);
+ Xfree(driver_name);
+ }
+ else
+ fprintf(stderr, "X server found. dri2 connection failed! \n");
+ }
+#endif
+
+ if(!intel_driver_is_active(intel)) {
+ char card_name[20];
+ for(cardi = 0; cardi < 16; cardi++) {
+ sprintf(card_name, "/dev/dri/renderD%d", 128+cardi);
+ if(intel_driver_init_render(intel, card_name))
+ break;
+ }
+ }
+
+ if(!intel_driver_is_active(intel)) {
+ char card_name[20];
+ for(cardi = 0; cardi < 16; cardi++) {
+ sprintf(card_name, "/dev/dri/card%d", cardi);
+ if(intel_driver_init_master(intel, card_name))
+ break;
+ }
+ }
+
+ if(!intel_driver_is_active(intel)) {
+ fprintf(stderr, "Device open failed, aborting...\n");
+ return CL_DEVICE_NOT_FOUND;
+ }
+
+#ifdef HAS_EGL
+ if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
+ assert(props->egl_display);
+ }
+#endif
+ return CL_SUCCESS;
+}
+
+static void
+intel_driver_close(intel_driver_t *intel)
+{
+#ifdef HAS_X11
+ if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
+ if(intel->x11_display) XCloseDisplay(intel->x11_display);
+#endif
+ if(intel->need_close) {
+ close(intel->fd);
+ intel->need_close = 0;
+ }
+ intel->dri_ctx = NULL;
+ intel->x11_display = NULL;
+ intel->fd = -1;
+}
+
+LOCAL int
+intel_driver_get_param(intel_driver_t *driver, int param, int *value)
+{
+ int ret;
+ struct drm_i915_getparam gp;
+
+ memset(&gp, 0, sizeof(struct drm_i915_getparam));
+ gp.param = param;
+ gp.value = value;
+
+ ret = drmCommandWriteRead(driver->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
+ return ret == 0;
+}
+
+LOCAL int
+intel_driver_is_active(intel_driver_t *driver) {
+ return driver->fd >= 0;
+}
+
+#ifdef HAS_X11
+LOCAL int
+intel_driver_init_shared(intel_driver_t *driver, dri_state_t *state)
+{
+ assert(state);
+ if(state->driConnectedFlag != DRI2)
+ return 0;
+ intel_driver_init(driver, state->fd);
+ driver->need_close = 0;
+ return 1;
+}
+#endif
+
+LOCAL int
+intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
+{
+ int dev_fd;
+
+ drm_client_t client;
+
+ // usually dev_name = "/dev/dri/card%d"
+ dev_fd = open(dev_name, O_RDWR);
+ if (dev_fd == -1) {
+ fprintf(stderr, "open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
+ return 0;
+ }
+
+ // Check that we're authenticated
+ memset(&client, 0, sizeof(drm_client_t));
+ int ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
+ if (ret == -1) {
+ fprintf(stderr, "ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client) failed: %s\n", strerror(errno));
+ close(dev_fd);
+ return 0;
+ }
+
+ if (!client.auth) {
+ fprintf(stderr, "%s not authenticated\n", dev_name);
+ close(dev_fd);
+ return 0;
+ }
+
+ intel_driver_init(driver, dev_fd);
+ driver->need_close = 1;
+
+ return 1;
+}
+
+LOCAL int
+intel_driver_init_render(intel_driver_t *driver, const char* dev_name)
+{
+ int dev_fd;
+
+ // usually dev_name = "/dev/dri/renderD%d"
+ dev_fd = open(dev_name, O_RDWR);
+ if (dev_fd == -1)
+ return 0;
+
+ intel_driver_init(driver, dev_fd);
+ driver->need_close = 1;
+
+ return 1;
+}
+
+LOCAL int
+intel_driver_terminate(intel_driver_t *driver)
+{
+ pthread_mutex_destroy(&driver->ctxmutex);
+
+ if(driver->need_close) {
+ close(driver->fd);
+ driver->need_close = 0;
+ }
+ driver->fd = -1;
+ return 1;
+}
+
+LOCAL void
+intel_driver_lock_hardware(intel_driver_t *driver)
+{
+
+ PPTHREAD_MUTEX_LOCK(driver);
+ assert(!driver->locked);
+ driver->locked = 1;
+}
+
+LOCAL void
+intel_driver_unlock_hardware(intel_driver_t *driver)
+{
+ driver->locked = 0;
+ PPTHREAD_MUTEX_UNLOCK(driver);
+}
+
+LOCAL dri_bo*
+intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t name)
+{
+ dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
+ sname,
+ name);
+ return bo;
+}
+
+LOCAL uint32_t
+intel_driver_shared_name(intel_driver_t *driver, dri_bo *bo)
+{
+ uint32_t name;
+ assert(bo);
+ dri_bo_flink(bo, &name);
+ return name;
+}
+/* XXX a null props is ok? */
+static int
+intel_get_device_id(void)
+{
+ intel_driver_t *driver = NULL;
+ int intel_device_id;
+
+ driver = intel_driver_new();
+ assert(driver != NULL);
+ if(UNLIKELY(intel_driver_open(driver, NULL) != CL_SUCCESS)) return INVALID_CHIP_ID;
+ intel_device_id = driver->device_id;
+ intel_driver_context_destroy(driver);
+ intel_driver_close(driver);
+ intel_driver_terminate(driver);
+ intel_driver_delete(driver);
+
+ return intel_device_id;
+}
+
+static void
+cl_intel_driver_delete(intel_driver_t *driver)
+{
+ if (driver == NULL)
+ return;
+ intel_driver_context_destroy(driver);
+ intel_driver_close(driver);
+ intel_driver_terminate(driver);
+ intel_driver_delete(driver);
+}
+
+#include "cl_gbe_loader.h"
+static intel_driver_t*
+cl_intel_driver_new(cl_context_prop props)
+{
+ intel_driver_t *driver = NULL;
+ TRY_ALLOC_NO_ERR (driver, intel_driver_new());
+ if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
+ intel_driver_open(driver, props);
+exit:
+ return driver;
+error:
+ cl_intel_driver_delete(driver);
+ driver = NULL;
+ goto exit;
+}
+
+static drm_intel_bufmgr*
+intel_driver_get_bufmgr(intel_driver_t *drv)
+{
+ return drv->bufmgr;
+}
+
+static uint32_t
+intel_driver_get_ver(struct intel_driver *drv)
+{
+ return drv->gen_ver;
+}
+
+static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
+static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
+
+static int get_cl_tiling(uint32_t drm_tiling)
+{
+ switch(drm_tiling) {
+ case I915_TILING_X: return CL_TILE_X;
+ case I915_TILING_Y: return CL_TILE_Y;
+ case I915_TILING_NONE: return CL_NO_TILE;
+ default:
+ assert(0);
+ }
+ return CL_NO_TILE;
+}
+
+#if defined(HAS_EGL)
+#include "intel_dri_resource_sharing.h"
+#include "cl_image.h"
+static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
+{
+ cl_int ret = CL_SUCCESS;
+
+ switch (tex_format) {
+ case GL_RGBA8:
+ case GL_RGBA:
+ case GL_RGBA16:
+ case GL_RGBA8I:
+ case GL_RGBA16I:
+ case GL_RGBA32I:
+ case GL_RGBA8UI:
+ case GL_RGBA16UI:
+ case GL_RGBA32UI:
+ case GL_RGBA16F:
+ case GL_RGBA32F:
+ cl_format->image_channel_order = CL_RGBA;
+ break;
+ case GL_BGRA:
+ cl_format->image_channel_order = CL_BGRA;
+ break;
+ default:
+ ret = -1;
+ goto error;
+ }
+
+ switch (tex_format) {
+ case GL_RGBA8:
+ case GL_RGBA:
+ case GL_BGRA:
+ cl_format->image_channel_data_type = CL_UNORM_INT8;
+ break;
+ case GL_RGBA16:
+ cl_format->image_channel_data_type = CL_UNORM_INT16;
+ break;
+ case GL_RGBA8I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT8;
+ break;
+ case GL_RGBA16I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT16;
+ break;
+ case GL_RGBA32I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT32;
+ break;
+ case GL_RGBA8UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
+ break;
+ case GL_RGBA16UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
+ break;
+ case GL_RGBA32UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
+ break;
+ case GL_RGBA16F:
+ cl_format->image_channel_data_type = CL_HALF_FLOAT;
+ break;
+ case GL_RGBA32F:
+ cl_format->image_channel_order = CL_FLOAT;
+ break;
+ default:
+ ret = -1;
+ goto error;
+ }
+
+error:
+ return ret;
+}
+
+static int
+get_mem_type_from_target(GLenum texture_target, cl_mem_object_type *type)
+{
+ switch(texture_target) {
+ case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
+ case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
+ case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
+ case GL_TEXTURE_1D_ARRAY: *type = CL_MEM_OBJECT_IMAGE1D_ARRAY; break;
+ case GL_TEXTURE_2D_ARRAY: *type = CL_MEM_OBJECT_IMAGE2D_ARRAY; break;
+ default:
+ return -1;
+ }
+ return CL_SUCCESS;
+}
+
+static cl_buffer
+intel_alloc_buffer_from_texture_egl(cl_context ctx, unsigned int target,
+ int miplevel, unsigned int texture,
+ struct _cl_mem_image *image)
+{
+ cl_buffer bo = (cl_buffer) NULL;
+ struct _intel_dri_share_image_region region;
+ unsigned int bpp, intel_fmt;
+ cl_image_format cl_format;
+ EGLBoolean ret;
+ EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+ EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+ EGL_GL_TEXTURE_TARGET_MESA, target,
+ EGL_NONE};
+ ret = eglAcquireResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx),
+ EGL_GL_TEXTURE_MESA,
+ &attrib_list[0], ®ion);
+ if (!ret)
+ goto out;
+
+ bo = (cl_buffer)intel_driver_share_buffer((intel_driver_t *)ctx->drv, "rendering buffer", region.name);
+
+ if (bo == NULL) {
+ eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+ goto out;
+ }
+ region.tiling = get_cl_tiling(region.tiling);
+ if (cl_get_clformat_from_texture(region.gl_format, &cl_format) != 0)
+ goto error;
+
+ if (cl_image_byte_per_pixel(&cl_format, &bpp) != CL_SUCCESS)
+ goto error;
+ intel_fmt = cl_image_get_intel_format(&cl_format);
+ if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
+ goto error;
+ cl_mem_object_type image_type;
+ if (get_mem_type_from_target(target, &image_type) != 0)
+ goto error;
+
+ cl_mem_image_init(image, region.w, region.h,
+ image_type, region.depth, cl_format,
+ intel_fmt, bpp, region.row_pitch,
+ region.slice_pitch, region.tiling,
+ region.tile_x, region.tile_y, region.offset);
+out:
+ return bo;
+
+error:
+ cl_buffer_unreference(bo);
+ eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+ return NULL;
+}
+
+static cl_buffer
+intel_alloc_buffer_from_texture(cl_context ctx, unsigned int target,
+ int miplevel, unsigned int texture,
+ struct _cl_mem_image *image)
+{
+
+ if (IS_EGL_CONTEXT(ctx))
+ return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
+
+ return NULL;
+}
+
+static int
+intel_release_buffer_from_texture(cl_context ctx, unsigned int target,
+ int miplevel, unsigned int texture)
+{
+ if (IS_EGL_CONTEXT(ctx)) {
+ EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+ EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+ EGL_GL_TEXTURE_TARGET_MESA, target,
+ EGL_NONE};
+
+ eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+ return CL_SUCCESS;
+ }
+ return -1;
+}
+#endif
+
+cl_buffer intel_share_buffer_from_libva(cl_context ctx,
+ unsigned int bo_name,
+ size_t *sz)
+{
+ drm_intel_bo *intel_bo;
+
+ intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+
+ if (sz)
+ *sz = intel_bo->size;
+
+ return (cl_buffer)intel_bo;
+}
+
+cl_buffer intel_share_image_from_libva(cl_context ctx,
+ unsigned int bo_name,
+ struct _cl_mem_image *image,
+ unsigned int offset)
+{
+ drm_intel_bo *intel_bo;
+ uint32_t intel_tiling, intel_swizzle_mode;
+
+ intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+
+ intel_bo->offset += offset;
+ drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+ image->tiling = get_cl_tiling(intel_tiling);
+
+ return (cl_buffer)intel_bo;
+}
+
+static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
+{
+ switch (tiling) {
+ case CL_NO_TILE:
+ *intel_tiling = I915_TILING_NONE;
+ break;
+ case CL_TILE_X:
+ *intel_tiling = I915_TILING_X;
+ break;
+ case CL_TILE_Y:
+ *intel_tiling = I915_TILING_Y;
+ break;
+ default:
+ assert(0);
+ return -1;
+ }
+ return 0;
+}
+
+static int intel_buffer_set_tiling(cl_buffer bo,
+ cl_image_tiling_t tiling, size_t stride)
+{
+ uint32_t intel_tiling;
+ int ret;
+ if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
+ return -1;
+#ifndef NDEBUG
+ uint32_t required_tiling;
+ required_tiling = intel_tiling;
+#endif
+ ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
+ assert(intel_tiling == required_tiling);
+ return ret;
+}
+
+LOCAL void
+intel_setup_callbacks(void)
+{
+ cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
+ cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
+ cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+ cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
+ cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
+ cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+ cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
+#if defined(HAS_EGL)
+ cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
+ cl_buffer_release_from_texture = (cl_buffer_release_from_texture_cb *) intel_release_buffer_from_texture;
+ intel_set_cl_gl_callbacks();
+#endif
+ cl_buffer_get_buffer_from_libva = (cl_buffer_get_buffer_from_libva_cb *) intel_share_buffer_from_libva;
+ cl_buffer_get_image_from_libva = (cl_buffer_get_image_from_libva_cb *) intel_share_image_from_libva;
+ cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference;
+ cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference;
+ cl_buffer_map = (cl_buffer_map_cb *) drm_intel_bo_map;
+ cl_buffer_unmap = (cl_buffer_unmap_cb *) drm_intel_bo_unmap;
+ cl_buffer_map_gtt = (cl_buffer_map_gtt_cb *) drm_intel_gem_bo_map_gtt;
+ cl_buffer_unmap_gtt = (cl_buffer_unmap_gtt_cb *) drm_intel_gem_bo_unmap_gtt;
+ cl_buffer_map_gtt_unsync = (cl_buffer_map_gtt_unsync_cb *) drm_intel_gem_bo_map_unsynchronized;
+ cl_buffer_get_virtual = (cl_buffer_get_virtual_cb *) drm_intel_bo_get_virtual;
+ cl_buffer_get_size = (cl_buffer_get_size_cb *) drm_intel_bo_get_size;
+ cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
+ cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
+ cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
+ cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
+ cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
+ intel_set_gpgpu_callbacks(intel_get_device_id());
+}
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
new file mode 100644
index 0000000..107fdfc
--- /dev/null
+++ b/src/intel/intel_driver.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef _INTEL_DRIVER_H_
+#define _INTEL_DRIVER_H_
+
+#include "cl_device_data.h"
+
+#include <stdint.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <xf86drm.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+
+#define CMD_MI (0x0 << 29)
+#define CMD_2D (0x2 << 29)
+
+#define MI_NOOP (CMD_MI | 0)
+#define MI_BATCH_BUFFER_END (CMD_MI | (0xA << 23))
+#define MI_FLUSH (CMD_MI | (0x4 << 23))
+#define STATE_INSTRUCTION_CACHE_INVALIDATE (0x1 << 0)
+
+#define XY_COLOR_BLT_CMD (CMD_2D | (0x50 << 22) | 0x04)
+#define XY_COLOR_BLT_WRITE_ALPHA (1 << 21)
+#define XY_COLOR_BLT_WRITE_RGB (1 << 20)
+#define XY_COLOR_BLT_DST_TILED (1 << 11)
+
+/* BR13 */
+#define BR13_565 (0x1 << 24)
+#define BR13_8888 (0x3 << 24)
+
+struct dri_state;
+typedef struct _XDisplay Display;
+
+typedef struct intel_driver
+{
+ dri_bufmgr *bufmgr;
+ drm_intel_context *ctx;
+ int fd;
+ int device_id;
+ int gen_ver;
+ sigset_t sa_mask;
+ pthread_mutex_t ctxmutex;
+ int locked;
+ int need_close;
+ Display *x11_display;
+ struct dri_state *dri_ctx;
+} intel_driver_t;
+
+/* device control */
+extern void intel_driver_lock_hardware(intel_driver_t*);
+extern void intel_driver_unlock_hardware(intel_driver_t*);
+
+/* methods working in shared mode */
+extern dri_bo* intel_driver_share_buffer(intel_driver_t*, const char *sname, uint32_t name);
+extern uint32_t intel_driver_shared_name(intel_driver_t*, dri_bo*);
+
+/* init driver shared with X using dri state, acquired from X Display */
+extern int intel_driver_init_shared(intel_driver_t*, struct dri_state*);
+
+/* init driver in master mode (when X is not using the card)
+ * usually dev_name = "/dev/dri/card0"
+ */
+extern int intel_driver_init_master(intel_driver_t*, const char* dev_name);
+
+/* init driver for render node */
+extern int intel_driver_init_render(intel_driver_t*, const char* dev_name);
+
+/* terminate driver and all underlying structures */
+extern int intel_driver_terminate(intel_driver_t*);
+
+/* simple check if driver was initialized (checking fd should suffice) */
+extern int intel_driver_is_active(intel_driver_t*);
+
+/* query device parameters using driver ioctl */
+extern int intel_driver_get_param(intel_driver_t*, int param, int *value);
+
+/* init the call backs used by the ocl driver */
+extern void intel_setup_callbacks(void);
+
+#endif /* _INTEL_DRIVER_H_ */
+
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
new file mode 100644
index 0000000..c4b9156
--- /dev/null
+++ b/src/intel/intel_gpgpu.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Alexei Soupikov <alexei.soupikov at intel.com>
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "intel/intel_gpgpu.h"
+#include "intel/intel_defines.h"
+#include "intel/intel_structs.h"
+#include "intel/intel_batchbuffer.h"
+#include "intel/intel_driver.h"
+#include "program.h" // for BTI_RESERVED_NUM
+
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_sampler.h"
+
+#ifndef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE1D 0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
+#endif
+
+#define GEN_CMD_MEDIA_OBJECT (0x71000000)
+#define MO_TS_BIT (1 << 24)
+#define MO_RETAIN_BIT (1 << 28)
+#define SAMPLER_STATE_SIZE (16)
+
+#define TIMESTAMP_ADDR 0x2358
+
+/* Stores both binding tables and surface states */
+typedef struct surface_heap {
+ uint32_t binding_table[256];
+ char surface[256][sizeof(gen6_surface_state_t)];
+} surface_heap_t;
+
+typedef struct intel_event {
+ drm_intel_bo *buffer;
+ drm_intel_bo *ts_buf;
+ int status;
+} intel_event_t;
+
+#define MAX_IF_DESC 32
+
+/* We can bind only a limited number of buffers */
+enum { max_buf_n = 128 };
+
+enum { max_img_n = 128};
+
+enum {max_sampler_n = 16 };
+
+/* Handle GPGPU state */
+struct intel_gpgpu
+{
+ void* ker_opaque;
+ size_t global_wk_sz[3];
+ void* printf_info;
+ intel_driver_t *drv;
+ intel_batchbuffer_t *batch;
+ cl_gpgpu_kernel *ker;
+ drm_intel_bo *binded_buf[max_buf_n]; /* all buffers binded for the call */
+ uint32_t target_buf_offset[max_buf_n];/* internal offset for buffers binded for the call */
+ uint32_t binded_offset[max_buf_n]; /* their offsets in the curbe buffer */
+ uint32_t binded_n; /* number of buffers binded */
+
+ unsigned long img_bitmap; /* image usage bitmap. */
+ unsigned int img_index_base; /* base index for image surface.*/
+
+ unsigned long sampler_bitmap; /* sampler usage bitmap. */
+
+ struct { drm_intel_bo *bo; } stack_b;
+ struct { drm_intel_bo *bo; } perf_b;
+ struct { drm_intel_bo *bo; } scratch_b;
+ struct { drm_intel_bo *bo; } constant_b;
+ struct { drm_intel_bo *bo; } time_stamp_b; /* time stamp buffer */
+ struct { drm_intel_bo *bo;
+ drm_intel_bo *ibo;} printf_b; /* the printf buf and index buf*/
+
+ struct { drm_intel_bo *bo; } aux_buf;
+ struct {
+ uint32_t surface_heap_offset;
+ uint32_t curbe_offset;
+ uint32_t idrt_offset;
+ uint32_t sampler_state_offset;
+ uint32_t sampler_border_color_state_offset;
+ } aux_offset;
+
+ uint32_t per_thread_scratch;
+ struct {
+ uint32_t num_cs_entries;
+ uint32_t size_cs_entry; /* size of one entry in 512bit elements */
+ } curb;
+
+ uint32_t max_threads; /* max threads requested by the user */
+};
+
+typedef struct intel_gpgpu intel_gpgpu_t;
+
+typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
+intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
+
+typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size);
+intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL;
+
+typedef void (intel_gpgpu_post_action_t)(intel_gpgpu_t *gpgpu, int32_t flush_mode);
+intel_gpgpu_post_action_t *intel_gpgpu_post_action = NULL;
+
+typedef uint64_t (intel_gpgpu_read_ts_reg_t)(drm_intel_bufmgr *bufmgr);
+intel_gpgpu_read_ts_reg_t *intel_gpgpu_read_ts_reg = NULL;
+
+static void
+intel_gpgpu_sync(void *buf)
+{
+ if (buf)
+ drm_intel_bo_wait_rendering((drm_intel_bo *)buf);
+}
+
+static void *intel_gpgpu_ref_batch_buf(intel_gpgpu_t *gpgpu)
+{
+ if (gpgpu->batch->last_bo)
+ drm_intel_bo_reference(gpgpu->batch->last_bo);
+
+ return gpgpu->batch->last_bo;
+}
+
+static void intel_gpgpu_unref_batch_buf(void *buf)
+{
+ if (buf)
+ drm_intel_bo_unreference((drm_intel_bo *)buf);
+}
+
+static void
+intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
+{
+ if (gpgpu == NULL)
+ return;
+ if(gpgpu->time_stamp_b.bo)
+ drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
+ if(gpgpu->printf_b.bo)
+ drm_intel_bo_unreference(gpgpu->printf_b.bo);
+ if(gpgpu->printf_b.ibo)
+ drm_intel_bo_unreference(gpgpu->printf_b.ibo);
+ if (gpgpu->aux_buf.bo)
+ drm_intel_bo_unreference(gpgpu->aux_buf.bo);
+ if (gpgpu->perf_b.bo)
+ drm_intel_bo_unreference(gpgpu->perf_b.bo);
+ if (gpgpu->stack_b.bo)
+ drm_intel_bo_unreference(gpgpu->stack_b.bo);
+ if (gpgpu->scratch_b.bo)
+ drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+
+ if(gpgpu->constant_b.bo)
+ drm_intel_bo_unreference(gpgpu->constant_b.bo);
+
+ intel_batchbuffer_delete(gpgpu->batch);
+ cl_free(gpgpu);
+}
+
+static intel_gpgpu_t*
+intel_gpgpu_new(intel_driver_t *drv)
+{
+ intel_gpgpu_t *state = NULL;
+
+ TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t));
+ state->drv = drv;
+ state->batch = intel_batchbuffer_new(state->drv);
+ assert(state->batch);
+
+exit:
+ return state;
+error:
+ intel_gpgpu_delete(state);
+ state = NULL;
+ goto exit;
+}
+
+static void
+intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 1);
+ OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen7()
+{
+ return cc_llc_l3;
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen75()
+{
+ return llccc_ec | l3cc_ec;
+}
+
+static void
+intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
+{
+ const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
+ BEGIN_BATCH(gpgpu->batch, 10);
+ OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
+ /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY); /* General State Base Addr */
+ /* 0, State Mem Obj CC */
+ /* We use a state base address for the surface heap since IVB clamp the
+ * binding table pointer at 11 bits. So, we cannot use pointers directly while
+ * using the surface heap
+ */
+ assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ I915_GEM_DOMAIN_INSTRUCTION,
+ gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY));
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
+ /* If we output an AUB file, we limit the total size to 64MB */
+#if USE_FULSIM
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
+#else
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+ * to a valid bound value, otherwise, the border color pointer may be rejected and you
+ * may get incorrect border color. This is a known hardware bug. */
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+#endif /* USE_FULSIM */
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
+ return size / 1024 - 1;
+}
+
+uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
+ size = size >> 11;
+ uint32_t index = 0;
+ while((size >>= 1) > 0)
+ index++; //get leading one
+
+ //non pow 2 size
+ if(size & (size - 1)) index++;
+ return index;
+}
+
+static cl_int
+intel_gpgpu_get_max_curbe_size(uint32_t device_id)
+{
+ if (IS_BAYTRAIL_T(device_id) ||
+ IS_IVB_GT1(device_id))
+ return 992;
+ else
+ return 2016;
+}
+
+static cl_int
+intel_gpgpu_get_curbe_size(intel_gpgpu_t *gpgpu)
+{
+ int curbe_size = gpgpu->curb.size_cs_entry * gpgpu->curb.num_cs_entries;
+ int max_curbe_size = intel_gpgpu_get_max_curbe_size(gpgpu->drv->device_id);
+
+ if (curbe_size > max_curbe_size) {
+ fprintf(stderr, "warning, curbe size exceed limitation.\n");
+ return max_curbe_size;
+ } else
+ return curbe_size;
+}
+
+static void
+intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
+{
+ int32_t scratch_index;
+ BEGIN_BATCH(gpgpu->batch, 8);
+ OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
+
+ if(gpgpu->per_thread_scratch > 0) {
+ scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
+ OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ scratch_index);
+ }
+ else {
+ OUT_BATCH(gpgpu->batch, 0);
+ }
+ /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+ OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (0 << 8) | 0xc4);
+ OUT_BATCH(gpgpu->batch, 0);
+ /* curbe_size */
+ OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu));
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
+ OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.curbe_offset);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
+ OUT_BATCH(gpgpu->batch, 1 << 5);
+ OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.idrt_offset);
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static const uint32_t gpgpu_l3_config_reg1[] = {
+ 0x00080040, 0x02040040, 0x00800040, 0x01000038,
+ 0x02000030, 0x01000038, 0x00000038, 0x00000040,
+ 0x0A140091, 0x09100091, 0x08900091, 0x08900091,
+ 0x010000a1
+};
+
+static const uint32_t gpgpu_l3_config_reg2[] = {
+ 0x00000000, 0x00000000, 0x00080410, 0x00080410,
+ 0x00040410, 0x00040420, 0x00080420, 0x00080020,
+ 0x00204080, 0x00244890, 0x00284490, 0x002444A0,
+ 0x00040810
+};
+
+/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
+static void
+intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
+{
+ BEGIN_BATCH(gpgpu->batch, 5);
+ OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2));
+ OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
+ OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo,
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH();
+}
+
+static void
+intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
+{
+ gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
+ intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
+ memset(pc, 0, sizeof(*pc));
+ pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
+ pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+ pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+ pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+ pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+ pc->dw1.render_target_cache_flush_enable = 1;
+ pc->dw1.texture_cache_invalidation_enable = 1;
+ pc->dw1.cs_stall = 1;
+ pc->dw1.dc_flush_enable = 1;
+ //pc->dw1.instruction_cache_invalidate_enable = 1;
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+ BEGIN_BATCH(gpgpu->batch, 9);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x00730000);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+ ADVANCE_BATCH(gpgpu->batch);
+
+ intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_set_L3_baytrail(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+ BEGIN_BATCH(gpgpu->batch, 9);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x00D30000); /* General credit : High credit = 26 : 6 */
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, 0x01020021); /* {SLM=64, URB=96, DC=16, RO=16, Sum=192} */
+ else
+ OUT_BATCH(gpgpu->batch, 0x02040040); /* {SLM=0, URB=128, DC=32, RO=32, Sum=192} */
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x0); /* {I/S=0, Const=0, Tex=0} */
+
+ ADVANCE_BATCH(gpgpu->batch);
+
+ intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+ /* still set L3 in batch buffer for fulsim. */
+ BEGIN_BATCH(gpgpu->batch, 9);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x00610000);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
+
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+ if (use_slm)
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
+ else
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+ ADVANCE_BATCH(gpgpu->batch);
+
+ //if(use_slm)
+ // gpgpu->batch->enable_slm = 1;
+ intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
+{
+ intel_batchbuffer_start_atomic(gpgpu->batch, 256);
+ intel_gpgpu_pipe_control(gpgpu);
+ assert(intel_gpgpu_set_L3);
+ intel_gpgpu_set_L3(gpgpu, gpgpu->ker->use_slm);
+ intel_gpgpu_select_pipeline(gpgpu);
+ intel_gpgpu_set_base_address(gpgpu);
+ intel_gpgpu_load_vfe_state(gpgpu);
+ intel_gpgpu_load_curbe_buffer(gpgpu);
+ intel_gpgpu_load_idrt(gpgpu);
+
+ if (gpgpu->perf_b.bo) {
+ BEGIN_BATCH(gpgpu->batch, 3);
+ OUT_BATCH(gpgpu->batch,
+ (0x28 << 23) | /* MI_REPORT_PERF_COUNT */
+ (3 - 2)); /* length-2 */
+ OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ 0 | /* Offset for the start "counters" */
+ 1); /* Use GTT and not PGTT */
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH(gpgpu->batch);
+ }
+
+ /* Insert PIPE_CONTROL for time stamp of start*/
+ if (gpgpu->time_stamp_b.bo)
+ intel_gpgpu_write_timestamp(gpgpu, 0);
+}
+
+static void
+intel_gpgpu_post_action_gen7(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+ if(flush_mode)
+ intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_post_action_gen75(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+ /* flush force for set L3 */
+ intel_gpgpu_pipe_control(gpgpu);
+
+ /* Restore L3 control to disable SLM mode,
+ otherwise, may affect 3D pipeline */
+ intel_gpgpu_set_L3(gpgpu, 0);
+}
+
+static void
+intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+ /* Insert PIPE_CONTROL for time stamp of end*/
+ if (gpgpu->time_stamp_b.bo)
+ intel_gpgpu_write_timestamp(gpgpu, 1);
+
+ /* Insert the performance counter command */
+ if (gpgpu->perf_b.bo) {
+ BEGIN_BATCH(gpgpu->batch, 3);
+ OUT_BATCH(gpgpu->batch,
+ (0x28 << 23) | /* MI_REPORT_PERF_COUNT */
+ (3 - 2)); /* length-2 */
+ OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ 512 | /* Offset for the end "counters" */
+ 1); /* Use GTT and not PGTT */
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH(gpgpu->batch);
+ }
+
+ intel_gpgpu_post_action(gpgpu, flush_mode);
+ intel_batchbuffer_end_atomic(gpgpu->batch);
+}
+
+static int
+intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
+{
+ return intel_batchbuffer_reset(gpgpu->batch, sz);
+}
+/* check we do not get a 0 starting address for binded buf */
+static void
+intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
+{
+ uint32_t i;
+ for (i = 0; i < gpgpu->binded_n; ++i)
+ assert(gpgpu->binded_buf[i]->offset != 0);
+}
+
+static void
+intel_gpgpu_flush_batch_buffer(intel_batchbuffer_t *batch)
+{
+ assert(batch);
+ intel_batchbuffer_emit_mi_flush(batch);
+ intel_batchbuffer_flush(batch);
+}
+
+static void
+intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
+{
+ if (!gpgpu->batch || !gpgpu->batch->buffer)
+ return;
+ intel_gpgpu_flush_batch_buffer(gpgpu->batch);
+ intel_gpgpu_check_binded_buf_address(gpgpu);
+}
+
+static int
+intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
+ uint32_t max_threads,
+ uint32_t size_cs_entry,
+ int profiling)
+{
+ drm_intel_bo *bo;
+
+ /* Binded buffers */
+ gpgpu->binded_n = 0;
+ gpgpu->img_bitmap = 0;
+ gpgpu->img_index_base = 3;
+ gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
+
+ /* URB */
+ gpgpu->curb.num_cs_entries = 64;
+ gpgpu->curb.size_cs_entry = size_cs_entry;
+ gpgpu->max_threads = max_threads;
+
+ if (gpgpu->printf_b.ibo)
+ dri_bo_unreference(gpgpu->printf_b.ibo);
+ gpgpu->printf_b.ibo = NULL;
+ if (gpgpu->printf_b.bo)
+ dri_bo_unreference(gpgpu->printf_b.bo);
+ gpgpu->printf_b.bo = NULL;
+
+ /* Set the profile buffer*/
+ if(gpgpu->time_stamp_b.bo)
+ dri_bo_unreference(gpgpu->time_stamp_b.bo);
+ gpgpu->time_stamp_b.bo = NULL;
+ if (profiling) {
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
+ gpgpu->time_stamp_b.bo = bo;
+ if (!bo)
+ fprintf(stderr, "Could not allocate buffer for profiling.\n");
+ }
+
+ /* stack */
+ if (gpgpu->stack_b.bo)
+ dri_bo_unreference(gpgpu->stack_b.bo);
+ gpgpu->stack_b.bo = NULL;
+
+ /* Set the auxiliary buffer*/
+ uint32_t size_aux = 0;
+ if(gpgpu->aux_buf.bo)
+ dri_bo_unreference(gpgpu->aux_buf.bo);
+ gpgpu->aux_buf.bo = NULL;
+
+ //surface heap must be 4096 bytes aligned because state base address use 20bit for the address
+ size_aux = ALIGN(size_aux, 4096);
+ gpgpu->aux_offset.surface_heap_offset = size_aux;
+ size_aux += sizeof(surface_heap_t);
+
+ //curbe must be 32 bytes aligned
+ size_aux = ALIGN(size_aux, 32);
+ gpgpu->aux_offset.curbe_offset = size_aux;
+ size_aux += gpgpu->curb.num_cs_entries * gpgpu->curb.size_cs_entry * 32;
+
+ //idrt must be 32 bytes aligned
+ size_aux = ALIGN(size_aux, 32);
+ gpgpu->aux_offset.idrt_offset = size_aux;
+ size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
+
+ //sampler state must be 32 bytes aligned
+ size_aux = ALIGN(size_aux, 32);
+ gpgpu->aux_offset.sampler_state_offset = size_aux;
+ size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t);
+
+ //sampler border color state must be 32 bytes aligned
+ size_aux = ALIGN(size_aux, 32);
+ gpgpu->aux_offset.sampler_border_color_state_offset = size_aux;
+ size_aux += GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t);
+
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 0);
+ if (!bo || dri_bo_map(bo, 1) != 0) {
+ fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+ if (bo)
+ dri_bo_unreference(bo);
+ if (profiling && gpgpu->time_stamp_b.bo)
+ dri_bo_unreference(gpgpu->time_stamp_b.bo);
+ gpgpu->time_stamp_b.bo = NULL;
+ return -1;
+ }
+ memset(bo->virtual, 0, size_aux);
+ gpgpu->aux_buf.bo = bo;
+ return 0;
+}
+
+static void
+intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
+{
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+ index * sizeof(gen7_surface_state_t);
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ obj_bo_offset,
+ gpgpu->aux_offset.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen7_surface_state_t, ss1),
+ obj_bo);
+}
+
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+{
+ uint32_t s = size - 1;
+ assert(size != 0);
+
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
+ memset(ss2, 0, sizeof(gen7_surface_state_t));
+ ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
+ ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+ heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
+
+ if(gpgpu->constant_b.bo)
+ dri_bo_unreference(gpgpu->constant_b.bo);
+ gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+ if (gpgpu->constant_b.bo == NULL)
+ return NULL;
+ ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ 0,
+ gpgpu->aux_offset.surface_heap_offset +
+ heap->binding_table[bti] +
+ offsetof(gen7_surface_state_t, ss1),
+ gpgpu->constant_b.bo);
+ return gpgpu->constant_b.bo;
+}
+
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+{
+ uint32_t s = size - 1;
+ assert(size != 0);
+
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
+ memset(ss2, 0, sizeof(gen7_surface_state_t));
+ ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
+ ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+ ss2->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+ ss2->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+ ss2->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+ ss2->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+ heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
+
+ if(gpgpu->constant_b.bo)
+ dri_bo_unreference(gpgpu->constant_b.bo);
+ gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+ if (gpgpu->constant_b.bo == NULL)
+ return NULL;
+ ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ 0,
+ gpgpu->aux_offset.surface_heap_offset +
+ heap->binding_table[bti] +
+ offsetof(gen7_surface_state_t, ss1),
+ gpgpu->constant_b.bo);
+ return gpgpu->constant_b.bo;
+}
+
+static void
+intel_gpgpu_setup_bti(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, uint32_t size, unsigned char index)
+{
+ uint32_t s = size - 1;
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[index];
+ memset(ss0, 0, sizeof(gen7_surface_state_t));
+ ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+ ss0->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
+
+ ss0->ss1.base_addr = buf->offset + internal_offset;
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ internal_offset,
+ gpgpu->aux_offset.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen7_surface_state_t, ss1),
+ buf);
+}
+
+
+static int
+intel_is_surface_array(cl_mem_object_type type)
+{
+ if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+ type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ return 1;
+
+ return 0;
+}
+
+static int
+intel_get_surface_type(cl_mem_object_type type)
+{
+ switch (type) {
+ case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+ case CL_MEM_OBJECT_IMAGE1D:
+ case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+ return I965_SURFACE_1D;
+
+ case CL_MEM_OBJECT_IMAGE2D:
+ case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+ return I965_SURFACE_2D;
+
+ case CL_MEM_OBJECT_IMAGE3D:
+ return I965_SURFACE_3D;
+
+ default:
+ assert(0);
+ }
+ return 0;
+}
+
+/* Get fixed surface type. If it is a 1D array image with a large index,
+ we need to fixup it to 2D type due to a Gen7/Gen75's sampler issue
+ on a integer type surface with clamp address mode and nearest filter mode.
+*/
+static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type)
+{
+ uint32_t surface_type;
+ if (((IS_IVYBRIDGE(gpgpu->drv->device_id) || IS_HASWELL(gpgpu->drv->device_id))) &&
+ index >= 128 + BTI_RESERVED_NUM &&
+ type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ surface_type = I965_SURFACE_2D;
+ else
+ surface_type = intel_get_surface_type(type);
+ return surface_type;
+}
+
+static void
+intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
+ uint32_t index,
+ dri_bo* obj_bo,
+ uint32_t obj_bo_offset,
+ uint32_t format,
+ cl_mem_object_type type,
+ int32_t w,
+ int32_t h,
+ int32_t depth,
+ int32_t pitch,
+ int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
+
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+ ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+ if (intel_is_surface_array(type)) {
+ ss->ss0.surface_array = 1;
+ ss->ss0.surface_array_spacing = 1;
+ }
+ ss->ss0.surface_format = format;
+ ss->ss1.base_addr = obj_bo->offset;
+ ss->ss2.width = w - 1;
+
+ ss->ss2.height = h - 1;
+ ss->ss3.depth = depth - 1;
+ ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+ ss->ss4.not_str_buf.min_array_element = 0;
+ ss->ss3.pitch = pitch - 1;
+ ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+ if (tiling == GPGPU_TILE_X) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
+ } else if (tiling == GPGPU_TILE_Y) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
+ }
+ ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+ intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
+ uint32_t index,
+ dri_bo* obj_bo,
+ uint32_t obj_bo_offset,
+ uint32_t format,
+ cl_mem_object_type type,
+ int32_t w,
+ int32_t h,
+ int32_t depth,
+ int32_t pitch,
+ int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+ ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+ if (intel_is_surface_array(type)) {
+ ss->ss0.surface_array = 1;
+ ss->ss0.surface_array_spacing = 1;
+ }
+ ss->ss0.surface_format = format;
+ ss->ss1.base_addr = obj_bo->offset;
+ ss->ss2.width = w - 1;
+ ss->ss2.height = h - 1;
+ ss->ss3.depth = depth - 1;
+ ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+ ss->ss4.not_str_buf.min_array_element = 0;
+ ss->ss3.pitch = pitch - 1;
+ ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+ ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+ ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+ ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+ ss->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+ if (tiling == GPGPU_TILE_X) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
+ } else if (tiling == GPGPU_TILE_Y) {
+ ss->ss0.tiled_surface = 1;
+ ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
+ }
+ ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+ intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
+ uint32_t internal_offset, uint32_t size, uint8_t bti)
+{
+ assert(gpgpu->binded_n < max_buf_n);
+ gpgpu->binded_buf[gpgpu->binded_n] = buf;
+ gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
+ gpgpu->binded_offset[gpgpu->binded_n] = offset;
+ gpgpu->binded_n++;
+ intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti);
+}
+
+static int
+intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
+{
+ drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+ drm_intel_bo* old = gpgpu->scratch_b.bo;
+ uint32_t total = per_thread_size * gpgpu->max_threads;
+ /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
+ if (IS_HASWELL(gpgpu->drv->device_id))
+ total *= 2;
+
+ gpgpu->per_thread_scratch = per_thread_size;
+
+ if(old && old->size < total) {
+ drm_intel_bo_unreference(old);
+ old = NULL;
+ }
+
+ if(!old && total) {
+ gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
+ if (gpgpu->scratch_b.bo == NULL)
+ return -1;
+ }
+ return 0;
+}
+static void
+intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint8_t bti)
+{
+ drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+ gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
+
+ intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, size, bti);
+}
+
+static void
+intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
+{
+ gen6_interface_descriptor_t *desc;
+ drm_intel_bo *ker_bo = NULL;
+
+ desc = (gen6_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
+
+ memset(desc, 0, sizeof(*desc));
+ ker_bo = (drm_intel_bo *) kernel->bo;
+ desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
+ desc->desc1.single_program_flow = 0;
+ desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
+ desc->desc5.rounding_mode = 0; /* round to nearest even */
+
+ assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
+ desc->desc2.sampler_state_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) >> 5;
+ desc->desc3.binding_table_entry_count = 0; /* no prefetch */
+ desc->desc3.binding_table_pointer = 0;
+ desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
+ desc->desc4.curbe_read_offset = 0;
+
+ /* Barriers / SLM are automatically handled on Gen7+ */
+ if (gpgpu->drv->gen_ver == 7 || gpgpu->drv->gen_ver == 75) {
+ size_t slm_sz = kernel->slm_sz;
+ desc->desc5.group_threads_num = kernel->use_slm ? kernel->thread_n : 0;
+ desc->desc5.barrier_enable = kernel->use_slm;
+ if (slm_sz <= 4*KB)
+ slm_sz = 4*KB;
+ else if (slm_sz <= 8*KB)
+ slm_sz = 8*KB;
+ else if (slm_sz <= 16*KB)
+ slm_sz = 16*KB;
+ else if (slm_sz <= 32*KB)
+ slm_sz = 32*KB;
+ else
+ slm_sz = 64*KB;
+ slm_sz = slm_sz >> 12;
+ desc->desc5.slm_sz = slm_sz;
+ }
+ else
+ desc->desc5.group_threads_num = kernel->barrierID; /* BarrierID on GEN6 */
+
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_INSTRUCTION, 0,
+ 0,
+ gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc0),
+ ker_bo);
+
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_SAMPLER, 0,
+ gpgpu->aux_offset.sampler_state_offset,
+ gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc2),
+ gpgpu->aux_buf.bo);
+}
+
+static int
+intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+{
+ unsigned char *curbe = NULL;
+ cl_gpgpu_kernel *k = gpgpu->ker;
+ uint32_t i, j;
+
+ /* Upload the data first */
+ if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) {
+ fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+ return -1;
+ }
+ assert(gpgpu->aux_buf.bo->virtual);
+ curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset);
+ memcpy(curbe, data, size);
+
+ /* Now put all the relocations for our flat address space */
+ for (i = 0; i < k->thread_n; ++i)
+ for (j = 0; j < gpgpu->binded_n; ++j) {
+ *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset + gpgpu->target_buf_offset[j];
+ drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
+ gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
+ gpgpu->binded_buf[j],
+ gpgpu->target_buf_offset[j],
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER);
+ }
+ dri_bo_unmap(gpgpu->aux_buf.bo);
+ return 0;
+}
+
+static void
+intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
+{
+ if (n) {
+ const size_t sz = n * sizeof(gen6_sampler_state_t);
+ memcpy(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset, data, sz);
+ }
+}
+
+int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
+{
+ switch( cl_address_mode ) {
+ case CLK_ADDRESS_NONE:
+ case CLK_ADDRESS_REPEAT:
+ return GEN_TEXCOORDMODE_WRAP;
+ case CLK_ADDRESS_CLAMP:
+ return GEN_TEXCOORDMODE_CLAMP_BORDER;
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ return GEN_TEXCOORDMODE_CLAMP;
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ return GEN_TEXCOORDMODE_MIRROR;
+ default:
+ return GEN_TEXCOORDMODE_WRAP;
+ }
+}
+
+static void
+intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
+{
+ int using_nearest = 0;
+ uint32_t wrap_mode;
+ gen7_sampler_state_t *sampler;
+
+ sampler = (gen7_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset) + index;
+ memset(sampler, 0, sizeof(*sampler));
+ assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0);
+ sampler->ss2.default_color_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) >> 5;
+ if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
+ sampler->ss3.non_normalized_coord = 1;
+ else
+ sampler->ss3.non_normalized_coord = 0;
+
+ switch (clk_sampler & __CLK_FILTER_MASK) {
+ case CLK_FILTER_NEAREST:
+ sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
+ sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+ sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
+ using_nearest = 1;
+ break;
+ case CLK_FILTER_LINEAR:
+ sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
+ sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+ sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
+ break;
+ }
+
+ wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
+ sampler->ss3.s_wrap_mode = wrap_mode;
+ /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
+ * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
+ sampler->ss3.t_wrap_mode = wrap_mode;
+ sampler->ss3.r_wrap_mode = wrap_mode;
+
+ sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+ sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+ sampler->ss0.base_level = 0;
+
+ sampler->ss1.max_lod = 0;
+ sampler->ss1.min_lod = 0;
+
+ if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
+ sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
+ GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
+ GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
+ if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
+ sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
+ GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
+ GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_SAMPLER, 0,
+ gpgpu->aux_offset.sampler_border_color_state_offset,
+ gpgpu->aux_offset.sampler_state_offset +
+ index * sizeof(gen7_sampler_state_t) +
+ offsetof(gen7_sampler_state_t, ss2),
+ gpgpu->aux_buf.bo);
+
+}
+
+static void
+intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
+{
+ int index;
+ assert(sampler_sz <= GEN_MAX_SAMPLERS);
+ for(index = 0; index < sampler_sz; index++)
+ intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+}
+
+static void
+intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
+{
+ gpgpu->ker = kernel;
+ intel_gpgpu_build_idrt(gpgpu, kernel);
+ dri_bo_unmap(gpgpu->aux_buf.bo);
+}
+
+static void
+intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf)
+{
+ if (gpgpu->perf_b.bo)
+ drm_intel_bo_unreference(gpgpu->perf_b.bo);
+ drm_intel_bo_reference((drm_intel_bo*) perf);
+ gpgpu->perf_b.bo = (drm_intel_bo*) perf;
+}
+
+static void
+intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
+ uint32_t simd_sz,
+ uint32_t thread_n,
+ const size_t global_wk_off[3],
+ const size_t global_wk_sz[3],
+ const size_t local_wk_sz[3])
+{
+ const uint32_t global_wk_dim[3] = {
+ global_wk_sz[0] / local_wk_sz[0],
+ global_wk_sz[1] / local_wk_sz[1],
+ global_wk_sz[2] / local_wk_sz[2]
+ };
+ uint32_t right_mask = ~0x0;
+ size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
+
+ assert(simd_sz == 8 || simd_sz == 16);
+
+ uint32_t shift = (group_sz & (simd_sz - 1));
+ shift = (shift == 0) ? simd_sz : shift;
+ right_mask = (1 << shift) - 1;
+
+ BEGIN_BATCH(gpgpu->batch, 11);
+ OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
+ OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
+ assert(thread_n <= 64);
+ if (simd_sz == 16)
+ OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+ else
+ OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+ OUT_BATCH(gpgpu->batch, right_mask);
+ OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
+ ADVANCE_BATCH(gpgpu->batch);
+
+ BEGIN_BATCH(gpgpu->batch, 2);
+ OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
+ OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */
+ ADVANCE_BATCH(gpgpu->batch);
+}
+
+static intel_event_t*
+intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
+{
+ intel_event_t *event = NULL;
+ TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t));
+
+ event->buffer = gpgpu->batch->buffer;
+ if (event->buffer)
+ drm_intel_bo_reference(event->buffer);
+ event->status = command_queued;
+
+ if(gpgpu->time_stamp_b.bo) {
+ event->ts_buf = gpgpu->time_stamp_b.bo;
+ drm_intel_bo_reference(event->ts_buf);
+ }
+
+exit:
+ return event;
+error:
+ cl_free(event);
+ event = NULL;
+ goto exit;
+}
+
+/*
+ The upper layer already flushed the batch buffer, just update
+ internal status to command_submitted.
+*/
+static void
+intel_gpgpu_event_flush(intel_event_t *event)
+{
+ assert(event->status == command_queued);
+ event->status = command_running;
+}
+
+static int
+intel_gpgpu_event_update_status(intel_event_t *event, int wait)
+{
+ if(event->status == command_complete)
+ return event->status;
+
+ if (event->buffer &&
+ event->status == command_running &&
+ !drm_intel_bo_busy(event->buffer)) {
+ event->status = command_complete;
+ drm_intel_bo_unreference(event->buffer);
+ event->buffer = NULL;
+ return event->status;
+ }
+
+ if(wait == 0)
+ return event->status;
+
+ if (event->buffer) {
+ drm_intel_bo_wait_rendering(event->buffer);
+ event->status = command_complete;
+ drm_intel_bo_unreference(event->buffer);
+ event->buffer = NULL;
+ }
+ return event->status;
+}
+
+static void
+intel_gpgpu_event_delete(intel_event_t *event)
+{
+ if(event->buffer)
+ drm_intel_bo_unreference(event->buffer);
+ if(event->ts_buf)
+ drm_intel_bo_unreference(event->ts_buf);
+ cl_free(event);
+}
+
+/* IVB and HSW's result MUST shift in x86_64 system */
+static uint64_t
+intel_gpgpu_read_ts_reg_gen7(drm_intel_bufmgr *bufmgr)
+{
+ uint64_t result = 0;
+ drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+ /* In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of
+ result which got from drm_intel_reg_read, and 32-35 bits are lost; but match bspec in
+ i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain
+ 32 bits data in i386.
+ */
+#ifdef __i386__
+ return result & 0x0ffffffff;
+#else
+ return result >> 32;
+#endif /* __i386__ */
+}
+
+/* baytrail's result should clear high 4 bits */
+static uint64_t
+intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
+{
+ uint64_t result = 0;
+ drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+ return result & 0x0ffffffff;
+}
+
+/* We want to get the current time of GPU. */
+static void
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
+{
+ uint64_t result = 0;
+ drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+
+ /* Get the ts that match the bspec */
+ result = intel_gpgpu_read_ts_reg(bufmgr);
+ result *= 80;
+
+ *ret_ts = result;
+ return;
+}
+
+/* Get the GPU execute time. */
+static void
+intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
+ int index, uint64_t* ret_ts)
+{
+ uint64_t result = 0;
+
+ assert(event->ts_buf != NULL);
+ assert(index == 0 || index == 1);
+ drm_intel_gem_bo_map_gtt(event->ts_buf);
+ uint64_t* ptr = event->ts_buf->virtual;
+ result = ptr[index];
+
+ /* According to BSpec, the timestamp counter should be 36 bits,
+ but comparing to the timestamp counter from IO control reading,
+ we find the first 4 bits seems to be fake. In order to keep the
+ timestamp counter conformable, we just skip the first 4 bits.
+ */
+ result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
+ *ret_ts = result;
+
+ drm_intel_gem_bo_unmap_gtt(event->ts_buf);
+}
+
+static int
+intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint32_t offset, uint8_t bti)
+{
+ drm_intel_bo *bo = NULL;
+ if (i == 0) { // the index buffer.
+ if (gpgpu->printf_b.ibo)
+ dri_bo_unreference(gpgpu->printf_b.ibo);
+ gpgpu->printf_b.ibo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf index buffer", size, 4096);
+ bo = gpgpu->printf_b.ibo;
+ } else if (i == 1) {
+ if (gpgpu->printf_b.bo)
+ dri_bo_unreference(gpgpu->printf_b.bo);
+ gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf output buffer", size, 4096);
+ bo = gpgpu->printf_b.bo;
+ } else
+ assert(0);
+
+ if (!bo || (drm_intel_bo_map(bo, 1) != 0)) {
+ if (gpgpu->printf_b.bo)
+ drm_intel_bo_unreference(gpgpu->printf_b.bo);
+ gpgpu->printf_b.bo = NULL;
+ fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+ return -1;
+ }
+ memset(bo->virtual, 0, size);
+ drm_intel_bo_unmap(bo);
+ intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, bti);
+ return 0;
+}
+
+static void*
+intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+ drm_intel_bo *bo = NULL;
+ if (i == 0) {
+ bo = gpgpu->printf_b.ibo;
+ } else if (i == 1) {
+ bo = gpgpu->printf_b.bo;
+ } else
+ assert(0);
+
+ drm_intel_bo_map(bo, 1);
+ return bo->virtual;
+}
+
+static void
+intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+ drm_intel_bo *bo = NULL;
+ if (i == 0) {
+ bo = gpgpu->printf_b.ibo;
+ } else if (i == 1) {
+ bo = gpgpu->printf_b.bo;
+ } else
+ assert(0);
+
+ drm_intel_bo_unmap(bo);
+}
+
+static void
+intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+ if (i == 0) {
+ drm_intel_bo_unreference(gpgpu->printf_b.ibo);
+ gpgpu->printf_b.ibo = NULL;
+ } else if (i == 1) {
+ drm_intel_bo_unreference(gpgpu->printf_b.bo);
+ gpgpu->printf_b.bo = NULL;
+ } else
+ assert(0);
+}
+
+static void
+intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info, size_t * global_sz)
+{
+ gpgpu->printf_info = printf_info;
+ gpgpu->global_wk_sz[0] = global_sz[0];
+ gpgpu->global_wk_sz[1] = global_sz[1];
+ gpgpu->global_wk_sz[2] = global_sz[2];
+}
+
+static void*
+intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz)
+{
+ global_sz[0] = gpgpu->global_wk_sz[0];
+ global_sz[1] = gpgpu->global_wk_sz[1];
+ global_sz[2] = gpgpu->global_wk_sz[2];
+ return gpgpu->printf_info;
+}
+
+LOCAL void
+intel_set_gpgpu_callbacks(int device_id)
+{
+ cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new;
+ cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
+ cl_gpgpu_sync = (cl_gpgpu_sync_cb *) intel_gpgpu_sync;
+ cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
+ cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
+ cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
+ cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
+ cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+ cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
+ cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
+ cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
+ cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start;
+ cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
+ cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
+ cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
+ cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
+ cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
+ cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
+ cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
+ cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status;
+ cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
+ cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
+ cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
+ cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
+ cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
+ cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
+ cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
+ cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
+ cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
+ cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
+ cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
+
+ if (IS_HASWELL(device_id)) {
+ cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
+ cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen75;
+ intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
+ cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
+ intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
+ intel_gpgpu_post_action = intel_gpgpu_post_action_gen75;
+ intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb
+ }
+ else if (IS_IVYBRIDGE(device_id)) {
+ cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
+ cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen7;
+ if (IS_BAYTRAIL_T(device_id)) {
+ intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
+ intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
+ } else {
+ intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
+ intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+ }
+ cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
+ intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
+ intel_gpgpu_post_action = intel_gpgpu_post_action_gen7;
+ }
+}
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
new file mode 100644
index 0000000..d593ac7
--- /dev/null
+++ b/src/intel/intel_gpgpu.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Alexei Soupikov <alexei.soupikov at intel.com>
+ */
+
+#ifndef __INTEL_GPGPU_H__
+#define __INTEL_GPGPU_H__
+
+#include "cl_utils.h"
+#include "cl_driver.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+
+/* Set the gpgpu related call backs */
+extern void intel_set_gpgpu_callbacks(int device_id);
+
+#endif /* __INTEL_GPGPU_H__ */
+
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
new file mode 100644
index 0000000..ef76bb4
--- /dev/null
+++ b/src/intel/intel_structs.h
@@ -0,0 +1,461 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __INTEL_STRUCTS_H__
+#define __INTEL_STRUCTS_H__
+
+#include <stdint.h>
+
+typedef struct gen6_interface_descriptor
+{
+ struct {
+ uint32_t pad6:6;
+ uint32_t kernel_start_pointer:26;
+ } desc0;
+
+ struct {
+ uint32_t pad:7;
+ uint32_t software_exception:1;
+ uint32_t pad2:3;
+ uint32_t maskstack_exception:1;
+ uint32_t pad3:1;
+ uint32_t illegal_opcode_exception:1;
+ uint32_t pad4:2;
+ uint32_t floating_point_mode:1;
+ uint32_t thread_priority:1;
+ uint32_t single_program_flow:1;
+ uint32_t pad5:1;
+ uint32_t pad6:6;
+ uint32_t pad7:6;
+ } desc1;
+
+ struct {
+ uint32_t pad:2;
+ uint32_t sampler_count:3;
+ uint32_t sampler_state_pointer:27;
+ } desc2;
+
+ struct {
+ uint32_t binding_table_entry_count:5; /* prefetch entries only */
+ uint32_t binding_table_pointer:27; /* 11 bit only on IVB+ */
+ } desc3;
+
+ struct {
+ uint32_t curbe_read_offset:16; /* in GRFs */
+ uint32_t curbe_read_len:16; /* in GRFs */
+ } desc4;
+
+ struct {
+ uint32_t group_threads_num:8; /* 0..64, 0 - no barrier use */
+ uint32_t barrier_return_byte:8;
+ uint32_t slm_sz:5; /* 0..16 - 0K..64K */
+ uint32_t barrier_enable:1;
+ uint32_t rounding_mode:2;
+ uint32_t barrier_return_grf_offset:8;
+ } desc5;
+
+ uint32_t desc6; /* unused */
+ uint32_t desc7; /* unused */
+} gen6_interface_descriptor_t;
+
+typedef struct gen6_surface_state
+{
+ struct {
+ uint32_t cube_pos_z:1;
+ uint32_t cube_neg_z:1;
+ uint32_t cube_pos_y:1;
+ uint32_t cube_neg_y:1;
+ uint32_t cube_pos_x:1;
+ uint32_t cube_neg_x:1;
+ uint32_t pad:2;
+ uint32_t render_cache_read_mode:1;
+ uint32_t cube_map_corner_mode:1;
+ uint32_t mipmap_layout_mode:1;
+ uint32_t vert_line_stride_ofs:1;
+ uint32_t vert_line_stride:1;
+ uint32_t color_blend:1;
+ uint32_t writedisable_blue:1;
+ uint32_t writedisable_green:1;
+ uint32_t writedisable_red:1;
+ uint32_t writedisable_alpha:1;
+ uint32_t surface_format:9;
+ uint32_t data_return_format:1;
+ uint32_t pad0:1;
+ uint32_t surface_type:3;
+ } ss0;
+
+ struct {
+ uint32_t base_addr;
+ } ss1;
+
+ struct {
+ uint32_t render_target_rotation:2;
+ uint32_t mip_count:4;
+ uint32_t width:13;
+ uint32_t height:13;
+ } ss2;
+
+ struct {
+ uint32_t tile_walk:1;
+ uint32_t tiled_surface:1;
+ uint32_t pad:1;
+ uint32_t pitch:18;
+ uint32_t depth:11;
+ } ss3;
+
+ struct {
+ uint32_t multisample_pos_index:3;
+ uint32_t pad:1;
+ uint32_t multisample_count:3;
+ uint32_t pad1:1;
+ uint32_t rt_view_extent:9;
+ uint32_t min_array_elt:11;
+ uint32_t min_lod:4;
+ } ss4;
+
+ struct {
+ uint32_t pad:16;
+ uint32_t cache_control:2; /* different values for GT and IVB */
+ uint32_t gfdt:1; /* allows selective flushing of LLC (e.g. for scanout) */
+ uint32_t encrypted_data:1;
+ uint32_t y_offset:4;
+ uint32_t vertical_alignment:1;
+ uint32_t x_offset:7;
+ } ss5;
+
+ uint32_t ss6; /* unused */
+ uint32_t ss7; /* unused */
+} gen6_surface_state_t;
+
+typedef struct gen7_surface_state
+{
+ struct {
+ uint32_t cube_pos_z:1;
+ uint32_t cube_neg_z:1;
+ uint32_t cube_pos_y:1;
+ uint32_t cube_neg_y:1;
+ uint32_t cube_pos_x:1;
+ uint32_t cube_neg_x:1;
+ uint32_t media_boundary_pixel_mode:2;
+ uint32_t render_cache_rw_mode:1;
+ uint32_t pad1:1;
+ uint32_t surface_array_spacing:1;
+ uint32_t vertical_line_stride_offset:1;
+ uint32_t vertical_line_stride:1;
+ uint32_t tile_walk:1;
+ uint32_t tiled_surface:1;
+ uint32_t horizontal_alignment:1;
+ uint32_t vertical_alignment:2;
+ uint32_t surface_format:9;
+ uint32_t pad0:1;
+ uint32_t surface_array:1;
+ uint32_t surface_type:3;
+ } ss0;
+
+ struct {
+ uint32_t base_addr;
+ } ss1;
+
+ struct {
+ uint32_t width:14;
+ uint32_t pad1:2;
+ uint32_t height:14;
+ uint32_t pad0:2;
+ } ss2;
+
+ struct {
+ uint32_t pitch:18;
+ uint32_t pad0:3;
+ uint32_t depth:11;
+ } ss3;
+
+ union {
+ struct {
+ uint32_t mulsample_pal_idx:3;
+ uint32_t numer_mulsample:3;
+ uint32_t mss_fmt:1;
+ uint32_t rt_view_extent:11;
+ uint32_t min_array_element:11;
+ uint32_t rt_rotate:2;
+ uint32_t pad0:1;
+ } not_str_buf;
+ } ss4;
+
+ struct {
+ uint32_t mip_count:4;
+ uint32_t surface_min_load:4;
+ uint32_t pad2:6;
+ uint32_t coherence_type:1;
+ uint32_t stateless_force_write_thru:1;
+ uint32_t cache_control:4;
+ uint32_t y_offset:4;
+ uint32_t pad0:1;
+ uint32_t x_offset:7;
+ } ss5;
+
+ uint32_t ss6; /* unused */
+
+ struct {
+ uint32_t min_lod:12;
+ uint32_t pad0:4;
+ uint32_t shader_a:3;
+ uint32_t shader_b:3;
+ uint32_t shader_g:3;
+ uint32_t shader_r:3;
+ uint32_t pad1:4;
+ } ss7;
+} gen7_surface_state_t;
+
+STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t));
+static const size_t surface_state_sz = sizeof(gen6_surface_state_t);
+
+typedef struct gen6_vfe_state_inline
+{
+ struct {
+ uint32_t per_thread_scratch_space:4;
+ uint32_t pad3:3;
+ uint32_t extend_vfe_state_present:1;
+ uint32_t pad2:2;
+ uint32_t scratch_base:22;
+ } vfe0;
+
+ struct {
+ uint32_t debug_counter_control:2;
+ uint32_t gpgpu_mode:1; /* 0 for SNB!!! */
+ uint32_t gateway_mmio_access:2;
+ uint32_t fast_preempt:1;
+ uint32_t bypass_gateway_ctl:1; /* 0 - legacy, 1 - no open/close */
+ uint32_t reset_gateway_timer:1;
+ uint32_t urb_entries:8;
+ uint32_t max_threads:16;
+ } vfe1;
+
+ struct {
+ uint32_t pad8:8;
+ uint32_t debug_object_id:24;
+ } vfe2;
+
+ struct {
+ uint32_t curbe_size:16; /* in GRFs */
+ uint32_t urb_size:16; /* in GRFs */
+ } vfe3;
+
+ struct {
+ uint32_t scoreboard_mask:32; /* 1 - enable the corresponding dependency */
+ } vfe4;
+
+ struct {
+ uint32_t scoreboard0_dx:4;
+ uint32_t scoreboard0_dy:4;
+ uint32_t scoreboard1_dx:4;
+ uint32_t scoreboard1_dy:4;
+ uint32_t scoreboard2_dx:4;
+ uint32_t scoreboard2_dy:4;
+ uint32_t scoreboard3_dx:4;
+ uint32_t scoreboard3_dy:4;
+ } vfe5;
+
+ struct {
+ uint32_t scoreboard4_dx:4;
+ uint32_t scoreboard4_dy:4;
+ uint32_t scoreboard5_dx:4;
+ uint32_t scoreboard5_dy:4;
+ uint32_t scoreboard6_dx:4;
+ uint32_t scoreboard6_dy:4;
+ uint32_t scoreboard7_dx:4;
+ uint32_t scoreboard7_dy:4;
+ } vfe6;
+} gen6_vfe_state_inline_t;
+
+typedef struct gen6_pipe_control
+{
+ struct {
+ uint32_t length : BITFIELD_RANGE(0, 7);
+ uint32_t reserved : BITFIELD_RANGE(8, 15);
+ uint32_t instruction_subopcode : BITFIELD_RANGE(16, 23);
+ uint32_t instruction_opcode : BITFIELD_RANGE(24, 26);
+ uint32_t instruction_pipeline : BITFIELD_RANGE(27, 28);
+ uint32_t instruction_type : BITFIELD_RANGE(29, 31);
+ } dw0;
+
+ struct {
+ uint32_t depth_cache_flush_enable : BITFIELD_BIT(0);
+ uint32_t stall_at_pixel_scoreboard : BITFIELD_BIT(1);
+ uint32_t state_cache_invalidation_enable : BITFIELD_BIT(2);
+ uint32_t constant_cache_invalidation_enable : BITFIELD_BIT(3);
+ uint32_t vf_cache_invalidation_enable : BITFIELD_BIT(4);
+ uint32_t dc_flush_enable : BITFIELD_BIT(5);
+ uint32_t protected_memory_app_id : BITFIELD_BIT(6);
+ uint32_t pipe_control_flush_enable : BITFIELD_BIT(7);
+ uint32_t notify_enable : BITFIELD_BIT(8);
+ uint32_t indirect_state_pointers_disable : BITFIELD_BIT(9);
+ uint32_t texture_cache_invalidation_enable : BITFIELD_BIT(10);
+ uint32_t instruction_cache_invalidate_enable : BITFIELD_BIT(11);
+ uint32_t render_target_cache_flush_enable : BITFIELD_BIT(12);
+ uint32_t depth_stall_enable : BITFIELD_BIT(13);
+ uint32_t post_sync_operation : BITFIELD_RANGE(14, 15);
+ uint32_t generic_media_state_clear : BITFIELD_BIT(16);
+ uint32_t synchronize_gfdt_surface : BITFIELD_BIT(17);
+ uint32_t tlb_invalidate : BITFIELD_BIT(18);
+ uint32_t global_snapshot_count_reset : BITFIELD_BIT(19);
+ uint32_t cs_stall : BITFIELD_BIT(20);
+ uint32_t store_data_index : BITFIELD_BIT(21);
+ uint32_t protected_memory_enable : BITFIELD_BIT(22);
+ uint32_t reserved : BITFIELD_RANGE(23, 31);
+ } dw1;
+
+ struct {
+ uint32_t reserved : BITFIELD_RANGE(0, 1);
+ uint32_t destination_address_type : BITFIELD_BIT(2);
+ uint32_t address : BITFIELD_RANGE(3, 31);
+ } dw2;
+
+ struct {
+ uint32_t data;
+ } dw3;
+
+ struct {
+ uint32_t data;
+ } dw4;
+} gen6_pipe_control_t;
+
+typedef struct gen6_sampler_state
+{
+ struct {
+ uint32_t shadow_function:3;
+ uint32_t lod_bias:11;
+ uint32_t min_filter:3;
+ uint32_t mag_filter:3;
+ uint32_t mip_filter:2;
+ uint32_t base_level:5;
+ uint32_t min_mag_neq:1;
+ uint32_t lod_preclamp:1;
+ uint32_t default_color_mode:1;
+ uint32_t pad0:1;
+ uint32_t disable:1;
+ } ss0;
+
+ struct {
+ uint32_t r_wrap_mode:3;
+ uint32_t t_wrap_mode:3;
+ uint32_t s_wrap_mode:3;
+ uint32_t cube_control_mode:1;
+ uint32_t pad:2;
+ uint32_t max_lod:10;
+ uint32_t min_lod:10;
+ } ss1;
+
+ struct {
+ uint32_t pad:5;
+ uint32_t default_color_pointer:27;
+ } ss2;
+
+ struct {
+ uint32_t non_normalized_coord:1;
+ uint32_t pad:12;
+ uint32_t address_round:6;
+ uint32_t max_aniso:3;
+ uint32_t chroma_key_mode:1;
+ uint32_t chroma_key_index:2;
+ uint32_t chroma_key_enable:1;
+ uint32_t monochrome_filter_width:3;
+ uint32_t monochrome_filter_height:3;
+ } ss3;
+} gen6_sampler_state_t;
+
+typedef struct gen7_sampler_border_color {
+ float r,g,b,a;
+} gen7_sampler_border_color_t;
+
+typedef struct gen7_sampler_state
+{
+ struct {
+ uint32_t aniso_algorithm:1;
+ uint32_t lod_bias:13;
+ uint32_t min_filter:3;
+ uint32_t mag_filter:3;
+ uint32_t mip_filter:2;
+ uint32_t base_level:5;
+ uint32_t pad1:1;
+ uint32_t lod_preclamp:1;
+ uint32_t default_color_mode:1;
+ uint32_t pad0:1;
+ uint32_t disable:1;
+ } ss0;
+
+ struct {
+ uint32_t cube_control_mode:1;
+ uint32_t shadow_function:3;
+ uint32_t pad:4;
+ uint32_t max_lod:12;
+ uint32_t min_lod:12;
+ } ss1;
+
+ struct {
+ uint32_t pad:5;
+ uint32_t default_color_pointer:27;
+ } ss2;
+
+ struct {
+ uint32_t r_wrap_mode:3;
+ uint32_t t_wrap_mode:3;
+ uint32_t s_wrap_mode:3;
+ uint32_t pad:1;
+ uint32_t non_normalized_coord:1;
+ uint32_t trilinear_quality:2;
+ uint32_t address_round:6;
+ uint32_t max_aniso:3;
+ uint32_t chroma_key_mode:1;
+ uint32_t chroma_key_index:2;
+ uint32_t chroma_key_enable:1;
+ uint32_t pad0:6;
+ } ss3;
+} gen7_sampler_state_t;
+
+STATIC_ASSERT(sizeof(gen6_sampler_state_t) == sizeof(gen7_sampler_state_t));
+
+#undef BITFIELD_BIT
+#undef BITFIELD_RANGE
+
+#endif /* __INTEL_STRUCTS_H__ */
+
diff --git a/src/kernels/cl_internal_copy_buf_align16.cl b/src/kernels/cl_internal_copy_buf_align16.cl
new file mode 100644
index 0000000..1abb4e9
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_align16.cl
@@ -0,0 +1,12 @@
+kernel void __cl_copy_region_align16 ( global float* src, unsigned int src_offset,
+ global float* dst, unsigned int dst_offset,
+ unsigned int size)
+{
+ int i = get_global_id(0) * 4;
+ if (i < size*4) {
+ dst[i+dst_offset] = src[i+src_offset];
+ dst[i+dst_offset + 1] = src[i+src_offset + 1];
+ dst[i+dst_offset + 2] = src[i+src_offset + 2];
+ dst[i+dst_offset + 3] = src[i+src_offset + 3];
+ }
+}
diff --git a/src/kernels/cl_internal_copy_buf_align4.cl b/src/kernels/cl_internal_copy_buf_align4.cl
new file mode 100644
index 0000000..27174ca
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_copy_region_align4 ( global float* src, unsigned int src_offset,
+ global float* dst, unsigned int dst_offset,
+ unsigned int size)
+{
+ int i = get_global_id(0);
+ if (i < size)
+ dst[i+dst_offset] = src[i+src_offset];
+}
diff --git a/src/kernels/cl_internal_copy_buf_rect.cl b/src/kernels/cl_internal_copy_buf_rect.cl
new file mode 100644
index 0000000..71e7484
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_rect.cl
@@ -0,0 +1,15 @@
+kernel void __cl_copy_buffer_rect ( global char* src, global char* dst,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_offset, unsigned int dst_offset,
+ unsigned int src_row_pitch, unsigned int src_slice_pitch,
+ unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+ dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+ dst[dst_offset] = src[src_offset];
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
new file mode 100644
index 0000000..e02d0e5
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
@@ -0,0 +1,28 @@
+kernel void __cl_copy_region_unalign_dst_offset ( global int* src, unsigned int src_offset,
+ global int* dst, unsigned int dst_offset,
+ unsigned int size,
+ unsigned int first_mask, unsigned int last_mask,
+ unsigned int shift, unsigned int dw_mask)
+{
+ int i = get_global_id(0);
+ unsigned int tmp = 0;
+
+ if (i > size -1)
+ return;
+
+ /* last dw, need to be careful, not to overflow the source. */
+ if ((i == size - 1) && ((last_mask & (~(~dw_mask >> shift))) == 0)) {
+ tmp = ((src[src_offset + i] & ~dw_mask) >> shift);
+ } else {
+ tmp = ((src[src_offset + i] & ~dw_mask) >> shift)
+ | ((src[src_offset + i + 1] & dw_mask) << (32 - shift));
+ }
+
+ if (i == 0) {
+ dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+ } else if (i == size - 1) {
+ dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+ } else {
+ dst[i+dst_offset] = tmp;
+ }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
new file mode 100644
index 0000000..83b6e97
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_region_unalign_same_offset ( global int* src, unsigned int src_offset,
+ global int* dst, unsigned int dst_offset,
+ unsigned int size,
+ unsigned int first_mask, unsigned int last_mask)
+{
+ int i = get_global_id(0);
+ if (i > size -1)
+ return;
+
+ if (i == 0) {
+ dst[dst_offset] = (dst[dst_offset] & first_mask)
+ | (src[src_offset] & (~first_mask));
+ } else if (i == size - 1) {
+ dst[i+dst_offset] = (src[i+src_offset] & last_mask)
+ | (dst[i+dst_offset] & (~last_mask));
+ } else {
+ dst[i+dst_offset] = src[i+src_offset];
+ }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
new file mode 100644
index 0000000..ce0aa1d
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
@@ -0,0 +1,29 @@
+kernel void __cl_copy_region_unalign_src_offset ( global int* src, unsigned int src_offset,
+ global int* dst, unsigned int dst_offset,
+ unsigned int size,
+ unsigned int first_mask, unsigned int last_mask,
+ unsigned int shift, unsigned int dw_mask, int src_less)
+{
+ int i = get_global_id(0);
+ unsigned int tmp = 0;
+
+ if (i > size -1)
+ return;
+
+ if (i == 0) {
+ tmp = ((src[src_offset + i] & dw_mask) << shift);
+ } else if (src_less && i == size - 1) { // not exceed the bound of source
+ tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift));
+ } else {
+ tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift))
+ | ((src[src_offset + i] & dw_mask) << shift);
+ }
+
+ if (i == 0) {
+ dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+ } else if (i == size - 1) {
+ dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+ } else {
+ dst[i+dst_offset] = tmp;
+ }
+}
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
new file mode 100644
index 0000000..a218b58
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d(__read_only image2d_t image, global uchar* buffer,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+ unsigned int src_offset)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ uint4 color = (uint4)(0);
+ int2 dst_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ dst_coord.x = dst_origin0 + i;
+ dst_coord.y = dst_origin1 + j;
+ src_offset += (k * region1 + j) * region0 + i;
+ color.x = buffer[src_offset];
+ write_imageui(image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
new file mode 100644
index 0000000..84d3b27
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_buffer_to_image_3d(__read_only image3d_t image, global uchar* buffer,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+ unsigned int src_offset)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ uint4 color = (uint4)(0);
+ int4 dst_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ dst_coord.x = dst_origin0 + i;
+ dst_coord.y = dst_origin1 + j;
+ dst_coord.z = dst_origin2 + k;
+ src_offset += (k * region1 + j) * region0 + i;
+ color.x = buffer[src_offset];
+ write_imageui(image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_1d_to_1d.cl b/src/kernels/cl_internal_copy_image_1d_to_1d.cl
new file mode 100644
index 0000000..dca82b2
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_1d_to_1d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_1d_to_1d(__read_only image1d_t src_image, __write_only image1d_t dst_image,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int4 color;
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int src_coord;
+ int dst_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_coord = src_origin0 + i;
+ dst_coord = dst_origin0 + i;
+ color = read_imagei(src_image, sampler, src_coord);
+ write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_2d.cl b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
new file mode 100644
index 0000000..c5eaab1
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
@@ -0,0 +1,21 @@
+kernel void __cl_copy_image_2d_to_2d(__read_only image2d_t src_image, __write_only image2d_t dst_image,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int4 color;
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int2 src_coord;
+ int2 dst_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_coord.x = src_origin0 + i;
+ src_coord.y = src_origin1 + j;
+ dst_coord.x = dst_origin0 + i;
+ dst_coord.y = dst_origin1 + j;
+ color = read_imagei(src_image, sampler, src_coord);
+ write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_3d.cl b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
new file mode 100644
index 0000000..4c73a74
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_2d_to_3d(__read_only image2d_t src_image, __write_only image3d_t dst_image,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int4 color;
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int2 src_coord;
+ int4 dst_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_coord.x = src_origin0 + i;
+ src_coord.y = src_origin1 + j;
+ dst_coord.x = dst_origin0 + i;
+ dst_coord.y = dst_origin1 + j;
+ dst_coord.z = dst_origin2 + k;
+ color = read_imagei(src_image, sampler, src_coord);
+ write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer.cl b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
new file mode 100644
index 0000000..b6c352e
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+ unsigned int dst_offset)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ uint4 color;
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int2 src_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_coord.x = src_origin0 + i;
+ src_coord.y = src_origin1 + j;
+ color = read_imageui(image, sampler, src_coord);
+ dst_offset += (k * region1 + j) * region0 + i;
+ buffer[dst_offset] = color.x;
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_2d.cl b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
new file mode 100644
index 0000000..e0effa0
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_3d_to_2d(__read_only image3d_t src_image, __write_only image2d_t dst_image,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int4 color;
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int4 src_coord;
+ int2 dst_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_coord.x = src_origin0 + i;
+ src_coord.y = src_origin1 + j;
+ src_coord.z = src_origin2 + k;
+ dst_coord.x = dst_origin0 + i;
+ dst_coord.y = dst_origin1 + j;
+ color = read_imagei(src_image, sampler, src_coord);
+ write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_3d.cl b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
new file mode 100644
index 0000000..de80a0a
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
@@ -0,0 +1,23 @@
+kernel void __cl_copy_image_3d_to_3d(__read_only image3d_t src_image, __write_only image3d_t dst_image,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int4 color;
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int4 src_coord;
+ int4 dst_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_coord.x = src_origin0 + i;
+ src_coord.y = src_origin1 + j;
+ src_coord.z = src_origin2 + k;
+ dst_coord.x = dst_origin0 + i;
+ dst_coord.y = dst_origin1 + j;
+ dst_coord.z = dst_origin2 + k;
+ color = read_imagei(src_image, sampler, src_coord);
+ write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer.cl b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
new file mode 100644
index 0000000..dcfc8a2
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
@@ -0,0 +1,22 @@
+#define IMAGE_TYPE image3d_t
+#define COORD_TYPE int4
+kernel void __cl_copy_image_3d_to_buffer ( __read_only IMAGE_TYPE image, global uchar* buffer,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+ unsigned int dst_offset)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ uint4 color;
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ COORD_TYPE src_coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ src_coord.x = src_origin0 + i;
+ src_coord.y = src_origin1 + j;
+ src_coord.z = src_origin2 + k;
+ color = read_imageui(image, sampler, src_coord);
+ dst_offset += (k * region1 + j) * region0 + i;
+ buffer[dst_offset] = color.x;
+}
diff --git a/src/kernels/cl_internal_fill_buf_align128.cl b/src/kernels/cl_internal_fill_buf_align128.cl
new file mode 100644
index 0000000..552820c
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align128.cl
@@ -0,0 +1,9 @@
+kernel void __cl_fill_region_align128 ( global float16* dst, float16 pattern0,
+ unsigned int offset, unsigned int size, float16 pattern1)
+{
+ int i = get_global_id(0);
+ if (i < size) {
+ dst[i*2+offset] = pattern0;
+ dst[i*2+offset+1] = pattern1;
+ }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align2.cl b/src/kernels/cl_internal_fill_buf_align2.cl
new file mode 100644
index 0000000..0b9a4cf
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align2.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align2 ( global char2 * dst, char2 pattern,
+ unsigned int offset, unsigned int size)
+{
+ int i = get_global_id(0);
+ if (i < size) {
+ dst[i+offset] = pattern;
+ }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align4.cl b/src/kernels/cl_internal_fill_buf_align4.cl
new file mode 100644
index 0000000..aefd92f
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align4 ( global float* dst, float pattern,
+ unsigned int offset, unsigned int size)
+{
+ int i = get_global_id(0);
+ if (i < size) {
+ dst[i+offset] = pattern;
+ }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align8.cl b/src/kernels/cl_internal_fill_buf_align8.cl
new file mode 100644
index 0000000..edaff77
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align8.cl
@@ -0,0 +1,14 @@
+#define COMPILER_ABS_FUNC_N(N) \
+ kernel void __cl_fill_region_align8_##N ( global float##N* dst, float##N pattern, \
+ unsigned int offset, unsigned int size) { \
+ int i = get_global_id(0); \
+ if (i < size) { \
+ dst[i+offset] = pattern; \
+ } \
+ }
+
+
+COMPILER_ABS_FUNC_N(2)
+COMPILER_ABS_FUNC_N(4)
+COMPILER_ABS_FUNC_N(8)
+COMPILER_ABS_FUNC_N(16)
diff --git a/src/kernels/cl_internal_fill_buf_unalign.cl b/src/kernels/cl_internal_fill_buf_unalign.cl
new file mode 100644
index 0000000..90762b0
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_unalign.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_unalign ( global char * dst, char pattern,
+ unsigned int offset, unsigned int size)
+{
+ int i = get_global_id(0);
+ if (i < size) {
+ dst[i+offset] = pattern;
+ }
+}
diff --git a/src/kernels/cl_internal_fill_image_1d.cl b/src/kernels/cl_internal_fill_image_1d.cl
new file mode 100644
index 0000000..b3b0cbf
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_1d.cl
@@ -0,0 +1,14 @@
+kernel void __cl_fill_image_1d( __write_only image1d_t image, float4 pattern,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ coord = origin0 + i;
+ write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_1d_array.cl b/src/kernels/cl_internal_fill_image_1d_array.cl
new file mode 100644
index 0000000..f1eb241
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_1d_array.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_1d_array( __write_only image1d_array_t image, float4 pattern,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int2 coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ coord.x = origin0 + i;
+ coord.y = origin2 + k;
+ write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_2d.cl b/src/kernels/cl_internal_fill_image_2d.cl
new file mode 100644
index 0000000..0e29f3e
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_2d.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_2d( __write_only image2d_t image, float4 pattern,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int2 coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ coord.x = origin0 + i;
+ coord.y = origin1 + j;
+ write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_2d_array.cl b/src/kernels/cl_internal_fill_image_2d_array.cl
new file mode 100644
index 0000000..f29c9e7
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_2d_array.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_2d_array( __write_only image2d_array_t image, float4 pattern,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int4 coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ coord.x = origin0 + i;
+ coord.y = origin1 + j;
+ coord.z = origin2 + k;
+ write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_3d.cl b/src/kernels/cl_internal_fill_image_3d.cl
new file mode 100644
index 0000000..042b8ab
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_3d.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_3d( __write_only image3d_t image, float4 pattern,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ int k = get_global_id(2);
+ int4 coord;
+ if((i >= region0) || (j>= region1) || (k>=region2))
+ return;
+ coord.x = origin0 + i;
+ coord.y = origin1 + j;
+ coord.z = origin2 + k;
+ write_imagef(image, coord, pattern);
+
+}
diff --git a/src/performance.c b/src/performance.c
new file mode 100644
index 0000000..85cd481
--- /dev/null
+++ b/src/performance.c
@@ -0,0 +1,324 @@
+#include <performance.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#define MAX_KERNEL_NAME_LENGTH 100
+#define MAX_KERNEL_EXECUTION_COUNT 100000
+#define MAX_KERNEL_BUILD_OPT 1000
+typedef struct kernel_storage_node
+{
+ char kernel_name[MAX_KERNEL_NAME_LENGTH];
+ float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+ char build_option[MAX_KERNEL_BUILD_OPT];
+ int current_count;
+ float kernel_sum_time;
+ struct kernel_storage_node *next;
+} kernel_storage_node;
+
+typedef struct context_storage_node
+{
+ uintptr_t context_id;
+ kernel_storage_node *kernels_storage;
+ char max_time_kernel_name[MAX_KERNEL_NAME_LENGTH];
+ float kernel_max_time;
+ int kernel_count;
+ struct context_storage_node *next;
+} context_storage_node;
+
+typedef struct storage
+{
+ context_storage_node * context_storage;
+} storage;
+
+
+
+static storage record;
+static int atexit_registered = 0;
+
+
+static context_storage_node * prev_context_pointer = NULL;
+static kernel_storage_node * prev_kernel_pointer = NULL;
+
+static context_storage_node * find_context(cl_context context)
+{
+ if(NULL != prev_context_pointer )
+ {
+ if(prev_context_pointer->context_id == (uintptr_t)context)
+ return prev_context_pointer;
+ }
+
+ if(NULL == record.context_storage)
+ {
+ record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+ record.context_storage->context_id = (uintptr_t)context;
+ record.context_storage->kernels_storage = NULL;
+ record.context_storage->kernel_max_time = 0.0f;
+ record.context_storage->next = NULL;
+ record.context_storage->kernel_count = 0;
+ return record.context_storage;
+ }
+
+ context_storage_node *pre = record.context_storage;
+ context_storage_node *cur = record.context_storage;
+ while(NULL !=cur && (uintptr_t)context != cur->context_id )
+ {
+ pre = cur;
+ cur = cur->next;
+ }
+ if(NULL != cur)
+ return cur;
+
+ pre->next = (context_storage_node *)malloc(sizeof(context_storage_node));
+ pre = pre->next;
+ pre->context_id = (uintptr_t)context;
+ pre->kernels_storage = NULL;
+ pre->kernel_max_time = 0.0f;
+ pre->next = NULL;
+ pre->kernel_count = 0;
+ return pre;
+}
+
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt)
+{
+ if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
+ p_context == prev_context_pointer &&
+ !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) &&
+ !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT))
+ return prev_kernel_pointer;
+
+ if(NULL == p_context)
+ return NULL;
+
+ if(NULL == p_context->kernels_storage)
+ {
+ p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+ p_context->kernel_count++;
+ strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
+ p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+ p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0';
+ p_context->kernels_storage->current_count = 0;
+ p_context->kernels_storage->kernel_sum_time = 0.0f;
+ p_context->kernels_storage->next = NULL;
+ return p_context->kernels_storage;
+ }
+
+ kernel_storage_node *pre = p_context->kernels_storage;
+ kernel_storage_node *cur = p_context->kernels_storage;
+ while(NULL != cur &&
+ (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) ||
+ strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT)))
+ {
+ pre = cur;
+ cur = cur->next;
+ }
+ if(NULL != cur)
+ return cur;
+
+ p_context->kernel_count++;
+ pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+ pre = pre->next;
+ pre->current_count = 0;
+ pre->kernel_sum_time = 0.0f;
+ pre->next = NULL;
+ strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+ pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+ pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ return pre;
+}
+
+static void free_storage()
+{
+ context_storage_node *p_context = record.context_storage;
+ while(NULL != p_context)
+ {
+ context_storage_node *p_tmp_context = p_context->next;
+ kernel_storage_node *p_kernel = p_context->kernels_storage;
+ while(NULL != p_kernel)
+ {
+ kernel_storage_node *p_tmp_kernel = p_kernel->next;
+ free(p_kernel);
+ p_kernel = p_tmp_kernel;
+ }
+ free(p_context);
+ p_context = p_tmp_context;
+ }
+}
+
+typedef struct time_element
+{
+ char kernel_name[MAX_KERNEL_NAME_LENGTH];
+ float kernel_sum_time;
+ int kernel_execute_count;
+ double dev;
+ float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+ uint32_t time_index;
+} time_element;
+
+static int cmp(const void *a, const void *b)
+{
+ if(((time_element *)a)->kernel_sum_time < ((time_element *)b)->kernel_sum_time)
+ return 1;
+ else if(((time_element *)a)->kernel_sum_time > ((time_element *)b)->kernel_sum_time)
+ return -1;
+ else
+ return 0;
+}
+
+static void print_time_info()
+{
+ context_storage_node *p_context = record.context_storage;
+ if(NULL == p_context)
+ {
+ printf("Nothing to output !\n");
+ return;
+ }
+
+ int tmp_context_id = 0;
+ while(NULL != p_context)
+ {
+ printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
+ printf(" ->>>> KERNELS TIME SUMMARY <<<<-\n");
+
+ kernel_storage_node *p_kernel = p_context->kernels_storage;
+ kernel_storage_node *p_tmp_kernel = p_kernel;
+ time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+ memset(te, 0, sizeof(time_element)*p_context->kernel_count);
+ int i = -1, j = 0, k = 0;
+ while(NULL != p_tmp_kernel)
+ {
+ for(k=0; k<=i; k++)
+ {
+ if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH))
+ break;
+ }
+ if(k == i+1)
+ {
+ i++;
+ k = i;
+ }
+ te[k].kernel_execute_count += p_tmp_kernel->current_count;
+ strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH);
+ te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time;
+ for(j=0; j != p_tmp_kernel->current_count; ++j)
+ te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j];
+ p_tmp_kernel = p_tmp_kernel->next;
+ }
+
+ for(k=0; k<=i; k++)
+ {
+ float average = te[k].kernel_sum_time / te[k].kernel_execute_count;
+ double sumsquare = 0.0;
+ for(j=0; j<te[k].time_index; ++j)
+ sumsquare += pow((te[k].kernel_times[j] - average), 2.0);
+ te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count);
+ }
+
+ float sum_time = 0.0f;
+ qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
+ for(j=0; j<=i; ++j)
+ sum_time += te[j].kernel_sum_time;
+
+ for(j=0; j<=i; ++j)
+ {
+ printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d Ave(ms): %7.2f Dev: %.1lf%%]\n",
+ te[j].kernel_name,
+ te[j].kernel_sum_time / sum_time * 100,
+ te[j].kernel_sum_time,
+ te[j].kernel_execute_count,
+ te[j].kernel_sum_time / te[j].kernel_execute_count,
+ te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100);
+ }
+ free(te);
+ printf(" Total : %.2f\n", sum_time);
+ if(2 != b_output_kernel_perf)
+ {
+ printf("[------------ CONTEXT ENDS------------]\n\n");
+ p_context = p_context->next;
+ continue;
+ }
+ p_tmp_kernel = p_kernel;
+ printf("\n ->>>> KERNELS TIME DETAIL <<<<-\n");
+ while(NULL != p_kernel)
+ {
+ printf(" [Kernel Name : %30s Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+ if(*p_kernel->build_option != '\0')
+ {
+ int count = 0;
+ printf(" ->Build Options : ");
+ while(p_kernel->build_option[count] != '\0' )
+ {
+ printf("%c", p_kernel->build_option[count++]);
+ if(count % 100 == 0)
+ printf("\n ");
+ }
+ printf("\n");
+ }
+ for(i=0; i!=p_kernel->current_count; ++i)
+ printf(" Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
+ p_kernel = p_kernel->next;
+ }
+ printf("[------------ CONTEXT ENDS------------]\n\n");
+ p_context = p_context->next;
+ }
+ free_storage();
+}
+
+
+static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time)
+{
+ if(!atexit_registered)
+ {
+ atexit_registered = 1;
+ atexit(print_time_info);
+ }
+ context_storage_node *p_context = find_context(context);
+ kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
+ prev_context_pointer = p_context;
+ prev_kernel_pointer = p_kernel;
+ p_kernel->kernel_times[p_kernel->current_count++] = time;
+ p_kernel->kernel_sum_time += time;
+ if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
+ {
+ p_context->kernel_max_time = p_kernel->kernel_sum_time;
+ strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+ p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+ }
+}
+
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+int b_output_kernel_perf = 0;
+static struct timeval start, end;
+
+void initialize_env_var()
+{
+ char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
+ if(NULL == env || !strncmp(env,"0", 1))
+ b_output_kernel_perf = 0;
+ else if(!strncmp(env,"1", 1))
+ b_output_kernel_perf = 1;
+ else
+ b_output_kernel_perf = 2;
+}
+
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+ pthread_mutex_lock(&mutex);
+ gettimeofday(&start, NULL);
+}
+
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
+{
+ clFinish(cq);
+ gettimeofday(&end, NULL);
+ float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
+ insert(context, kernel_name, build_opt, t);
+ pthread_mutex_unlock(&mutex);
+}
diff --git a/src/performance.h b/src/performance.h
new file mode 100644
index 0000000..1e75054
--- /dev/null
+++ b/src/performance.h
@@ -0,0 +1,12 @@
+#ifndef __PERFORMANCE_H__
+#define __PERFORMANCE_H__
+#include "CL/cl.h"
+
+
+extern int b_output_kernel_perf;
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq);
+void initialize_env_var();
+
+
+#endif
diff --git a/src/x11/dricommon.c b/src/x11/dricommon.c
new file mode 100644
index 0000000..bd4ac50
--- /dev/null
+++ b/src/x11/dricommon.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Note: the code is taken from libva code base
+ */
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <X11/Xlibint.h>
+#include <X11/Xlib.h>
+#include "x11/va_dri2.h"
+#include "x11/va_dri2tokens.h"
+#include "x11/dricommon.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+
+#define LOCAL __attribute__ ((visibility ("internal")))
+
+LOCAL dri_drawable_t*
+dri_state_do_drawable_hash(dri_state_t *state, XID drawable)
+{
+ int index = drawable % DRAWABLE_HASH_SZ;
+ struct dri_drawable *dri_drawable = state->drawable_hash[index];
+
+ while (dri_drawable) {
+ if (dri_drawable->x_drawable == drawable)
+ return dri_drawable;
+ dri_drawable = dri_drawable->next;
+ }
+
+ dri_drawable = dri_state_create_drawable(state, drawable);
+ dri_drawable->x_drawable = drawable;
+ dri_drawable->next = state->drawable_hash[index];
+ state->drawable_hash[index] = dri_drawable;
+
+ return dri_drawable;
+}
+
+LOCAL void
+dri_state_free_drawable_hash(dri_state_t *state)
+{
+ int i;
+ struct dri_drawable *dri_drawable, *prev;
+
+ for (i = 0; i < DRAWABLE_HASH_SZ; i++) {
+ dri_drawable = state->drawable_hash[i];
+
+ while (dri_drawable) {
+ prev = dri_drawable;
+ dri_drawable = prev->next;
+ dri_state_destroy_drawable(state, prev);
+ }
+ }
+}
+
+LOCAL dri_drawable_t*
+dri_state_get_drawable(dri_state_t *state, XID drawable)
+{
+ return dri_state_do_drawable_hash(state, drawable);
+}
+
+LOCAL void
+dri_state_init_drawable_hash_table(dri_state_t *state)
+{
+ int i;
+ for(i=0; i < DRAWABLE_HASH_SZ; i++)
+ state->drawable_hash[i] = NULL;
+}
+
+LOCAL void
+dri_state_delete(dri_state_t *state)
+{
+ if (state == NULL)
+ return;
+ dri_state_close(state);
+ cl_free(state);
+}
+
+LOCAL dri_state_t*
+dri_state_new(void)
+{
+ dri_state_t *state = NULL;
+ TRY_ALLOC_NO_ERR (state, CALLOC(dri_state_t));
+ state->fd = -1;
+ state->driConnectedFlag = NONE;
+ dri_state_init_drawable_hash_table(state);
+
+exit:
+ return state;
+error:
+ dri_state_delete(state);
+ state = NULL;
+ goto exit;
+}
+
+#define __DRI_BUFFER_FRONT_LEFT 0
+#define __DRI_BUFFER_BACK_LEFT 1
+#define __DRI_BUFFER_FRONT_RIGHT 2
+#define __DRI_BUFFER_BACK_RIGHT 3
+#define __DRI_BUFFER_DEPTH 4
+#define __DRI_BUFFER_STENCIL 5
+#define __DRI_BUFFER_ACCUM 6
+#define __DRI_BUFFER_FAKE_FRONT_LEFT 7
+#define __DRI_BUFFER_FAKE_FRONT_RIGHT 8
+
+typedef struct dri2_drawable
+{
+ struct dri_drawable base;
+ union dri_buffer buffers[5];
+ int width;
+ int height;
+ int has_backbuffer;
+ int back_index;
+ int front_index;
+} dri2_drawable_t;
+
+LOCAL dri_drawable_t*
+dri_state_create_drawable(dri_state_t *state, XID x_drawable)
+{
+ dri2_drawable_t *dri2_drwble;
+ dri2_drwble = (dri2_drawable_t*)calloc(1, sizeof(*dri2_drwble));
+
+ if (!dri2_drwble)
+ return NULL;
+
+ dri2_drwble->base.x_drawable = x_drawable;
+ dri2_drwble->base.x = 0;
+ dri2_drwble->base.y = 0;
+ VA_DRI2CreateDrawable(state->x11_dpy, x_drawable);
+
+ return &dri2_drwble->base;
+}
+
+LOCAL void
+dri_state_destroy_drawable(dri_state_t *state, dri_drawable_t *dri_drwble)
+{
+ VA_DRI2DestroyDrawable(state->x11_dpy, dri_drwble->x_drawable);
+ free(dri_drwble);
+}
+
+LOCAL void
+dri_state_swap_buffer(dri_state_t *state, dri_drawable_t *dri_drwble)
+{
+ dri2_drawable_t *dri2_drwble = (dri2_drawable_t*)dri_drwble;
+ XRectangle xrect;
+ XserverRegion region;
+
+ if (dri2_drwble->has_backbuffer) {
+ xrect.x = 0;
+ xrect.y = 0;
+ xrect.width = dri2_drwble->width;
+ xrect.height = dri2_drwble->height;
+
+ region = XFixesCreateRegion(state->x11_dpy, &xrect, 1);
+ VA_DRI2CopyRegion(state->x11_dpy, dri_drwble->x_drawable, region,
+ DRI2BufferFrontLeft, DRI2BufferBackLeft);
+ XFixesDestroyRegion(state->x11_dpy, region);
+ }
+}
+
+LOCAL union dri_buffer*
+dri_state_get_rendering_buffer(dri_state_t *state, dri_drawable_t *dri_drwble)
+{
+ dri2_drawable_t *dri2_drwble = (dri2_drawable_t *)dri_drwble;
+ int i;
+ int count;
+ unsigned int attachments[5];
+ VA_DRI2Buffer *buffers;
+
+ i = 0;
+ attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+ attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+ buffers = VA_DRI2GetBuffers(state->x11_dpy,
+ dri_drwble->x_drawable,
+ &dri2_drwble->width,
+ &dri2_drwble->height,
+ attachments,
+ i,
+ &count);
+ assert(buffers);
+ if (buffers == NULL)
+ return NULL;
+
+ dri2_drwble->has_backbuffer = 0;
+
+ for (i = 0; i < count; i++) {
+ dri2_drwble->buffers[i].dri2.attachment = buffers[i].attachment;
+ dri2_drwble->buffers[i].dri2.name = buffers[i].name;
+ dri2_drwble->buffers[i].dri2.pitch = buffers[i].pitch;
+ dri2_drwble->buffers[i].dri2.cpp = buffers[i].cpp;
+ dri2_drwble->buffers[i].dri2.flags = buffers[i].flags;
+
+ if (buffers[i].attachment == __DRI_BUFFER_BACK_LEFT) {
+ dri2_drwble->has_backbuffer = 1;
+ dri2_drwble->back_index = i;
+ }
+
+ if (buffers[i].attachment == __DRI_BUFFER_FRONT_LEFT)
+ dri2_drwble->front_index = i;
+ }
+
+ dri_drwble->width = dri2_drwble->width;
+ dri_drwble->height = dri2_drwble->height;
+ Xfree(buffers);
+
+ if (dri2_drwble->has_backbuffer)
+ return &dri2_drwble->buffers[dri2_drwble->back_index];
+
+ return &dri2_drwble->buffers[dri2_drwble->front_index];
+}
+
+LOCAL void
+dri_state_close(dri_state_t *state) {
+ dri_state_free_drawable_hash(state);
+ assert(state->fd >= 0);
+ close(state->fd);
+}
+
+LOCAL void
+dri_state_release(dri_state_t *state) {
+ dri_state_delete(state);
+}
+
+LOCAL dri_state_t*
+getDRI2State(Display* dpy, int screen, char **driver_name)
+{
+ int major, minor;
+ int error_base;
+ int event_base;
+ char *device_name = NULL;
+ drm_magic_t magic;
+ char * internal_driver_name = NULL;
+ int fd = -1;
+ dri_state_t* state = NULL;
+
+ if (!VA_DRI2QueryExtension(dpy, &event_base, &error_base))
+ goto err_out;
+
+ if (!VA_DRI2QueryVersion(dpy, &major, &minor))
+ goto err_out;
+
+
+ if (!VA_DRI2Connect(dpy, RootWindow(dpy, screen),
+ &internal_driver_name, &device_name))
+ goto err_out;
+
+ fd = open(device_name, O_RDWR);
+ assert(fd >= 0);
+
+ if (fd < 0)
+ goto err_out;
+
+ if (drmGetMagic(fd, &magic))
+ goto err_out;
+
+ if (!VA_DRI2Authenticate(dpy, RootWindow(dpy, screen),
+ magic))
+ goto err_out;
+
+ if(driver_name)
+ *driver_name = internal_driver_name;
+ else
+ Xfree(internal_driver_name);
+
+ state = dri_state_new();
+ state->fd = fd;
+ state->x11_dpy = dpy;
+ state->x11_screen = screen;
+ state->driConnectedFlag = DRI2;
+ if (device_name)
+ Xfree(device_name);
+ return state;
+
+err_out:
+ if (device_name)
+ Xfree(device_name);
+
+ if (internal_driver_name)
+ Xfree(internal_driver_name);
+
+ if(driver_name) *driver_name = NULL;
+
+ if (fd >= 0)
+ close(fd);
+
+ if (driver_name)
+ *driver_name = NULL;
+
+ return state;
+}
+
diff --git a/src/x11/dricommon.h b/src/x11/dricommon.h
new file mode 100644
index 0000000..5a950b4
--- /dev/null
+++ b/src/x11/dricommon.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Note: the code is taken from libva code base
+ */
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _VA_DRICOMMON_H_
+#define _VA_DRICOMMON_H_
+
+#include <X11/Xlib.h>
+#include <xf86drm.h>
+#include <drm.h>
+#include <drm_sarea.h>
+
+union dri_buffer
+{
+ struct {
+ unsigned int attachment;
+ unsigned int name;
+ unsigned int pitch;
+ unsigned int cpp;
+ unsigned int flags;
+ } dri2;
+};
+
+typedef struct dri_drawable
+{
+ XID x_drawable;
+ int x;
+ int y;
+ unsigned int width;
+ unsigned int height;
+ struct dri_drawable *next;
+} dri_drawable_t;
+
+#define DRAWABLE_HASH_SZ 32
+
+enum DRI_VER
+{
+ NONE = 0,
+ // NOT supported VA_DRI1 = 1,
+ DRI2 = 2
+};
+
+typedef struct dri_state
+{
+ Display *x11_dpy;
+ int x11_screen;
+ int fd;
+ enum DRI_VER driConnectedFlag; /* 0: disconnected, 2: DRI2 */
+ dri_drawable_t *drawable_hash[DRAWABLE_HASH_SZ];
+} dri_state_t;
+
+dri_drawable_t *dri_state_create_drawable(dri_state_t*, XID x_drawable);
+void dri_state_destroy_drawable(dri_state_t*, dri_drawable_t*);
+void dri_state_close(dri_state_t*);
+void dri_state_release(dri_state_t*);
+
+// Create a dri2 state from dpy and screen
+dri_state_t *getDRI2State(Display* dpy, int screen, char **driver_name);
+
+#endif /* _VA_DRICOMMON_H_ */
+
diff --git a/src/x11/mesa_egl_extension.c b/src/x11/mesa_egl_extension.c
new file mode 100644
index 0000000..a7fc8cb
--- /dev/null
+++ b/src/x11/mesa_egl_extension.c
@@ -0,0 +1,307 @@
+#include <stdio.h>
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+#include "src/cl_driver.h"
+
+struct _egl_display;
+struct _egl_resource;
+struct _egl_thread_info;
+struct _egl_config;
+struct _egl_surface;
+struct _egl_driver;
+
+typedef struct _egl_display _EGLDisplay;
+typedef struct _egl_resource _EGLResource;
+typedef struct _egl_thread_info _EGLThreadInfo;
+typedef struct _egl_config _EGLConfig;
+typedef struct _egl_surface _EGLSurface;
+typedef struct _egl_driver _EGLDriver;
+
+/**
+ * A resource of a display.
+ */
+struct _egl_resource
+{
+ /* which display the resource belongs to */
+ _EGLDisplay *Display;
+ EGLBoolean IsLinked;
+ EGLint RefCount;
+
+ /* used to link resources of the same type */
+ _EGLResource *Next;
+};
+
+/**
+ * "Base" class for device driver contexts.
+ */
+struct _egl_context
+{
+ /* A context is a display resource */
+ _EGLResource Resource;
+
+ /* The bound status of the context */
+ _EGLThreadInfo *Binding;
+ _EGLSurface *DrawSurface;
+ _EGLSurface *ReadSurface;
+
+ _EGLConfig *Config;
+
+ EGLint ClientAPI; /**< EGL_OPENGL_ES_API, EGL_OPENGL_API, EGL_OPENVG_API */
+ EGLint ClientMajorVersion;
+ EGLint ClientMinorVersion;
+ EGLint Flags;
+ EGLint Profile;
+ EGLint ResetNotificationStrategy;
+
+ /* The real render buffer when a window surface is bound */
+ EGLint WindowRenderBuffer;
+};
+
+typedef struct _egl_context _EGLContext;
+
+struct dri2_egl_display
+{
+ int dri2_major;
+ int dri2_minor;
+ __DRIscreen *dri_screen;
+ int own_dri_screen;
+ const __DRIconfig **driver_configs;
+ void *driver;
+};
+
+enum _egl_platform_type {
+ _EGL_PLATFORM_WINDOWS,
+ _EGL_PLATFORM_X11,
+ _EGL_PLATFORM_WAYLAND,
+ _EGL_PLATFORM_DRM,
+ _EGL_PLATFORM_FBDEV,
+ _EGL_PLATFORM_NULL,
+ _EGL_PLATFORM_ANDROID,
+
+ _EGL_NUM_PLATFORMS,
+ _EGL_INVALID_PLATFORM = -1
+};
+typedef enum _egl_platform_type _EGLPlatformType;
+
+typedef pthread_mutex_t _EGLMutex;
+
+struct _egl_display
+{
+ /* used to link displays */
+ _EGLDisplay *Next;
+
+ _EGLMutex Mutex;
+
+ _EGLPlatformType Platform; /**< The type of the platform display */
+ void *PlatformDisplay; /**< A pointer to the platform display */
+
+ _EGLDriver *Driver; /**< Matched driver of the display */
+ EGLBoolean Initialized; /**< True if the display is initialized */
+
+ /* options that affect how the driver initializes the display */
+ struct {
+ EGLBoolean TestOnly; /**< Driver should not set fields when true */
+ EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
+ } Options;
+
+ /* these fields are set by the driver during init */
+ void *DriverData; /**< Driver private data */
+};
+
+static struct dri2_egl_display *
+dri2_egl_display(_EGLDisplay *dpy)
+{
+ return (struct dri2_egl_display *)dpy->DriverData;
+}
+
+static _EGLDisplay *
+_eglLockDisplay(EGLDisplay dpy)
+{
+ return (_EGLDisplay *)dpy;
+}
+
+static _EGLContext *
+_eglLookupContext(EGLContext ctx, EGLDisplay disp)
+{
+ disp = disp;
+ return (_EGLContext *) ctx;
+}
+
+struct dri2_egl_context
+{
+ _EGLContext base;
+ __DRIcontext *dri_context;
+};
+
+static struct dri2_egl_context *
+dri2_egl_context(_EGLContext *ctx)
+{
+ return (struct dri2_egl_context *)ctx;
+}
+
+static EGLBoolean
+dri2_acquire_texture(_EGLDisplay *disp,
+ _EGLContext *ctx,
+ const EGLint *attr_list,
+ void *user_data)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint texture = 0;
+ GLenum gl_target = 0;
+ GLint level = 0;
+ GLboolean ret;
+
+ if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_acquire_texture(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ gl_target, level, texture,
+ user_data);
+ return ret;
+}
+
+static EGLBoolean
+dri2_release_texture(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint texture = 0;
+ GLenum gl_target = 0;
+ GLint level = 0;
+ GLboolean ret;
+
+ if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_release_texture(dri2_dpy->driver, dri2_ctx->dri_context,
+ gl_target, level, texture);
+ return ret;
+}
+
+static EGLBoolean
+dri2_acquire_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list,
+ void *user_data)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint bufobj = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_acquire_buffer_object(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ bufobj, user_data);
+ return ret;
+}
+
+static EGLBoolean
+dri2_release_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint bufobj = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_release_buffer_object(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ bufobj);
+ return ret;
+}
+
+static EGLBoolean
+dri2_acquire_render_buffer(_EGLDisplay *disp,
+ _EGLContext *ctx,
+ const EGLint *attr_list,
+ void *user_data)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint rb = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_acquire_render_buffer(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ rb, user_data);
+ return ret;
+}
+
+static EGLBoolean
+dri2_release_render_buffer(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint rb = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_release_render_buffer(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ rb);
+ return ret;
+}
+
+static EGLBoolean
+dri2_acquire_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+ const EGLint *attrib_list, void *user_data)
+{
+ switch (target) {
+ case EGL_GL_TEXTURE_MESA:
+ return dri2_acquire_texture(disp, ctx, attrib_list, user_data);
+ case EGL_GL_BUFFER_OBJECT_MESA:
+ return dri2_acquire_buffer_object(disp, ctx, attrib_list, user_data);
+ case EGL_GL_RENDER_BUFFER_MESA:
+ return dri2_acquire_render_buffer(disp, ctx, attrib_list, user_data);
+ default:
+ fprintf(stderr, "bad resource target value 0x%04x",
+ target);
+ }
+ return EGL_FALSE;
+}
+
+static EGLBoolean
+dri2_release_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+ const EGLint *attrib_list)
+{
+ switch (target) {
+ case EGL_GL_TEXTURE_MESA:
+ return dri2_release_texture(disp, ctx, attrib_list);
+ case EGL_GL_BUFFER_OBJECT_MESA:
+ return dri2_release_buffer_object(disp, ctx, attrib_list);
+ case EGL_GL_RENDER_BUFFER_MESA:
+ return dri2_release_render_buffer(disp, ctx, attrib_list);
+ default:
+ fprintf(stderr, "bad resource target value 0x%04x",
+ target);
+ }
+ return EGL_FALSE;
+}
+
+EGLBoolean
+eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void *user)
+{
+ _EGLDisplay *disp = _eglLockDisplay(dpy);
+ _EGLContext *context = _eglLookupContext(ctx, disp);
+
+ return dri2_acquire_resource_mesa(disp, context, target, attrib_list, user);
+}
+
+EGLBoolean
+eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list)
+{
+ _EGLDisplay *disp = _eglLockDisplay(dpy);
+ _EGLContext *context = _eglLookupContext(ctx, disp);
+
+ return dri2_release_resource_mesa(disp, context, target, attrib_list);
+}
diff --git a/src/x11/mesa_egl_extension.h b/src/x11/mesa_egl_extension.h
new file mode 100644
index 0000000..39ea134
--- /dev/null
+++ b/src/x11/mesa_egl_extension.h
@@ -0,0 +1,20 @@
+#ifndef __MESA_EGL_EXTENSION_H__
+#define __MESA_EGL_EXTENSION_H__
+
+#include <EGL/egl.h>
+#include <GL/gl.h>
+#include <GL/internal/dri_interface.h>
+
+#define EGL_GL_TEXTURE_MESA 0x3300 /* eglAcuireResource target */
+#define EGL_GL_BUFFER_OBJECT_MESA 0x3301 /* eglAcuireResource target */
+#define EGL_GL_RENDER_BUFFER_MESA 0x3302 /* eglAcuireResource target */
+#define EGL_GL_TEXTURE_ID_MESA 0x3303 /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_LEVEL_MESA 0x3304 /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_TARGET_MESA 0x3305 /* eglAcuireResource attribute */
+#define EGL_GL_BUFFER_OBJECT_ID_MESA 0x3306 /* eglAcuireResource attribute */
+#define EGL_GL_RENDER_BUFFER_ID_MESA 0x3307 /* eglAcuireResource attribute */
+
+EGLBoolean eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void * user_data);
+EGLBoolean eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list);
+
+#endif
diff --git a/src/x11/mesa_egl_res_share.c b/src/x11/mesa_egl_res_share.c
new file mode 100644
index 0000000..93e9454
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.c
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <assert.h>
+#include <string.h>
+
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target, EGLint *level,
+ const EGLint *attrib_list)
+{
+ EGLint i, err = EGL_SUCCESS;
+
+ *texture = 0;
+ *gl_target = 0;
+ *level = 0;
+
+ if (!attrib_list)
+ return EGL_BAD_ATTRIBUTE;
+
+ for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+ EGLint attr = attrib_list[i++];
+ EGLint val = attrib_list[i];
+
+ switch (attr) {
+ case EGL_GL_TEXTURE_LEVEL_MESA:
+ *level = val;
+ break;
+ case EGL_GL_TEXTURE_ID_MESA:
+ *texture = val;
+ break;
+ case EGL_GL_TEXTURE_TARGET_MESA:
+ *gl_target = val;
+ break;
+ default:
+ /* unknown attrs are ignored */
+ break;
+ }
+ }
+
+ return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj, const EGLint *attrib_list)
+{
+ EGLint i, err = EGL_SUCCESS;
+ *bufobj = 0;
+
+ if (!attrib_list)
+ return EGL_BAD_ATTRIBUTE;
+
+ for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+ EGLint attr = attrib_list[i++];
+ EGLint val = attrib_list[i];
+
+ switch (attr) {
+ case EGL_GL_BUFFER_OBJECT_ID_MESA:
+ *bufobj = val;
+ break;
+ default:
+ /* unknown attrs are ignored */
+ break;
+ }
+ }
+ if (*bufobj == 0)
+ err = EGL_BAD_ATTRIBUTE;
+
+ return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list)
+{
+ EGLint i, err = EGL_SUCCESS;
+ *rb = 0;
+
+ if (!attrib_list)
+ return EGL_BAD_ATTRIBUTE;
+
+ for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+ EGLint attr = attrib_list[i++];
+ EGLint val = attrib_list[i];
+
+ switch (attr) {
+ case EGL_GL_RENDER_BUFFER_ID_MESA:
+ *rb = val;
+ break;
+ default:
+ /* unknown attrs are ignored */
+ break;
+ }
+ }
+ if (*rb == 0)
+ err = EGL_BAD_ATTRIBUTE;
+
+ return err;
+}
diff --git a/src/x11/mesa_egl_res_share.h b/src/x11/mesa_egl_res_share.h
new file mode 100644
index 0000000..43e746e
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef EGLRESSHARE_INCLUDED
+#define EGLRESSHARE_INCLUDED
+
+#include <EGL/egl.h>
+
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target,
+ EGLint *level, const EGLint *attrib_list);
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj,
+ const EGLint *attrib_list);
+
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list);
+#endif
diff --git a/src/x11/va_dri2.c b/src/x11/va_dri2.c
new file mode 100644
index 0000000..5225acd
--- /dev/null
+++ b/src/x11/va_dri2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ * Kristian H�gsberg (krh at redhat.com)
+ */
+
+#define NEED_REPLIES
+#include <X11/Xlibint.h>
+#include <X11/extensions/Xext.h>
+#include <X11/extensions/extutil.h>
+#include "xf86drm.h"
+#include "x11/va_dri2.h"
+#include "x11/va_dri2str.h"
+#include "x11/va_dri2tokens.h"
+
+#ifndef DRI2DriverDRI
+#define DRI2DriverDRI 0
+#endif
+
+#define LOCAL __attribute__ ((visibility ("internal")))
+
+static char va_dri2ExtensionName[] = DRI2_NAME;
+static XExtensionInfo _va_dri2_info_data;
+static XExtensionInfo *va_dri2Info = &_va_dri2_info_data;
+static XEXT_GENERATE_CLOSE_DISPLAY (VA_DRI2CloseDisplay, va_dri2Info)
+static /* const */ XExtensionHooks va_dri2ExtensionHooks = {
+ NULL, /* create_gc */
+ NULL, /* copy_gc */
+ NULL, /* flush_gc */
+ NULL, /* free_gc */
+ NULL, /* create_font */
+ NULL, /* free_font */
+ VA_DRI2CloseDisplay, /* close_display */
+ NULL, /* wire_to_event */
+ NULL, /* event_to_wire */
+ NULL, /* error */
+ NULL, /* error_string */
+};
+
+static XEXT_GENERATE_FIND_DISPLAY (DRI2FindDisplay, va_dri2Info,
+ va_dri2ExtensionName,
+ &va_dri2ExtensionHooks,
+ 0, NULL)
+
+LOCAL Bool VA_DRI2QueryExtension(Display *dpy, int *eventBase, int *errorBase)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+
+ if (XextHasExtension(info)) {
+ *eventBase = info->codes->first_event;
+ *errorBase = info->codes->first_error;
+ return True;
+ }
+
+ return False;
+}
+
+LOCAL Bool VA_DRI2QueryVersion(Display *dpy, int *major, int *minor)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay (dpy);
+ xDRI2QueryVersionReply rep;
+ xDRI2QueryVersionReq *req;
+
+ XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+ LockDisplay(dpy);
+ GetReq(DRI2QueryVersion, req);
+ req->reqType = info->codes->major_opcode;
+ req->dri2Reqtype = X_DRI2QueryVersion;
+ req->majorVersion = DRI2_MAJOR;
+ req->minorVersion = DRI2_MINOR;
+ if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return False;
+ }
+ *major = rep.majorVersion;
+ *minor = rep.minorVersion;
+ UnlockDisplay(dpy);
+ SyncHandle();
+
+ return True;
+}
+
+LOCAL Bool VA_DRI2Connect(Display *dpy, XID window,
+ char **driverName, char **deviceName)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+ xDRI2ConnectReply rep;
+ xDRI2ConnectReq *req;
+
+ XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+ LockDisplay(dpy);
+ GetReq(DRI2Connect, req);
+ req->reqType = info->codes->major_opcode;
+ req->dri2Reqtype = X_DRI2Connect;
+ req->window = window;
+ req->drivertype = DRI2DriverDRI;
+ if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return False;
+ }
+
+ if (rep.driverNameLength == 0 && rep.deviceNameLength == 0) {
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return False;
+ }
+
+ *driverName = Xmalloc(rep.driverNameLength + 1);
+ if (*driverName == NULL) {
+ _XEatData(dpy,
+ ((rep.driverNameLength + 3) & ~3) +
+ ((rep.deviceNameLength + 3) & ~3));
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return False;
+ }
+ _XReadPad(dpy, *driverName, rep.driverNameLength);
+ (*driverName)[rep.driverNameLength] = '\0';
+
+ *deviceName = Xmalloc(rep.deviceNameLength + 1);
+ if (*deviceName == NULL) {
+ Xfree(*driverName);
+ _XEatData(dpy, ((rep.deviceNameLength + 3) & ~3));
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return False;
+ }
+ _XReadPad(dpy, *deviceName, rep.deviceNameLength);
+ (*deviceName)[rep.deviceNameLength] = '\0';
+
+ UnlockDisplay(dpy);
+ SyncHandle();
+
+ return True;
+}
+
+LOCAL Bool VA_DRI2Authenticate(Display *dpy, XID window, drm_magic_t magic)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+ xDRI2AuthenticateReq *req;
+ xDRI2AuthenticateReply rep;
+
+ XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+ LockDisplay(dpy);
+ GetReq(DRI2Authenticate, req);
+ req->reqType = info->codes->major_opcode;
+ req->dri2Reqtype = X_DRI2Authenticate;
+ req->window = window;
+ req->magic = magic;
+
+ if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return False;
+ }
+
+ UnlockDisplay(dpy);
+ SyncHandle();
+
+ return rep.authenticated;
+}
+
+LOCAL void VA_DRI2CreateDrawable(Display *dpy, XID drawable)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+ xDRI2CreateDrawableReq *req;
+
+ XextSimpleCheckExtension (dpy, info, va_dri2ExtensionName);
+
+ LockDisplay(dpy);
+ GetReq(DRI2CreateDrawable, req);
+ req->reqType = info->codes->major_opcode;
+ req->dri2Reqtype = X_DRI2CreateDrawable;
+ req->drawable = drawable;
+ UnlockDisplay(dpy);
+ SyncHandle();
+}
+
+LOCAL void VA_DRI2DestroyDrawable(Display *dpy, XID drawable)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+ xDRI2DestroyDrawableReq *req;
+
+ XextSimpleCheckExtension (dpy, info, va_dri2ExtensionName);
+
+ XSync(dpy, False);
+
+ LockDisplay(dpy);
+ GetReq(DRI2DestroyDrawable, req);
+ req->reqType = info->codes->major_opcode;
+ req->dri2Reqtype = X_DRI2DestroyDrawable;
+ req->drawable = drawable;
+ UnlockDisplay(dpy);
+ SyncHandle();
+}
+
+LOCAL VA_DRI2Buffer *VA_DRI2GetBuffers(Display *dpy, XID drawable,
+ int *width, int *height,
+ unsigned int *attachments, int count,
+ int *outcount)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+ xDRI2GetBuffersReply rep;
+ xDRI2GetBuffersReq *req;
+ VA_DRI2Buffer *buffers;
+ xDRI2Buffer repBuffer;
+ CARD32 *p;
+ int i;
+
+ XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+ LockDisplay(dpy);
+ GetReqExtra(DRI2GetBuffers, count * 4, req);
+ req->reqType = info->codes->major_opcode;
+ req->dri2Reqtype = X_DRI2GetBuffers;
+ req->drawable = drawable;
+ req->count = count;
+ p = (CARD32 *) &req[1];
+ for (i = 0; i < count; i++)
+ p[i] = attachments[i];
+
+ if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return NULL;
+ }
+
+ *width = rep.width;
+ *height = rep.height;
+ *outcount = rep.count;
+
+ buffers = Xmalloc(rep.count * sizeof buffers[0]);
+ if (buffers == NULL) {
+ _XEatData(dpy, rep.count * sizeof repBuffer);
+ UnlockDisplay(dpy);
+ SyncHandle();
+ return NULL;
+ }
+
+ for (i = 0; i < (int) rep.count; i++) {
+ _XReadPad(dpy, (char *) &repBuffer, sizeof repBuffer);
+ buffers[i].attachment = repBuffer.attachment;
+ buffers[i].name = repBuffer.name;
+ buffers[i].pitch = repBuffer.pitch;
+ buffers[i].cpp = repBuffer.cpp;
+ buffers[i].flags = repBuffer.flags;
+ }
+
+ UnlockDisplay(dpy);
+ SyncHandle();
+
+ return buffers;
+}
+
+LOCAL void VA_DRI2CopyRegion(Display *dpy, XID drawable, XserverRegion region,
+ CARD32 dest, CARD32 src)
+{
+ XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+ xDRI2CopyRegionReq *req;
+ xDRI2CopyRegionReply rep;
+
+ XextSimpleCheckExtension (dpy, info, va_dri2ExtensionName);
+
+ LockDisplay(dpy);
+ GetReq(DRI2CopyRegion, req);
+ req->reqType = info->codes->major_opcode;
+ req->dri2Reqtype = X_DRI2CopyRegion;
+ req->drawable = drawable;
+ req->region = region;
+ req->dest = dest;
+ req->src = src;
+
+ _XReply(dpy, (xReply *)&rep, 0, xFalse);
+
+ UnlockDisplay(dpy);
+ SyncHandle();
+}
diff --git a/src/x11/va_dri2.h b/src/x11/va_dri2.h
new file mode 100644
index 0000000..1a1f96e
--- /dev/null
+++ b/src/x11/va_dri2.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2007,2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ * Kristian H�gsberg (krh at redhat.com)
+ */
+#ifndef _VA_DRI2_H_
+#define _VA_DRI2_H_
+
+#include <X11/extensions/Xfixes.h>
+#include <X11/Xfuncproto.h>
+#include <xf86drm.h>
+
+typedef struct {
+ unsigned int attachment;
+ unsigned int name;
+ unsigned int pitch;
+ unsigned int cpp;
+ unsigned int flags;
+} VA_DRI2Buffer;
+
+extern Bool
+VA_DRI2QueryExtension(Display *display, int *eventBase, int *errorBase);
+extern Bool
+VA_DRI2QueryVersion(Display *display, int *major, int *minor);
+extern Bool
+VA_DRI2Connect(Display *display, XID window,
+ char **driverName, char **deviceName);
+extern Bool
+VA_DRI2Authenticate(Display *display, XID window, drm_magic_t magic);
+extern void
+VA_DRI2CreateDrawable(Display *display, XID drawable);
+extern void
+VA_DRI2DestroyDrawable(Display *display, XID handle);
+extern VA_DRI2Buffer *
+VA_DRI2GetBuffers(Display *dpy, XID drawable,
+ int *width, int *height,
+ unsigned int *attachments, int count,
+ int *outcount);
+#if 1
+extern void
+VA_DRI2CopyRegion(Display *dpy, XID drawable, XserverRegion region,
+ CARD32 dest, CARD32 src);
+#endif
+#endif
diff --git a/src/x11/va_dri2str.h b/src/x11/va_dri2str.h
new file mode 100644
index 0000000..db10e16
--- /dev/null
+++ b/src/x11/va_dri2str.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ * Kristian H�gsberg (krh at redhat.com)
+ */
+#ifndef _DRI2_PROTO_H_
+#define _DRI2_PROTO_H_
+
+#define DRI2_NAME "DRI2"
+#define DRI2_MAJOR 1
+#define DRI2_MINOR 0
+
+#define DRI2NumberErrors 0
+#define DRI2NumberEvents 0
+#define DRI2NumberRequests 7
+
+#define X_DRI2QueryVersion 0
+#define X_DRI2Connect 1
+#define X_DRI2Authenticate 2
+#define X_DRI2CreateDrawable 3
+#define X_DRI2DestroyDrawable 4
+#define X_DRI2GetBuffers 5
+#define X_DRI2CopyRegion 6
+
+typedef struct {
+ CARD32 attachment B32;
+ CARD32 name B32;
+ CARD32 pitch B32;
+ CARD32 cpp B32;
+ CARD32 flags B32;
+} xDRI2Buffer;
+
+typedef struct {
+ CARD8 reqType;
+ CARD8 dri2Reqtype;
+ CARD16 length B16;
+ CARD32 majorVersion B32;
+ CARD32 minorVersion B32;
+} xDRI2QueryVersionReq;
+#define sz_xDRI2QueryVersionReq 12
+
+typedef struct {
+ BYTE type; /* X_Reply */
+ BYTE pad1;
+ CARD16 sequenceNumber B16;
+ CARD32 length B32;
+ CARD32 majorVersion B32;
+ CARD32 minorVersion B32;
+ CARD32 pad2 B32;
+ CARD32 pad3 B32;
+ CARD32 pad4 B32;
+ CARD32 pad5 B32;
+} xDRI2QueryVersionReply;
+#define sz_xDRI2QueryVersionReply 32
+
+typedef struct {
+ CARD8 reqType;
+ CARD8 dri2Reqtype;
+ CARD16 length B16;
+ CARD32 window B32;
+ CARD32 drivertype B32;
+} xDRI2ConnectReq;
+#define sz_xDRI2ConnectReq 12
+
+typedef struct {
+ BYTE type; /* X_Reply */
+ BYTE pad1;
+ CARD16 sequenceNumber B16;
+ CARD32 length B32;
+ CARD32 driverNameLength B32;
+ CARD32 deviceNameLength B32;
+ CARD32 pad2 B32;
+ CARD32 pad3 B32;
+ CARD32 pad4 B32;
+ CARD32 pad5 B32;
+} xDRI2ConnectReply;
+#define sz_xDRI2ConnectReply 32
+
+typedef struct {
+ CARD8 reqType;
+ CARD8 dri2Reqtype;
+ CARD16 length B16;
+ CARD32 window B32;
+ CARD32 magic B32;
+} xDRI2AuthenticateReq;
+#define sz_xDRI2AuthenticateReq 12
+
+typedef struct {
+ BYTE type; /* X_Reply */
+ BYTE pad1;
+ CARD16 sequenceNumber B16;
+ CARD32 length B32;
+ CARD32 authenticated B32;
+ CARD32 pad2 B32;
+ CARD32 pad3 B32;
+ CARD32 pad4 B32;
+ CARD32 pad5 B32;
+ CARD32 pad6 B32;
+} xDRI2AuthenticateReply;
+#define sz_xDRI2AuthenticateReply 32
+
+typedef struct {
+ CARD8 reqType;
+ CARD8 dri2Reqtype;
+ CARD16 length B16;
+ CARD32 drawable B32;
+} xDRI2CreateDrawableReq;
+#define sz_xDRI2CreateDrawableReq 8
+
+typedef struct {
+ CARD8 reqType;
+ CARD8 dri2Reqtype;
+ CARD16 length B16;
+ CARD32 drawable B32;
+} xDRI2DestroyDrawableReq;
+#define sz_xDRI2DestroyDrawableReq 8
+
+typedef struct {
+ CARD8 reqType;
+ CARD8 dri2Reqtype;
+ CARD16 length B16;
+ CARD32 drawable B32;
+ CARD32 count B32;
+} xDRI2GetBuffersReq;
+#define sz_xDRI2GetBuffersReq 12
+
+typedef struct {
+ BYTE type; /* X_Reply */
+ BYTE pad1;
+ CARD16 sequenceNumber B16;
+ CARD32 length B32;
+ CARD32 width B32;
+ CARD32 height B32;
+ CARD32 count B32;
+ CARD32 pad2 B32;
+ CARD32 pad3 B32;
+ CARD32 pad4 B32;
+} xDRI2GetBuffersReply;
+#define sz_xDRI2GetBuffersReply 32
+
+typedef struct {
+ CARD8 reqType;
+ CARD8 dri2Reqtype;
+ CARD16 length B16;
+ CARD32 drawable B32;
+ CARD32 region B32;
+ CARD32 dest B32;
+ CARD32 src B32;
+} xDRI2CopyRegionReq;
+#define sz_xDRI2CopyRegionReq 20
+
+typedef struct {
+ BYTE type; /* X_Reply */
+ BYTE pad1;
+ CARD16 sequenceNumber B16;
+ CARD32 length B32;
+ CARD32 pad2 B32;
+ CARD32 pad3 B32;
+ CARD32 pad4 B32;
+ CARD32 pad5 B32;
+ CARD32 pad6 B32;
+ CARD32 pad7 B32;
+} xDRI2CopyRegionReply;
+#define sz_xDRI2CopyRegionReply 32
+
+#endif
diff --git a/src/x11/va_dri2tokens.h b/src/x11/va_dri2tokens.h
new file mode 100644
index 0000000..d3c31f3
--- /dev/null
+++ b/src/x11/va_dri2tokens.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ * Kristian H�gsberg (krh at redhat.com)
+ */
+#ifndef _DRI2_TOKENS_H_
+#define _DRI2_TOKENS_H_
+
+#define DRI2BufferFrontLeft 0
+#define DRI2BufferBackLeft 1
+#define DRI2BufferFrontRight 2
+#define DRI2BufferBackRight 3
+#define DRI2BufferDepth 4
+#define DRI2BufferStencil 5
+#define DRI2BufferAccum 6
+#define DRI2BufferFakeFrontLeft 7
+#define DRI2BufferFakeFrontRight 8
+
+#define DRI2DriverDRI 0
+
+#endif
diff --git a/utests/.gitignore b/utests/.gitignore
new file mode 100644
index 0000000..90f80fc
--- /dev/null
+++ b/utests/.gitignore
@@ -0,0 +1,15 @@
+compiler_box_blur.bmp
+compiler_box_blur_float.bmp
+compiler_clod.bmp
+compiler_julia.bmp
+compiler_julia_no_break.bmp
+compiler_mandelbrot.bmp
+compiler_mandelbrot_alternate.bmp
+compiler_menger_sponge_no_shadow.bmp
+compiler_nautilus.bmp
+compiler_ribbon.bmp
+flat_address_space
+libutests.so
+utest_run
+generated
+utest_generator.pyc
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
new file mode 100644
index 0000000..9c531de
--- /dev/null
+++ b/utests/CMakeLists.txt
@@ -0,0 +1,241 @@
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+
+##### Math Function Part:
+EXEC_PROGRAM(mkdir ${CMAKE_CURRENT_SOURCE_DIR} ARGS generated -p)
+EXEC_PROGRAM(${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR} ARGS utest_math_gen.py OUTPUT_VARIABLE GEN_MATH_STRING)
+string(REGEX REPLACE " " ";" ADDMATHFUNC ${GEN_MATH_STRING})
+
+string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../kernels/\\1.cl" KERNEL_MATH_LIST ${GEN_MATH_STRING})
+string(REGEX REPLACE " " ";" KERNEL_MATH_LIST ${KERNEL_MATH_LIST})
+string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "\\1.cl" KERNEL_GITIGNORE_LIST ${GEN_MATH_STRING})
+set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "generated;${KERNEL_MATH_LIST}")
+
+configure_file (
+ "setenv.sh.in"
+ "setenv.sh"
+ )
+
+#XXX only need GL if required
+link_directories (${LLVM_LIBRARY_DIR} ${OPENGL_LIBDIR} ${DRM_LIBDIR})
+set (utests_sources
+ utest_error.c
+ compiler_basic_arithmetic.cpp
+ compiler_displacement_map_element.cpp
+ compiler_shader_toy.cpp
+ compiler_mandelbrot.cpp
+ compiler_mandelbrot_alternate.cpp
+ compiler_box_blur_float.cpp
+ compiler_box_blur_image.cpp
+ compiler_box_blur.cpp
+ compiler_insert_to_constant.cpp
+ compiler_argument_structure.cpp
+ compiler_arith_shift_right.cpp
+ compiler_mixed_pointer.cpp
+ compiler_array0.cpp
+ compiler_array.cpp
+ compiler_array1.cpp
+ compiler_array2.cpp
+ compiler_array3.cpp
+ compiler_byte_scatter.cpp
+ compiler_ceil.cpp
+ compiler_clz_short.cpp
+ compiler_clz_int.cpp
+ compiler_convert_uchar_sat.cpp
+ compiler_copy_buffer.cpp
+ compiler_copy_image.cpp
+ compiler_copy_image_1d.cpp
+ compiler_copy_image_3d.cpp
+ compiler_copy_buffer_row.cpp
+ compiler_degrees.cpp
+ compiler_step.cpp
+ compiler_fabs.cpp
+ compiler_abs.cpp
+ compiler_abs_diff.cpp
+ compiler_fill_image.cpp
+ compiler_fill_image0.cpp
+ compiler_fill_image_1d.cpp
+ compiler_fill_image_3d.cpp
+ compiler_fill_image_3d_2.cpp
+ compiler_function_argument0.cpp
+ compiler_function_argument1.cpp
+ compiler_function_argument2.cpp
+ compiler_function_argument.cpp
+ compiler_function_constant0.cpp
+ compiler_function_constant1.cpp
+ compiler_function_constant.cpp
+ compiler_global_constant.cpp
+ compiler_global_constant_2.cpp
+ compiler_group_size.cpp
+ compiler_hadd.cpp
+ compiler_if_else.cpp
+ compiler_integer_division.cpp
+ compiler_integer_remainder.cpp
+ compiler_insert_vector.cpp
+ compiler_lower_return0.cpp
+ compiler_lower_return1.cpp
+ compiler_lower_return2.cpp
+ compiler_mad_hi.cpp
+ compiler_mul_hi.cpp
+ compiler_mad24.cpp
+ compiler_mul24.cpp
+ compiler_multiple_kernels.cpp
+ compiler_radians.cpp
+ compiler_rhadd.cpp
+ compiler_rotate.cpp
+ compiler_saturate.cpp
+ compiler_saturate_sub.cpp
+ compiler_shift_right.cpp
+ compiler_short_scatter.cpp
+ compiler_smoothstep.cpp
+ compiler_uint2_copy.cpp
+ compiler_uint3_copy.cpp
+ compiler_uint8_copy.cpp
+ compiler_uint16_copy.cpp
+ compiler_uint3_unaligned_copy.cpp
+ compiler_upsample_int.cpp
+ compiler_upsample_long.cpp
+ compiler_unstructured_branch0.cpp
+ compiler_unstructured_branch1.cpp
+ compiler_unstructured_branch2.cpp
+ compiler_unstructured_branch3.cpp
+ compiler_write_only_bytes.cpp
+ compiler_write_only.cpp
+ compiler_write_only_shorts.cpp
+ compiler_switch.cpp
+ compiler_math.cpp
+ compiler_atomic_functions.cpp
+ compiler_async_copy.cpp
+ compiler_async_stride_copy.cpp
+ compiler_insn_selection_min.cpp
+ compiler_insn_selection_max.cpp
+ compiler_insn_selection_masked_min_max.cpp
+ compiler_load_bool_imm.cpp
+ compiler_global_memory_barrier.cpp
+ compiler_local_memory_two_ptr.cpp
+ compiler_local_memory_barrier.cpp
+ compiler_local_memory_barrier_wg64.cpp
+ compiler_local_memory_barrier_2.cpp
+ compiler_local_slm.cpp
+ compiler_movforphi_undef.cpp
+ compiler_volatile.cpp
+ compiler_copy_image1.cpp
+ compiler_get_image_info.cpp
+ compiler_get_image_info_array.cpp
+ compiler_vect_compare.cpp
+ compiler_vector_load_store.cpp
+ compiler_vector_inc.cpp
+ compiler_cl_finish.cpp
+ get_cl_info.cpp
+ builtin_atan2.cpp
+ builtin_bitselect.cpp
+ builtin_frexp.cpp
+ builtin_mad_sat.cpp
+ builtin_modf.cpp
+ builtin_nextafter.cpp
+ builtin_remquo.cpp
+ builtin_shuffle.cpp
+ builtin_shuffle2.cpp
+ builtin_sign.cpp
+ builtin_lgamma.cpp
+ builtin_lgamma_r.cpp
+ builtin_tgamma.cpp
+ buildin_work_dim.cpp
+ builtin_global_size.cpp
+ builtin_local_size.cpp
+ builtin_global_id.cpp
+ builtin_num_groups.cpp
+ builtin_local_id.cpp
+ builtin_acos_asin.cpp
+ builtin_pow.cpp
+ builtin_exp.cpp
+ builtin_convert_sat.cpp
+ sub_buffer.cpp
+ runtime_createcontext.cpp
+ runtime_null_kernel_arg.cpp
+ runtime_event.cpp
+ runtime_barrier_list.cpp
+ runtime_marker_list.cpp
+ runtime_compile_link.cpp
+ compiler_long.cpp
+ compiler_long_2.cpp
+ compiler_long_convert.cpp
+ compiler_long_shl.cpp
+ compiler_long_shr.cpp
+ compiler_long_asr.cpp
+ compiler_long_mult.cpp
+ compiler_long_cmp.cpp
+ compiler_function_argument3.cpp
+ compiler_function_qualifiers.cpp
+ compiler_bool_cross_basic_block.cpp
+ compiler_private_data_overflow.cpp
+ compiler_getelementptr_bitcast.cpp
+ compiler_simd_any.cpp
+ compiler_simd_all.cpp
+ compiler_double_precision.cpp
+ load_program_from_bin_file.cpp
+ load_program_from_gen_bin.cpp
+ get_arg_info.cpp
+ profiling_exec.cpp
+ enqueue_copy_buf.cpp
+ enqueue_copy_buf_unaligned.cpp
+ test_printf.cpp
+ enqueue_fill_buf.cpp
+ enqueue_built_in_kernels.cpp
+ builtin_kernel_max_global_size.cpp
+ image_1D_buffer.cpp
+ compare_image_2d_and_1d_array.cpp
+ compiler_constant_expr.cpp
+ utest_assert.cpp
+ utest.cpp
+ utest_file_map.cpp
+ utest_helper.cpp)
+
+SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
+
+if(GEN_PCI_ID)
+ ADD_CUSTOM_COMMAND(
+ OUTPUT ${kernel_bin}.bin
+ COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin -t${GEN_PCI_ID}
+ DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+else(GEN_PCI_ID)
+ ADD_CUSTOM_COMMAND(
+ OUTPUT ${kernel_bin}.bin
+ COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin
+ DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+endif(GEN_PCI_ID)
+
+ADD_CUSTOM_TARGET(kernel_bin.bin
+ DEPENDS ${kernel_bin}.bin)
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/generated
+ COMMAND mkdir ${CMAKE_CURRENT_SOURCE_DIR}/generated -p
+ COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/utest_math_gen.py > /dev/null 2>&1
+ COMMAND echo ${KERNEL_GITIGNORE_LIST} |sed 's/ /\\n/g' > ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/.gitignore
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ )
+add_custom_target(utest_generator
+ DEPENDS generated
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ )
+
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
+SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
+SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+SET(UTESTS_REQUIRED_EGL_LIB ${EGL_LIBRARIES})
+else()
+SET(UTESTS_REQUIRED_EGL_LIB "")
+endif()
+
+ADD_LIBRARY(utests SHARED ${ADDMATHFUNC} ${utests_sources})
+
+TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT})
+
+ADD_EXECUTABLE(utest_run utest_run.cpp)
+TARGET_LINK_LIBRARIES(utest_run utests)
+ADD_DEPENDENCIES (utest_run kernel_bin.bin)
+ADD_DEPENDENCIES (utests utest_generator)
+
+ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
+TARGET_LINK_LIBRARIES(flat_address_space utests)
diff --git a/utests/buildin_work_dim.cpp b/utests/buildin_work_dim.cpp
new file mode 100644
index 0000000..d678c0f
--- /dev/null
+++ b/utests/buildin_work_dim.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+
+static void buildin_work_dim(void)
+{
+ // Setup kernel and buffers
+
+ int result, err;
+ OCL_CREATE_KERNEL("buildin_work_dim");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ globals[0] = 1;
+ globals[1] = 1;
+ globals[2] = 1;
+ locals[0] = 1;
+ locals[1] = 1;
+ locals[2] = 1;
+
+ for( int i=1; i <= 3; i++ )
+ {
+
+ // Run the kernel
+ OCL_NDRANGE(i);
+
+ err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &result, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to read output array! %d\n", err);
+ exit(1);
+ }
+
+ OCL_ASSERT( result == i);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(buildin_work_dim);
diff --git a/utests/builtin_acos_asin.cpp b/utests/builtin_acos_asin.cpp
new file mode 100644
index 0000000..0187226
--- /dev/null
+++ b/utests/builtin_acos_asin.cpp
@@ -0,0 +1,87 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+#define printf_c(...) \
+{\
+ printf("\033[1m\033[40;31m");\
+ printf( __VA_ARGS__ );\
+ printf("\033[0m");\
+}
+
+const float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
+const int count_input = sizeof(input_data) / sizeof(input_data[0]);
+const int max_function = 5;
+
+static void cpu_compiler_math(float *dst, const float *src)
+{
+ const float x = *src;
+
+ dst[0] = acos(x);
+ dst[1] = acosh(x);
+ dst[2] = asin(x);
+ dst[3] = asinh(x);
+ dst[4] = x;
+}
+
+static void builtin_acos_asin(void)
+{
+ // Setup kernel and buffers
+ int k, i, index_cur;
+ float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+
+ OCL_CREATE_KERNEL("builtin_acos_asin");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+ globals[0] = count_input;
+ locals[0] = 1;
+
+ clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
+ clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+
+ // Run the kernel
+ OCL_NDRANGE( 1 );
+
+ clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+ for (k = 0; (uint)k < count_input; k++)
+ {
+ cpu_compiler_math( cpu_data + k * max_function, input_data + k);
+
+ for (i = 0; i < max_function; i++)
+ {
+ index_cur = k * max_function + i;
+#if udebug
+ if (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])){
+ printf_c("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+ else if (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])){
+ printf_c("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+ else if(fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-3f){
+ printf_c("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+ else
+ printf("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+#else
+ if (isinf(cpu_data[index_cur]))
+ OCL_ASSERT(isinf(gpu_data[index_cur]));
+ else if (isnan(cpu_data[index_cur]))
+ OCL_ASSERT(isnan(gpu_data[index_cur]));
+ else
+ {
+ OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+ }
+#endif
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_acos_asin)
diff --git a/utests/builtin_atan2.cpp b/utests/builtin_atan2.cpp
new file mode 100644
index 0000000..29dd7b4
--- /dev/null
+++ b/utests/builtin_atan2.cpp
@@ -0,0 +1,43 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_atan2(void) {
+ const int n = 1024;
+ float y[n], x[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_atan2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ y[i] = ((float*) buf_data[0])[i] = (rand()&255) * 0.01f;
+ x[i] = ((float*) buf_data[1])[i] = (rand()&255) * 0.01f;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ float *dst = (float*) buf_data[2];
+ for (int i = 0; i < n; ++i) {
+ float cpu = atan2f(y[i], x[i]);
+ float gpu = dst[i];
+ if (fabsf(cpu - gpu) >= 1e-2) {
+ printf("%f %f %f %f\n", y[i], x[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_atan2);
diff --git a/utests/builtin_bitselect.cpp b/utests/builtin_bitselect.cpp
new file mode 100644
index 0000000..37fb8df
--- /dev/null
+++ b/utests/builtin_bitselect.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+
+int as_int(float f) {
+ void *p = &f;
+ return *(int *)p;
+}
+
+int cpu(int a, int b, int c) {
+ return (a & ~c) | (b & c);
+}
+
+void builtin_bitselect(void)
+{
+ const int n = 32;
+ float src1[n], src2[n], src3[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_bitselect");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((float*)buf_data[0])[i] = rand() * 0.1f;
+ src2[i] = ((float*)buf_data[1])[i] = rand() * 0.1f;
+ src3[i] = ((float*)buf_data[2])[i] = rand() * 0.1f;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[3])[i] == cpu(as_int(src1[i]), as_int(src2[i]), as_int(src3[i])));
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_bitselect);
diff --git a/utests/builtin_convert_sat.cpp b/utests/builtin_convert_sat.cpp
new file mode 100644
index 0000000..7272057
--- /dev/null
+++ b/utests/builtin_convert_sat.cpp
@@ -0,0 +1,80 @@
+#include <cstdint>
+#include "utest_helper.hpp"
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+int64_t my_rand(void) {
+ int64_t x = rand() - RAND_MAX/2;
+ int64_t y = rand() - RAND_MAX/2;
+ return x * y;
+}
+
+#define DEF2(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX, REAL_SRC_TYPE) \
+void builtin_convert_ ## SRC_TYPE ## _to_ ## DST_TYPE ## _sat(void) \
+{ \
+ const int n = 128; \
+ OCL_CREATE_KERNEL_FROM_FILE("builtin_convert_sat", "builtin_convert_" # SRC_TYPE "_to_" # DST_TYPE "_sat"); \
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(REAL_SRC_TYPE), NULL); \
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(DST_TYPE), NULL); \
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+ globals[0] = n; \
+ locals[0] = 16; \
+ OCL_MAP_BUFFER(0); \
+ for (int i = 0; i < n; i++) \
+ ((REAL_SRC_TYPE *)buf_data[0])[i] = my_rand(); \
+ OCL_UNMAP_BUFFER(0); \
+ OCL_NDRANGE(1); \
+ OCL_MAP_BUFFER(0); \
+ OCL_MAP_BUFFER(1); \
+ for (int i = 0; i < n; i++) { \
+ REAL_SRC_TYPE src = ((REAL_SRC_TYPE *)buf_data[0])[i]; \
+ DST_TYPE dst; \
+ if ((double)src > (double)DST_MAX) \
+ dst = DST_MAX; \
+ else if ((double)src < (double)DST_MIN) \
+ dst = DST_MIN; \
+ else \
+ dst = src; \
+ OCL_ASSERT(((DST_TYPE *)buf_data[1])[i] == dst); \
+ } \
+ OCL_UNMAP_BUFFER(0); \
+ OCL_UNMAP_BUFFER(1); \
+} \
+MAKE_UTEST_FROM_FUNCTION(builtin_convert_ ## SRC_TYPE ## _to_ ## DST_TYPE ## _sat);
+
+#define DEF(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX) \
+ DEF2(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX, SRC_TYPE)
+
+DEF(char, uchar, -128, 127);
+DEF(char, short, -128, 127);
+DEF(char, ushort, -128, 127);
+DEF(char, int, -128, 127);
+DEF(char, uint, -128, 127);
+DEF2(char, long, -128, 127, int64_t);
+DEF(char, float, -128, 127);
+DEF(uchar, char, 0, 255);
+DEF(uchar, short, 0, 255);
+DEF(uchar, ushort, 0, 255);
+DEF(uchar, int, 0, 255);
+DEF(uchar, uint, 0, 255);
+DEF2(uchar, long, 0, 255, int64_t);
+DEF(uchar, float, 0, 255);
+DEF(short, ushort, -32768, 32767);
+DEF(short, int, -32768, 32767);
+DEF(short, uint, -32768, 32767);
+DEF2(short, long, -32768, 32767, int64_t);
+DEF(short, float, -32768, 32767);
+DEF(ushort, short, 0, 65535);
+DEF(ushort, int, 0, 65535);
+DEF(ushort, uint, 0, 65535);
+DEF2(ushort, long, 0, 65535, int64_t);
+DEF(ushort, float, 0, 65535);
+DEF(int, uint, -0x7FFFFFFF-1, 0x7FFFFFFF);
+DEF2(int, long, -0x7FFFFFFF-1, 0x7FFFFFFF, int64_t);
+DEF(int, float, -0x7FFFFFFF-1, 0x7FFFFFFF);
+DEF(uint, int, 0, 0xffffffffu);
+DEF2(uint, long, 0, 0xffffffffu, int64_t);
+DEF(uint, float, 0, 0xffffffffu);
+#undef DEF
diff --git a/utests/builtin_exp.cpp b/utests/builtin_exp.cpp
new file mode 100644
index 0000000..d5288c8
--- /dev/null
+++ b/utests/builtin_exp.cpp
@@ -0,0 +1,102 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_ULP (1.0e-6f)
+
+#define printf_c(...) \
+{\
+ printf("\033[1m\033[40;31m");\
+ printf( __VA_ARGS__ );\
+ printf("\033[0m");\
+}
+
+const float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0 };
+const int count_input = sizeof(input_data) / sizeof(input_data[0]);
+const int max_function = 5;
+
+static void cpu_compiler_math(float *dst, const float *src)
+{
+ const float x = *src;
+
+ dst[0] = exp(x);
+ dst[1] = exp2(x);
+ dst[2] = exp10(x);
+ dst[3] = expm1(x);
+ dst[4] = x;
+}
+
+static void builtin_exp(void)
+{
+ // Setup kernel and buffers
+ int k, i, index_cur;
+ float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+ float diff;
+ char log[256] = {0};
+
+ OCL_CREATE_KERNEL("builtin_exp");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+ globals[0] = count_input;
+ locals[0] = 1;
+
+ clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
+ clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+
+ // Run the kernel
+ OCL_NDRANGE( 1 );
+
+ clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+ for (k = 0; (uint)k < count_input; k++)
+ {
+ cpu_compiler_math( cpu_data + k * max_function, input_data + k);
+
+ for (i = 0; i < max_function; i++)
+ {
+ index_cur = k * max_function + i;
+ diff = fabs(gpu_data[index_cur]-cpu_data[index_cur]);
+ sprintf(log, "%d/%d: %f -> gpu:%f cpu:%f diff:%f expect:%f\n", \
+ k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur], \
+ diff/gpu_data[index_cur], 3 * FLT_ULP);
+
+#if udebug
+ if (isinf(cpu_data[index_cur]) && isinf(gpu_data[index_cur])){
+ printf(log);
+ }
+ else if (isnan(cpu_data[index_cur]) && isnan(gpu_data[index_cur])){
+ printf(log);
+ }
+ else if( diff / cpu_data[index_cur] < 3 * FLT_ULP \
+ && ( gpu_data[index_cur] > FLT_ULP || cpu_data[index_cur] > FLT_ULP )){
+ printf(log);
+ }
+ else if ( gpu_data[index_cur] < FLT_ULP && gpu_data[index_cur] < FLT_ULP)
+ printf(log);
+ else
+ printf_c(log);
+#else
+ if (isinf(cpu_data[index_cur]))
+ OCL_ASSERTM(isinf(gpu_data[index_cur]), log);
+ else if (isnan(cpu_data[index_cur]))
+ OCL_ASSERTM(isnan(gpu_data[index_cur]), log);
+ else if ( gpu_data[index_cur] > FLT_ULP || cpu_data[index_cur] > FLT_ULP)
+ OCL_ASSERTM(fabs( diff / cpu_data[index_cur]) < 3 * FLT_ULP, log);
+ else
+ OCL_ASSERTM(fabs(diff) < 3 * FLT_ULP, log);
+#endif
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_exp)
diff --git a/utests/builtin_frexp.cpp b/utests/builtin_frexp.cpp
new file mode 100644
index 0000000..75dac3b
--- /dev/null
+++ b/utests/builtin_frexp.cpp
@@ -0,0 +1,50 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_frexp(void)
+{
+ const int n = 32;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_frexp");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ src[0] = ((float*)buf_data[0])[0] = 0.f;
+ src[1] = ((float*)buf_data[0])[1] = -0.f;
+ src[2] = ((float*)buf_data[0])[2] = nanf("");
+ src[3] = ((float*)buf_data[0])[3] = INFINITY;
+ src[4] = ((float*)buf_data[0])[4] = -INFINITY;
+ for (int i = 5; i < n; ++i)
+ src[i] = ((float*)buf_data[0])[i] = (rand() & 255) * 0.1f - 12.8f;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ float *dst = (float*)buf_data[1];
+ int *exp = (int*)buf_data[2];
+ int w;
+ OCL_ASSERT(dst[0] == 0.f && exp[0] == 0);
+ OCL_ASSERT(dst[1] == -0.f && exp[1] == 0);
+ OCL_ASSERT(isnanf(dst[2]));
+ OCL_ASSERT(dst[3] == INFINITY);
+ OCL_ASSERT(dst[4] == -INFINITY);
+ for (int i = 5; i < n; ++i) {
+ OCL_ASSERT(fabsf(dst[i] - frexpf(src[i], &w)) < 1e-5);
+ OCL_ASSERT(exp[i] == w);
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_frexp);
diff --git a/utests/builtin_global_id.cpp b/utests/builtin_global_id.cpp
new file mode 100644
index 0000000..9601cab
--- /dev/null
+++ b/utests/builtin_global_id.cpp
@@ -0,0 +1,77 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11.
+Now define global size as following:
+ globals[0] = 3;
+ globals[1] = 4;
+ globals[2] = 5;
+
+Kernel:
+id = get_global_id(0) + get_global_id(1)*3 + get_global_id(2)*3*4
+
+dimension:1
+ 0 1 2
+dimension:2
+ 0 1 2
+ 3 4 5
+ 6 7 8
+ 9 10 11
+dimension:3
+ 0 1 2 12 13 14 24 25 26 36 37 38 48 49 50
+ 3 4 5 15 16 17 27 28 29 39 40 41 51 52 53
+ 6 7 8 18 19 20 30 31 32 42 43 44 54 55 56
+ 9 10 11 21 22 23 33 34 35 45 46 47 57 58 59
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_global_id(void)
+{
+
+ // Setup kernel and buffers
+ int dim, global_id[80], err, i, buf_len=1;
+ OCL_CREATE_KERNEL("builtin_global_id");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*80, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ for( dim=1; dim <= 3; dim++ )
+ {
+ buf_len = 1;
+ for(i=1; i <= dim; i++)
+ {
+ globals[i - 1] = 2 + i;
+ locals[i - 1] = 2 + i;
+ buf_len *= 2 + i;
+ }
+ for(i=dim+1; i <= 3; i++)
+ {
+ globals[i - 1] = 0;
+ locals[i - 1] = 0;
+ }
+
+ // Run the kernel
+ OCL_NDRANGE( dim );
+ clFinish(queue);
+
+ err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &global_id, 0, NULL, NULL);
+
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to read output array! %d\n", err);
+ exit(1);
+ }
+
+#if udebug
+ for(i = 0; i < buf_len; i++)
+ {
+ printf("%2d ", global_id[i]);
+ if ((i + 1) % 3 == 0) printf("\n");
+ }
+#endif
+
+ for( i = 0; i < buf_len; i++)
+ OCL_ASSERT( global_id[i] == i);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_global_id);
diff --git a/utests/builtin_global_size.cpp b/utests/builtin_global_size.cpp
new file mode 100644
index 0000000..094e019
--- /dev/null
+++ b/utests/builtin_global_size.cpp
@@ -0,0 +1,108 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_global_size should be as following:
+
+ globals[0] = 3;
+ globals[1] = 4;
+ globals[2] = 5;
+
+#ifdef CL_VERSION_1_2 | CL_VERSION_1_1:
+get_global_size(-1) = 1 (dimension:1)
+get_global_size(0) = 3 (dimension:1)
+get_global_size(1) = 1 (dimension:1)
+get_global_size(2) = 1 (dimension:1)
+
+get_global_size(-1) = 1 (dimension:2)
+get_global_size(0) = 3 (dimension:2)
+get_global_size(1) = 4 (dimension:2)
+get_global_size(2) = 1 (dimension:2)
+get_global_size(3) = 1 (dimension:2)
+
+get_global_size(-1) = 1 (dimension:3)
+get_global_size(0) = 3 (dimension:3)
+get_global_size(1) = 4 (dimension:3)
+get_global_size(2) = 5 (dimension:3)
+get_global_size(3) = 1 (dimension:3)
+get_global_size(4) = 1 (dimension:3)
+
+#ifdef CL_VERSION_1_0:
+get_global_size(-1) = 0 (dimension:1)
+get_global_size(0) = 3 (dimension:1)
+get_global_size(1) = 0 (dimension:1)
+get_global_size(2) = 0 (dimension:1)
+
+get_global_size(-1) = 0 (dimension:2)
+get_global_size(0) = 3 (dimension:2)
+get_global_size(1) = 4 (dimension:2)
+get_global_size(2) = 0 (dimension:2)
+get_global_size(3) = 1 (dimension:2)
+
+get_global_size(-1) = 0 (dimension:3)
+get_global_size(0) = 3 (dimension:3)
+get_global_size(1) = 4 (dimension:3)
+get_global_size(2) = 5 (dimension:3)
+get_global_size(3) = 0 (dimension:3)
+get_global_size(4) = 0 (dimension:3)
+
+*/
+#include "utest_helper.hpp"
+static void builtin_global_size(void)
+{
+
+ // Setup kernel and buffers
+ int dim, dim_arg_global, global_size, err;
+ OCL_CREATE_KERNEL("builtin_global_size");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ globals[0] = 3;
+ globals[1] = 4;
+ globals[2] = 5;
+ locals[0] = 1;
+ locals[1] = 1;
+ locals[2] = 1;
+
+ for( dim=1; dim <= 3; dim++ )
+ {
+
+ for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+ {
+
+ err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to write to source array!\n");
+ exit(1);
+ }
+
+ // Run the kernel
+ OCL_NDRANGE( dim );
+
+ err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &global_size, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to read output array! %d\n", err);
+ exit(1);
+ }
+
+ //printf("get_global_size(%d) = %d (dimension:%d)\n", dim_arg_global, global_size, dim);
+
+ if ( dim_arg_global >= 0 && dim_arg_global < dim)
+ OCL_ASSERT( global_size == dim_arg_global + 3);
+ else
+ {
+ #if defined(CL_VERSION_1_2) || defined(CL_VERSION_1_1)
+ OCL_ASSERT( global_size == 1);
+ #elif defined(CL_VERSION_1_0)
+ OCL_ASSERT( global_size == 0);
+ #else
+ OCL_ASSERT( global_size == 1);
+ #endif
+ }
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_global_size);
diff --git a/utests/builtin_kernel_max_global_size.cpp b/utests/builtin_kernel_max_global_size.cpp
new file mode 100644
index 0000000..c777564
--- /dev/null
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -0,0 +1,30 @@
+#include "utest_helper.hpp"
+
+void builtin_kernel_max_global_size(void)
+{
+ char* built_in_kernel_names;
+ size_t built_in_kernels_size;
+ cl_int err = CL_SUCCESS;
+ size_t ret_sz;
+
+
+ OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+ built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+ OCL_ASSERT(ret_sz == built_in_kernels_size);
+ cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+ OCL_ASSERT(built_in_prog != NULL);
+ cl_kernel builtin_kernel_1d = clCreateKernel(built_in_prog, "__cl_copy_region_unalign_src_offset", &err);
+ OCL_ASSERT(builtin_kernel_1d != NULL);
+ size_t param_value_size;
+ void* param_value;
+ clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, 0, NULL, ¶m_value_size);
+ param_value = malloc(param_value_size);
+ clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, param_value_size, param_value, 0);
+ OCL_ASSERT(*(size_t*)param_value == 256 * 1024 *1024);
+ clReleaseKernel(builtin_kernel_1d);
+ clReleaseProgram(built_in_prog);
+ free(param_value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_kernel_max_global_size);
diff --git a/utests/builtin_lgamma.cpp b/utests/builtin_lgamma.cpp
new file mode 100644
index 0000000..876699a
--- /dev/null
+++ b/utests/builtin_lgamma.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma(void) {
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_lgamma");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*) buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = lgamma(src[i]);
+ float gpu = dst[i];
+ if (fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f\n", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma);
diff --git a/utests/builtin_lgamma_r.cpp b/utests/builtin_lgamma_r.cpp
new file mode 100644
index 0000000..b6e5d0e
--- /dev/null
+++ b/utests/builtin_lgamma_r.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma_r(void) {
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_lgamma_r");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ float *dst = (float*) buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ int cpu_signp;
+ float cpu = lgamma_r(src[i], &cpu_signp);
+ int gpu_signp = ((int*)buf_data[2])[i];
+ float gpu = dst[i];
+ if (cpu_signp != gpu_signp || fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f\n", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma_r);
diff --git a/utests/builtin_local_id.cpp b/utests/builtin_local_id.cpp
new file mode 100644
index 0000000..1f07615
--- /dev/null
+++ b/utests/builtin_local_id.cpp
@@ -0,0 +1,81 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11.
+Now define local and global size as following:
+ globals[0] = 4;
+ globals[1] = 9;
+ globals[2] = 16;
+ locals[0] = 2;
+ locals[1] = 3;
+ locals[2] = 4;
+
+Kernel:
+int id = get_local_id(0) + get_group_id(0)*2 + \
+ get_local_id(1) * 4 + get_group_id(1)*12 +\
+ get_local_id(2) *36 + get_group_id(2)*144;
+
+dimension:1
+ 0 1 2 3
+dimension:2
+ 0 1 2 3 4 5 6 7 8 9 10 11
+12 13 14 15 16 17 18 19 20 21 22 23
+24 25 26 27 28 29 30 31 32 33 34 35
+dimension:3
+ 0 1 2 3 4 5 6 7 ... 139 140 141 142 143
+...
+...
+429 430 431 432 433 434 ... 571 572 573 574 575
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_local_id(void)
+{
+
+ // Setup kernel and buffers
+ int dim, local_id[576], err, i, buf_len=1;
+ OCL_CREATE_KERNEL("builtin_local_id");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ for( dim=1; dim <= 3; dim++ )
+ {
+ buf_len = 1;
+ for(i=1; i <= dim; i++)
+ {
+ locals[i - 1] = i + 1;
+ globals[i - 1] = (i + 1) * (i + 1);
+ buf_len *= ((i + 1) * (i + 1));
+ }
+ for(i = dim+1; i <= 3; i++)
+ {
+ globals[i - 1] = 0;
+ locals[i - 1] = 0;
+ }
+
+ // Run the kernel
+ OCL_NDRANGE( dim );
+ clFinish(queue);
+
+ err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &local_id, 0, NULL, NULL);
+
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to read output array! %d\n", err);
+ exit(1);
+ }
+
+#if udebug
+ for(i = 0; i < buf_len; i++)
+ {
+ printf("%2d ", local_id[i]);
+ if ((i + 1) % 4 == 0) printf("\n");
+ }
+#endif
+
+ for( i = 0; i < buf_len; i++)
+ OCL_ASSERT( local_id[i] == i);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_local_id);
diff --git a/utests/builtin_local_size.cpp b/utests/builtin_local_size.cpp
new file mode 100644
index 0000000..a9dac2e
--- /dev/null
+++ b/utests/builtin_local_size.cpp
@@ -0,0 +1,88 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_local_size should be as following:
+
+ globals[0] = 3;
+ globals[1] = 4;
+ globals[2] = 5;
+ locals[0] = 3;
+ locals[1] = 4;
+ locals[2] = 5;
+
+get_local_size(-1) = 1 (dimension:1)
+get_local_size(0) = 3 (dimension:1)
+get_local_size(1) = 1 (dimension:1)
+get_local_size(2) = 1 (dimension:1)
+
+get_local_size(-1) = 1 (dimension:2)
+get_local_size(0) = 3 (dimension:2)
+get_local_size(1) = 4 (dimension:2)
+get_local_size(2) = 1 (dimension:2)
+get_local_size(3) = 1 (dimension:2)
+
+get_local_size(-1) = 1 (dimension:3)
+get_local_size(0) = 3 (dimension:3)
+get_local_size(1) = 4 (dimension:3)
+get_local_size(2) = 5 (dimension:3)
+get_local_size(3) = 1 (dimension:3)
+get_local_size(4) = 1 (dimension:3)
+
+*/
+#include "utest_helper.hpp"
+#define udebug 0
+
+static void builtin_local_size(void)
+{
+
+ // Setup kernel and buffers
+ int dim, dim_arg_global, local_size, err;
+ OCL_CREATE_KERNEL("builtin_local_size");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ globals[0] = 3;
+ globals[1] = 4;
+ globals[2] = 5;
+ locals[0] = 3;
+ locals[1] = 4;
+ locals[2] = 5;
+
+ for( dim=1; dim <= 3; dim++ )
+ {
+
+ for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+ {
+
+ err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to write to source array!\n");
+ exit(1);
+ }
+
+ // Run the kernel
+ OCL_NDRANGE( dim );
+
+ err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &local_size, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to read output array! %d\n", err);
+ exit(1);
+ }
+
+#if udebug
+ printf("get_local_size(%d) = %d (dimension:%d)\n", dim_arg_global, local_size, dim);
+#endif
+ if ( dim_arg_global >= 0 && dim_arg_global < dim)
+ OCL_ASSERT( local_size == dim_arg_global + 3);
+ else
+ {
+ OCL_ASSERT( local_size == 1);
+ }
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_local_size);
diff --git a/utests/builtin_mad_sat.cpp b/utests/builtin_mad_sat.cpp
new file mode 100644
index 0000000..ed9a558
--- /dev/null
+++ b/utests/builtin_mad_sat.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+void builtin_mad_sat(void)
+{
+ const int n = 32;
+ short src1[n], src2[n], src3[n];
+srand(0);
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_mad_sat");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(short), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((short*)buf_data[0])[i] = rand();
+ src2[i] = ((short*)buf_data[1])[i] = rand();
+ src3[i] = ((short*)buf_data[2])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; ++i) {
+ int a = (int)src1[i] * (int)src2[i] + (int)src3[i];
+ a = a > 0x7FFF ? 0x7FFF : (a < -0x8000 ? -0x8000 : a);
+ OCL_ASSERT(((short*)buf_data[3])[i] == (short)a);
+ }
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_mad_sat);
diff --git a/utests/builtin_modf.cpp b/utests/builtin_modf.cpp
new file mode 100644
index 0000000..057e95e
--- /dev/null
+++ b/utests/builtin_modf.cpp
@@ -0,0 +1,56 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+void builtin_modf(void)
+{
+ const int n = 32;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_modf");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ src[0] = INFINITY;
+ src[1] = -INFINITY;
+ src[2] = nanf("");
+ src[3] = 0;
+ src[4] = 1.5f;
+ src[5] = 2.5f;
+ src[6] = -2.5f;
+ src[7] = 20;
+ src[8] = 21;
+ src[9] = 89.5f;
+
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, n * sizeof(float));
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ float *dst = (float *)buf_data[1];
+ float *it = (float *)buf_data[2];
+ OCL_ASSERT(dst[0] == 0 && it[0] == INFINITY);
+ OCL_ASSERT(dst[1] == -0.f && it[1] == -INFINITY);
+ OCL_ASSERT(isnanf(dst[2]) && isnanf(it[2]));
+ OCL_ASSERT(dst[3] == 0 && it[3] == 0);
+ OCL_ASSERT(dst[4] == 0.5f && it[4] == 1);
+ OCL_ASSERT(dst[5] == 0.5f && it[5] == 2);
+ OCL_ASSERT(dst[6] == -0.5f && it[6] == -2);
+ OCL_ASSERT(dst[7] == 0 && it[7] == 20);
+ OCL_ASSERT(dst[8] == 0 && it[8] == 21);
+ OCL_ASSERT(dst[9] == 0.5f && it[9] == 89);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_modf);
diff --git a/utests/builtin_nextafter.cpp b/utests/builtin_nextafter.cpp
new file mode 100644
index 0000000..ae95497
--- /dev/null
+++ b/utests/builtin_nextafter.cpp
@@ -0,0 +1,60 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+static int as_int(float f) {
+ void *p = &f;
+ return *(int *)p;
+}
+
+void builtin_nextafter(void)
+{
+ const int n = 16;
+ float src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_nextafter");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ src1[0] = nanf(""), src2[0] = 1.1f;
+ src1[1] = 2.2f, src2[1] = nanf("");
+ src1[2] = nanf(""), src2[2] = nanf("");
+ src1[3] = 123.4f, src2[3] = 123.4f;
+ src1[4] = 0.f, src2[4] = 1.f;
+ src1[5] = -0.f, src2[5] = -1.f;
+ for (int i = 6; i < n; ++i) {
+ src1[i] = (rand() & 255) * 0.1f - 12.8f;
+ src2[i] = (rand() & 255) * 0.1f - 12.8f;
+ }
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, n * sizeof(float));
+ memcpy(buf_data[1], src2, n * sizeof(float));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ float *dest = (float *)buf_data[2];
+ if (0)
+ for (int i = 0; i < n; ++i)
+ printf("%d %x %x %x %x\n", i, as_int(src1[i]), as_int(src2[i]),
+ as_int(dest[i]), as_int(nextafterf(src1[i], src2[i])));
+ OCL_ASSERT(isnanf(dest[0]));
+ OCL_ASSERT(isnanf(dest[1]));
+ OCL_ASSERT(isnanf(dest[2]));
+ for (int i = 3; i < n; ++i)
+ OCL_ASSERT(dest[i] == nextafterf(src1[i], src2[i]));
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_nextafter);
diff --git a/utests/builtin_num_groups.cpp b/utests/builtin_num_groups.cpp
new file mode 100644
index 0000000..bbff435
--- /dev/null
+++ b/utests/builtin_num_groups.cpp
@@ -0,0 +1,85 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_num_groups should be as following:
+
+ globals[0] = 1;
+ globals[1] = 4;
+ globals[2] = 9;
+ locals[0] = 1;
+ locals[1] = 2;
+ locals[2] = 3;
+
+#ifdef CL_VERSION_1_2 | CL_VERSION_1_1:
+get_num_groups(-1) = 1 (dimension:1)
+get_num_groups(0) = 1 (dimension:1)
+get_num_groups(1) = 1 (dimension:1)
+
+get_num_groups(-1) = 1 (dimension:2)
+get_num_groups(0) = 1 (dimension:2)
+get_num_groups(1) = 2 (dimension:2)
+get_num_groups(2) = 1 (dimension:2)
+
+get_num_groups(-1) = 1 (dimension:3)
+get_num_groups(0) = 1 (dimension:3)
+get_num_groups(1) = 2 (dimension:3)
+get_num_groups(2) = 3 (dimension:3)
+get_num_groups(3) = 1 (dimension:3)
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_num_groups(void)
+{
+
+ // Setup kernel and buffers
+ int dim, dim_arg_global, num_groups, err;
+ OCL_CREATE_KERNEL("builtin_num_groups");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ globals[0] = 1;
+ globals[1] = 4;
+ globals[2] = 9;
+ locals[0] = 1;
+ locals[1] = 2;
+ locals[2] = 3;
+
+ for( dim=1; dim <= 3; dim++ )
+ {
+
+ for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+ {
+
+ err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to write to source array!\n");
+ exit(1);
+ }
+
+ // Run the kernel
+ OCL_NDRANGE( dim );
+
+ err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &num_groups, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: Failed to read output array! %d\n", err);
+ exit(1);
+ }
+
+#if udebug
+ printf("get_num_groups(%d) = %d (dimension:%d)\n", dim_arg_global, num_groups, dim);
+#endif
+ if ( dim_arg_global >= 0 && dim_arg_global < dim)
+ OCL_ASSERT( num_groups == dim_arg_global + 1 );
+ else
+ {
+ OCL_ASSERT( num_groups == 1);
+ }
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_num_groups);
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
new file mode 100644
index 0000000..8ed17ed
--- /dev/null
+++ b/utests/builtin_pow.cpp
@@ -0,0 +1,92 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+#define printf_c(...) \
+{\
+ printf("\033[1m\033[40;31m");\
+ printf( __VA_ARGS__ );\
+ printf("\033[0m");\
+}
+const float ori_data[] = {-20.5, -1, -0.9, -0.01, 0, 0.01, 0.9, 1.0, 20.5};
+const int count_input_ori = sizeof(ori_data) / sizeof(ori_data[0]);
+const int count_input = count_input_ori * count_input_ori;
+
+float input_data1[count_input];
+float input_data2[count_input];
+const int max_function = 1;
+
+static void cpu_compiler_math(const float *src1, const float *src2, float *dst)
+{
+ dst[0] = powf(src1[0], src2[0]);
+// dst[1] = src1[0];
+}
+
+static void builtin_pow(void)
+{
+ // Setup kernel and buffers
+ int k, i, index_cur;
+ float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+
+ for(i=0; i<count_input_ori;i++)
+ for(k=0; k<count_input_ori;k++)
+ {
+ input_data1[i*count_input_ori+k] = ori_data[i];
+ input_data2[i*count_input_ori+k] = ori_data[k];
+ }
+
+ OCL_CREATE_KERNEL("builtin_pow");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+
+ globals[0] = count_input;
+ locals[0] = 1;
+
+ clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data1, 0, NULL, NULL);
+ clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, count_input * sizeof(float), input_data2, 0, NULL, NULL);
+ clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), &max_function, 0, NULL, NULL);
+
+ // Run the kernel
+ OCL_NDRANGE( 1 );
+
+ clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+ for (k = 0; (uint)k < count_input; k++)
+ {
+ cpu_compiler_math( input_data1 + k, input_data2 + k, cpu_data + k * max_function);
+
+ for (i = 0; i < max_function; i++)
+ {
+ index_cur = k * max_function + i;
+#if udebug
+ if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
+ (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
+ (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-5f) )
+ {
+ printf_c("%d/%d: x:%f, y:%f -> gpu:%f cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+ else
+ printf("%d/%d: x:%f, y:%f -> gpu:%f cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
+#else
+ if (isinf(cpu_data[index_cur]))
+ OCL_ASSERT(isinf(gpu_data[index_cur]));
+ else if (isnan(cpu_data[index_cur]))
+ OCL_ASSERT(isnan(gpu_data[index_cur]));
+ else
+ {
+ OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+ }
+#endif
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(builtin_pow)
diff --git a/utests/builtin_remquo.cpp b/utests/builtin_remquo.cpp
new file mode 100644
index 0000000..f67be12
--- /dev/null
+++ b/utests/builtin_remquo.cpp
@@ -0,0 +1,65 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+void builtin_remquo(void)
+{
+ const int n = 16;
+ float src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_remquo");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ src1[0] = 1, src2[0] = 0;
+ src1[1] = 1, src2[1] = -0.f;
+ src1[2] = INFINITY, src2[2] = 1;
+ src1[3] = -INFINITY, src2[3] = 1;
+ src1[4] = nanf(""), src2[4] = nanf("");
+ src1[5] = 1.625f, src2[5] = 1;
+ src1[6] = -1.625f, src2[6] = 1;
+ src1[7] = 1.625f, src2[7] = -1;
+ src1[8] = -1.625f, src2[8] = -1;
+ src1[9] = 5, src2[9] = 2;
+ src1[10] = 3, src2[10] = 2;
+ src1[11] = -0.f, src2[11] = 1;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, n * sizeof(float));
+ memcpy(buf_data[1], src2, n * sizeof(float));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ float *dest = (float *)buf_data[2];
+ int *quo = (int *)buf_data[3];
+ OCL_ASSERT(isnanf(dest[0]));
+ OCL_ASSERT(isnanf(dest[1]));
+ OCL_ASSERT(isnanf(dest[2]));
+ OCL_ASSERT(isnanf(dest[3]));
+ OCL_ASSERT(isnanf(dest[4]));
+ OCL_ASSERT(dest[5] == -0.375f && quo[5] == 2);
+ OCL_ASSERT(dest[6] == 0.375f && quo[6] == -2);
+ OCL_ASSERT(dest[7] == -0.375f && quo[7] == -2);
+ OCL_ASSERT(dest[8] == 0.375f && quo[8] == 2);
+ OCL_ASSERT(dest[9] == 1 && quo[9] == 2);
+ OCL_ASSERT(dest[10] == -1 && quo[10] == 2);
+ OCL_ASSERT(dest[11] == -0.f && quo[11] == 0);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_remquo);
diff --git a/utests/builtin_shuffle.cpp b/utests/builtin_shuffle.cpp
new file mode 100644
index 0000000..c7fa86b
--- /dev/null
+++ b/utests/builtin_shuffle.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void builtin_shuffle(void)
+{
+ const int n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_shuffle");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; i ++) {
+ ((float *)(buf_data[0]))[i] = rand();
+ ((float *)(buf_data[1]))[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; i ++) {
+ OCL_ASSERT(((float *)(buf_data[0]))[i] == ((float *)(buf_data[3]))[i]);
+ OCL_ASSERT(((float *)(buf_data[1]))[i] == ((float *)(buf_data[2]))[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_shuffle);
diff --git a/utests/builtin_shuffle2.cpp b/utests/builtin_shuffle2.cpp
new file mode 100644
index 0000000..7a9ebd1
--- /dev/null
+++ b/utests/builtin_shuffle2.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void builtin_shuffle2(void)
+{
+ const int n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_shuffle2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; i ++) {
+ ((float *)(buf_data[0]))[i] = (rand() & 15) * 0.1f;
+ ((float *)(buf_data[1]))[i] = (rand() & 15) * 0.1f;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; i ++) {
+ OCL_ASSERT(2 * ((float *)(buf_data[0]))[i] == ((float *)(buf_data[3]))[i]);
+ OCL_ASSERT(2 * ((float *)(buf_data[1]))[i] == ((float *)(buf_data[2]))[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_shuffle2);
diff --git a/utests/builtin_sign.cpp b/utests/builtin_sign.cpp
new file mode 100644
index 0000000..426de36
--- /dev/null
+++ b/utests/builtin_sign.cpp
@@ -0,0 +1,47 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_sign(void)
+{
+ const int n = 32;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_sign");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ src[0] = ((float*)buf_data[0])[0] = nanf("");
+ src[1] = ((float*)buf_data[0])[1] = INFINITY;
+ src[2] = ((float*)buf_data[0])[2] = 0.f;
+ src[3] = ((float*)buf_data[0])[3] = -0.f;
+ for (int i = 4; i < n; ++i) {
+ src[i] = ((float*)buf_data[0])[i] = (rand() & 15) * 0.1 - 0.75;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ OCL_ASSERT(dst[0] == 0);
+ OCL_ASSERT(dst[1] == 1.f);
+ OCL_ASSERT(dst[2] == 0.f);
+ OCL_ASSERT(dst[3] == -0.f);
+ for (int i = 4; i < n; ++i) {
+ if (src[i] == 0.f)
+ OCL_ASSERT(dst[i] == 0.f);
+ else if (src[i] == -0.f)
+ OCL_ASSERT(dst[i] == -0.f);
+ else
+ OCL_ASSERT(dst[i] == (src[i] > 0 ? 1 : -1));
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sign);
diff --git a/utests/builtin_sinpi.cpp b/utests/builtin_sinpi.cpp
new file mode 100644
index 0000000..0e11a0d
--- /dev/null
+++ b/utests/builtin_sinpi.cpp
@@ -0,0 +1,104 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static int as_int(float x) {
+ union {float f; int i;} u;
+ u.f = x;
+ return u.i;
+}
+
+static float sinpi(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ float y, z;
+ int n = 0, ix;
+ const float pi = 3.1415927410e+00f;
+
+ ix = as_int(x) & 0x7fffffff;
+
+ if (ix < 0x3e800000)
+ return sinf(pi * x);
+ y = -x;
+ z = floorf(y);
+ if (z != y) {
+ y *= 0.5f;
+ y = 2.f * (y - floorf(y));
+ n = y * 4.f;
+ } else {
+ if (ix >= 0x4b800000) {
+ y = 0;
+ n = 0;
+ } else {
+ if (ix < 0x4b000000)
+ z = y + 8.3886080000e+06f;
+ int n = as_int(z);
+ n &= 1;
+ y = n;
+ n <<= 2;
+ }
+ }
+ switch (n) {
+ case 0:
+ y = sinf(pi * y);
+ break;
+ case 1:
+ case 2:
+ y = cosf(pi * ((float) 0.5 - y));
+ break;
+ case 3:
+ case 4:
+ y = sinf(pi * (1.f - y));
+ break;
+ case 5:
+ case 6:
+ y = -cosf(pi * (y - (float) 1.5));
+ break;
+ default:
+ y = sinf(pi * (y - (float) 2.0));
+ break;
+ }
+ return -y;
+}
+
+void builtin_sinpi(void)
+{
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_sinpi");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*)buf_data[0])[i] = (j*n + i) * 0.01f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = sinpi(src[i]);
+ OCL_ASSERT (fabsf(cpu - dst[i]) < 1e-4);
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sinpi);
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
new file mode 100644
index 0000000..4c824d0
--- /dev/null
+++ b/utests/builtin_tgamma.cpp
@@ -0,0 +1,42 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_tgamma(void)
+{
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_tgamma");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j ++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*)buf_data[0])[i] = (j*n+i+1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = gammaf(src[i]);
+ if (isinf(cpu)) {
+ OCL_ASSERT(isinf(dst[i]));
+ } else if (fabsf(cpu - dst[i]) >= 1e-3) {
+ printf("%f %f %f\n", src[i], cpu, dst[i]);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_tgamma);
diff --git a/utests/cl_create_kernel.cpp b/utests/cl_create_kernel.cpp
new file mode 100644
index 0000000..36a7c38
--- /dev/null
+++ b/utests/cl_create_kernel.cpp
@@ -0,0 +1,16 @@
+#include "utest_helper.hpp"
+
+static void test_create_kernel(void)
+{
+ cl_ulong max_mem_size;
+ cl_int status;
+
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_size), &max_mem_size, NULL);
+ OCL_ASSERT(max_mem_size < (cl_ulong)-1);
+ // increment the size so that following clCreateBuffer() would fail.
+ ++max_mem_size;
+ buf[0] = clCreateBuffer(ctx, 0, max_mem_size, NULL, &status);
+ OCL_ASSERT(status == CL_INVALID_BUFFER_SIZE);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_create_kernel);
diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp
new file mode 100644
index 0000000..f2c828e
--- /dev/null
+++ b/utests/compare_image_2d_and_1d_array.cpp
@@ -0,0 +1,79 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compare_image_2d_and_1d_array(void)
+{
+ const int w = 64;
+ const int h = 32;
+ cl_image_format format;
+ cl_image_desc desc;
+ cl_sampler sampler;
+
+ // Create the 1D array buffer.
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ uint32_t* image_data1 = (uint32_t *)malloc(w * h * sizeof(uint32_t));
+ uint32_t* image_data2 = (uint32_t *)malloc(w * h * sizeof(uint32_t));
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ char a = 0;
+ if (j % 2 == 0)
+ a = (j + 3) & 0x3f;
+
+ image_data2[w * j + i] = image_data1[w * j + i] = a << 24 | a << 16 | a << 8 | a;
+ }
+ }
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = w * sizeof(uint32_t);
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1);
+
+ // Create the 2D array buffer.
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+ desc.image_width = w;
+ desc.image_array_size = h;
+ desc.image_row_pitch = w * sizeof(uint32_t);
+ OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2);
+
+ OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_LINEAR);
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("compare_image_2d_and_1d_array");
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_sampler), &sampler);
+ globals[0] = 32;
+ globals[1] = 16;
+ locals[0] = 32;
+ locals[1] = 16;
+ OCL_NDRANGE(2);
+
+ OCL_MAP_BUFFER_GTT(0);
+ OCL_MAP_BUFFER_GTT(1);
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; i++) {
+ // Because the array index will not join the sample caculation, the result should
+ // be different between the 2D and 1D_array.
+ if (j % 2 == 0)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
+ }
+ }
+ OCL_UNMAP_BUFFER_GTT(0);
+ OCL_UNMAP_BUFFER_GTT(1);
+
+ OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compare_image_2d_and_1d_array);
diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
new file mode 100644
index 0000000..3f477a8
--- /dev/null
+++ b/utests/compiler_abs.cpp
@@ -0,0 +1,254 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+ T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+ typedef cl_vec<T, N> vec_type;
+
+ cl_vec(void) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ }
+ cl_vec(vec_type & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ }
+
+ vec_type& operator= (vec_type & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ return *this;
+ }
+
+ template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ return *this;
+ }
+
+ bool operator== (vec_type & other) {
+ return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+ }
+
+ void abs(void) {
+ int i = 0;
+ for (; i < N; i++) {
+ T f = ptr[i];
+ f = f < 0 ? -f : f;
+ ptr[i] = f;
+ }
+ }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+ cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+ cl_vec<T, N> v = src[global_id];
+ v.abs();
+ dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *src, U *dst)
+{
+ T f = src[global_id];
+ f = f < 0 ? -f : f;
+ dst[global_id] = (U)f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+ int i = 0;
+
+ memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ for (; i < N; i++) {
+ vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
+ }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+ val = static_cast<T>((rand() & 63) - 32);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+ if (std::is_unsigned<T>::value)
+ printf(" %u", val);
+ else
+ printf(" %d", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* src,
+ cl_vec<U, N>* dst, int n)
+{
+ U* val = reinterpret_cast<U *>(dst);
+
+ n = n*((N+1)/2)*2;
+
+ printf("\nRaw: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+
+ printf("\nCPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(val[i]);
+ }
+ printf("\nGPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[1])[i]);
+ }
+}
+
+template <typename T, typename U> static void dump_data (T* src, U* dst, int n)
+{
+ printf("\nRaw: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+
+ printf("\nCPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(dst[i]);
+ }
+ printf("\nGPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[1])[i]);
+ }
+}
+
+template <typename T, typename U> static void compiler_abs_with_type(void)
+{
+ const size_t n = 16;
+ U cpu_dst[16];
+ T cpu_src[16];
+
+ // Setup buffers
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+
+ /* Clear the dst buffer to avoid random data. */
+ OCL_MAP_BUFFER(1);
+ memset(buf_data[1], 0, sizeof(U) * n);
+ OCL_UNMAP_BUFFER(1);
+
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ gen_rand_val(cpu_src[i]);
+ }
+
+ memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+
+// dump_data(cpu_src, cpu_dst, n);
+
+ OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+#define ABS_TEST_TYPE_1(TYPE, UTYPE, KEEP_PROGRAM) \
+ static void compiler_abs_##TYPE (void) \
+ { \
+ OCL_CALL (cl_kernel_init, "compiler_abs.cl", "compiler_abs_"#TYPE, SOURCE, NULL); \
+ compiler_abs_with_type<TYPE, UTYPE>(); \
+ } \
+ MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_abs_##TYPE, KEEP_PROGRAM);
+
+#define ABS_TEST_TYPE(TYPE, UTYPE) ABS_TEST_TYPE_1(TYPE, UTYPE, true)
+#define ABS_TEST_TYPE_END(TYPE, UTYPE) ABS_TEST_TYPE_1(TYPE, UTYPE, false)
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+ABS_TEST_TYPE(int, uint)
+ABS_TEST_TYPE(short, ushort)
+ABS_TEST_TYPE(char, uchar)
+ABS_TEST_TYPE(uint, uint)
+ABS_TEST_TYPE(ushort, ushort)
+ABS_TEST_TYPE(uchar, uchar)
+
+
+typedef cl_vec<int, 2> int2;
+typedef cl_vec<int, 3> int3;
+typedef cl_vec<int, 4> int4;
+typedef cl_vec<int, 8> int8;
+typedef cl_vec<int, 16> int16;
+typedef cl_vec<unsigned int, 2> uint2;
+typedef cl_vec<unsigned int, 3> uint3;
+typedef cl_vec<unsigned int, 4> uint4;
+typedef cl_vec<unsigned int, 8> uint8;
+typedef cl_vec<unsigned int, 16> uint16;
+ABS_TEST_TYPE(int2, uint2)
+ABS_TEST_TYPE(int3, uint3)
+ABS_TEST_TYPE(int4, uint4)
+ABS_TEST_TYPE(int8, uint8)
+ABS_TEST_TYPE(int16, uint16)
+ABS_TEST_TYPE(uint2, uint2)
+ABS_TEST_TYPE(uint3, uint3)
+ABS_TEST_TYPE(uint4, uint4)
+ABS_TEST_TYPE(uint8, uint8)
+ABS_TEST_TYPE(uint16, uint16)
+
+
+typedef cl_vec<char, 2> char2;
+typedef cl_vec<char, 3> char3;
+typedef cl_vec<char, 4> char4;
+typedef cl_vec<char, 8> char8;
+typedef cl_vec<char, 16> char16;
+typedef cl_vec<unsigned char, 2> uchar2;
+typedef cl_vec<unsigned char, 3> uchar3;
+typedef cl_vec<unsigned char, 4> uchar4;
+typedef cl_vec<unsigned char, 8> uchar8;
+typedef cl_vec<unsigned char, 16> uchar16;
+ABS_TEST_TYPE(char2, uchar2)
+ABS_TEST_TYPE(char3, uchar3)
+ABS_TEST_TYPE(char4, uchar4)
+ABS_TEST_TYPE(char8, uchar8)
+ABS_TEST_TYPE(char16, uchar16)
+ABS_TEST_TYPE(uchar2, uchar2)
+ABS_TEST_TYPE(uchar3, uchar3)
+ABS_TEST_TYPE(uchar4, uchar4)
+ABS_TEST_TYPE(uchar8, uchar8)
+ABS_TEST_TYPE(uchar16, uchar16)
+
+
+typedef cl_vec<short, 2> short2;
+typedef cl_vec<short, 3> short3;
+typedef cl_vec<short, 4> short4;
+typedef cl_vec<short, 8> short8;
+typedef cl_vec<short, 16> short16;
+typedef cl_vec<unsigned short, 2> ushort2;
+typedef cl_vec<unsigned short, 3> ushort3;
+typedef cl_vec<unsigned short, 4> ushort4;
+typedef cl_vec<unsigned short, 8> ushort8;
+typedef cl_vec<unsigned short, 16> ushort16;
+ABS_TEST_TYPE(short2, ushort2)
+ABS_TEST_TYPE(short3, ushort3)
+ABS_TEST_TYPE(short4, ushort4)
+ABS_TEST_TYPE(short8, ushort8)
+ABS_TEST_TYPE(short16, ushort16)
+ABS_TEST_TYPE(ushort2, ushort2)
+ABS_TEST_TYPE(ushort3, ushort3)
+ABS_TEST_TYPE(ushort4, ushort4)
+ABS_TEST_TYPE(ushort8, ushort8)
+ABS_TEST_TYPE_END(ushort16, ushort16)
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
new file mode 100644
index 0000000..15a1f90
--- /dev/null
+++ b/utests/compiler_abs_diff.cpp
@@ -0,0 +1,295 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+ T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+ typedef cl_vec<T, N> vec_type;
+
+ cl_vec(void) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ }
+ cl_vec(vec_type & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ }
+
+ vec_type& operator= (vec_type & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ return *this;
+ }
+
+ template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ return *this;
+ }
+
+ bool operator== (vec_type & other) {
+ return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+ }
+
+ void abs_diff(vec_type & other) {
+ int i = 0;
+ for (; i < N; i++) {
+ T a = ptr[i];
+ T b = other.ptr[i];
+ T f = a > b ? (a - b) : (b - a);
+ ptr[i] = f;
+ }
+ }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+ cl_vec<T, N> *x, cl_vec<T, N> *y, cl_vec<U, N> *diff)
+{
+ cl_vec<T, N> v = x[global_id];
+ v.abs_diff(y[global_id]);
+ diff[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *x, T *y, U *diff)
+{
+ T a = x[global_id];
+ T b = y[global_id];
+ U f = a > b ? (a - b) : (b - a);
+ diff[global_id] = f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+ int i = 0;
+ for (; i < N; i++) {
+ vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
+ }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+ val = static_cast<T>((rand() & 63) - 32);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+ if (std::is_unsigned<T>::value)
+ printf(" %u", val);
+ else
+ printf(" %d", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* x,
+ cl_vec<T, N>* y, cl_vec<U, N>* diff, int n)
+{
+ U* val = reinterpret_cast<U *>(diff);
+
+ n = n*((N+1)/2)*2;
+
+ printf("\nRaw x: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+ printf("\nRaw y: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[1])[i]);
+ }
+
+ printf("\nCPU diff: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(val[i]);
+ }
+ printf("\nGPU diff: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[2])[i]);
+ }
+}
+
+template <typename T, typename U> static void dump_data (T* x, T* y, U* diff, int n)
+{
+ printf("\nRaw x: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+ printf("\nRaw y: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[1])[i]);
+ }
+
+ printf("\nCPU diff: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(diff[i]);
+ }
+ printf("\nGPU diff: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[2])[i]);
+ }
+}
+
+template <typename T, typename U> static void compiler_abs_diff_with_type(void)
+{
+ const size_t n = 16;
+ U cpu_diff[16];
+ T cpu_x[16];
+ T cpu_y[16];
+
+ // Setup buffers
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(U), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ /* Clear the dst buffer to avoid random data. */
+ OCL_MAP_BUFFER(2);
+ memset(buf_data[2], 0, sizeof(U) * n);
+ OCL_UNMAP_BUFFER(2);
+
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ gen_rand_val(cpu_x[i]);
+ gen_rand_val(cpu_y[i]);
+ }
+
+ memcpy(buf_data[0], cpu_x, sizeof(T) * n);
+ memcpy(buf_data[1], cpu_y, sizeof(T) * n);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, cpu_x, cpu_y, cpu_diff);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+
+// dump_data(cpu_x, cpu_y, cpu_diff, n);
+
+ OCL_ASSERT(!memcmp(buf_data[2], cpu_diff, sizeof(T) * n));
+
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ }
+}
+
+
+#define ABS_TEST_DIFF_TYPE_2(TYPE, CLTYPE, UTYPE, KEEP_PROGRAM) \
+ static void compiler_abs_diff_##CLTYPE (void) \
+ { \
+ OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#CLTYPE, SOURCE, NULL); \
+ compiler_abs_diff_with_type<TYPE, UTYPE>(); \
+ } \
+ MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_abs_diff_##CLTYPE, KEEP_PROGRAM);
+
+#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE, true)
+
+#define ABS_TEST_DIFF_TYPE_END(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE, false)
+
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef uint64_t ulong64;
+ABS_TEST_DIFF_TYPE(int, uint)
+ABS_TEST_DIFF_TYPE_2(int64_t, long, ulong64, true)
+ABS_TEST_DIFF_TYPE(short, ushort)
+ABS_TEST_DIFF_TYPE(char, uchar)
+ABS_TEST_DIFF_TYPE(uint, uint)
+ABS_TEST_DIFF_TYPE_2(ulong64, ulong, ulong64, true)
+ABS_TEST_DIFF_TYPE(ushort, ushort)
+ABS_TEST_DIFF_TYPE(uchar, uchar)
+
+typedef cl_vec<int, 2> int2;
+typedef cl_vec<int, 3> int3;
+typedef cl_vec<int, 4> int4;
+typedef cl_vec<int, 8> int8;
+typedef cl_vec<int, 16> int16;
+typedef cl_vec<unsigned int, 2> uint2;
+typedef cl_vec<unsigned int, 3> uint3;
+typedef cl_vec<unsigned int, 4> uint4;
+typedef cl_vec<unsigned int, 8> uint8;
+typedef cl_vec<unsigned int, 16> uint16;
+ABS_TEST_DIFF_TYPE(int2, uint2)
+ABS_TEST_DIFF_TYPE(int3, uint3)
+ABS_TEST_DIFF_TYPE(int4, uint4)
+ABS_TEST_DIFF_TYPE(int8, uint8)
+ABS_TEST_DIFF_TYPE(int16, uint16)
+ABS_TEST_DIFF_TYPE(uint2, uint2)
+ABS_TEST_DIFF_TYPE(uint3, uint3)
+ABS_TEST_DIFF_TYPE(uint4, uint4)
+ABS_TEST_DIFF_TYPE(uint8, uint8)
+ABS_TEST_DIFF_TYPE(uint16, uint16)
+
+typedef cl_vec<int64_t, 2> long2;
+typedef cl_vec<int64_t, 3> long3;
+typedef cl_vec<int64_t, 4> long4;
+typedef cl_vec<int64_t, 8> long8;
+typedef cl_vec<int64_t, 16> long16;
+typedef cl_vec<uint64_t, 2> ulong2;
+typedef cl_vec<uint64_t, 3> ulong3;
+typedef cl_vec<uint64_t, 4> ulong4;
+typedef cl_vec<uint64_t, 8> ulong8;
+typedef cl_vec<uint64_t, 16> ulong16;
+ABS_TEST_DIFF_TYPE(long2, ulong2)
+ABS_TEST_DIFF_TYPE(long3, ulong3)
+ABS_TEST_DIFF_TYPE(long4, ulong4)
+ABS_TEST_DIFF_TYPE(long8, ulong8)
+ABS_TEST_DIFF_TYPE(long16, ulong16)
+ABS_TEST_DIFF_TYPE(ulong2, ulong2)
+ABS_TEST_DIFF_TYPE(ulong3, ulong3)
+ABS_TEST_DIFF_TYPE(ulong4, ulong4)
+ABS_TEST_DIFF_TYPE(ulong8, ulong8)
+ABS_TEST_DIFF_TYPE(ulong16, ulong16)
+
+typedef cl_vec<char, 2> char2;
+typedef cl_vec<char, 3> char3;
+typedef cl_vec<char, 4> char4;
+typedef cl_vec<char, 8> char8;
+typedef cl_vec<char, 16> char16;
+typedef cl_vec<unsigned char, 2> uchar2;
+typedef cl_vec<unsigned char, 3> uchar3;
+typedef cl_vec<unsigned char, 4> uchar4;
+typedef cl_vec<unsigned char, 8> uchar8;
+typedef cl_vec<unsigned char, 16> uchar16;
+ABS_TEST_DIFF_TYPE(char2, uchar2)
+ABS_TEST_DIFF_TYPE(char3, uchar3)
+ABS_TEST_DIFF_TYPE(char4, uchar4)
+ABS_TEST_DIFF_TYPE(char8, uchar8)
+ABS_TEST_DIFF_TYPE(char16, uchar16)
+ABS_TEST_DIFF_TYPE(uchar2, uchar2)
+ABS_TEST_DIFF_TYPE(uchar3, uchar3)
+ABS_TEST_DIFF_TYPE(uchar4, uchar4)
+ABS_TEST_DIFF_TYPE(uchar8, uchar8)
+ABS_TEST_DIFF_TYPE(uchar16, uchar16)
+
+
+typedef cl_vec<short, 2> short2;
+typedef cl_vec<short, 3> short3;
+typedef cl_vec<short, 4> short4;
+typedef cl_vec<short, 8> short8;
+typedef cl_vec<short, 16> short16;
+typedef cl_vec<unsigned short, 2> ushort2;
+typedef cl_vec<unsigned short, 3> ushort3;
+typedef cl_vec<unsigned short, 4> ushort4;
+typedef cl_vec<unsigned short, 8> ushort8;
+typedef cl_vec<unsigned short, 16> ushort16;
+ABS_TEST_DIFF_TYPE(short2, ushort2)
+ABS_TEST_DIFF_TYPE(short3, ushort3)
+ABS_TEST_DIFF_TYPE(short4, ushort4)
+ABS_TEST_DIFF_TYPE(short8, ushort8)
+ABS_TEST_DIFF_TYPE(short16, ushort16)
+ABS_TEST_DIFF_TYPE(ushort2, ushort2)
+ABS_TEST_DIFF_TYPE(ushort3, ushort3)
+ABS_TEST_DIFF_TYPE(ushort4, ushort4)
+ABS_TEST_DIFF_TYPE(ushort8, ushort8)
+ABS_TEST_DIFF_TYPE_END(ushort16, ushort16)
diff --git a/utests/compiler_address_space.cpp b/utests/compiler_address_space.cpp
new file mode 100644
index 0000000..89c7a38
--- /dev/null
+++ b/utests/compiler_address_space.cpp
@@ -0,0 +1,10 @@
+#include "utest_helper.hpp"
+
+void compiler_address_space(void)
+{
+ OCL_CREATE_KERNEL("compiler_address_space");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_address_space);
+
+
diff --git a/utests/compiler_argument_structure.cpp b/utests/compiler_argument_structure.cpp
new file mode 100644
index 0000000..22464a5
--- /dev/null
+++ b/utests/compiler_argument_structure.cpp
@@ -0,0 +1,28 @@
+#include "utest_helper.hpp"
+
+struct hop { int x, y; };
+
+void compiler_argument_structure(void)
+{
+ const size_t n = 2048;
+ hop h = {3, 4};
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_argument_structure");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(hop), &h);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 7);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_argument_structure);
+
diff --git a/utests/compiler_argument_structure_indirect.cpp b/utests/compiler_argument_structure_indirect.cpp
new file mode 100644
index 0000000..a4584d5
--- /dev/null
+++ b/utests/compiler_argument_structure_indirect.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+struct hop { int x[16]; };
+
+void compiler_argument_structure_indirect(void)
+{
+ const size_t n = 2048;
+ hop h;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_argument_structure_indirect");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ for (int i = 0; i < 16; ++i) h.x[i] = i;
+ OCL_SET_ARG(1, sizeof(hop), &h);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 7);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_argument_structure_indirect);
+
diff --git a/utests/compiler_arith_shift_right.cpp b/utests/compiler_arith_shift_right.cpp
new file mode 100644
index 0000000..6485571
--- /dev/null
+++ b/utests/compiler_arith_shift_right.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+ dst[global_id] = src[global_id] >> 24;
+}
+
+void compiler_arith_shift_right(void)
+{
+ const size_t n = 16;
+ int cpu_src[16];
+ int cpu_dst[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_arith_shift_right");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int*)buf_data[0])[i] = 0x80000000 | rand();
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_arith_shift_right);
diff --git a/utests/compiler_array.cpp b/utests/compiler_array.cpp
new file mode 100644
index 0000000..8806c99
--- /dev/null
+++ b/utests/compiler_array.cpp
@@ -0,0 +1,28 @@
+#include "utest_helper.hpp"
+
+void compiler_array(void)
+{
+ const size_t n = 16;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_array");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 16; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 3);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array);
+
diff --git a/utests/compiler_array0.cpp b/utests/compiler_array0.cpp
new file mode 100644
index 0000000..7cf2bbb
--- /dev/null
+++ b/utests/compiler_array0.cpp
@@ -0,0 +1,54 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+ int i;
+ int final[16];
+ for (i = 0; i < 16; ++i) {
+ int array[16], j;
+ for (j = 0; j < 16; ++j)
+ array[j] = global_id;
+ for (j = 0; j < src[0]; ++j)
+ array[j] = 1+src[j];
+ final[i] = array[i];
+ }
+ dst[global_id] = final[global_id];
+}
+
+void compiler_array0(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_array0");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array0);
+
+
diff --git a/utests/compiler_array1.cpp b/utests/compiler_array1.cpp
new file mode 100644
index 0000000..fe1ecec
--- /dev/null
+++ b/utests/compiler_array1.cpp
@@ -0,0 +1,52 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+ int final[16];
+ for (int i = 0; i < 16; ++i) {
+ int array[16];
+ for (int j = 0; j < src[0]; ++j)
+ array[j] = 1+src[0];
+ for (int j = src[0]; j < 16; ++j)
+ array[j] = global_id;
+ final[i] = array[i];
+ }
+ dst[global_id] = final[global_id];
+}
+
+void compiler_array1(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_array1");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array1);
+
diff --git a/utests/compiler_array2.cpp b/utests/compiler_array2.cpp
new file mode 100644
index 0000000..61ca9da
--- /dev/null
+++ b/utests/compiler_array2.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+ int final[16];
+ int array[16];
+ for (int j = 0; j < 16; ++j) array[j] = j;
+ for (int j = 0; j < 16; ++j) final[j] = j+1;
+ if (global_id == 15)
+ dst[global_id] = final[global_id];
+ else
+ dst[global_id] = array[15 - global_id];
+}
+
+void compiler_array2(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_array2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array2);
+
diff --git a/utests/compiler_array3.cpp b/utests/compiler_array3.cpp
new file mode 100644
index 0000000..865b1e5
--- /dev/null
+++ b/utests/compiler_array3.cpp
@@ -0,0 +1,51 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+ int tmp[32];
+ for (int i = 0; i < 16; ++i) {
+ for (int j = 0; j < 16; ++j)
+ tmp[j] = global_id;
+ for (int j = 0; j < src[0]; ++j)
+ tmp[j] = 1+src[j];
+ tmp[16+i] = tmp[i];
+ }
+ dst[global_id] = tmp[16+global_id];
+}
+
+void compiler_array3(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_array3");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array3);
+
diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp
new file mode 100644
index 0000000..ad661c0
--- /dev/null
+++ b/utests/compiler_async_copy.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+#include <stdint.h>
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+#define DEF(TYPE, KER_TYPE, VEC_SIZE) \
+static void compiler_async_copy_##KER_TYPE##VEC_SIZE(void) \
+{ \
+ const size_t n = 1024; \
+ const size_t local_size = 32; \
+ const int copiesPerWorkItem = 5; \
+\
+ /* Setup kernel and buffers */\
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_async_copy", "compiler_async_copy_" # KER_TYPE # VEC_SIZE); \
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(TYPE) * VEC_SIZE, NULL); \
+ OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(TYPE) * VEC_SIZE, NULL); \
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+ OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(TYPE)*VEC_SIZE, NULL); \
+ OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem); \
+\
+ OCL_MAP_BUFFER(1); \
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * VEC_SIZE; ++i) \
+ ((TYPE*)buf_data[1])[i] = rand(); \
+ OCL_UNMAP_BUFFER(1); \
+\
+ /* Run the kernel */\
+ globals[0] = n; \
+ locals[0] = local_size; \
+ OCL_NDRANGE(1); \
+ OCL_MAP_BUFFER(0); \
+ OCL_MAP_BUFFER(1); \
+\
+ /* Check results */\
+ TYPE *dst = (TYPE*)buf_data[0]; \
+ TYPE *src = (TYPE*)buf_data[1]; \
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * VEC_SIZE; i++) \
+ OCL_ASSERT(dst[i] == src[i]); \
+ OCL_UNMAP_BUFFER(0); \
+ OCL_UNMAP_BUFFER(1); \
+} \
+\
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy_##KER_TYPE##VEC_SIZE);
+
+DEF(char, char, 2);
+DEF(uchar, uchar, 2);
+DEF(short, short, 2);
+DEF(ushort, ushort, 2);
+DEF(int, int, 2);
+DEF(uint, uint, 2);
+DEF(int64_t, long, 2);
+DEF(uint64_t, ulong, 2);
+DEF(float, float, 2);
+//DEF(double, double, 2);
diff --git a/utests/compiler_async_copy_and_prefetch.cpp b/utests/compiler_async_copy_and_prefetch.cpp
new file mode 100644
index 0000000..323faf9
--- /dev/null
+++ b/utests/compiler_async_copy_and_prefetch.cpp
@@ -0,0 +1,10 @@
+#include "utest_helper.hpp"
+
+void compiler_async_copy_and_prefetch(void)
+{
+ OCL_CREATE_KERNEL("compiler_async_copy_and_prefetch");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy_and_prefetch);
+
+
diff --git a/utests/compiler_async_stride_copy.cpp b/utests/compiler_async_stride_copy.cpp
new file mode 100644
index 0000000..2e9eaeb
--- /dev/null
+++ b/utests/compiler_async_stride_copy.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_stride_copy(void)
+{
+ const size_t n = 1024;
+ const size_t local_size = 128;
+ const int copiesPerWorkItem = 5;
+ const int stride =3;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_async_stride_copy");
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(char)*4, NULL);
+ OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+ OCL_SET_ARG(4, sizeof(int), &stride);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * 4 * stride; ++i)
+ ((char*)buf_data[1])[i] = rand() & 0xff;
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = local_size;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ char *dst = (char*)buf_data[0];
+ char *src = (char*)buf_data[1];
+ for (uint32_t i = 0; i < n * copiesPerWorkItem; i += stride * 4) {
+ OCL_ASSERT(dst[i + 0] == (char)(src[i + 0] + 3));
+ OCL_ASSERT(dst[i + 1] == (char)(src[i + 1] + 3));
+ OCL_ASSERT(dst[i + 2] == (char)(src[i + 2] + 3));
+ OCL_ASSERT(dst[i + 3] == (char)(src[i + 3] + 3));
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_stride_copy);
diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
new file mode 100644
index 0000000..65f1c5a
--- /dev/null
+++ b/utests/compiler_atomic_functions.cpp
@@ -0,0 +1,97 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+#include <string.h>
+
+#define GROUP_NUM 16
+#define LOCAL_SIZE 256
+static void cpu_compiler_atomic(int *dst, int *src)
+{
+ dst[4] = 0xffffffff;
+ int tmp[16] = { 0 };
+ tmp[4] = -1;
+ for(int j=0; j<LOCAL_SIZE; j++) {
+ int i = j % 12;
+
+ switch(i) {
+ case 0: tmp[i] += 1; break;
+ case 1: tmp[i] -= 1; break;
+ case 2: tmp[i] += src[j]; break;
+ case 3: tmp[i] -= src[j]; break;
+ case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
+ case 5: tmp[i] |= src[j]<<(j>>4); break;
+ case 6: tmp[i] ^= src[j]; break;
+ case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
+ case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
+ case 9: tmp[i] = (unsigned int)tmp[i] < (unsigned int)(-src[j]) ? tmp[i] : -src[j]; break;
+ case 10: tmp[i] = (unsigned int)tmp[i] > (unsigned int)(src[j]) ? tmp[i] : src[j]; break;
+ case 11: tmp[i] = src[10]; break;
+ default: break;
+ }
+ }
+
+ for(int k=0; k<GROUP_NUM; k++) {
+ for(int j=0; j<LOCAL_SIZE; j++) {
+ int i = j % 12;
+
+ switch(i) {
+ case 0: dst[i] += 1; break;
+ case 1: dst[i] -= 1; break;
+ case 2: dst[i] += src[j]; break;
+ case 3: dst[i] -= src[j]; break;
+ case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
+ case 5: dst[i] |= src[j]<<(j>>4); break;
+ case 6: dst[i] ^= src[j]; break;
+ case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
+ case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
+ case 9: dst[i] = (unsigned int)dst[i] < (unsigned int)(-src[j]) ? dst[i] : -src[j]; break;
+ case 10: dst[i] = (unsigned int)dst[i] > (unsigned int)(src[j]) ? dst[i] : src[j]; break;
+ case 11: dst[i] = src[10]; break;
+ default: break;
+ }
+ }
+ }
+
+ for(int i=0; i<12; i++)
+ dst[i+12] = tmp[i];
+}
+
+static void compiler_atomic_functions(void)
+{
+ const size_t n = GROUP_NUM * LOCAL_SIZE;
+ int cpu_dst[24] = {0}, cpu_src[256];
+
+ globals[0] = n;
+ locals[0] = LOCAL_SIZE;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_atomic_functions");
+ OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, 16 * sizeof(int), NULL);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+
+ OCL_MAP_BUFFER(0);
+ memset(buf_data[0], 0, 24 * sizeof(int));
+ ((int *)buf_data[0])[4] = -1;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < locals[0]; ++i)
+ cpu_src[i] = ((int*)buf_data[1])[i] = rand() & 0xff;
+ cpu_compiler_atomic(cpu_dst, cpu_src);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for(int i=0; i<24; i++) {
+ //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
+ OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions)
diff --git a/utests/compiler_basic_arithmetic.cpp b/utests/compiler_basic_arithmetic.cpp
new file mode 100644
index 0000000..ba05de0
--- /dev/null
+++ b/utests/compiler_basic_arithmetic.cpp
@@ -0,0 +1,115 @@
+#include "utest_helper.hpp"
+
+enum eTestOP {
+ TEST_OP_ADD =0,
+ TEST_OP_SUB,
+ TEST_OP_MUL,
+ TEST_OP_DIV,
+ TEST_OP_REM
+};
+
+template <typename T, eTestOP op>
+static void test_exec(const char* kernel_name)
+{
+ const size_t n = 160;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_basic_arithmetic", kernel_name);
+ buf_data[0] = (T*) malloc(sizeof(T) * n);
+ buf_data[1] = (T*) malloc(sizeof(T) * n);
+ for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = (T) rand();
+ for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[1])[i] = (T) rand();
+ if(op == TEST_OP_DIV || op == TEST_OP_REM) {
+ for (uint32_t i = 0; i < n; ++i) {
+ if(((T*)buf_data[1])[i] == 0)
+ ((T*)buf_data[1])[i] = (T) 1;
+ }
+ }
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[1]);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(2);
+ if(op == TEST_OP_SUB) {
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] - ((T*)buf_data[1])[i]));
+ } else if(op == TEST_OP_ADD) {
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] + ((T*)buf_data[1])[i]));
+ } else if(op == TEST_OP_MUL) {
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] * ((T*)buf_data[1])[i]));
+ } else if(op == TEST_OP_DIV) {
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] / ((T*)buf_data[1])[i]));
+ } else {
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] % ((T*)buf_data[1])[i]));
+ }
+ free(buf_data[0]);
+ free(buf_data[1]);
+ buf_data[0] = buf_data[1] = NULL;
+}
+
+#define DECL_TEST_SUB(type, alias, keep_program) \
+static void compiler_sub_ ##alias(void)\
+{\
+ test_exec<type, TEST_OP_SUB>("compiler_sub_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_sub_ ## alias, keep_program)
+
+#define DECL_TEST_ADD(type, alias, keep_program) \
+static void compiler_add_ ##alias(void)\
+{\
+ test_exec<type, TEST_OP_ADD>("compiler_add_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_add_ ## alias, keep_program)
+
+#define DECL_TEST_MUL(type, alias, keep_program) \
+static void compiler_mul_ ##alias(void)\
+{\
+ test_exec<type, TEST_OP_MUL>("compiler_mul_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_mul_ ## alias, keep_program)
+
+#define DECL_TEST_DIV(type, alias, keep_program) \
+static void compiler_div_ ##alias(void)\
+{\
+ test_exec<type, TEST_OP_DIV>("compiler_div_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_div_ ## alias, keep_program)
+
+#define DECL_TEST_REM(type, alias, keep_program) \
+static void compiler_rem_ ##alias(void)\
+{\
+ test_exec<type, TEST_OP_REM>("compiler_rem_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_rem_ ## alias, keep_program)
+
+#define _DECL_TEST_FOR_ALL_TYPE(op, keep_program) \
+DECL_TEST_##op(int8_t, char, true) \
+DECL_TEST_##op(uint8_t, uchar, true) \
+DECL_TEST_##op(int16_t, short, true) \
+DECL_TEST_##op(uint16_t, ushort, true) \
+DECL_TEST_##op(int32_t, int, true) \
+DECL_TEST_##op(uint32_t, uint, keep_program)
+
+#define DECL_TEST_FOR_ALL_TYPE(op) _DECL_TEST_FOR_ALL_TYPE(op, true)
+
+#define DECL_TEST_FOR_ALL_TYPE_END(op) _DECL_TEST_FOR_ALL_TYPE(op, false)
+
+DECL_TEST_FOR_ALL_TYPE(SUB)
+DECL_TEST_FOR_ALL_TYPE(ADD)
+DECL_TEST_FOR_ALL_TYPE(MUL)
+DECL_TEST_FOR_ALL_TYPE(DIV)
+DECL_TEST_FOR_ALL_TYPE_END(REM)
+#undef DECL_TEST_FOR_ALL_TYPE
diff --git a/utests/compiler_bool_cross_basic_block.cpp b/utests/compiler_bool_cross_basic_block.cpp
new file mode 100644
index 0000000..908edc0
--- /dev/null
+++ b/utests/compiler_bool_cross_basic_block.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int scale) {
+ bool isRedRow = false;
+ bool isRed;
+ int val = src[global_id];
+ for (int i=0; i<scale; i++, isRedRow = !isRedRow) {
+ if (isRedRow) {
+ isRed= false;
+ for (int j=0; j < scale; j++, isRed=!isRed) {
+ if (isRed) {
+ val++;
+ }
+ }
+ }
+ }
+ dst[global_id] = val;
+}
+
+void compiler_bool_cross_basic_block(void){
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+ int scale = 4;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_bool_cross_basic_block");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(int), &scale);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int*)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, cpu_src, cpu_dst, scale);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_bool_cross_basic_block)
diff --git a/utests/compiler_box_blur.cpp b/utests/compiler_box_blur.cpp
new file mode 100644
index 0000000..e4e053e
--- /dev/null
+++ b/utests/compiler_box_blur.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+static int w = 0;
+static int h = 0;
+static int sz = 0;
+static const size_t chunk = 64;
+static int *src = NULL, *dst = NULL;
+
+static void compiler_box_blur()
+{
+ OCL_CREATE_KERNEL("compiler_box_blur");
+
+ /* Load the picture */
+ src = cl_read_bmp("lenna128x128.bmp", &w, &h);
+ sz = w * h * sizeof(int);
+
+ /* Run the kernel */
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, src);
+ OCL_CREATE_BUFFER(buf[1], 0, sz, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(int), &w);
+ OCL_SET_ARG(3, sizeof(int), &h);
+ OCL_SET_ARG(4, sizeof(int), &chunk);
+ globals[0] = size_t(w/4);
+ globals[1] = h/chunk + ((h%chunk)?1:0);
+ locals[0] = 16;
+ locals[1] = 1;
+ free(src);
+ OCL_NDRANGE(2);
+ OCL_MAP_BUFFER(1);
+ dst = (int*) buf_data[1];
+
+ /* Save the image (for debug purpose) */
+ cl_write_bmp(dst, w, h, "compiler_box_blur.bmp");
+
+ /* Compare with the golden image */
+ OCL_CHECK_IMAGE(dst, w, h, "compiler_box_blur_ref.bmp");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_box_blur);
+
diff --git a/utests/compiler_box_blur_float.cpp b/utests/compiler_box_blur_float.cpp
new file mode 100644
index 0000000..a3c97bc
--- /dev/null
+++ b/utests/compiler_box_blur_float.cpp
@@ -0,0 +1,65 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+static int *tmp = NULL;
+static struct float4 {float x,y,z,w;} *src = NULL, *dst = NULL;
+static int w = 0;
+static int h = 0;
+static int sz = 0;
+static const size_t chunk = 64;
+
+static void compiler_box_blur_float()
+{
+ OCL_CREATE_KERNEL("compiler_box_blur_float");
+
+ /* Load the picture */
+ tmp = cl_read_bmp("lenna128x128.bmp", &w, &h);
+ sz = w * h * sizeof(float[4]);
+ src = (float4*)malloc(sz);
+
+ /* RGBA -> float4 conversion */
+ const int n = w*h;
+ for (int i = 0; i < n; ++i) {
+ src[i].x = (float) (tmp[i] & 0xff);
+ src[i].y = (float) ((tmp[i] >> 8) & 0xff);
+ src[i].z = (float) ((tmp[i] >> 16) & 0xff);
+ src[i].w = 0.f;
+ }
+ free(tmp);
+
+ /* Run the kernel */
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, src);
+ OCL_CREATE_BUFFER(buf[1], 0, sz, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(int), &w);
+ OCL_SET_ARG(3, sizeof(int), &h);
+ OCL_SET_ARG(4, sizeof(int), &chunk);
+ globals[0] = size_t(w);
+ globals[1] = h/chunk + ((h%chunk)?1:0);
+ locals[0] = 16;
+ locals[1] = 1;
+ free(src);
+ OCL_NDRANGE(2);
+ OCL_MAP_BUFFER(1);
+ dst = (float4*) buf_data[1];
+
+ /* Convert back to RGBA and save */
+ int *tmp = (int*) malloc(n*sizeof(int));
+ for (int i = 0; i < n; ++i) {
+ int to = int(std::min(dst[i].x, 255.f));
+ to |= int(std::min(dst[i].y, 255.f)) << 8;
+ to |= int(std::min(dst[i].z, 255.f)) << 16;
+ tmp[i] = to;
+ }
+
+ /* Save the image (for debug purpose) */
+ cl_write_bmp(tmp, w, h, "compiler_box_blur_float.bmp");
+
+ /* Compare with the golden image */
+ OCL_CHECK_IMAGE(tmp, w, h, "compiler_box_blur_float_ref.bmp");
+ free(tmp);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_box_blur_float);
+
diff --git a/utests/compiler_box_blur_image.cpp b/utests/compiler_box_blur_image.cpp
new file mode 100644
index 0000000..d94a97c
--- /dev/null
+++ b/utests/compiler_box_blur_image.cpp
@@ -0,0 +1,52 @@
+#include "utest_helper.hpp"
+
+static void compiler_box_blur_image()
+{
+ int w, h;
+ cl_image_format format = { };
+ cl_image_desc desc = { };
+ size_t origin[3] = { };
+ size_t region[3];
+ int *src, *dst;
+
+ OCL_CREATE_KERNEL("compiler_box_blur_image");
+
+ /* Load the picture */
+ src = cl_read_bmp("lenna128x128.bmp", &w, &h);
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNORM_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_depth = 1;
+ desc.image_row_pitch = w*sizeof(uint32_t);
+
+ /* Run the kernel */
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, src);
+ free(src);
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = w;
+ globals[1] = h;
+ locals[0] = 16;
+ locals[1] = 16;
+ OCL_NDRANGE(2);
+ dst = (int*)malloc(w*h*sizeof(uint32_t));
+ region[0] = w;
+ region[1] = h;
+ region[2] = 1;
+ OCL_READ_IMAGE(buf[1], origin, region, dst);
+
+ /* Save the image (for debug purpose) */
+ cl_write_bmp(dst, w, h, "compiler_box_blur_image.bmp");
+
+ /* Compare with the golden image */
+ OCL_CHECK_IMAGE(dst, w, h, "compiler_box_blur_ref.bmp");
+
+ free(dst);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_box_blur_image);
diff --git a/utests/compiler_byte_scatter.cpp b/utests/compiler_byte_scatter.cpp
new file mode 100644
index 0000000..11300da
--- /dev/null
+++ b/utests/compiler_byte_scatter.cpp
@@ -0,0 +1,24 @@
+#include "utest_helper.hpp"
+
+static void compiler_byte_scatter(void)
+{
+ const size_t n = 128;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_byte_scatter");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int8_t), NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((int8_t*)buf_data[0])[i] == (int8_t) i);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_byte_scatter);
+
diff --git a/utests/compiler_ceil.cpp b/utests/compiler_ceil.cpp
new file mode 100644
index 0000000..29c7551
--- /dev/null
+++ b/utests/compiler_ceil.cpp
@@ -0,0 +1,43 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, float *dst) {
+ dst[global_id] = ceilf(src[global_id]);
+}
+
+void compiler_ceil(void)
+{
+ const size_t n = 16;
+ float cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_ceil");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_ceil);
diff --git a/utests/compiler_cl_finish.cpp b/utests/compiler_cl_finish.cpp
new file mode 100644
index 0000000..7c7dee3
--- /dev/null
+++ b/utests/compiler_cl_finish.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+#define T_GET(t) gettimeofday(&t, NULL);
+#define T_LAPSE(t1, t2) \
+ ((t2.tv_sec+t2.tv_usec*0.000001) - (t1.tv_sec+t1.tv_usec*0.000001))
+
+static void compiler_cl_finish(void)
+{
+ const size_t n = 16*1024*1024;
+ struct timeval t1, t2;
+ float t_fin, t_map_w_fin,t_map_wo_fin;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("test_cl_finish");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+
+ // Run the kernel
+ locals[0] = 64;
+ globals[0] = 32 * locals[0];
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(int), &n);
+ OCL_SET_ARG(3, sizeof(int), &globals[0]);
+
+ // 1st time map after clFinish
+ OCL_NDRANGE(1);
+ T_GET(t1);
+ OCL_FINISH();
+ T_GET(t2);
+ t_fin = T_LAPSE(t1, t2);
+
+ T_GET(t1);
+ OCL_MAP_BUFFER(0);
+ T_GET(t2);
+ t_map_w_fin = T_LAPSE(t1, t2);
+
+ // 2nd time map without clFinish
+ OCL_NDRANGE(1);
+ T_GET(t1);
+ OCL_MAP_BUFFER(0);
+ T_GET(t2);
+ t_map_wo_fin = T_LAPSE(t1, t2);
+
+ OCL_ASSERT(t_fin > t_map_w_fin && t_map_wo_fin > t_map_w_fin);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_cl_finish);
diff --git a/utests/compiler_clz_int.cpp b/utests/compiler_clz_int.cpp
new file mode 100644
index 0000000..c12cfc6
--- /dev/null
+++ b/utests/compiler_clz_int.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+void compiler_clz_int(void)
+{
+ const int n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_clz_int");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ ((int*)buf_data[0])[0] = 0;
+ for (int32_t i = 1; i < (int32_t) n; ++i)
+ ((int*)buf_data[0])[i] = 0xffffffffu >> i;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ OCL_ASSERT(((int*)buf_data[1])[0] == 32);
+ for (int i = 1; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[1])[i] == i);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_clz_int);
diff --git a/utests/compiler_clz_short.cpp b/utests/compiler_clz_short.cpp
new file mode 100644
index 0000000..eb3a370
--- /dev/null
+++ b/utests/compiler_clz_short.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+void compiler_clz_short(void)
+{
+ const size_t n = 16;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_clz_short");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ ((short*)buf_data[0])[0] = 0;
+ for (int32_t i = 1; i < (int32_t) n; ++i)
+ ((short*)buf_data[0])[i] = 0xffffu >> i;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ OCL_ASSERT(((short*)buf_data[1])[0] == 16);
+ for (unsigned i = 1; i < (unsigned) n; ++i)
+ OCL_ASSERT(((short*)buf_data[1])[i] == (short)i);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_clz_short);
diff --git a/utests/compiler_constant_expr.cpp b/utests/compiler_constant_expr.cpp
new file mode 100644
index 0000000..8bed03b
--- /dev/null
+++ b/utests/compiler_constant_expr.cpp
@@ -0,0 +1,35 @@
+#include "utest_helper.hpp"
+#include <math.h>
+
+static void compiler_constant_expr(void)
+{
+ const size_t n = 48;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_constant_expr");
+ buf_data[0] = (uint32_t*) malloc(sizeof(float) * n);
+ for (uint32_t i = 0; i < n; ++i) ((float*)buf_data[0])[i] = i;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(float), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i) {
+ float expect = pow(((float*)buf_data[0])[i], (i % 3) + 1);
+ float err = fabs(((float*)buf_data[1])[i] - expect);
+ OCL_ASSERT(err <= 100 * cl_FLT_ULP(expect));
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_constant_expr);
+
diff --git a/utests/compiler_convert_uchar_sat.cpp b/utests/compiler_convert_uchar_sat.cpp
new file mode 100644
index 0000000..da00041
--- /dev/null
+++ b/utests/compiler_convert_uchar_sat.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, int *dst) {
+ float f = src[global_id];
+ dst[global_id] = f > 255 ? 255 : f < 0 ? 0 : f;
+}
+
+void compiler_convert_uchar_sat(void)
+{
+ const size_t n = 16;
+ float cpu_src[16];
+ int cpu_dst[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_convert_uchar_sat");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = (rand() & 1023) / 2;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_convert_uchar_sat);
diff --git a/utests/compiler_copy_buffer.cpp b/utests/compiler_copy_buffer.cpp
new file mode 100644
index 0000000..8066efe
--- /dev/null
+++ b/utests/compiler_copy_buffer.cpp
@@ -0,0 +1,32 @@
+#include "utest_helper.hpp"
+
+static void compiler_copy_buffer(void)
+{
+ const size_t n = 8192 * 4;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("test_copy_buffer");
+ //OCL_CREATE_KERNEL("compiler_array");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = i;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_buffer);
+
diff --git a/utests/compiler_copy_buffer_row.cpp b/utests/compiler_copy_buffer_row.cpp
new file mode 100644
index 0000000..12c0592
--- /dev/null
+++ b/utests/compiler_copy_buffer_row.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+static void compiler_copy_buffer_row(void)
+{
+ uint32_t *src_buffer = NULL;
+ int *data_buffer = NULL;
+ const int row = 8192;
+ const int row_n = 2;
+ const int n = row * row_n;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("test_copy_buffer_row");
+ src_buffer = (uint32_t *) malloc(sizeof(uint32_t) * n);
+ for (int32_t i = 0; i < n; ++i) src_buffer[i] = i;
+ data_buffer = (int *) malloc(sizeof(int) * 2);
+ data_buffer[0] = row;
+ data_buffer[1] = n;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_COPY_HOST_PTR, 2 * sizeof(uint32_t), data_buffer);
+ free(src_buffer);
+ free(data_buffer);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_buffer_row);
+
diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp
new file mode 100644
index 0000000..150fd8a
--- /dev/null
+++ b/utests/compiler_copy_image.cpp
@@ -0,0 +1,58 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_copy_image(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ cl_image_format format;
+ cl_image_desc desc;
+ cl_sampler sampler;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_copy_image");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = w * sizeof(uint32_t);
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+ OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ globals[0] = w;
+ globals[1] = h;
+ locals[0] = 16;
+ locals[1] = 16;
+ OCL_NDRANGE(2);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image);
diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp
new file mode 100644
index 0000000..659dddc
--- /dev/null
+++ b/utests/compiler_copy_image1.cpp
@@ -0,0 +1,83 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_copy_image1(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ cl_image_format format;
+ cl_image_desc desc;
+ cl_sampler sampler;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_copy_image1");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = w * sizeof(uint32_t);
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+ OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+ OCL_CREATE_IMAGE(buf[2], 0, &format, &desc, NULL);
+ OCL_CREATE_IMAGE(buf[3], 0, &format, &desc, NULL);
+ OCL_CREATE_IMAGE(buf[4], 0, &format, &desc, NULL);
+ OCL_CREATE_IMAGE(buf[5], 0, &format, &desc, NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[3]);
+ OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+ OCL_SET_ARG(6, sizeof(cl_mem), &buf[5]);
+ float w_inv = 1.0/w;
+ float h_inv = 1.0/h;
+ OCL_SET_ARG(7, sizeof(float), &w_inv);
+ OCL_SET_ARG(8, sizeof(float), &h_inv);
+
+ globals[0] = w;
+ globals[1] = h;
+ locals[0] = 16;
+ locals[1] = 16;
+ OCL_NDRANGE(2);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ OCL_MAP_BUFFER(4);
+ OCL_MAP_BUFFER(5);
+
+ for(uint32_t k = 0; k < 5; k++)
+ {
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1 + k])[j * w + i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+ OCL_UNMAP_BUFFER(4);
+ OCL_UNMAP_BUFFER(5);
+
+ OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);
diff --git a/utests/compiler_copy_image_1d.cpp b/utests/compiler_copy_image_1d.cpp
new file mode 100644
index 0000000..5af6a77
--- /dev/null
+++ b/utests/compiler_copy_image_1d.cpp
@@ -0,0 +1,52 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_copy_image_1d(void)
+{
+ const size_t w = 512;
+ cl_image_format format;
+ cl_image_desc desc;
+ cl_sampler sampler;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_copy_image_1d");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w);
+ for (uint32_t i = 0; i < w; i++)
+ ((uint32_t*)buf_data[0])[i] = i;
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+ desc.image_width = w;
+ desc.image_row_pitch = w * sizeof(uint32_t);
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+ OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ globals[0] = w;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < w; i++) {
+ //printf (" %x", ((uint32_t*)buf_data[1])[i]);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_1d);
diff --git a/utests/compiler_copy_image_3d.cpp b/utests/compiler_copy_image_3d.cpp
new file mode 100644
index 0000000..de7cd45
--- /dev/null
+++ b/utests/compiler_copy_image_3d.cpp
@@ -0,0 +1,77 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+static void compiler_copy_image_3d(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ const size_t depth = 4;
+ cl_image_format format;
+ cl_image_desc desc;
+ cl_sampler sampler;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_copy_image_3d");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * depth);
+ for (uint32_t k = 0; k < depth; k++)
+ for (uint32_t j = 0; j < h; j++)
+ for (uint32_t i = 0; i < w; i++)
+ ((float*)buf_data[0])[k*w*h + j*w + i] = (k << 10) + (j << 10) + i;
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNORM_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_depth = depth;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+ OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+ memset(&desc, 0, sizeof(desc));
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_depth = 1;
+ for(uint32_t i = 0; i < depth; i++)
+ OCL_CREATE_IMAGE(buf[2 + i], 0, &format, &desc, NULL);
+
+ OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ for(uint32_t i = 0; i < depth; i++)
+ OCL_SET_ARG(3 + i, sizeof(cl_mem), &buf[2 + i]);
+ globals[0] = w;
+ globals[1] = h;
+ globals[2] = depth;
+ locals[0] = 64;
+ locals[1] = 1;
+ locals[2] = 1;
+ OCL_NDRANGE(3);
+
+ // Check result
+ for(uint32_t i = 0; i < depth + 2; i++)
+ OCL_MAP_BUFFER_GTT(i);
+ for (uint32_t k = 0; k < depth; k++)
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++) {
+ OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[1])[k*w*((h+1)&-2LL) + j*w + i]);
+ OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[k + 2])[j * w + i]);
+ }
+
+ for(uint32_t i = 0; i < depth + 2; i++)
+ OCL_UNMAP_BUFFER_GTT(i);
+
+ OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_3d);
diff --git a/utests/compiler_data_types.cpp b/utests/compiler_data_types.cpp
new file mode 100644
index 0000000..c686cc7
--- /dev/null
+++ b/utests/compiler_data_types.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_data_types(void)
+{
+ OCL_CREATE_KERNEL("compiler_data_types");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_data_types);
+
diff --git a/utests/compiler_degrees.cpp b/utests/compiler_degrees.cpp
new file mode 100644
index 0000000..7a17ca7
--- /dev/null
+++ b/utests/compiler_degrees.cpp
@@ -0,0 +1,32 @@
+#include "utest_helper.hpp"
+
+void compiler_degrees(void)
+{
+ const int n = 32;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_degrees");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float *)buf_data[0])[i] = rand() * 0.01f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ OCL_ASSERT(((float *)buf_data[1])[i] == src[i] * (180 / 3.141592653589793F));
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_degrees);
diff --git a/utests/compiler_displacement_map_element.cpp b/utests/compiler_displacement_map_element.cpp
new file mode 100644
index 0000000..98041ec
--- /dev/null
+++ b/utests/compiler_displacement_map_element.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+
+typedef unsigned int uint;
+constexpr int W = 16, H = 16;
+constexpr int SIZE = W * H;
+uint in_1[SIZE];
+uint disp_map[SIZE];
+uint out_1[SIZE];
+
+uint cpu(const int cx, const int cy, const uint *in, const uint *disp_map, int w, int h) {
+ uint c = disp_map[cy * w + cx];
+ int x_pos = cx + c;
+ int y_pos = cy + c;
+ if(0 <= x_pos && x_pos < w && 0 <= y_pos && y_pos < h)
+ return in[y_pos * w + x_pos];
+ else
+ return 0;
+}
+
+void test() {
+ OCL_MAP_BUFFER(2);
+ for(int y=0; y<H; y++)
+ for(int x=0; x<W; x++) {
+ uint out = ((uint*)buf_data[2]) [y * W + x];
+ uint wish = cpu(x, y, in_1, disp_map, W, H);
+ if(out != wish)
+ printf("XXX %d %d %x %x\n", x, y, out, wish);
+ OCL_ASSERT(out == wish);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+void displacement_map_element(void) {
+ int i, pass;
+
+ OCL_CREATE_KERNEL("compiler_displacement_map_element");
+ OCL_CREATE_BUFFER(buf[0], 0, SIZE * sizeof(uint), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, SIZE * sizeof(uint), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, SIZE * sizeof(uint), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(W), &W);
+ OCL_SET_ARG(3, sizeof(H), &H);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[2]);
+ globals[0] = W;
+ globals[1] = H;
+ locals[0] = 16;
+ locals[1] = 16;
+
+ for (pass = 0; pass < 8; pass ++) {
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (i = 0; i < SIZE; i ++) {
+ in_1[i] = ((uint*)buf_data[0])[i] = ((rand() & 0xFFFF) << 16) | (rand() & 0xFFFF);
+ disp_map[i] = ((uint*)buf_data[1])[i] = rand() & 3;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(2);
+ test();
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(displacement_map_element);
diff --git a/utests/compiler_double.cpp b/utests/compiler_double.cpp
new file mode 100644
index 0000000..7c54ddf
--- /dev/null
+++ b/utests/compiler_double.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, double *src, double *dst) {
+ double f = src[global_id];
+ double d = 1.234567890123456789;
+ dst[global_id] = global_id < 14 ? (d * (f + d)) : 14;
+}
+
+void compiler_double(void)
+{
+ const size_t n = 16;
+ double cpu_dst[n], cpu_src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_double");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 1; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((double*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double);
diff --git a/utests/compiler_double_2.cpp b/utests/compiler_double_2.cpp
new file mode 100644
index 0000000..7e3ae4b
--- /dev/null
+++ b/utests/compiler_double_2.cpp
@@ -0,0 +1,47 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, double *dst) {
+ float f = src[global_id];
+ float d = 1.234567890123456789;
+ dst[global_id] = global_id < 14 ? d * (d + f) : 14;
+}
+
+void compiler_double_2(void)
+{
+ const size_t n = 16;
+ float cpu_src[n];
+ double cpu_dst[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_double_2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 1; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_2);
diff --git a/utests/compiler_double_3.cpp b/utests/compiler_double_3.cpp
new file mode 100644
index 0000000..294950d
--- /dev/null
+++ b/utests/compiler_double_3.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, double *dst) {
+ float d = 1.234567890123456789;
+ dst[global_id] = global_id < 14 ? d : 14;
+}
+
+void compiler_double_3(void)
+{
+ const size_t n = 16;
+ float cpu_src[n];
+ double cpu_dst[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_double_3");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 1; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_3);
diff --git a/utests/compiler_double_4.cpp b/utests/compiler_double_4.cpp
new file mode 100644
index 0000000..cb25bd4
--- /dev/null
+++ b/utests/compiler_double_4.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void compiler_double_4(void)
+{
+ const size_t n = 16;
+ double cpu_src1[n], cpu_src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_double_4");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ cpu_src1[i] = ((double*)buf_data[0])[i] = rand() * 1e-2;
+ cpu_src2[i] = ((double*)buf_data[1])[i] = rand() * 1e-2;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(fabs(((double*)buf_data[2])[i] - cpu_src1[i] - cpu_src2[i]) < 1e-4);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_4);
diff --git a/utests/compiler_double_precision.cpp b/utests/compiler_double_precision.cpp
new file mode 100644
index 0000000..217fd18
--- /dev/null
+++ b/utests/compiler_double_precision.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+#include <math.h>
+
+static void double_precision_check(void)
+{
+ const size_t n = 16; //8192 * 4;
+
+ double d0 = 0.12345678912345678;
+ double d1 = 0.12355678922345678;
+ float cpu_result = d1 - d0;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("double_precision_check");
+ //OCL_CREATE_KERNEL("compiler_array");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((float*)buf_data[0])[i] = 0;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ bool precisionOK = true;
+ for (uint32_t i = 0; i < n; ++i) {
+ float error = ((float*)buf_data[1])[i] - cpu_result;
+ if (error != 0)
+ precisionOK = false;
+ OCL_ASSERT((fabs(error) < 1e-4));
+ }
+ if (!precisionOK)
+ printf("\n - WARN: GPU doesn't have correct double precision. Got %.7G, expected %.7G\n", ((float*)buf_data[1])[0], cpu_result);
+}
+
+MAKE_UTEST_FROM_FUNCTION(double_precision_check);
diff --git a/utests/compiler_fabs.cpp b/utests/compiler_fabs.cpp
new file mode 100644
index 0000000..b14f486
--- /dev/null
+++ b/utests/compiler_fabs.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, float *dst) {
+ float f = src[global_id];
+ f = f < 0 ? -f : f;
+ dst[global_id] = f;
+}
+
+void compiler_fabs(void)
+{
+ const size_t n = 16;
+ float cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_fabs");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fabs);
diff --git a/utests/compiler_fill_gl_image.cpp b/utests/compiler_fill_gl_image.cpp
new file mode 100644
index 0000000..87d2fcd
--- /dev/null
+++ b/utests/compiler_fill_gl_image.cpp
@@ -0,0 +1,76 @@
+#include "utest_helper.hpp"
+
+static void read_back(int tex, int width, int height, uint32_t * resultColor)
+{
+ float vertices[8] = {-1, 1, 1, 1, 1, -1, -1, -1};
+ float tex_coords[8] = {0, 0, 1, 0, 1, 1, 0, 1};
+
+ glBindTexture(GL_TEXTURE_2D, tex);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glEnable(GL_TEXTURE_2D);
+ glDisable(GL_BLEND);
+ glVertexPointer(2, GL_FLOAT, sizeof(float) * 2, vertices);
+ glEnableClientState(GL_VERTEX_ARRAY);
+ glClientActiveTexture(GL_TEXTURE0);
+ glTexCoordPointer(2, GL_FLOAT, sizeof(float) * 2, tex_coords);
+ glEnableClientState(GL_TEXTURE_COORD_ARRAY);
+ glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+ glFlush();
+ OCL_SWAP_EGL_BUFFERS();
+
+ glReadPixels(0, 0, width, height, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor);
+}
+
+
+static void compiler_fill_gl_image(void)
+{
+ const size_t w = EGL_WINDOW_WIDTH;
+ const size_t h = EGL_WINDOW_HEIGHT;
+ uint32_t color = 0x123456FF;
+ uint32_t *resultColor;
+ GLuint tex;
+
+ if (eglContext == EGL_NO_CONTEXT) {
+ fprintf(stderr, "There is no valid egl context. Ignore this case.\n");
+ return;
+ }
+ // Setup kernel and images
+ glGenTextures(1, &tex);
+ glBindTexture(GL_TEXTURE_2D, tex);
+ // Must set the all filters to GL_NEAREST!
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, NULL);
+
+ OCL_CREATE_KERNEL("test_fill_gl_image");
+ OCL_CREATE_GL_IMAGE(buf[0], 0, GL_TEXTURE_2D, 0, tex);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(color), &color);
+ globals[0] = w;
+ globals[1] = h;
+ locals[0] = 16;
+ locals[1] = 16;
+ glFinish();
+ OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(0);
+ OCL_NDRANGE(2);
+ OCL_FLUSH();
+
+ // Check result
+ resultColor = new uint32_t[w * h * 4];
+ if (resultColor == NULL)
+ assert(0);
+
+ read_back(tex, w, h, resultColor);
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(resultColor[j * w + i] == color);
+ OCL_UNMAP_BUFFER(0);
+ delete resultColor;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_gl_image);
diff --git a/utests/compiler_fill_image.cpp b/utests/compiler_fill_image.cpp
new file mode 100644
index 0000000..5a38b8c
--- /dev/null
+++ b/utests/compiler_fill_image.cpp
@@ -0,0 +1,44 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ uint32_t color = 0x12345678;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = 0;
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_fill_image");
+
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(color), &color);
+ globals[0] = w;
+ globals[1] = h;
+ locals[0] = 16;
+ locals[1] = 16;
+ OCL_NDRANGE(2);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 0x78563412);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image);
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_fill_image0.cpp
new file mode 100644
index 0000000..e6e0b1d
--- /dev/null
+++ b/utests/compiler_fill_image0.cpp
@@ -0,0 +1,42 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image0(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = 0;
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_fill_image0");
+
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ globals[0] = w;
+ globals[1] = h;
+ locals[0] = 16;
+ locals[1] = 16;
+ OCL_NDRANGE(2);
+
+ // Check result
+ OCL_MAP_BUFFER_GTT(0);
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == (i << 16 | j));
+ OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image0);
diff --git a/utests/compiler_fill_image_1d.cpp b/utests/compiler_fill_image_1d.cpp
new file mode 100644
index 0000000..e644c5f
--- /dev/null
+++ b/utests/compiler_fill_image_1d.cpp
@@ -0,0 +1,50 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_1d(void)
+{
+ const size_t w = 2048;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+ desc.image_width = w;
+ desc.image_row_pitch = 0;
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_fill_image_1d");
+
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+ OCL_MAP_BUFFER_GTT(0);
+ for (uint32_t i = 0; i < w; i++) {
+ ((uint32_t*)buf_data[0])[i] = 0;
+ }
+ OCL_UNMAP_BUFFER_GTT(0);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ globals[0] = w/2;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER_GTT(0);
+ //printf("------ The image result is: -------\n");
+ for (uint32_t i = 0; i < w/2; i++) {
+ //printf(" %2x", ((uint32_t *)buf_data[0])[i]);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 0x03020100);
+ }
+ for (uint32_t i = w/2; i < w; i++) {
+ //printf(" %2x", ((uint32_t *)buf_data[0])[i]);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 0);
+ }
+ OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_1d);
diff --git a/utests/compiler_fill_image_3d.cpp b/utests/compiler_fill_image_3d.cpp
new file mode 100644
index 0000000..ec96e80
--- /dev/null
+++ b/utests/compiler_fill_image_3d.cpp
@@ -0,0 +1,50 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_3d(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ const size_t depth = 5;
+ uint32_t color = 0x12345678;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_depth = depth;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_fill_image_3d");
+
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(color), &color);
+ globals[0] = w;
+ globals[1] = h;
+ globals[2] = depth;
+ locals[0] = 16;
+ locals[1] = 16;
+ locals[2] = 1;
+ OCL_NDRANGE(3);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ for (uint32_t k = 0; k < depth; k++)
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == 0x78563412);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_3d);
diff --git a/utests/compiler_fill_image_3d_2.cpp b/utests/compiler_fill_image_3d_2.cpp
new file mode 100644
index 0000000..410ace8
--- /dev/null
+++ b/utests/compiler_fill_image_3d_2.cpp
@@ -0,0 +1,48 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_3d_2(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ const size_t depth = 5;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_depth = depth;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_fill_image_3d_2");
+
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ globals[0] = w;
+ globals[1] = h;
+ globals[2] = depth;
+ locals[0] = 16;
+ locals[1] = 16;
+ locals[2] = 1;
+ OCL_NDRANGE(3);
+
+ // Check result
+ OCL_MAP_BUFFER_GTT(0);
+ for (uint32_t k = 0; k < depth; k++)
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == 0x78563412);
+ OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_3d_2);
diff --git a/utests/compiler_function_argument.cpp b/utests/compiler_function_argument.cpp
new file mode 100644
index 0000000..a39523b
--- /dev/null
+++ b/utests/compiler_function_argument.cpp
@@ -0,0 +1,27 @@
+#include "utest_helper.hpp"
+
+void compiler_function_argument(void)
+{
+ const size_t n = 2048;
+ const int value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_argument");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(int), &value);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[0])[i] == value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument);
+
+
diff --git a/utests/compiler_function_argument0.cpp b/utests/compiler_function_argument0.cpp
new file mode 100644
index 0000000..2e4227e
--- /dev/null
+++ b/utests/compiler_function_argument0.cpp
@@ -0,0 +1,26 @@
+#include "utest_helper.hpp"
+
+void compiler_function_argument0(void)
+{
+ const size_t n = 2048;
+ const short value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_argument0");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(short), &value);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[0])[i] == value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument0);
+
diff --git a/utests/compiler_function_argument1.cpp b/utests/compiler_function_argument1.cpp
new file mode 100644
index 0000000..48a7677
--- /dev/null
+++ b/utests/compiler_function_argument1.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+void compiler_function_argument1(void)
+{
+ const size_t n = 2048;
+ const char value = 34;
+ const short value0 = 31;
+ const int value1 = 3;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_argument1");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(char), &value);
+ OCL_SET_ARG(2, sizeof(short), &value0);
+ OCL_SET_ARG(3, sizeof(int), &value1);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[0])[i] == value + value0 + value1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument1);
+
+
diff --git a/utests/compiler_function_argument2.cpp b/utests/compiler_function_argument2.cpp
new file mode 100644
index 0000000..c352a9e
--- /dev/null
+++ b/utests/compiler_function_argument2.cpp
@@ -0,0 +1,57 @@
+#include "utest_helper.hpp"
+
+#define VECSIZE 8
+void compiler_function_argument2(void)
+{
+ char arg0[8] = { 0 };
+ unsigned char arg1[8] = { 0 };
+ short arg2[8] = { 0 };
+ unsigned short arg3[8] = { 0 };
+ int arg4[8] = { 0 };
+ unsigned int arg5[8] = { 0 };
+ float arg6[8] = { 0 };
+
+ for (uint32_t i = 0; i < 8; ++i) {
+ arg0[i] = rand();
+ arg1[i] = rand();
+ arg2[i] = rand();
+ arg3[i] = rand();
+ arg4[i] = rand();
+ arg5[i] = rand();
+ arg6[i] = rand();
+ }
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_argument2");
+ OCL_CREATE_BUFFER(buf[0], 0, sizeof(float) * 8 * 8, NULL);
+ OCL_SET_ARG(0, sizeof(arg0), arg0);
+ OCL_SET_ARG(1, sizeof(arg1), arg1);
+ OCL_SET_ARG(2, sizeof(arg2), arg2);
+ OCL_SET_ARG(3, sizeof(arg3), arg3);
+ OCL_SET_ARG(4, sizeof(arg4), arg4);
+ OCL_SET_ARG(5, sizeof(arg5), arg5);
+ OCL_SET_ARG(6, sizeof(arg6), arg6);
+ OCL_SET_ARG(7, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = 1;
+ locals[0] = 1;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ /* Check results */
+ float *dst = (float*)buf_data[0];
+
+ for (uint32_t i = 0; i < 8; ++i) {
+ OCL_ASSERT((float)arg0[i] == dst[0*8 + i]);
+ OCL_ASSERT((float)arg1[i] == dst[1*8 + i]);
+ OCL_ASSERT((float)arg2[i] == dst[2*8 + i]);
+ OCL_ASSERT((float)arg3[i] == dst[3*8 + i]);
+ OCL_ASSERT((float)arg4[i] == dst[4*8 + i]);
+ OCL_ASSERT((float)arg5[i] == dst[5*8 + i]);
+ OCL_ASSERT((float)arg6[i] == dst[6*8 + i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument2);
diff --git a/utests/compiler_function_argument3.cpp b/utests/compiler_function_argument3.cpp
new file mode 100644
index 0000000..e9f5e80
--- /dev/null
+++ b/utests/compiler_function_argument3.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+struct sfloat8 {
+ float a;
+ float b;
+ float c;
+ float d;
+ float e;
+ float f;
+ float g;
+ float h;
+};
+
+void compiler_function_argument3(void)
+{
+ sfloat8 arg6;
+
+ arg6.a = 3.0f;
+ arg6.h = 4.0f;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_argument3");
+ OCL_CREATE_BUFFER(buf[0], 0, sizeof(struct sfloat8) * 8, NULL);
+
+ OCL_SET_ARG(0, sizeof(arg6), &arg6);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = 1;
+ locals[0] = 1;
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+
+ /* Check results */
+ sfloat8 *dst = (sfloat8*)buf_data[0];
+
+ OCL_ASSERT(dst[0].a == 3.0f);
+ OCL_ASSERT(dst[0].b == 12.0f);
+ OCL_ASSERT(dst[0].h == 7.0f);
+
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument3);
diff --git a/utests/compiler_function_constant.cpp b/utests/compiler_function_constant.cpp
new file mode 100644
index 0000000..20f0ece
--- /dev/null
+++ b/utests/compiler_function_constant.cpp
@@ -0,0 +1,34 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant(void)
+{
+ const size_t n = 2048;
+ const uint32_t value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_constant");
+ OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(uint32_t), &value);
+
+ OCL_MAP_BUFFER(0);
+ for(uint32_t i = 0; i < 69; ++i)
+ ((short *)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t *)buf_data[1])[i] == (value + i%69));
+
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant);
diff --git a/utests/compiler_function_constant0.cpp b/utests/compiler_function_constant0.cpp
new file mode 100644
index 0000000..6fbbd30
--- /dev/null
+++ b/utests/compiler_function_constant0.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant0(void)
+{
+ const size_t n = 2048;
+ const uint32_t value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_constant0");
+ OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(int32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, 1 * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(uint32_t), &value);
+
+ OCL_MAP_BUFFER(0);
+ for(uint32_t i = 0; i < 69; ++i)
+ ((int32_t *)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_MAP_BUFFER(1);
+ ((char *)buf_data[1])[0] = 15;
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t *)buf_data[2])[i] == (value + 15 + i%69));
+
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant0);
diff --git a/utests/compiler_function_constant1.cpp b/utests/compiler_function_constant1.cpp
new file mode 100644
index 0000000..b92e6ca
--- /dev/null
+++ b/utests/compiler_function_constant1.cpp
@@ -0,0 +1,47 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant1(void)
+{
+ const size_t n = 2048;
+ const uint32_t value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_function_constant");
+ OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(uint32_t), &value);
+
+ OCL_MAP_BUFFER(0);
+ for(uint32_t i = 0; i < 69; ++i)
+ ((short *)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ OCL_CREATE_BUFFER(buf[2], 0, 101 * sizeof(short), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[2]);
+ OCL_MAP_BUFFER(2);
+ for(uint32_t i = 0; i < 69; ++i)
+ ((short *)buf_data[2])[i] = 2*i;
+ OCL_UNMAP_BUFFER(2);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t *)buf_data[1])[i] == (value + (i%69)*2));
+
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant1);
diff --git a/utests/compiler_function_qualifiers.cpp b/utests/compiler_function_qualifiers.cpp
new file mode 100644
index 0000000..622313c
--- /dev/null
+++ b/utests/compiler_function_qualifiers.cpp
@@ -0,0 +1,20 @@
+#include "utest_helper.hpp"
+
+void compiler_function_qualifiers(void)
+{
+ OCL_CREATE_KERNEL("compiler_function_qualifiers");
+
+ size_t param_value_size;
+ void* param_value;
+ cl_int err;
+
+ err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, 0, NULL, ¶m_value_size);
+ OCL_ASSERT(err == CL_SUCCESS);
+ param_value = malloc(param_value_size);
+ err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, param_value_size, param_value, NULL);
+ OCL_ASSERT(err == CL_SUCCESS);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_qualifiers);
+
+
diff --git a/utests/compiler_geometric_builtin.cpp b/utests/compiler_geometric_builtin.cpp
new file mode 100644
index 0000000..a9ccc2c
--- /dev/null
+++ b/utests/compiler_geometric_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_geometric_builtin(void)
+{
+ OCL_CREATE_KERNEL("compiler_geometric_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_geometric_builtin);
+
diff --git a/utests/compiler_get_image_info.cpp b/utests/compiler_get_image_info.cpp
new file mode 100644
index 0000000..3b9d132
--- /dev/null
+++ b/utests/compiler_get_image_info.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+
+static void compiler_get_image_info(void)
+{
+ const size_t w = 256;
+ const size_t h = 512;
+ const size_t depth = 3;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_depth = depth;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+ desc.num_mip_levels = 0;
+ desc.num_samples = 0;
+ desc.buffer = NULL;
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_get_image_info");
+
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, 32 * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, 32 * sizeof(int), NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = 32;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (uint32_t i = 0; i < 32; i++)
+ {
+ OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((w << 20) | (h << 8) | depth));
+ OCL_ASSERT(((uint32_t*)buf_data[2])[i] == ((CL_UNSIGNED_INT8 << 16) | CL_RGBA));
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_info);
diff --git a/utests/compiler_get_image_info_array.cpp b/utests/compiler_get_image_info_array.cpp
new file mode 100644
index 0000000..970877d
--- /dev/null
+++ b/utests/compiler_get_image_info_array.cpp
@@ -0,0 +1,64 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_get_image_info_array(void)
+{
+ const int w = 256;
+ const int h = 512;
+ const int array_size1 = 10;
+ const int array_size2 = 3;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ // Create the 1D array buffer.
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+ desc.image_width = w;
+ desc.image_array_size = array_size1;
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+ // Create the 2D array buffer.
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_array_size = array_size2;
+ OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_get_image_info_array");
+
+ OCL_CREATE_BUFFER(buf[2], 0, 32 * sizeof(int), NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = 32;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(2);
+ OCL_ASSERT(((int*)buf_data[2])[0] == w);
+ OCL_ASSERT(((int*)buf_data[2])[1] == array_size1);
+ OCL_ASSERT(((int*)buf_data[2])[2] == CL_UNSIGNED_INT8);
+ OCL_ASSERT(((int*)buf_data[2])[3] == CL_RGBA);
+
+ OCL_ASSERT(((int*)buf_data[2])[4] == w);
+ OCL_ASSERT(((int*)buf_data[2])[5] == h);
+ OCL_ASSERT(((int*)buf_data[2])[6] == array_size2);
+ OCL_ASSERT(((int*)buf_data[2])[7] == CL_UNSIGNED_INT8);
+ OCL_ASSERT(((int*)buf_data[2])[8] == CL_RGBA);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_info_array);
diff --git a/utests/compiler_getelementptr_bitcast.cpp b/utests/compiler_getelementptr_bitcast.cpp
new file mode 100644
index 0000000..a57ff36
--- /dev/null
+++ b/utests/compiler_getelementptr_bitcast.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void compiler_getelementptr_bitcast(void)
+{
+ const size_t n = 16;
+ float cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_getelementptr_bitcast");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+
+ //must be 1 to pass the test, it is required by the special usage in the kernel
+ locals[0] = 1;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i){
+ unsigned char* c = (unsigned char*)&cpu_src[i];
+ cpu_dst[i] = c[2];
+ }
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i){
+ //printf("src:%f, gpu_dst: %f, cpu_dst: %f\n", cpu_src[i], ((float *)buf_data[1])[i], cpu_dst[i]);
+ OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_getelementptr_bitcast);
diff --git a/utests/compiler_global_constant.cpp b/utests/compiler_global_constant.cpp
new file mode 100644
index 0000000..88f9852
--- /dev/null
+++ b/utests/compiler_global_constant.cpp
@@ -0,0 +1,104 @@
+#include "utest_helper.hpp"
+
+void compiler_global_constant(void)
+{
+ const size_t n = 2048;
+ const uint32_t e = 34, r = 77;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_global_constant");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(uint32_t), &e);
+ OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ unsigned int m[3] = {71,72,73};
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], m[i%3] + e + r);
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + e + r);
+ OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant1(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant1");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ uint32_t data1[] = {1, 4, 7};
+ uint32_t data2[]= {3, 7, 11};
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + data2[i%3]);
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + data2[i%3]);
+ OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant2(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], 6);
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == 6);
+ OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant3(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant3");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ uint32_t data1[] = {3, 6, 9};
+ char data2[]= {'c', 'f', 'j'};
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + (int)data2[i%3]);
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + (uint32_t)data2[i%3]);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant1, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant2, true);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant3);
diff --git a/utests/compiler_global_constant_2.cpp b/utests/compiler_global_constant_2.cpp
new file mode 100644
index 0000000..cbe63ae
--- /dev/null
+++ b/utests/compiler_global_constant_2.cpp
@@ -0,0 +1,59 @@
+#include "utest_helper.hpp"
+
+void compiler_global_constant_2(void)
+{
+ const size_t n = 2048;
+ const uint32_t e = 34, r = 77;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_global_constant_2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(uint32_t), &e);
+ OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ unsigned int m[3] = {0x15b,0x25b,0x35b};
+ unsigned int t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// std::cout << ((uint32_t *)buf_data[0])[i] << std::endl;
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + t[i%5] + e + r);
+ OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant_2_long(void)
+{
+ const size_t n = 2048;
+ const uint32_t e = 34, r = 77;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant_2", "compiler_global_constant_2_long");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(uint32_t), &e);
+ OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ uint64_t m[3] = {0x15b,0x25b,0xFFFFFFFFF};
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// std::cout << ((uint64_t *)buf_data[0])[i] << std::endl;
+ OCL_ASSERT(((uint64_t *)buf_data[0])[i] == m[i%3] + e + r);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2_long);
diff --git a/utests/compiler_global_memory_barrier.cpp b/utests/compiler_global_memory_barrier.cpp
new file mode 100644
index 0000000..ea84e72
--- /dev/null
+++ b/utests/compiler_global_memory_barrier.cpp
@@ -0,0 +1,28 @@
+#include "utest_helper.hpp"
+
+static void compiler_global_memory_barrier(void)
+{
+ const size_t n = 16*1024;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_global_memory_barrier");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ // Run the kernel
+ globals[0] = n/2;
+ locals[0] = 256;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ uint32_t *dst = (uint32_t*)buf_data[0];
+ for (uint32_t i = 0; i < n; i+=locals[0])
+ for (uint32_t j = 0; j < locals[0]; ++j)
+ OCL_ASSERT(dst[i+j] == locals[0] - 1 -j);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_global_memory_barrier);
diff --git a/utests/compiler_group_size.cpp b/utests/compiler_group_size.cpp
new file mode 100644
index 0000000..8ad83f0
--- /dev/null
+++ b/utests/compiler_group_size.cpp
@@ -0,0 +1,141 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+struct xyz{
+ unsigned short b;
+ unsigned short e;
+ unsigned int o;
+};
+
+void compiler_group_size1(void)
+{
+ const size_t n = 7*32*17;
+
+ int group_size[] = {7, 17, 32};
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_group_size");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ for(int i = 0; i < 3; i++) {
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = group_size[i];
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+void compiler_group_size2(void)
+{
+ const uint32_t n = 4*17*8;
+ int size_x[] = {2, 4, 17};
+ int size_y[] = {2, 4, 4};
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_group_size");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ for(int i = 0; i < 3; i++) {
+ // Run the kernel
+ globals[0] = 4*17;
+ globals[1] = 8;
+ locals[0] = size_x[i];
+ locals[1] = size_y[i];
+ OCL_NDRANGE(2);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+void compiler_group_size3(void)
+{
+ const uint32_t n = 4*17*8*4;
+ int size_x[] = {2, 4, 17};
+ int size_y[] = {2, 4, 4};
+ int size_z[] = {2, 1, 2};
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_group_size");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ for(int i = 0; i < 3; i++) {
+ // Run the kernel
+ globals[0] = 4*17;
+ globals[1] = 8;
+ globals[2] = 4;
+ locals[0] = size_x[i];
+ locals[1] = size_y[i];
+ locals[2] = size_z[i];
+ OCL_NDRANGE(3);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+void compiler_group_size4(void)
+{
+ const size_t n = 16;
+ uint32_t color = 2;
+ uint32_t num = 1;
+ int group_size[] = {1};
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_group_size", "compiler_group_size4");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(struct xyz), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+
+ for(uint32_t i = 0; i < num; i++) {
+ // Run the kernel
+ OCL_MAP_BUFFER(0);
+ ((struct xyz*)buf_data[0])[0].b = 0;
+ ((struct xyz*)buf_data[0])[0].e = 2;
+ ((struct xyz*)buf_data[0])[0].o = 0;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_MAP_BUFFER(1);
+ memset(((uint32_t*)buf_data[1]), 0x0, sizeof(uint32_t)*n);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_int), &group_size[i]);
+ OCL_SET_ARG(3, sizeof(cl_int), &color);
+
+ globals[0] = group_size[i];
+ locals[0] = group_size[i];
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ for (uint32_t j = 0; j < n; ++j) {
+// std::cout <<((uint32_t*)buf_data[1])[j] << " ";
+ if(j >= i && j <= i+2) {
+ OCL_ASSERT(((uint32_t*)buf_data[1])[j] == color);
+ } else {
+ OCL_ASSERT(((uint32_t*)buf_data[1])[j] == 0);
+ }
+
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size1, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size2, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size3, true);
+MAKE_UTEST_FROM_FUNCTION(compiler_group_size4);
+
diff --git a/utests/compiler_hadd.cpp b/utests/compiler_hadd.cpp
new file mode 100644
index 0000000..9723702
--- /dev/null
+++ b/utests/compiler_hadd.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_hadd(void)
+{
+ const int n = 32;
+ int src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_hadd");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((int*)buf_data[1])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ long long a = src1[i];
+ a += src2[i];
+ a >>= 1;
+ OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_hadd);
diff --git a/utests/compiler_if_else.cpp b/utests/compiler_if_else.cpp
new file mode 100644
index 0000000..e38b23f
--- /dev/null
+++ b/utests/compiler_if_else.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+
+static void compiler_if_else(void)
+{
+ const size_t n = 17;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_if_else");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 16; ++i) {
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+ OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
+ }
+
+ // Second control flow
+ for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 16; ++i) {
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+ OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
+ }
+
+ // Third control flow
+ for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 2;
+ for (uint32_t i = 4; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 3; ++i) {
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+ OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
+ }
+ OCL_ASSERT(((int32_t*)buf_data[1])[3] == -1);
+ OCL_ASSERT(((int32_t*)buf_data[0])[3] == 1);
+ for (uint32_t i = 4; i < 16; ++i) {
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+ OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_if_else);
+
diff --git a/utests/compiler_insert_to_constant.cpp b/utests/compiler_insert_to_constant.cpp
new file mode 100644
index 0000000..c4f737f
--- /dev/null
+++ b/utests/compiler_insert_to_constant.cpp
@@ -0,0 +1,30 @@
+#include "utest_helper.hpp"
+
+void compiler_insert_to_constant(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_insert_to_constant");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t[4]), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ uint32_t *data = (uint32_t*) buf_data[0];
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(data[4*i+0] == 0);
+ OCL_ASSERT(data[4*i+1] == 1);
+ OCL_ASSERT(data[4*i+2] == i);
+ OCL_ASSERT(data[4*i+3] == 3);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insert_to_constant);
+
+
diff --git a/utests/compiler_insert_vector.cpp b/utests/compiler_insert_vector.cpp
new file mode 100644
index 0000000..c7c239f
--- /dev/null
+++ b/utests/compiler_insert_vector.cpp
@@ -0,0 +1,18 @@
+#include "utest_helper.hpp"
+
+void compiler_insert_vector(void)
+{
+ const size_t n = 2048;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_insert_vector");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int) * 4, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insert_vector);
diff --git a/utests/compiler_insn_selection_masked_min_max.cpp b/utests/compiler_insn_selection_masked_min_max.cpp
new file mode 100644
index 0000000..6a2edcc
--- /dev/null
+++ b/utests/compiler_insn_selection_masked_min_max.cpp
@@ -0,0 +1,42 @@
+#include "utest_helper.hpp"
+#include <algorithm>
+
+static void compiler_insn_selection_masked_min_max(void)
+{
+ const size_t n = 256;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_insn_selection_masked_min_max");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i)
+ ((float*)buf_data[0])[i] = float(i);
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ float *src = (float*)buf_data[0];
+ for (uint32_t i = 0; i < n; ++i) {
+ float cpu_dst;
+ if (i % 16 > 5)
+ cpu_dst = std::max(src[i], src[7]);
+ else
+ cpu_dst = std::min(src[i], src[10]);
+ OCL_ASSERT(dst[i] == cpu_dst);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insn_selection_masked_min_max)
+
+
diff --git a/utests/compiler_insn_selection_max.cpp b/utests/compiler_insn_selection_max.cpp
new file mode 100644
index 0000000..8552b9f
--- /dev/null
+++ b/utests/compiler_insn_selection_max.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+#include <algorithm>
+
+static void compiler_insn_selection_max(void)
+{
+ const size_t n = 8192 * 4;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_insn_selection_max");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i)
+ ((float*)buf_data[0])[i] = float(i);
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ float *src = (float*)buf_data[0];
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(dst[i] == std::max(src[i], src[0]));
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insn_selection_max)
+
+
diff --git a/utests/compiler_insn_selection_min.cpp b/utests/compiler_insn_selection_min.cpp
new file mode 100644
index 0000000..f5f9d18
--- /dev/null
+++ b/utests/compiler_insn_selection_min.cpp
@@ -0,0 +1,36 @@
+#include "utest_helper.hpp"
+#include <algorithm>
+
+static void compiler_insn_selection_min(void)
+{
+ const size_t n = 8192 * 4;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_insn_selection_min");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i)
+ ((float*)buf_data[0])[i] = float(i);
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ float *src = (float*)buf_data[0];
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(dst[i] == std::min(src[i], src[0]));
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insn_selection_min)
+
diff --git a/utests/compiler_integer_builtin.cpp b/utests/compiler_integer_builtin.cpp
new file mode 100644
index 0000000..98ad51b
--- /dev/null
+++ b/utests/compiler_integer_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_integer_builtin(void)
+{
+ OCL_CREATE_KERNEL("compiler_integer_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_integer_builtin);
+
diff --git a/utests/compiler_integer_division.cpp b/utests/compiler_integer_division.cpp
new file mode 100644
index 0000000..3898ae1
--- /dev/null
+++ b/utests/compiler_integer_division.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int x) {
+ dst[global_id] = src[global_id] / x;
+}
+
+void compiler_integer_division(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+ const int x = 7;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_integer_division");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(x), &x);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 1000;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst, x);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_integer_division);
diff --git a/utests/compiler_integer_remainder.cpp b/utests/compiler_integer_remainder.cpp
new file mode 100644
index 0000000..100f464
--- /dev/null
+++ b/utests/compiler_integer_remainder.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int x) {
+ dst[global_id] = src[global_id] % x;
+}
+
+void compiler_integer_remainder(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+ const int x = 7;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_integer_remainder");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(x), &x);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst, x);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_integer_remainder);
diff --git a/utests/compiler_load_bool_imm.cpp b/utests/compiler_load_bool_imm.cpp
new file mode 100644
index 0000000..d060daf
--- /dev/null
+++ b/utests/compiler_load_bool_imm.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+static void compiler_load_bool_imm(void)
+{
+ const size_t n = 1024;
+ const size_t local_size = 16;
+ const int copiesPerWorkItem = 5;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_load_bool_imm");
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, local_size*copiesPerWorkItem*sizeof(int), NULL); // 16 x int
+ OCL_SET_ARG(2, sizeof(int), &copiesPerWorkItem); // 16 x int
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = local_size;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ int *dst = (int*)buf_data[0];
+ for (uint32_t i = 0; i < n * copiesPerWorkItem; i++)
+ OCL_ASSERT(dst[i] == copiesPerWorkItem);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_load_bool_imm);
diff --git a/utests/compiler_local_memory_barrier.cpp b/utests/compiler_local_memory_barrier.cpp
new file mode 100644
index 0000000..6c9c98e
--- /dev/null
+++ b/utests/compiler_local_memory_barrier.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_barrier(void)
+{
+ const size_t n = 1024;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_local_memory_barrier");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, 64, NULL); // 16 x int
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ uint32_t *dst = (uint32_t*)buf_data[0];
+ for (uint32_t i = 0; i < n; i+=16)
+ for (uint32_t j = 0; j < 16; ++j)
+ OCL_ASSERT(dst[i+j] == 15-j);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_barrier);
+
diff --git a/utests/compiler_local_memory_barrier_2.cpp b/utests/compiler_local_memory_barrier_2.cpp
new file mode 100644
index 0000000..4fa090b
--- /dev/null
+++ b/utests/compiler_local_memory_barrier_2.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_barrier_2(void)
+{
+ const size_t n = 16*1024;
+
+ globals[0] = n/2;
+ locals[0] = 256;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_local_memory_barrier_2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ //OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, locals[0] * 2 * sizeof(uint32_t), NULL);
+
+ // Run the kernel
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ uint32_t *dst = (uint32_t*)buf_data[0];
+ for (uint32_t i = 0; i < n; i+=locals[0])
+ for (uint32_t j = 0; j < locals[0]; ++j)
+ OCL_ASSERT(dst[i+j] == locals[0] - 1 -j);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_barrier_2);
diff --git a/utests/compiler_local_memory_barrier_wg64.cpp b/utests/compiler_local_memory_barrier_wg64.cpp
new file mode 100644
index 0000000..0cb69f5
--- /dev/null
+++ b/utests/compiler_local_memory_barrier_wg64.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_barrier_wg64(void)
+{
+ const size_t n = 1024;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_local_memory_barrier_wg64");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, 256, NULL); // 64 x int
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 64;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ uint32_t *dst = (uint32_t*)buf_data[0];
+ for (uint32_t i = 0; i < n; i+=64)
+ for (uint32_t j = 0; j < 64; ++j)
+ OCL_ASSERT(dst[i+j] == 63-j);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_barrier_wg64);
+
diff --git a/utests/compiler_local_memory_two_ptr.cpp b/utests/compiler_local_memory_two_ptr.cpp
new file mode 100644
index 0000000..fde5533
--- /dev/null
+++ b/utests/compiler_local_memory_two_ptr.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_two_ptr(void)
+{
+ const size_t n = 1024;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_local_memory_two_ptr");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, 64, NULL); // 16 x int
+ OCL_SET_ARG(2, 64, NULL); // 16 x int
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ int32_t *dst = (int32_t*)buf_data[0];
+ for (int32_t i = 0; i < (int) n; i+=16)
+ for (int32_t j = 0; j < 16; ++j) {
+ const int gid = i + j;
+ const int tid = j;
+ OCL_ASSERT(dst[i+j] == (gid&~0xf) + 15-tid + 15-tid);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_two_ptr);
+
diff --git a/utests/compiler_local_slm.cpp b/utests/compiler_local_slm.cpp
new file mode 100644
index 0000000..3a0c1ed
--- /dev/null
+++ b/utests/compiler_local_slm.cpp
@@ -0,0 +1,33 @@
+#include "utest_helper.hpp"
+
+void compiler_local_slm(void)
+{
+ const size_t n = 32;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == (i%16 + 2 + 1+ i/16));
+ OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_local_slm1(void)
+{
+ const size_t n = 2;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm1");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ globals[0] = 1;
+ locals[0] = 1;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ uint64_t * ptr = (uint64_t*)buf_data[0];
+ OCL_ASSERT((ptr[1] -ptr[0]) == 4);
+ OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_local_slm);
+MAKE_UTEST_FROM_FUNCTION(compiler_local_slm1);
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
new file mode 100644
index 0000000..b525694
--- /dev/null
+++ b/utests/compiler_long.cpp
@@ -0,0 +1,60 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ int64_t zero = 0;
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_long), &zero);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ src1[0] = -1L, src2[0] = -1L;
+ src1[1] = 0x8000000000000000UL, src2[1] = 0x8000000000000000UL;
+ src1[2] = 0x7FFFFFFFFFFFFFFFL, src2[2] = 1L;
+ src1[3] = 0xFFFFFFFEL, src2[3] = 1L;
+ src1[4] = 0x7FFFFFFFL, src2[4] = 0x80000000L;
+ src1[5] = 0, src2[5] = 0;
+ src1[6] = 0, src2[6] = 1;
+ src1[7] = -2L, src2[7] = -1L;
+ src1[8] = 0, src2[8] = 0x8000000000000000UL;
+ for (int32_t i = 9; i < (int32_t) n; ++i) {
+ src1[i] = ((int64_t)rand() << 32) + rand();
+ src2[i] = ((int64_t)rand() << 32) + rand();
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+ if (i < 5)
+ OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+ if (i > 5)
+ OCL_ASSERT(src1[i] - src2[i] == ((int64_t *)buf_data[2])[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long);
diff --git a/utests/compiler_long_2.cpp b/utests/compiler_long_2.cpp
new file mode 100644
index 0000000..6c5da4b
--- /dev/null
+++ b/utests/compiler_long_2.cpp
@@ -0,0 +1,51 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_2(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src1[i] = ((int64_t)rand() << 32) + rand();
+ src2[i] = ((int64_t)rand() << 32) + rand();
+ }
+ src1[4] = 1;
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ int64_t *dest = ((int64_t *)buf_data[2]);
+ //for (int32_t i = 0; i < (int32_t) n; ++i)
+ // printf("%lx\n", dest[i]);
+ OCL_ASSERT(0xFEDCBA9876543210UL == (uint64_t)dest[0]);
+ OCL_ASSERT((src1[1] & src2[1]) == dest[1]);
+ OCL_ASSERT((src1[2] | src2[2]) == dest[2]);
+ OCL_ASSERT((src1[3] ^ src2[3]) == dest[3]);
+ OCL_ASSERT(0x1122334455667788L == dest[4]);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_2);
diff --git a/utests/compiler_long_asr.cpp b/utests/compiler_long_asr.cpp
new file mode 100644
index 0000000..0a70a23
--- /dev/null
+++ b/utests/compiler_long_asr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_asr(void)
+{
+ const size_t n = 64;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_asr");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ src[i] = (int64_t)1 << 63;
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, sizeof(src));
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ int64_t *dest = ((int64_t *)buf_data[1]);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ if (i > 7)
+ OCL_ASSERT(dest[i] == src[i] >> i);
+ else
+ OCL_ASSERT(dest[i] == src[i] + 1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_asr);
diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
new file mode 100644
index 0000000..35d4c4f
--- /dev/null
+++ b/utests/compiler_long_cmp.cpp
@@ -0,0 +1,122 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_cmp(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ src1[0] = (int64_t)1 << 63, src2[0] = 0x7FFFFFFFFFFFFFFFll;
+ src1[1] = (int64_t)1 << 63, src2[1] = ((int64_t)1 << 63) | 1;
+ src1[2] = -1ll, src2[2] = 0;
+ src1[3] = ((int64_t)123 << 32) | 0x7FFFFFFF, src2[3] = ((int64_t)123 << 32) | 0x80000000;
+ src1[4] = 0x7FFFFFFFFFFFFFFFll, src2[4] = (int64_t)1 << 63;
+ src1[5] = ((int64_t)1 << 63) | 1, src2[5] = (int64_t)1 << 63;
+ src1[6] = 0, src2[6] = -1ll;
+ src1[7] = ((int64_t)123 << 32) | 0x80000000, src2[7] = ((int64_t)123 << 32) | 0x7FFFFFFF;
+ for(size_t i=8; i<n; i++) {
+ src1[i] = i;
+ src2[i] = i;
+ }
+
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_l");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] < src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+ OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_le");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] <= src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+ OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_g");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] > src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+ OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_ge");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] >= src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+ OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_eq");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] == src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+ OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_neq");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] != src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_cmp);
diff --git a/utests/compiler_long_convert.cpp b/utests/compiler_long_convert.cpp
new file mode 100644
index 0000000..ada6926
--- /dev/null
+++ b/utests/compiler_long_convert.cpp
@@ -0,0 +1,158 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+// convert shorter integer to 64-bit integer
+void compiler_long_convert(void)
+{
+ const size_t n = 16;
+ char src1[n];
+ short src2[n];
+ int src3[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_convert");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[4], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[5], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[4]);
+ OCL_SET_ARG(5, sizeof(cl_mem), &buf[5]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src1[i] = -i;
+ src2[i] = -i;
+ src3[i] = -i;
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ memcpy(buf_data[2], src3, sizeof(src3));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(3);
+ OCL_MAP_BUFFER(4);
+ OCL_MAP_BUFFER(5);
+ int64_t *dst1 = ((int64_t *)buf_data[3]);
+ int64_t *dst2 = ((int64_t *)buf_data[4]);
+ int64_t *dst3 = ((int64_t *)buf_data[5]);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%lx %lx %lx\n", dst1[i], dst2[i], dst3[i]);
+ OCL_ASSERT(dst1[i] == -(int64_t)i);
+ OCL_ASSERT(dst2[i] == -(int64_t)i);
+ OCL_ASSERT(dst3[i] == -(int64_t)i);
+ }
+ OCL_UNMAP_BUFFER(3);
+ OCL_UNMAP_BUFFER(4);
+ OCL_UNMAP_BUFFER(5);
+}
+
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_long_convert, true);
+
+// convert 64-bit integer to shorter integer
+void compiler_long_convert_2(void)
+{
+ const size_t n = 16;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src[i] = -i;
+ }
+ OCL_MAP_BUFFER(3);
+ memcpy(buf_data[3], src, sizeof(src));
+ OCL_UNMAP_BUFFER(3);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ char *dst1 = ((char *)buf_data[0]);
+ short *dst2 = ((short *)buf_data[1]);
+ int *dst3 = ((int *)buf_data[2]);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%x %x %x\n", dst1[i], dst2[i], dst3[i]);
+ OCL_ASSERT(dst1[i] == -i);
+ OCL_ASSERT(dst2[i] == -i);
+ OCL_ASSERT(dst3[i] == -i);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_long_convert_2, true);
+
+// convert 64-bit integer to 32-bit float
+void compiler_long_convert_to_float(void)
+{
+ const size_t n = 16;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_to_float");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src[i] = -(int64_t)i;
+ }
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[1], src, sizeof(src));
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ float *dst = ((float *)buf_data[0]);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%f\n", dst[i]);
+ OCL_ASSERT(dst[i] == src[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_to_float);
diff --git a/utests/compiler_long_mult.cpp b/utests/compiler_long_mult.cpp
new file mode 100644
index 0000000..06070f7
--- /dev/null
+++ b/utests/compiler_long_mult.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_mult(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_mult");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src1[i] = 0x77665544FFEEDDCCLL;
+ src2[i] = ((int64_t)rand() << 32) + rand();
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+ if (i < 3)
+ OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+ else
+ OCL_ASSERT(src1[i] * src2[i] == ((int64_t *)buf_data[2])[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mult);
diff --git a/utests/compiler_long_shl.cpp b/utests/compiler_long_shl.cpp
new file mode 100644
index 0000000..c8e4624
--- /dev/null
+++ b/utests/compiler_long_shl.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shl(void)
+{
+ const size_t n = 64;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_shl");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ src[i] = 1;
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, sizeof(src));
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ int64_t *dest = ((int64_t *)buf_data[1]);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ if (i > 7)
+ OCL_ASSERT(dest[i] == ((int64_t)1) << i);
+ else
+ OCL_ASSERT(dest[i] == src[i] + 1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shl);
diff --git a/utests/compiler_long_shr.cpp b/utests/compiler_long_shr.cpp
new file mode 100644
index 0000000..e9fea6a
--- /dev/null
+++ b/utests/compiler_long_shr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shr(void)
+{
+ const size_t n = 64;
+ uint64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_shr");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ src[i] = (uint64_t)1 << 63;
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, sizeof(src));
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ uint64_t *dest = ((uint64_t *)buf_data[1]);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ if (i > 7)
+ OCL_ASSERT(dest[i] == src[i] >> i);
+ else
+ OCL_ASSERT(dest[i] == src[i] + 1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shr);
diff --git a/utests/compiler_lower_return0.cpp b/utests/compiler_lower_return0.cpp
new file mode 100644
index 0000000..0e9dbd0
--- /dev/null
+++ b/utests/compiler_lower_return0.cpp
@@ -0,0 +1,54 @@
+#include "utest_helper.hpp"
+
+static void compiler_lower_return0(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_lower_return0");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 32; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+
+ // Second control flow
+ for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 32; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+
+ // Third control flow
+ for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 8; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+ for (int32_t i = 8; i < 32; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_lower_return0);
+
+
diff --git a/utests/compiler_lower_return1.cpp b/utests/compiler_lower_return1.cpp
new file mode 100644
index 0000000..b4f1fe3
--- /dev/null
+++ b/utests/compiler_lower_return1.cpp
@@ -0,0 +1,47 @@
+#include "utest_helper.hpp"
+
+static void compiler_lower_return1(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_lower_return1");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+ for (int32_t i = 11; i < 16; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+
+ // Second control flow
+ for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 4; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+ for (int32_t i = 4; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+ for (int32_t i = 11; i < 16; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_lower_return1);
+
diff --git a/utests/compiler_lower_return2.cpp b/utests/compiler_lower_return2.cpp
new file mode 100644
index 0000000..1e34036
--- /dev/null
+++ b/utests/compiler_lower_return2.cpp
@@ -0,0 +1,48 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+ const int id = global_id;
+ dst[id] = id;
+ while (dst[id] > src[id]) {
+ if (dst[id] > 10) return;
+ dst[id]--;
+ }
+ dst[id] += 2;
+}
+
+static void compiler_lower_return2(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_lower_return2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_lower_return2);
+
diff --git a/utests/compiler_mad24.cpp b/utests/compiler_mad24.cpp
new file mode 100644
index 0000000..a3890a1
--- /dev/null
+++ b/utests/compiler_mad24.cpp
@@ -0,0 +1,41 @@
+#include "utest_helper.hpp"
+
+void compiler_mad24(void)
+{
+ const int n = 32;
+ int src1[n], src2[n], src3[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_mad24");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((int*)buf_data[1])[i] = rand();
+ src3[i] = ((int*)buf_data[2])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[3])[i] == ((src1[i] << 8) >> 8) * ((src2[i] << 8) >> 8) + src3[i]);
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mad24);
diff --git a/utests/compiler_mad_hi.cpp b/utests/compiler_mad_hi.cpp
new file mode 100644
index 0000000..6f66e7f
--- /dev/null
+++ b/utests/compiler_mad_hi.cpp
@@ -0,0 +1,46 @@
+#include "utest_helper.hpp"
+
+void compiler_mad_hi(void)
+{
+ const int n = 32;
+ int src1[n], src2[n], src3[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_mad_hi");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((int*)buf_data[1])[i] = rand();
+ src3[i] = ((int*)buf_data[2])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; ++i) {
+ long long a = src1[i];
+ a *= src2[i];
+ a >>= 32;
+ a += src3[i];
+ OCL_ASSERT(((int*)buf_data[3])[i] == (int)a);
+ }
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mad_hi);
diff --git a/utests/compiler_mandelbrot.cpp b/utests/compiler_mandelbrot.cpp
new file mode 100644
index 0000000..7758dae
--- /dev/null
+++ b/utests/compiler_mandelbrot.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static int *dst = NULL;
+static const size_t w = 256;
+static const size_t h = 256;
+
+static void compiler_mandelbrot(void)
+{
+ const size_t global[2] = {w, h};
+ const size_t local[2] = {16, 1};
+ const size_t sz = w * h * sizeof(char[4]);
+
+ OCL_CREATE_KERNEL("compiler_mandelbrot");
+
+ OCL_CREATE_BUFFER(buf[0], 0, sz, NULL);
+ OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &buf[0]);
+ OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+ OCL_MAP_BUFFER(0);
+ dst = (int *) buf_data[0];
+
+ /* Save the image (for debug purpose) */
+ cl_write_bmp(dst, w, h, "compiler_mandelbrot.bmp");
+
+ /* Compare with the golden image */
+ OCL_CHECK_IMAGE(dst, w, h, "compiler_mandelbrot_ref.bmp");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mandelbrot);
+
diff --git a/utests/compiler_mandelbrot_alternate.cpp b/utests/compiler_mandelbrot_alternate.cpp
new file mode 100644
index 0000000..2e5d59f
--- /dev/null
+++ b/utests/compiler_mandelbrot_alternate.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static int *dst = NULL;
+static const size_t w = 256;
+static const size_t h = 256;
+static const float criterium = 4.f;
+
+static void compiler_mandelbrot_alternate(void)
+{
+ const size_t global[2] = {w, h};
+ const size_t local[2] = {16, 1};
+ const size_t sz = w * h * sizeof(char[4]);
+ const float rcpWidth = 1.f / float(w);
+ const float rcpHeight = 1.f / float(h);
+
+ OCL_CREATE_KERNEL("compiler_mandelbrot_alternate");
+
+ OCL_CREATE_BUFFER(buf[0], 0, sz, NULL);
+ OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &buf[0]);
+ OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &rcpWidth);
+ OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &rcpHeight);
+ OCL_CALL (clSetKernelArg, kernel, 3, sizeof(float), &criterium);
+ OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+ OCL_MAP_BUFFER(0);
+ dst = (int *) buf_data[0];
+
+ /* Save the image (for debug purpose) */
+ cl_write_bmp(dst, w, h, "compiler_mandelbrot_alternate.bmp");
+
+ /* Compare with the golden image */
+ OCL_CHECK_IMAGE(dst, w, h, "compiler_mandelbrot_alternate_ref.bmp");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mandelbrot_alternate);
+
diff --git a/utests/compiler_math.cpp b/utests/compiler_math.cpp
new file mode 100644
index 0000000..e0c4487
--- /dev/null
+++ b/utests/compiler_math.cpp
@@ -0,0 +1,89 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static void cpu_compiler_math(float *dst, float *src, int i)
+{
+ const float x = src[i];
+ const float PI = 3.141592653589793f;
+ switch (i) {
+ case 0: dst[i] = cosf(x); break;
+ case 1: dst[i] = sinf(x); break;
+ case 2: dst[i] = log2f(x); break;
+ case 3: dst[i] = sqrtf(x); break;
+ case 4: dst[i] = 1.f/ sqrtf(x); break;
+ case 5: dst[i] = 1.f / x; break;
+ case 6: dst[i] = tanf(x); break;
+ case 7: dst[i] = powf(x, 0.3333333333333333333f); break;
+ case 8: dst[i] = ceilf(x); break;
+ case 9: dst[i] = cosf(PI * x); break;
+ case 10: dst[i] = powf(2, x); break;
+ case 11: dst[i] = powf(10, x); break;
+ case 12: dst[i] = expf(x) - 1; break;
+ case 13: dst[i] = logf(x + 1); break;
+ case 14: dst[i] = floorf(log2f(x)); break;
+ case 15: dst[i] = sinf(PI * x); break;
+ case 16: dst[i] = tanf(PI * x); break;
+ case 17: dst[i] = 2 * roundf(x / 2); break;
+ case 18: dst[i] = sinhf(x); break;
+ case 19: dst[i] = coshf(x); break;
+ case 20: dst[i] = tanhf(x); break;
+ case 21: dst[i] = asinhf(x); break;
+ case 22: dst[i] = acoshf(x); break;
+ case 23: dst[i] = atanhf(x); break;
+ case 24: dst[i] = asinf(x); break;
+ case 25: dst[i] = acosf(x); break;
+ case 26: dst[i] = atanf(x); break;
+ case 27: dst[i] = asinf(x) / PI; break;
+ case 28: dst[i] = acosf(x) / PI; break;
+ case 29: dst[i] = atanf(x) / PI; break;
+ case 30: dst[i] = erff(x); break;
+ case 31: dst[i] = nanf(""); break;
+ default: dst[i] = 1.f; break;
+ };
+}
+
+static void compiler_math(void)
+{
+ const size_t n = 32;
+ float cpu_dst[32], cpu_src[32];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_math");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ int j;
+ for(j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 32; ++i)
+ cpu_src[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < 16; ++i)
+ cpu_compiler_math(cpu_dst, cpu_src, i);
+ for (int i = 0; i < 16; ++i) {
+ const float cpu = cpu_dst[i];
+ const float gpu = ((float*)buf_data[0])[i];
+ if (isinf(cpu))
+ OCL_ASSERT(isinf(gpu));
+ else if (isnan(cpu))
+ OCL_ASSERT(isnan(gpu));
+ else
+ OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math)
+
+
diff --git a/utests/compiler_math_2op.cpp b/utests/compiler_math_2op.cpp
new file mode 100644
index 0000000..454967d
--- /dev/null
+++ b/utests/compiler_math_2op.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static float rnde(float v) {
+ if(v - floorf(v) > 0.5f)
+ return floorf(v) + 1;
+ if(v - floorf(v) < 0.5f)
+ return floorf(v);
+ if((int)(floorf(v)) & 1)
+ return floorf(v) + 1;
+ return floorf(v);
+}
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, int i)
+{
+ const float x = src1[i], y = src2[i];
+ switch (i) {
+ case 0: dst[i] = x / y; break;
+ case 1: dst[i] = x > y ? x - y : 0; break;
+ case 2: dst[i] = fminf(x - floorf(x), 0x1.FFFFFep-1F); break;
+ case 3: dst[i] = sqrtf(x*x + y*y); break;
+ case 4: dst[i] = x * powf(2, (int)y); break;
+ case 5: dst[i] = powf(x, (int)y); break;
+ case 6: dst[i] = x - rnde(x/y)*y; break;
+ case 7: dst[i] = powf(x, 1.f/(int)(y+1)); break;
+ case 8: dst[i] = x * y < 0 ? -x : x; break;
+ case 9: dst[i] = fabsf(x) > fabsf(y) ? x : fabsf(y) > fabsf(x) ? y : fmaxf(x, y); break;
+ case 10: dst[i] = fabsf(x) < fabsf(y) ? x : fabsf(y) < fabsf(x) ? y : fminf(x, y); break;
+ default: dst[i] = 1.f; break;
+ };
+}
+
+static void compiler_math_2op(void)
+{
+ const size_t n = 32;
+ float cpu_dst[32], cpu_src1[32], cpu_src2[32];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_math_2op");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ int j;
+ for(j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (uint32_t i = 0; i < 32; ++i) {
+ cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+ cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_NDRANGE(1);
+
+ for (int i = 0; i < 16; ++i)
+ cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, i);
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < 16; ++i) {
+ const float cpu = cpu_dst[i];
+ const float gpu = ((float*)buf_data[0])[i];
+ if (isinf(cpu))
+ OCL_ASSERT(isinf(gpu));
+ else if (isnan(cpu))
+ OCL_ASSERT(isnan(gpu));
+ else {
+ OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+ }
+ }
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_2op)
diff --git a/utests/compiler_math_3op.cpp b/utests/compiler_math_3op.cpp
new file mode 100644
index 0000000..a382b0a
--- /dev/null
+++ b/utests/compiler_math_3op.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, float *src3, int i)
+{
+ const float x = src1[i], y = src2[i], z = src3[i];
+ switch (i) {
+ case 0: dst[i] = x * y + z; break;
+ case 1: dst[i] = x * y + z; break;
+ default: dst[i] = 1.f; break;
+ };
+}
+
+static void compiler_math_3op(void)
+{
+ const size_t n = 32;
+ float cpu_dst[32], cpu_src1[32], cpu_src2[32], cpu_src3[32];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_math_3op");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ for (uint32_t i = 0; i < 32; ++i) {
+ cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+ cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+ cpu_src3[i] = ((float*)buf_data[3])[i] = .1f * (rand() & 15);
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+ OCL_NDRANGE(1);
+
+ for (int i = 0; i < 16; ++i)
+ cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, cpu_src3, i);
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < 16; ++i) {
+ const float cpu = cpu_dst[i];
+ const float gpu = ((float*)buf_data[0])[i];
+ if (isinf(cpu))
+ OCL_ASSERT(isinf(gpu));
+ else if (isnan(cpu))
+ OCL_ASSERT(isnan(gpu));
+ else
+ OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+ }
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_3op)
diff --git a/utests/compiler_math_builtin.cpp b/utests/compiler_math_builtin.cpp
new file mode 100644
index 0000000..0577e04
--- /dev/null
+++ b/utests/compiler_math_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_math_builtin(void)
+{
+ OCL_CREATE_KERNEL("compiler_math_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_builtin);
+
diff --git a/utests/compiler_math_constants.cpp b/utests/compiler_math_constants.cpp
new file mode 100644
index 0000000..5ec97c9
--- /dev/null
+++ b/utests/compiler_math_constants.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_math_constants(void)
+{
+ OCL_CREATE_KERNEL("compiler_math_constants");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_constants);
+
diff --git a/utests/compiler_mem_fence.cpp b/utests/compiler_mem_fence.cpp
new file mode 100644
index 0000000..ad7e2f6
--- /dev/null
+++ b/utests/compiler_mem_fence.cpp
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Synchronization, explicit memory fence (section 6.11.9, 6.11.10) */
+#include "utest_helper.hpp"
+
+void compiler_mem_fence(void)
+{
+ OCL_CREATE_KERNEL("compiler_mem_fence");
+ OCL_NDRANGE(1);
+}
+
diff --git a/utests/compiler_mixed_pointer.cpp b/utests/compiler_mixed_pointer.cpp
new file mode 100644
index 0000000..9531fb2
--- /dev/null
+++ b/utests/compiler_mixed_pointer.cpp
@@ -0,0 +1,119 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src1, int *src2, int *dst) {
+ int * tmp = NULL;
+
+ switch(global_id) {
+ case 0:
+ case 1:
+ case 4:
+ tmp = src1;
+ break;
+ default:
+ tmp = src2;
+ break;
+ }
+ dst[global_id] = tmp[global_id];
+
+}
+static void cpu1(int global_id, int *src, int *dst1, int *dst2) {
+ int * tmp = global_id < 5 ? dst1 : dst2;
+ tmp[global_id] = src[global_id];
+}
+
+void compiler_mixed_pointer(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16], cpu_src1[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_mixed_pointer");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 1; ++pass) {
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+ cpu_src1[i] = ((int32_t*)buf_data[1])[i] = 65536-i;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_src1, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (size_t i = 0; i < n; ++i) {
+// printf(" %d %d\n", cpu_dst[i], ((int32_t*)buf_data[2])[i]);
+ OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer);
+
+void compiler_mixed_pointer1(void)
+{
+ const size_t n = 16;
+ int cpu_dst1[16], cpu_dst2[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_mixed_pointer", "compiler_mixed_pointer1");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 1; ++pass) {
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+ cpu_dst1[i] = ((int32_t*)buf_data[1])[i] = 0xff;
+ cpu_dst2[i] = ((int32_t*)buf_data[2])[i] = 0xff;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu1(i, cpu_src, cpu_dst1, cpu_dst2);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (size_t i = 0; i < n; ++i) {
+// printf(" %d %d\n", cpu_dst1[i], ((int32_t*)buf_data[1])[i]);
+// printf(" %d %d\n", ((int32_t*)buf_data[2])[i], cpu_dst2[i]);
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst1[i]);
+ OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst2[i]);
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer1);
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
new file mode 100644
index 0000000..8f1e66e
--- /dev/null
+++ b/utests/compiler_movforphi_undef.cpp
@@ -0,0 +1,61 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+static void compiler_movforphi_undef(void)
+{
+ const size_t w = 16;
+ const size_t h = 16;
+ cl_sampler sampler;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ // Setup kernel and images
+ OCL_CREATE_KERNEL("test_movforphi_undef");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ memset(&desc, 0, sizeof(desc));
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = w * sizeof(uint32_t);
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+ OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ globals[0] = w;
+ globals[1] = h;
+ locals[0] = 16;
+ locals[1] = 16;
+ OCL_NDRANGE(2);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ // Just compare the initial 2 data is enough for this case, as the initial 2 data must in the first
+ // tile box and we can just get the correct coords.
+ for (uint32_t j = 0; j < 1; ++j)
+ for (uint32_t i = 0; i < 3; i++)
+ {
+ if (i == 0)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_movforphi_undef);
diff --git a/utests/compiler_mul24.cpp b/utests/compiler_mul24.cpp
new file mode 100644
index 0000000..8a36947
--- /dev/null
+++ b/utests/compiler_mul24.cpp
@@ -0,0 +1,36 @@
+#include "utest_helper.hpp"
+
+void compiler_mul24(void)
+{
+ const int n = 32;
+ int src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_mul24");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((int*)buf_data[1])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[2])[i] == ((src1[i] << 8) >> 8) * ((src2[i] << 8) >> 8));
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mul24);
diff --git a/utests/compiler_mul_hi.cpp b/utests/compiler_mul_hi.cpp
new file mode 100644
index 0000000..5ea6389
--- /dev/null
+++ b/utests/compiler_mul_hi.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_mul_hi(void)
+{
+ const int n = 32;
+ int src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_mul_hi");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((int*)buf_data[1])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ long long a = src1[i];
+ a *= src2[i];
+ a >>= 32;
+ OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mul_hi);
diff --git a/utests/compiler_multiple_kernels.cpp b/utests/compiler_multiple_kernels.cpp
new file mode 100644
index 0000000..09b4349
--- /dev/null
+++ b/utests/compiler_multiple_kernels.cpp
@@ -0,0 +1,8 @@
+#include "utest_helper.hpp"
+
+static void compiler_multiple_kernels(void)
+{
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_multiple_kernels", "first_kernel");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_multiple_kernels);
\ No newline at end of file
diff --git a/utests/compiler_preprocessor_macros.cpp b/utests/compiler_preprocessor_macros.cpp
new file mode 100644
index 0000000..3cd0272
--- /dev/null
+++ b/utests/compiler_preprocessor_macros.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_preprocessor_macros(void)
+{
+ OCL_CREATE_KERNEL("compiler_preprocessor_macros");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_preprocessor_macros);
+
diff --git a/utests/compiler_private_data_overflow.cpp b/utests/compiler_private_data_overflow.cpp
new file mode 100644
index 0000000..0fa30a0
--- /dev/null
+++ b/utests/compiler_private_data_overflow.cpp
@@ -0,0 +1,15 @@
+#include "utest_helper.hpp"
+
+void compiler_private_data_overflow(void)
+{
+ OCL_CREATE_KERNEL( "compiler_private_data_overflow" );
+ OCL_CREATE_BUFFER( buf[0], 0, sizeof(cl_int4), NULL );
+ OCL_SET_ARG( 0, sizeof(cl_mem), &buf[0] );
+ globals[0] = 64;
+ locals[0] = 32;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_ASSERT( ((uint32_t *)buf_data[0])[0] == 0 );
+ OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION( compiler_private_data_overflow );
diff --git a/utests/compiler_program_objects.cpp b/utests/compiler_program_objects.cpp
new file mode 100644
index 0000000..34ae42a
--- /dev/null
+++ b/utests/compiler_program_objects.cpp
@@ -0,0 +1,64 @@
+/* test OpenCL 1.1 Program Objects (section 5.6)
+ * test creating program objects,
+ * build program executable,
+ * build options
+ * query program objects */
+
+#include "utest_helper.hpp"
+
+void compiler_program_objects(void)
+{
+ OCL_CREATE_KERNEL("empty"); // set up global vars
+ OCL_CALL(clRetainProgram, program);
+ OCL_CALL(clReleaseProgram, program);
+ OCL_CALL(clBuildProgram,
+ program,
+ 1,
+ &device,
+ "-Dname -Dname2=def -ldir "
+ "-cl-opt-disable -cl-strict-aliasing -cl-mad-enable -cl-no-signed-zeros "
+ "-cl-finite-math-only -cl-fast-relaxed-math -cl-unsafe-math-optimizations "
+ "-cl-single-precision-constant -cl-denorms-are-zero "
+ "-w -Werror -cl-std=CL1.1",
+ NULL,
+ NULL);
+ const int pi[] = {CL_PROGRAM_REFERENCE_COUNT,
+ CL_PROGRAM_CONTEXT,
+ CL_PROGRAM_NUM_DEVICES,
+ CL_PROGRAM_DEVICES,
+ CL_PROGRAM_SOURCE,
+ CL_PROGRAM_BINARY_SIZES,
+ CL_PROGRAM_BINARIES,};
+ const int pbi[] = {CL_PROGRAM_BUILD_STATUS,
+ CL_PROGRAM_BUILD_OPTIONS,
+ CL_PROGRAM_BUILD_LOG,};
+ char param_value[1024];
+ size_t pv_size;
+ int i;
+ for(i=0; i<sizeof(pi) / sizeof(pi[0]); i++)
+ OCL_CALL(clGetProgramInfo,
+ program,
+ pi[i],
+ sizeof(param_value),
+ param_value,
+ &pv_size);
+ for(i=0; i<sizeof(pbi) / sizeof(pbi[0]); i++)
+ OCL_CALL(clGetProgramBuildInfo,
+ program,
+ device,
+ pbi[i],
+ sizeof(param_value),
+ param_value,
+ &pv_size);
+ std::cout<<platform<<' '
+ <<device<<' '
+ <<ctx<<' '
+ <<program<<' '
+ <<kernel<<' '
+ <<queue<<std::endl;
+
+ puts("Test clUnloadCompiler");
+ OCL_CALL(clUnloadCompiler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_program_objects);
diff --git a/utests/compiler_radians.cpp b/utests/compiler_radians.cpp
new file mode 100644
index 0000000..882477e
--- /dev/null
+++ b/utests/compiler_radians.cpp
@@ -0,0 +1,32 @@
+#include "utest_helper.hpp"
+
+void compiler_radians(void)
+{
+ const int n = 32;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_radians");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float *)buf_data[0])[i] = rand() * 0.01f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ OCL_ASSERT(((float *)buf_data[1])[i] == src[i] * (3.141592653589793F / 180));
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_radians);
diff --git a/utests/compiler_relational_builtin.cpp b/utests/compiler_relational_builtin.cpp
new file mode 100644
index 0000000..a9a6eb5
--- /dev/null
+++ b/utests/compiler_relational_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_relational_builtin(void)
+{
+ OCL_CREATE_KERNEL("compiler_relational_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_relational_builtin);
+
diff --git a/utests/compiler_rhadd.cpp b/utests/compiler_rhadd.cpp
new file mode 100644
index 0000000..b25c788
--- /dev/null
+++ b/utests/compiler_rhadd.cpp
@@ -0,0 +1,41 @@
+#include "utest_helper.hpp"
+
+void compiler_rhadd(void)
+{
+ const int n = 32;
+ int src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_rhadd");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((int*)buf_data[1])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ long long a = src1[i];
+ a += src2[i];
+ a ++;
+ a >>= 1;
+ OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_rhadd);
diff --git a/utests/compiler_rotate.cpp b/utests/compiler_rotate.cpp
new file mode 100644
index 0000000..bf52ca4
--- /dev/null
+++ b/utests/compiler_rotate.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+int cpu(int src, int y) {
+ return (src << y) | (src >> (32 - y));
+}
+
+void compiler_rotate(void)
+{
+ const int n = 32;
+ int src[n], y[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_rotate");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((int*)buf_data[0])[i] = rand();
+ y[i] = ((int*)buf_data[2])[i] = rand() & 31;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[1])[i] == cpu(src[i], y[i]));
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_rotate);
diff --git a/utests/compiler_sampler.cpp b/utests/compiler_sampler.cpp
new file mode 100644
index 0000000..32bf926
--- /dev/null
+++ b/utests/compiler_sampler.cpp
@@ -0,0 +1,41 @@
+/* test OpenCL 1.1 Sampler Objects (section 5.5) */
+#include "utest_helper.hpp"
+
+void compiler_sampler(void)
+{
+ OCL_CREATE_KERNEL("compiler_sampler");
+
+ OCL_ASSERT(ctx != 0);
+ cl_sampler s;
+ cl_int err;
+ int a1[] = {CL_TRUE, CL_FALSE},
+ a2[] = {CL_ADDRESS_MIRRORED_REPEAT,
+ CL_ADDRESS_REPEAT,
+ CL_ADDRESS_CLAMP_TO_EDGE,
+ CL_ADDRESS_CLAMP,
+ CL_ADDRESS_NONE},
+ a3[] = {CL_FILTER_NEAREST, CL_FILTER_LINEAR},
+ a4[] = {CL_SAMPLER_REFERENCE_COUNT,
+ CL_SAMPLER_CONTEXT,
+ CL_SAMPLER_NORMALIZED_COORDS,
+ CL_SAMPLER_ADDRESSING_MODE,
+ CL_SAMPLER_FILTER_MODE};
+ char pv[1000];
+ size_t pv_size;
+ int i, j, k, l;
+ for(i=0; i<2; i++)
+ for(j=0; j<5; j++)
+ for(k=0; k<2; k++) {
+ s = clCreateSampler(ctx, a1[i], a2[j], a3[k], &err);
+ OCL_ASSERT(err == CL_SUCCESS);
+ OCL_CALL(clRetainSampler, s);
+ OCL_CALL(clReleaseSampler, s);
+ for(l=0; l<5; l++)
+ OCL_CALL(clGetSamplerInfo, s, a4[l], 1000, pv, &pv_size);
+ OCL_CALL(clReleaseSampler, s);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_sampler);
+
+
diff --git a/utests/compiler_saturate.cpp b/utests/compiler_saturate.cpp
new file mode 100644
index 0000000..6880df0
--- /dev/null
+++ b/utests/compiler_saturate.cpp
@@ -0,0 +1,114 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+constexpr int n = 16;
+
+// declaration only, we should create each template specification for each type.
+template<typename T>
+T get_data(int idx, int part);
+
+/* the format of test data is as follows:
+ * the first column is A
+ * the second column is B
+ * the third column is the expected result.
+ */
+
+#define DEF_TEMPLATE(TYPE, NAME) \
+template <> \
+TYPE get_data<TYPE>(int idx, int part) \
+{ \
+ static TYPE test_data[n][3] = { \
+ { 0, 0, 0 }, \
+ { 0, 1, 1 }, \
+ { 0, 2, 2 }, \
+ { -1, 1, 0 }, \
+ { 1, -2, -1 }, \
+ { 0, 110, 110 }, \
+ { -10, -10, -20 }, \
+ { CL_##NAME##_MIN, CL_##NAME##_MIN, CL_##NAME##_MIN }, \
+ { CL_##NAME##_MIN, CL_##NAME##_MAX, -1 }, \
+ { CL_##NAME##_MAX, 0, CL_##NAME##_MAX }, \
+ { CL_##NAME##_MAX, 1, CL_##NAME##_MAX }, \
+ { CL_##NAME##_MAX, 2, CL_##NAME##_MAX }, \
+ { CL_##NAME##_MAX, CL_##NAME##_MAX, CL_##NAME##_MAX }, \
+ { CL_##NAME##_MAX/2, CL_##NAME##_MAX/2, CL_##NAME##_MAX-1 }, \
+ { CL_##NAME##_MAX/2, CL_##NAME##_MAX/2+1, CL_##NAME##_MAX }, \
+ { CL_##NAME##_MAX/2+1, CL_##NAME##_MAX/2+1, CL_##NAME##_MAX } \
+ }; \
+ return test_data[idx][part]; \
+} \
+ \
+template <> \
+u##TYPE get_data<u##TYPE>(int idx, int part) \
+{ \
+ static u##TYPE test_data[n][3] = { \
+ { 0, 0, 0 }, \
+ { CL_U##NAME##_MAX, 0, CL_U##NAME##_MAX }, \
+ { CL_U##NAME##_MAX, 1, CL_U##NAME##_MAX }, \
+ { CL_U##NAME##_MAX, 2, CL_U##NAME##_MAX }, \
+ { CL_U##NAME##_MAX, CL_U##NAME##_MAX, CL_U##NAME##_MAX }, \
+ { CL_U##NAME##_MAX/2, CL_U##NAME##_MAX/2, CL_U##NAME##_MAX-1 }, \
+ { CL_U##NAME##_MAX/2, CL_U##NAME##_MAX/2+1, CL_U##NAME##_MAX }, \
+ { CL_U##NAME##_MAX/2+1, CL_U##NAME##_MAX/2+1, CL_U##NAME##_MAX }\
+ }; \
+ return test_data[idx][part]; \
+}
+
+DEF_TEMPLATE(int8_t, CHAR)
+DEF_TEMPLATE(int16_t, SHRT)
+DEF_TEMPLATE(int32_t, INT)
+//DEF_TEMPLATE(int64_t, LONG)
+
+
+template<typename T>
+void test(const char *kernel_name)
+{
+ T C[n] = { 0 };
+ T A[n] = { 0 };
+ T B[n] = { 0 };
+
+ for (int i = 0; i < n; i++) {
+ A[i] = get_data<T>(i, 0);
+ B[i] = get_data<T>(i, 1);
+ }
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_saturate", kernel_name);
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &C[0]);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &A[0]);
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &B[0]);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+ globals[0] = n;
+ locals[0] = n;
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+
+ for (int i = 0; i < n; i++) {
+ OCL_ASSERT(((T*)buf_data[0])[i] == get_data<T>(i, 2));
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+
+}
+
+#define compiler_saturate(type, kernel) \
+static void compiler_saturate_ ##type(void)\
+{\
+ test<type>(# kernel);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_saturate_ ## type);
+
+compiler_saturate(int8_t, test_char)
+compiler_saturate(uint8_t, test_uchar)
+compiler_saturate(int16_t, test_short)
+compiler_saturate(uint16_t, test_ushort)
+compiler_saturate(int32_t, test_int)
+compiler_saturate(uint32_t, test_uint)
+//compiler_saturate(int64_t, test_long)
+//compiler_saturate(uint64_t, test_ulong)
diff --git a/utests/compiler_saturate_sub.cpp b/utests/compiler_saturate_sub.cpp
new file mode 100644
index 0000000..1c95e2d
--- /dev/null
+++ b/utests/compiler_saturate_sub.cpp
@@ -0,0 +1,114 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+constexpr int n = 16;
+
+// declaration only, we should create each template specification for each type.
+template<typename T>
+T get_data(int idx, int part);
+
+/* the format of test data is as follows:
+ * the first column is A
+ * the second column is B
+ * the third column is the expected result.
+ */
+
+#define DEF_TEMPLATE(TYPE, NAME) \
+template <> \
+TYPE get_data<TYPE>(int idx, int part) \
+{ \
+ static TYPE test_data[n][3] = { \
+ { 0, 0, 0 }, \
+ { 0, 1, -1 }, \
+ { CL_##NAME##_MIN, CL_##NAME##_MIN, 0 }, \
+ { CL_##NAME##_MAX, CL_##NAME##_MAX, 0 }, \
+ { -2, CL_##NAME##_MIN, CL_##NAME##_MAX-1 }, \
+ { -1, CL_##NAME##_MIN, CL_##NAME##_MAX }, \
+ { 0, CL_##NAME##_MIN, CL_##NAME##_MAX }, \
+ { 1, CL_##NAME##_MIN, CL_##NAME##_MAX }, \
+ { -2, CL_##NAME##_MAX, CL_##NAME##_MIN }, \
+ { -1, CL_##NAME##_MAX, CL_##NAME##_MIN }, \
+ { 0, CL_##NAME##_MAX, -CL_##NAME##_MAX }, \
+ { 1, CL_##NAME##_MAX, -CL_##NAME##_MAX+1 }, \
+ { CL_##NAME##_MIN, CL_##NAME##_MAX, CL_##NAME##_MIN }, \
+ { CL_##NAME##_MIN, 1, CL_##NAME##_MIN }, \
+ { CL_##NAME##_MIN, -1, CL_##NAME##_MIN+1 }, \
+ { CL_##NAME##_MAX, CL_##NAME##_MIN, CL_##NAME##_MAX }, \
+ }; \
+ return test_data[idx][part]; \
+} \
+ \
+template <> \
+u##TYPE get_data<u##TYPE>(int idx, int part) \
+{ \
+ static u##TYPE test_data[n][3] = { \
+ { 0, 0, 0 }, \
+ { 0, 1, 0 }, \
+ { 1, 1, 0 }, \
+ { 1, 0, 1 }, \
+ { CL_U##NAME##_MAX, CL_U##NAME##_MAX, 0 }, \
+ { 0, CL_U##NAME##_MAX, 0 }, \
+ { 1, CL_U##NAME##_MAX, 0 }, \
+ { CL_U##NAME##_MAX, 0, CL_U##NAME##_MAX }, \
+ }; \
+ return test_data[idx][part]; \
+}
+
+DEF_TEMPLATE(int8_t, CHAR)
+DEF_TEMPLATE(int16_t, SHRT)
+DEF_TEMPLATE(int32_t, INT)
+//DEF_TEMPLATE(int64_t, LONG)
+
+
+template<typename T>
+void test(const char *kernel_name)
+{
+ T C[n] = { 0 };
+ T A[n] = { 0 };
+ T B[n] = { 0 };
+
+ for (int i = 0; i < n; i++) {
+ A[i] = get_data<T>(i, 0);
+ B[i] = get_data<T>(i, 1);
+ }
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_saturate_sub", kernel_name);
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &C[0]);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &A[0]);
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &B[0]);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+ globals[0] = n;
+ locals[0] = n;
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+
+ for (int i = 0; i < n; i++) {
+ OCL_ASSERT(((T*)buf_data[0])[i] == get_data<T>(i, 2));
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+
+}
+
+#define compiler_saturate_sub(type, kernel) \
+static void compiler_saturate_sub_ ##type(void)\
+{\
+ test<type>(# kernel);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_saturate_sub_ ## type);
+
+compiler_saturate_sub(int8_t, test_char)
+compiler_saturate_sub(uint8_t, test_uchar)
+compiler_saturate_sub(int16_t, test_short)
+compiler_saturate_sub(uint16_t, test_ushort)
+compiler_saturate_sub(int32_t, test_int)
+compiler_saturate_sub(uint32_t, test_uint)
+//compiler_saturate_sub(int64_t, test_long)
+//compiler_saturate_sub(uint64_t, test_ulong)
diff --git a/utests/compiler_shader_toy.cpp b/utests/compiler_shader_toy.cpp
new file mode 100644
index 0000000..58bcc6f
--- /dev/null
+++ b/utests/compiler_shader_toy.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* This is a super simple wrapper for the OpenCL kernels I ported from GLSL code
+ * taken in Inigo's web site:
+ * http://www.iquilezles.org/apps/shadertoy/index.html
+ *
+ * They are pretty cool and rather complex kernels. Just the right thing to have
+ * something a bit more complicated and interesting than unit tests.
+ *
+ * The code here is just to wrap the common code used by all the kernels (to run
+ * the code and assert its correctness)
+ */
+#include "utest_helper.hpp"
+
+static const int dim = 256;
+
+// tricky here 'name' stands for Kernel and Reference
+// 'file' stands for .cl file name and dst image name
+static void run_kernel(int w, int h, const char *file, const char *name)
+{
+ const size_t global[2] = {size_t(w), size_t(h)};
+ const size_t local[2] = {16, 1};
+ const size_t sz = w * h * sizeof(char[4]);
+ const float fx = float(w);
+ const float fy = float(h);
+ char kernel_file[256];
+ char dst_img[256];
+ char ref_img[256];
+
+ snprintf(kernel_file, sizeof(kernel_file), "%s.cl", file);
+ snprintf(dst_img, sizeof(dst_img), "%s.bmp", file);
+ snprintf(ref_img, sizeof(ref_img), "%s_ref.bmp", name);
+ OCL_CALL (cl_kernel_init, kernel_file, name, SOURCE, NULL);
+
+ OCL_CREATE_BUFFER(buf[0], 0, sz, NULL);
+ OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &buf[0]);
+ OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &fx);
+ OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &fy);
+ OCL_CALL (clSetKernelArg, kernel, 3, sizeof(int), &w);
+ OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+ OCL_MAP_BUFFER(0);
+ int *dst = (int*) buf_data[0];
+
+ /* Save the image (for debug purpose) */
+ cl_write_bmp(dst, w, h, dst_img);
+
+ /* Compare with the golden image */
+ OCL_CHECK_IMAGE(dst, w, h, ref_img);
+}
+
+#define DECL_SHADER_TOY_TEST(W,H,FILE_NAME, KERNEL_NAME) \
+ static void FILE_NAME(void) { run_kernel(W,H,#FILE_NAME, #KERNEL_NAME); } \
+ MAKE_UTEST_FROM_FUNCTION(FILE_NAME);
+
+DECL_SHADER_TOY_TEST(dim,dim,compiler_clod,compiler_clod);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_ribbon,compiler_ribbon);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_nautilus,compiler_nautilus);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge_no_shadow,compiler_menger_sponge_no_shadow);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia,compiler_julia);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_no_break,compiler_julia_no_break);
+// test for function calls
+DECL_SHADER_TOY_TEST(dim,dim,compiler_clod_function_call,compiler_clod);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_function_call,compiler_julia);
+
+// Still issues here for LLVM 3.2
+// DECL_SHADER_TOY_TEST(dim,dim,compiler_chocolux,compiler_chocolux);
+// DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge,compiler_menger_sponge);
+
+#undef DECL_SHADER_TOY_TEST
+
diff --git a/utests/compiler_shift_right.cpp b/utests/compiler_shift_right.cpp
new file mode 100644
index 0000000..b94cc46
--- /dev/null
+++ b/utests/compiler_shift_right.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+typedef unsigned int uint;
+
+static void cpu(int global_id, uint *src, int *dst) {
+ dst[global_id] = src[global_id] >> 24;
+}
+
+void compiler_shift_right(void)
+{
+ const size_t n = 16;
+ uint cpu_src[16];
+ int cpu_dst[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_shift_right");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((uint*)buf_data[0])[i] = 0x80000000 | rand();
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_shift_right);
diff --git a/utests/compiler_short_scatter.cpp b/utests/compiler_short_scatter.cpp
new file mode 100644
index 0000000..1746744
--- /dev/null
+++ b/utests/compiler_short_scatter.cpp
@@ -0,0 +1,25 @@
+#include "utest_helper.hpp"
+
+static void compiler_short_scatter(void)
+{
+ const size_t n = 128;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_short_scatter");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int16_t), NULL);
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((int16_t*)buf_data[0])[i] == (int16_t) i);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_short_scatter);
+
+
diff --git a/utests/compiler_simd_all.cpp b/utests/compiler_simd_all.cpp
new file mode 100644
index 0000000..086c54f
--- /dev/null
+++ b/utests/compiler_simd_all.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+void compiler_simd_all(void)
+{
+ const size_t n = 40;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_simd_all");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ globals[0] = n;
+ locals[0] = 10;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ ((int*)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
+ if (i % 2 == 1) {
+ if (i < (int32_t)locals[0])
+ OCL_ASSERT(((int *)buf_data[1])[i] == 1);
+ else
+ OCL_ASSERT(((int *)buf_data[1])[i] == 2);
+ }
+ else
+ OCL_ASSERT(((int *)buf_data[1])[i] == 3);
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_simd_all);
diff --git a/utests/compiler_simd_any.cpp b/utests/compiler_simd_any.cpp
new file mode 100644
index 0000000..dcc5ef1
--- /dev/null
+++ b/utests/compiler_simd_any.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+void compiler_simd_any(void)
+{
+ const size_t n = 40;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_simd_any");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ globals[0] = n;
+ locals[0] = 10;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ ((int*)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i){
+ //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
+ if (i % 2 == 1) {
+ if (i < (int32_t)locals[0])
+ OCL_ASSERT(((int *)buf_data[1])[i] == 1);
+ else
+ OCL_ASSERT(((int *)buf_data[1])[i] == 2);
+ }
+ else
+ OCL_ASSERT(((int *)buf_data[1])[i] == 3);
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_simd_any);
diff --git a/utests/compiler_smoothstep.cpp b/utests/compiler_smoothstep.cpp
new file mode 100644
index 0000000..363ea7e
--- /dev/null
+++ b/utests/compiler_smoothstep.cpp
@@ -0,0 +1,58 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+float cpu(float e0, float e1, float x)
+{
+ x = (x - e0) / (e1 - e0);
+ if (x >= 1)
+ x = 1.f;
+ if (x <= 0)
+ x = 0.f;
+ return x * x * (3 - 2 * x);
+}
+
+void compiler_smoothstep(void)
+{
+ const int n = 32;
+ float src1[n], src2[n], src3[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_smoothstep");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i) {
+ float a = 0.1f * (rand() & 15) - 0.75f;
+ float b = a + 0.1f * (rand() & 15) + 0.1f;
+ float c = 0.1f * (rand() & 15) - 0.75f;
+ src1[i] = ((float*)buf_data[0])[i] = a;
+ src2[i] = ((float*)buf_data[1])[i] = b;
+ src3[i] = ((float*)buf_data[2])[i] = c;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; ++i) {
+ float a = ((float*)buf_data[3])[i];
+ float b = cpu(src1[i], src2[i], src3[i]);
+ OCL_ASSERT(fabsf(a - b) < 1e-4f);
+ }
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_smoothstep);
diff --git a/utests/compiler_step.cpp b/utests/compiler_step.cpp
new file mode 100644
index 0000000..b022826
--- /dev/null
+++ b/utests/compiler_step.cpp
@@ -0,0 +1,342 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+ T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+ typedef cl_vec<T, N> vec_type;
+
+ cl_vec(void) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ }
+ cl_vec(vec_type & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ }
+
+ vec_type& operator= (vec_type & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ return *this;
+ }
+
+ template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+ memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ memcpy (this->ptr, other.ptr, sizeof(T) * N);
+ return *this;
+ }
+
+ bool operator== (vec_type & other) {
+ return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+ }
+
+ void step (vec_type & other) {
+ int i = 0;
+ for (; i < N; i++) {
+ T a = ptr[i];
+ T edge = other.ptr[i];
+ T f = a < edge ? 0.0 : 1.0;
+ ptr[i] = f;
+ }
+ }
+
+ void step (float & edge) {
+ int i = 0;
+ for (; i < N; i++) {
+ T a = ptr[i];
+ T f = a < edge ? 0.0 : 1.0;
+ ptr[i] = f;
+ }
+ }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+ cl_vec<T, N> *edge, cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+ cl_vec<T, N> v = src[global_id];
+ v.step(edge[global_id]);
+ dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *edge, T *src, U *dst)
+{
+ T f = src[global_id];
+ T e = edge[global_id];
+ f = f < e ? 0.0 : 1.0;
+ dst[global_id] = (U)f;
+}
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+ float edge, cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+ cl_vec<T, N> v = src[global_id];
+ v.step(edge);
+ dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, float edge, T *src, U *dst)
+{
+ T f = src[global_id];
+ f = f < edge ? 0.0 : 1.0;
+ dst[global_id] = (U)f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+ int i = 0;
+
+ memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
+ for (; i < N; i++) {
+ vect.ptr[i] = static_cast<T>(.1f * (rand() & 15) - .75f);
+ }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+ val = static_cast<T>(.1f * (rand() & 15) - .75f);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+ if (std::is_unsigned<T>::value)
+ printf(" %u", val);
+ else
+ printf(" %d", val);
+}
+
+inline static void print_data (float& val)
+{
+ printf(" %f", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* edge,
+ cl_vec<T, N>* src, cl_vec<U, N>* dst, int n)
+{
+ U* val = reinterpret_cast<U *>(dst);
+
+ n = n*((N+1)/2)*2;
+
+ printf("\nEdge: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+ printf("\nx: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[1])[i]);
+ }
+
+ printf("\nCPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(val[i]);
+ }
+ printf("\nGPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[2])[i]);
+ }
+}
+
+template <typename T, typename U> static void dump_data (T* edge, T* src, U* dst, int n)
+{
+ printf("\nedge: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+
+ printf("\nx: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[1])[i]);
+ }
+
+ printf("\nCPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(dst[i]);
+ }
+ printf("\nGPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[2])[i]);
+ }
+}
+
+template <typename T, typename U, int N> static void dump_data (float edge,
+ cl_vec<T, N>* src, cl_vec<U, N>* dst, int n)
+{
+ U* val = reinterpret_cast<U *>(dst);
+
+ n = n*((N+1)/2)*2;
+
+ printf("\nEdge: %f\n", edge);
+ printf("\nx: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+
+ printf("\nCPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(val[i]);
+ }
+ printf("\nGPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[1])[i]);
+ }
+}
+
+template <typename T, typename U> static void dump_data (float edge, T* src, U* dst, int n)
+{
+ printf("\nedge: %f\n", edge);
+ printf("\nx: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((T *)buf_data[0])[i]);
+ }
+
+ printf("\nCPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(dst[i]);
+ }
+ printf("\nGPU: \n");
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ print_data(((U *)buf_data[1])[i]);
+ }
+}
+
+template <typename T> static void compiler_step_with_type(void)
+{
+ const size_t n = 16;
+ T cpu_dst[n], cpu_src[n];
+ T edge[n];
+
+ // Setup buffers
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = n;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ /* Clear the dst buffer to avoid random data. */
+ OCL_MAP_BUFFER(2);
+ memset(buf_data[2], 0, sizeof(T) * n);
+ OCL_UNMAP_BUFFER(2);
+
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ gen_rand_val(cpu_src[i]);
+ gen_rand_val(edge[i]);
+ }
+
+ memcpy(buf_data[1], cpu_src, sizeof(T) * n);
+ memcpy(buf_data[0], edge, sizeof(T) * n);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, edge, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+
+ //dump_data(edge, cpu_src, cpu_dst, n);
+
+ OCL_ASSERT(!memcmp(buf_data[2], cpu_dst, sizeof(T) * n));
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+#define STEP_TEST_TYPE(TYPE) \
+ static void compiler_step_##TYPE (void) \
+ { \
+ OCL_CALL (cl_kernel_init, "compiler_step.cl", "compiler_step_"#TYPE, SOURCE, NULL); \
+ compiler_step_with_type<TYPE>(); \
+ } \
+ MAKE_UTEST_FROM_FUNCTION(compiler_step_##TYPE);
+
+typedef cl_vec<float, 2> float2;
+typedef cl_vec<float, 3> float3;
+typedef cl_vec<float, 4> float4;
+typedef cl_vec<float, 8> float8;
+typedef cl_vec<float, 16> float16;
+STEP_TEST_TYPE(float)
+STEP_TEST_TYPE(float2)
+STEP_TEST_TYPE(float3)
+STEP_TEST_TYPE(float4)
+STEP_TEST_TYPE(float8)
+STEP_TEST_TYPE(float16)
+
+
+template <typename T> static void compiler_stepf_with_type(void)
+{
+ const size_t n = 16;
+ T cpu_dst[n], cpu_src[n];
+ float edge = (float)(.1f * (rand() & 15) - .75f);
+
+ // Setup buffers
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(float), &edge);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = n;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+
+ /* Clear the dst buffer to avoid random data. */
+ OCL_MAP_BUFFER(1);
+ memset(buf_data[1], 0, sizeof(T) * n);
+ OCL_UNMAP_BUFFER(1);
+
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ gen_rand_val(cpu_src[i]);
+ }
+
+ memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, edge, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+
+ //dump_data(edge, cpu_src, cpu_dst, n);
+
+ OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+#define _STEPF_TEST_TYPE(TYPE, keep_program) \
+ static void compiler_stepf_##TYPE (void) \
+ { \
+ OCL_CALL (cl_kernel_init, "compiler_step.cl", "compiler_stepf_"#TYPE, SOURCE, NULL); \
+ compiler_stepf_with_type<TYPE>(); \
+ } \
+ MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_stepf_##TYPE, keep_program);
+
+#define STEPF_TEST_TYPE(TYPE) _STEPF_TEST_TYPE(TYPE, true)
+#define STEPF_TEST_TYPE_END(TYPE) _STEPF_TEST_TYPE(TYPE, false)
+
+
+STEPF_TEST_TYPE(float)
+STEPF_TEST_TYPE(float2)
+STEPF_TEST_TYPE(float3)
+STEPF_TEST_TYPE(float4)
+STEPF_TEST_TYPE(float8)
+STEPF_TEST_TYPE_END(float16)
diff --git a/utests/compiler_structure_attributes.cpp b/utests/compiler_structure_attributes.cpp
new file mode 100644
index 0000000..31656f4
--- /dev/null
+++ b/utests/compiler_structure_attributes.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_structure_attributes(void)
+{
+ OCL_CREATE_KERNEL("compiler_structure_attributes");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_structure_attributes);
+
diff --git a/utests/compiler_switch.cpp b/utests/compiler_switch.cpp
new file mode 100644
index 0000000..6e93309
--- /dev/null
+++ b/utests/compiler_switch.cpp
@@ -0,0 +1,48 @@
+#include "utest_helper.hpp"
+
+static void cpu_compiler_switch(int *dst, int *src, int get_global_id0)
+{
+ switch (get_global_id0) {
+ case 0: dst[get_global_id0] = src[get_global_id0 + 4]; break;
+ case 1: dst[get_global_id0] = src[get_global_id0 + 14]; break;
+ case 2: dst[get_global_id0] = src[get_global_id0 + 13]; break;
+ case 6: dst[get_global_id0] = src[get_global_id0 + 11]; break;
+ case 7: dst[get_global_id0] = src[get_global_id0 + 10]; break;
+ case 10: dst[get_global_id0] = src[get_global_id0 + 9]; break;
+ case 12: dst[get_global_id0] = src[get_global_id0 + 6]; break;
+ default: dst[get_global_id0] = src[get_global_id0 + 8]; break;
+ }
+}
+
+static void compiler_switch(void)
+{
+ const size_t n = 32;
+ int cpu_dst[32], cpu_src[32];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_switch");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 32; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[1])[i] = i;
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < 16; ++i)
+ cpu_compiler_switch(cpu_dst, cpu_src, i);
+ for (int i = 0; i < 16; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[0])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_switch)
+
diff --git a/utests/compiler_type_casting.cpp b/utests/compiler_type_casting.cpp
new file mode 100644
index 0000000..392acf4
--- /dev/null
+++ b/utests/compiler_type_casting.cpp
@@ -0,0 +1,10 @@
+#include "utest_helper.hpp"
+
+void compiler_type_casting(void)
+{
+ OCL_CREATE_KERNEL("compiler_type_casting");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_type_casting);
+
+
diff --git a/utests/compiler_uint16_copy.cpp b/utests/compiler_uint16_copy.cpp
new file mode 100644
index 0000000..1494e81
--- /dev/null
+++ b/utests/compiler_uint16_copy.cpp
@@ -0,0 +1,35 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint16_copy(void)
+{
+ const size_t n = 128;
+
+ // Setup kernel and buffers. Note that uint16 is aligned on 16 bytes
+ // according to the OCL specificatio
+ OCL_CREATE_KERNEL("compiler_uint16_copy");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[16]) * n);
+ for (uint32_t i = 0; i < n; ++i)
+ for (uint32_t j = 0; j < 16; ++j)
+ ((uint32_t*)buf_data[0])[16*i+j] = 16*i+j;
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[16]), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[16]), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 16*n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint16_copy);
+
diff --git a/utests/compiler_uint2_copy.cpp b/utests/compiler_uint2_copy.cpp
new file mode 100644
index 0000000..8eb4314
--- /dev/null
+++ b/utests/compiler_uint2_copy.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint2_copy(void)
+{
+ const size_t n = 128;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_uint2_copy");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[2]) * n);
+ for (uint32_t i = 0; i < 2*n; ++i) ((uint32_t*)buf_data[0])[i] = i;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[2]), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[2]), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 2*n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint2_copy);
+
diff --git a/utests/compiler_uint3_copy.cpp b/utests/compiler_uint3_copy.cpp
new file mode 100644
index 0000000..c4d3cf0
--- /dev/null
+++ b/utests/compiler_uint3_copy.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint3_copy(void)
+{
+ const size_t n = 128;
+
+ // Setup kernel and buffers. Note that uint3 is aligned on 16 bytes
+ // according to the OCL specification
+ OCL_CREATE_KERNEL("compiler_uint3_copy");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[4]) * n);
+ for (uint32_t i = 0; i < n; ++i) {
+ ((uint32_t*)buf_data[0])[4*i+0] = 3*i+0;
+ ((uint32_t*)buf_data[0])[4*i+1] = 3*i+1;
+ ((uint32_t*)buf_data[0])[4*i+2] = 3*i+2;
+ }
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[4]), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[4]), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(((uint32_t*)buf_data[0])[4*i+0] == ((uint32_t*)buf_data[1])[4*i+0]);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[4*i+1] == ((uint32_t*)buf_data[1])[4*i+1]);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[4*i+2] == ((uint32_t*)buf_data[1])[4*i+2]);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint3_copy);
+
diff --git a/utests/compiler_uint3_unaligned_copy.cpp b/utests/compiler_uint3_unaligned_copy.cpp
new file mode 100644
index 0000000..d42b4c3
--- /dev/null
+++ b/utests/compiler_uint3_unaligned_copy.cpp
@@ -0,0 +1,42 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint3_unaligned_copy(void)
+{
+ const size_t n = 128;
+
+ // Setup kernel and buffers. Note that uint3 is aligned on 16 bytes
+ // according to the OCL specification
+ OCL_CREATE_KERNEL("compiler_uint3_unaligned_copy");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[4]) * n);
+ for (uint32_t i = 0; i < n; ++i) {
+ ((uint32_t*)buf_data[0])[3*i+0] = 3*i+0;
+ ((uint32_t*)buf_data[0])[3*i+1] = 3*i+1;
+ ((uint32_t*)buf_data[0])[3*i+2] = 3*i+2;
+ }
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[4]), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[4]), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(((uint32_t*)buf_data[0])[3*i+0] == ((uint32_t*)buf_data[1])[3*i+0]);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[3*i+1] == ((uint32_t*)buf_data[1])[3*i+1]);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[3*i+2] == ((uint32_t*)buf_data[1])[3*i+2]);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint3_unaligned_copy);
+
+
+
diff --git a/utests/compiler_uint8_copy.cpp b/utests/compiler_uint8_copy.cpp
new file mode 100644
index 0000000..25dbd58
--- /dev/null
+++ b/utests/compiler_uint8_copy.cpp
@@ -0,0 +1,35 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint8_copy(void)
+{
+ const size_t n = 128;
+
+ // Setup kernel and buffers. Note that uint8 is aligned on 16 bytes
+ // according to the OCL specification
+ OCL_CREATE_KERNEL("compiler_uint8_copy");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[8]) * n);
+ for (uint32_t i = 0; i < n; ++i)
+ for (uint32_t j = 0; j < 8; ++j)
+ ((uint32_t*)buf_data[0])[8*i+j] = 8*i+j;
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[8]), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[8]), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 8*n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint8_copy);
+
diff --git a/utests/compiler_unstructured_branch0.cpp b/utests/compiler_unstructured_branch0.cpp
new file mode 100644
index 0000000..128a53e
--- /dev/null
+++ b/utests/compiler_unstructured_branch0.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch0(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_unstructured_branch0");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 16; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+ for (uint32_t i = 16; i < 32; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
+
+ // Second control flow
+ for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 32; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
+
+ // Third control flow
+ for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 8; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+ for (uint32_t i = 8; i < 32; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch0);
+
diff --git a/utests/compiler_unstructured_branch1.cpp b/utests/compiler_unstructured_branch1.cpp
new file mode 100644
index 0000000..6021f5b
--- /dev/null
+++ b/utests/compiler_unstructured_branch1.cpp
@@ -0,0 +1,54 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch1(void)
+{
+ const size_t n = 16;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_unstructured_branch1");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+
+ // Second control flow
+ for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
+
+ // Third control flow
+ for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+ for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 8; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+ for (uint32_t i = 8; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch1);
+
diff --git a/utests/compiler_unstructured_branch2.cpp b/utests/compiler_unstructured_branch2.cpp
new file mode 100644
index 0000000..d61c6b5
--- /dev/null
+++ b/utests/compiler_unstructured_branch2.cpp
@@ -0,0 +1,68 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch2(void)
+{
+ const size_t n = 16;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_unstructured_branch2");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
+
+ // Second control flow
+ for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
+
+ // Third control flow
+ for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+ for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 8; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
+ for (uint32_t i = 8; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
+
+ // Fourth control flow
+ for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 1;
+ for (uint32_t i = 4; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+ for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 8; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
+ for (uint32_t i = 8; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch2);
+
diff --git a/utests/compiler_unstructured_branch3.cpp b/utests/compiler_unstructured_branch3.cpp
new file mode 100644
index 0000000..0c6992a
--- /dev/null
+++ b/utests/compiler_unstructured_branch3.cpp
@@ -0,0 +1,58 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch3(void)
+{
+ const size_t n = 16;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_unstructured_branch3");
+ buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+ for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+
+ // First control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+
+ // Second control flow
+ for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = 0;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
+
+ // Third control flow
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+ for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = 0;
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 8; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+ for (uint32_t i = 8; i < n; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == 3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch3);
+
diff --git a/utests/compiler_upsample_int.cpp b/utests/compiler_upsample_int.cpp
new file mode 100644
index 0000000..ee912f9
--- /dev/null
+++ b/utests/compiler_upsample_int.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+
+void compiler_upsample_int(void)
+{
+ const int n = 32;
+ short src1[n];
+ unsigned short src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_upsample_int");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((short*)buf_data[0])[i] = rand();
+ src2[i] = ((short*)buf_data[1])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i)
+ OCL_ASSERT(((int*)buf_data[2])[i] == (int)((src1[i] << 16) | src2[i]));
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_upsample_int);
diff --git a/utests/compiler_upsample_long.cpp b/utests/compiler_upsample_long.cpp
new file mode 100644
index 0000000..b125ff4
--- /dev/null
+++ b/utests/compiler_upsample_long.cpp
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include "utest_helper.hpp"
+
+void compiler_upsample_long(void)
+{
+ const int n = 32;
+ int src1[n];
+ unsigned int src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_upsample_long");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(unsigned int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((unsigned int*)buf_data[1])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i)
+ OCL_ASSERT(((int64_t*)buf_data[2])[i] == (((int64_t)(src1[i]) << 32) | src2[i]));
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_upsample_long);
diff --git a/utests/compiler_vect_compare.cpp b/utests/compiler_vect_compare.cpp
new file mode 100644
index 0000000..e9e45be
--- /dev/null
+++ b/utests/compiler_vect_compare.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+typedef struct {
+ int x;
+ int y;
+ int z;
+ int w;
+} int4;
+
+void compiler_vect_compare(void)
+{
+ const size_t n = 16;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_vect_compare");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int4), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int4), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i) {
+ ((int4*)buf_data[0])[i].x = i & 0x1;
+ ((int4*)buf_data[0])[i].y = i & 0x2;
+ ((int4*)buf_data[0])[i].z = i & 0x4;
+ ((int4*)buf_data[0])[i].w = i & 0x8;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 16; ++i) {
+ OCL_ASSERT(((int4*)buf_data[1])[i].x == (int)((i&0x1)?0xffffffff:0));
+ OCL_ASSERT(((int4*)buf_data[1])[i].y == (int)((i&0x2)?0xffffffff:0));
+ OCL_ASSERT(((int4*)buf_data[1])[i].z == (int)((i&0x4)?0xffffffff:0));
+ OCL_ASSERT(((int4*)buf_data[1])[i].w == (int)((i&0x8)?0xffffffff:0));
+ }
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_vect_compare);
diff --git a/utests/compiler_vector_inc.cpp b/utests/compiler_vector_inc.cpp
new file mode 100644
index 0000000..c44424b
--- /dev/null
+++ b/utests/compiler_vector_inc.cpp
@@ -0,0 +1,46 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_vector_inc(void)
+{
+ const int n = 64;
+ char dst[n];
+ char src[n];
+
+ OCL_CREATE_KERNEL("compiler_vector_inc");
+ OCL_CREATE_BUFFER(buf[0], 0, n, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n / 2;
+ locals[0] = 16;
+
+ for (int i = 0; i < n; ++i) {
+ dst[i] = i;
+ src[i] = (i / 2) % 4;
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], dst, n);
+ memcpy(buf_data[1], src, n);
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ char *dest = ((char *)buf_data[0]);
+ for (int i=0; i<n; ++i) {
+ char wish;
+ if (src[i/2] < 2)
+ wish = dst[i] + 1;
+ else
+ wish = dst[i] - 1;
+ OCL_ASSERT(dest[i] == wish);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_vector_inc);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
new file mode 100644
index 0000000..5a1a8d1
--- /dev/null
+++ b/utests/compiler_vector_load_store.cpp
@@ -0,0 +1,63 @@
+#include "utest_helper.hpp"
+#include <string.h>
+template<typename T>
+static void compiler_vector_load_store(int elemNum, const char *kernelName)
+{
+ const size_t n = elemNum * 256;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_vector_load_store", kernelName);
+ buf_data[0] = (T*) malloc(sizeof(T) * n);
+ for (uint32_t i = 0; i < n; ++i)
+ ((T*)buf_data[0])[i] = i;
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n / elemNum;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i)
+ {
+ int shift = ((i % elemNum) + 1);
+ if (strstr(kernelName, "double") == NULL)
+ OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
+ else
+ OCL_ASSERT((((T*)buf_data[1])[i] - ((T)((T*)buf_data[0])[i] + shift)) < 1e-5);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+#define compiler_vector_load_store(type, n, kernel_type, keep_program) \
+static void compiler_vector_ ##kernel_type ##n ##_load_store(void)\
+{\
+ compiler_vector_load_store<type>(n, "test_" #kernel_type #n);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_vector_ ## kernel_type ##n ##_load_store, keep_program);
+
+#define test_all_vector(type, kernel_type, keep_program) \
+ compiler_vector_load_store(type, 2, kernel_type, true) \
+ compiler_vector_load_store(type, 3, kernel_type, true) \
+ compiler_vector_load_store(type, 4, kernel_type, true) \
+ compiler_vector_load_store(type, 8, kernel_type, true) \
+ compiler_vector_load_store(type, 16, kernel_type, keep_program)
+
+test_all_vector(int8_t, char, true)
+test_all_vector(uint8_t, uchar, true)
+test_all_vector(int16_t, short, true)
+test_all_vector(uint16_t, ushort, true)
+test_all_vector(int32_t, int, true)
+test_all_vector(uint32_t, uint, true)
+test_all_vector(float, float, true)
+//test_all_vector(double, double, true)
+test_all_vector(int64_t, long, true)
+test_all_vector(uint64_t, ulong, false)
diff --git a/utests/compiler_volatile.cpp b/utests/compiler_volatile.cpp
new file mode 100644
index 0000000..f4fe054
--- /dev/null
+++ b/utests/compiler_volatile.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_volatile(void)
+{
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_volatile");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_volatile);
diff --git a/utests/compiler_workitem_builtin.cpp b/utests/compiler_workitem_builtin.cpp
new file mode 100644
index 0000000..092b0e7
--- /dev/null
+++ b/utests/compiler_workitem_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_workitem_builtin(void)
+{
+ OCL_CREATE_KERNEL("compiler_workitem_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_workitem_builtin);
+
diff --git a/utests/compiler_write_only.cpp b/utests/compiler_write_only.cpp
new file mode 100644
index 0000000..3935535
--- /dev/null
+++ b/utests/compiler_write_only.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_write_only(void)
+{
+ const size_t n = 2048;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("test_write_only");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_write_only);
+
diff --git a/utests/compiler_write_only_bytes.cpp b/utests/compiler_write_only_bytes.cpp
new file mode 100644
index 0000000..1a13cdb
--- /dev/null
+++ b/utests/compiler_write_only_bytes.cpp
@@ -0,0 +1,23 @@
+#include "utest_helper.hpp"
+
+void compiler_write_only_bytes(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_write_only_bytes");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint8_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint8_t*)buf_data[0])[i] == 2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_write_only_bytes);
diff --git a/utests/compiler_write_only_shorts.cpp b/utests/compiler_write_only_shorts.cpp
new file mode 100644
index 0000000..19988fe
--- /dev/null
+++ b/utests/compiler_write_only_shorts.cpp
@@ -0,0 +1,24 @@
+#include "utest_helper.hpp"
+
+void compiler_write_only_shorts(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_write_only_shorts");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint16_t*)buf_data[0])[i] == 2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_write_only_shorts);
+
diff --git a/utests/enqueue_built_in_kernels.cpp b/utests/enqueue_built_in_kernels.cpp
new file mode 100644
index 0000000..52b8848
--- /dev/null
+++ b/utests/enqueue_built_in_kernels.cpp
@@ -0,0 +1,19 @@
+#include "utest_helper.hpp"
+
+void enqueue_built_in_kernels(void)
+{
+ char* built_in_kernel_names;
+ size_t built_in_kernels_size;
+ cl_int err = CL_SUCCESS;
+ size_t ret_sz;
+
+
+ OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+ built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+ OCL_ASSERT(ret_sz == built_in_kernels_size);
+ cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+ OCL_ASSERT(built_in_prog != NULL);
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_built_in_kernels);
diff --git a/utests/enqueue_copy_buf.cpp b/utests/enqueue_copy_buf.cpp
new file mode 100644
index 0000000..b647b7e
--- /dev/null
+++ b/utests/enqueue_copy_buf.cpp
@@ -0,0 +1,66 @@
+#include "utest_helper.hpp"
+
+static void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+ unsigned int i;
+ OCL_MAP_BUFFER(0);
+
+ for (i=0; i < sz; i++) {
+ ((char*)buf_data[0])[i] = (rand() & 63);
+ }
+
+ OCL_UNMAP_BUFFER(0);
+
+ if (src_off + cb > sz || dst_off + cb > sz) {
+ /* Expect Error. */
+ OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+ src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+ return;
+ }
+
+ OCL_ASSERT(!clEnqueueCopyBuffer(queue, buf[0], buf[1],
+ src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+#if 0
+ printf("\n########### Src buffer: \n");
+ for (i = 0; i < cb; ++i)
+ printf(" %2.2u", ((unsigned char*)buf_data[0])[i + src_off]);
+
+ printf("\n########### dst buffer: \n");
+ for (i = 0; i < cb; ++i)
+ printf(" %2.2u", ((unsigned char*)buf_data[1])[i + dst_off]);
+#endif
+
+ // Check results
+ for (i = 0; i < cb; ++i) {
+ if (((char*)buf_data[0])[i + src_off] != ((char*)buf_data[1])[i + dst_off]) {
+ printf ("different index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ }
+
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+}
+
+void enqueue_copy_buf(void)
+{
+ size_t i;
+ size_t j;
+ const size_t sz = 1024;
+
+ OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+ for (i=0; i<sz; i+=7) {
+ for (j=0; j<sz; j+=10) {
+ test_copy_buf(sz, i, j, sz/2);
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_copy_buf);
diff --git a/utests/enqueue_copy_buf_unaligned.cpp b/utests/enqueue_copy_buf_unaligned.cpp
new file mode 100644
index 0000000..e1bd0aa
--- /dev/null
+++ b/utests/enqueue_copy_buf_unaligned.cpp
@@ -0,0 +1,118 @@
+#include "utest_helper.hpp"
+
+static void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+ unsigned int i;
+ OCL_MAP_BUFFER(0);
+
+ for (i=0; i < sz; i++) {
+ ((char*)buf_data[0])[i] = (rand() & 31);
+ }
+
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_MAP_BUFFER(1);
+
+ for (i=0; i < sz; i++) {
+ ((char*)buf_data[1])[i] = 64;
+ }
+
+ OCL_UNMAP_BUFFER(1);
+
+ if (src_off + cb > sz || dst_off + cb > sz) {
+ /* Expect Error. */
+ OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+ src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+ return;
+ }
+
+ OCL_ASSERT(!clEnqueueCopyBuffer(queue, buf[0], buf[1],
+ src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+#if 0
+ printf ("@@@@@@@@@ cb is %d\n", cb);
+ printf ("@@@@@@@@@ src_off is %d\n", src_off);
+ printf ("@@@@@@@@@ dst_off is %d\n", dst_off);
+ printf("\n########### Src buffer: \n");
+ for (i = 0; i < sz; ++i)
+ printf(" %2.2u", ((unsigned char*)buf_data[0])[i]);
+
+ printf("\n########### dst buffer: \n");
+ for (i = 0; i < sz; ++i)
+ printf(" %2.2u", ((unsigned char*)buf_data[1])[i]);
+#endif
+
+ // Check results
+ for (i = 0; i < cb; ++i) {
+ if (((char*)buf_data[0])[i +src_off] != ((char*)buf_data[1])[i + dst_off]) {
+ printf ("different index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ }
+
+ for (i = 0; i < dst_off; ++i) {
+ if (((char*)buf_data[1])[i] != 64) {
+ printf ("wrong write, different index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ }
+
+ for (i = dst_off + cb; i < sz; ++i) {
+ if (((char*)buf_data[1])[i] != 64) {
+ printf ("wrong write, different index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ }
+
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+}
+
+void enqueue_copy_buf_unaligned(void)
+{
+ size_t i;
+ size_t j;
+ const size_t sz = 1024;
+ int offset = 0;
+
+ OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+#if 1
+ /* Test the same offset cases. */
+ for (i=0; i<sz; i+=32) {
+ for (j=64; j<sz; j+=32) {
+ offset = (rand() & 3);
+ test_copy_buf(sz, i + offset, j + offset, ((rand() & 31) + 1));
+ }
+ }
+#endif
+
+#if 1
+ /* Test the dst small offset cases. */
+ for (i=0; i<sz; i+=32) {
+ for (j=64; j<sz; j+=32) {
+ offset = (rand() & 2);
+ test_copy_buf(sz, i + offset + 1, j + offset, ((rand() & 31) + 1));
+ }
+ }
+#endif
+
+#if 1
+ /* Test the dst big offset cases. */
+ for (i=0; i<sz; i+=32) {
+ for (j=64; j<sz; j+=32) {
+ offset = (rand() & 2);
+ test_copy_buf(sz, i + offset, j + offset + 1, ((rand() & 31) + 1));
+ }
+ }
+#endif
+// test_copy_buf(sz, 0, 1, 17);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_copy_buf_unaligned);
diff --git a/utests/enqueue_fill_buf.cpp b/utests/enqueue_fill_buf.cpp
new file mode 100644
index 0000000..272b81f
--- /dev/null
+++ b/utests/enqueue_fill_buf.cpp
@@ -0,0 +1,90 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+static char pattern_serials[128];
+
+static void test_fill_buf(size_t sz, size_t offset, size_t size, size_t pattern_sz)
+{
+ unsigned int i;
+ int ret = 0;
+ OCL_MAP_BUFFER(0);
+ memset(((char*)buf_data[0]), 0, sz);
+ OCL_UNMAP_BUFFER(0);
+
+ for (i=0; i < pattern_sz; i++) {
+ pattern_serials[i] = (rand() & 63);
+ }
+
+ if (offset + size > sz) {
+ /* Expect Error. */
+ OCL_ASSERT(clEnqueueFillBuffer(queue, buf[0], pattern_serials,
+ pattern_sz, offset, size, 0, NULL, NULL));
+ return;
+ }
+
+ ret = clEnqueueFillBuffer(queue, buf[0], pattern_serials,
+ pattern_sz, offset, size, 0, NULL, NULL);
+ OCL_ASSERT(!ret);
+
+ OCL_MAP_BUFFER(0);
+
+#if 0
+ printf("\n==== pattern size is %d, offset is %d, size is %d ====\n",
+ pattern_sz, offset, size);
+ printf("\n########### buffer: \n");
+ for (i = 0; i < sz; ++i)
+ printf(" %2.2u", ((unsigned char*)buf_data[0])[i]);
+
+#endif
+
+ // Check results
+ int j = 0;
+ for (i = 0; i < sz; ++i) {
+ if (i < offset || i >= offset + size) {
+ if (((char*)buf_data[0])[i] != 0) {
+ printf ("\nnon zero index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ continue;
+ }
+
+ if (((char*)buf_data[0])[i] != pattern_serials[j]) {
+ printf ("\ndifferent index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ j++;
+ if (j == (int)pattern_sz) j = 0;
+ }
+
+ OCL_UNMAP_BUFFER(0);
+
+}
+
+void enqueue_fill_buf(void)
+{
+ size_t offset;
+ size_t pattern_sz;
+ const size_t sz = 1024;
+ size_t size = 0;
+ static int valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+ unsigned int i = 0;
+
+ OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+
+ for (i = 0; i < sizeof(valid_sz)/sizeof(int); i++) {
+
+ pattern_sz = valid_sz[i];
+ size = ((rand()%1024)/pattern_sz) * pattern_sz;
+ offset = ((rand()%1024)/pattern_sz) * pattern_sz;
+ while (size + offset + 1 > sz) {
+ if (size > offset) {
+ size = size - offset;
+ } else
+ offset = offset - size;
+ }
+
+ test_fill_buf(sz, offset, size, pattern_sz);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_fill_buf);
diff --git a/utests/get_arg_info.cpp b/utests/get_arg_info.cpp
new file mode 100644
index 0000000..c1ea1ef
--- /dev/null
+++ b/utests/get_arg_info.cpp
@@ -0,0 +1,85 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+void test_get_arg_info(void)
+{
+ int ret;
+ uint32_t ret_val;
+ cl_kernel_arg_type_qualifier type_qual;
+ size_t ret_sz;
+ char name[64];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("test_get_arg_info");
+
+ //Arg 0
+ ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+ sizeof(ret_val), &ret_val, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_address_qualifier));
+ OCL_ASSERT(ret_val == CL_KERNEL_ARG_ADDRESS_GLOBAL);
+
+ ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_ACCESS_QUALIFIER,
+ sizeof(ret_val), &ret_val, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_access_qualifier));
+ OCL_ASSERT(ret_val == CL_KERNEL_ARG_ACCESS_NONE);
+
+ ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_NAME,
+ sizeof(name), name, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == strlen("float*") + 1);
+ OCL_ASSERT(!strcmp(name, "float*"));
+
+ ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_NAME,
+ sizeof(name), name, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == strlen("src") + 1);
+ OCL_ASSERT(!strcmp(name, "src"));
+
+ ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER,
+ sizeof(type_qual), &type_qual, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_type_qualifier));
+ OCL_ASSERT(type_qual == (CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE));
+
+ //Arg 1
+ ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+ sizeof(ret_val), &ret_val, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_address_qualifier));
+ OCL_ASSERT(ret_val == CL_KERNEL_ARG_ADDRESS_LOCAL);
+
+ ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_ACCESS_QUALIFIER,
+ sizeof(ret_val), &ret_val, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_access_qualifier));
+ OCL_ASSERT(ret_val == CL_KERNEL_ARG_ACCESS_NONE);
+
+ ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_TYPE_NAME,
+ sizeof(name), name, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == strlen("int*") + 1);
+ OCL_ASSERT(!strcmp(name, "int*"));
+
+ ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_NAME,
+ sizeof(name), name, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == strlen("dst") + 1);
+ OCL_ASSERT(!strcmp(name, "dst"));
+
+ ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_TYPE_QUALIFIER,
+ sizeof(type_qual), &type_qual, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_type_qualifier));
+ OCL_ASSERT(type_qual == CL_KERNEL_ARG_TYPE_NONE);
+
+ //Arg 2
+ ret = clGetKernelArgInfo(kernel, 2, CL_KERNEL_ARG_TYPE_NAME,
+ sizeof(name), name, &ret_sz);
+ OCL_ASSERT(ret == CL_SUCCESS);
+ OCL_ASSERT(ret_sz == strlen("test_arg_struct") + 1);
+ OCL_ASSERT(!strcmp(name, "test_arg_struct"));
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_get_arg_info);
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
new file mode 100644
index 0000000..807739b
--- /dev/null
+++ b/utests/get_cl_info.cpp
@@ -0,0 +1,641 @@
+#include <string.h>
+#include <string>
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* ***************************************************** *
+ * This file to test all the API like: clGetXXXXInfo *
+ * ***************************************************** */
+#define NO_STANDARD_REF 0xFFFFF
+
+template <typename T = cl_uint>
+struct Info_Result {
+ T ret;
+ T refer;
+ int size;
+ typedef T type_value;
+
+ void * get_ret(void) {
+ return (void *)&ret;
+ }
+
+ Info_Result(T other) {
+ refer = other;
+ size = sizeof(T);
+ }
+
+ bool check_result (void) {
+ //printf("The refer is %d, we get result is %d\n", refer, ret);
+ if (ret != refer && refer != (T)NO_STANDARD_REF)
+ return false;
+
+ return true;
+ }
+};
+
+template <>
+struct Info_Result<char *> {
+ char * ret;
+ char * refer;
+ int size;
+ typedef char* type_value;
+
+ Info_Result(const char *other, int sz): refer(NULL) {
+ size = sz;
+ ret = (char *)malloc(sizeof(char) * sz);
+ if (other) {
+ refer = (char *)malloc(sizeof(char) * sz);
+ memcpy(refer, other, sz);
+ }
+ }
+
+ ~Info_Result(void) {
+ free(refer);
+ free(ret);
+ }
+
+ void * get_ret(void) {
+ return (void *)ret;
+ }
+
+ bool check_result (void) {
+ if (refer && ::memcmp(ret, refer, size))
+ return false;
+
+ return true;
+ }
+};
+
+template <> //Used for such as CL_PROGRAM_BINARIES
+struct Info_Result<char **> {
+ char ** ret;
+ char ** refer;
+ int *elt_size;
+ int size;
+ typedef char** type_value;
+
+ Info_Result(char **other, int *sz, int elt_num) {
+ size = elt_num;
+
+ ret = (char **)malloc(elt_num * sizeof(char *));
+ memset(ret, 0, (elt_num * sizeof(char *)));
+ refer = (char **)malloc(elt_num * sizeof(char *));
+ memset(refer, 0, (elt_num * sizeof(char *)));
+ elt_size = (int *)malloc(elt_num * sizeof(int));
+ memset(elt_size, 0, (elt_num * sizeof(int)));
+ if (sz) {
+ int i = 0;
+ for (; i < elt_num; i++) {
+ elt_size[i] = sz[i];
+ ret[i] = (char *)malloc(sz[i] * sizeof(char));
+
+ if (other[i] && elt_size[i] > 0) {
+ refer[i] = (char *)malloc(sz[i] * sizeof(char));
+ memcpy(&refer[i], &other[i], sz[i]);
+ }
+ else
+ refer[i] = NULL;
+ }
+ }
+ }
+
+ ~Info_Result(void) {
+ int i = 0;
+ for (; i < size; i++) {
+ if (refer[i])
+ free(refer[i]);
+ free(ret[i]);
+ }
+ free(ret);
+ free(refer);
+ free(elt_size);
+ }
+
+ void * get_ret(void) {
+ return (void *)ret;
+ }
+
+ bool check_result (void) {
+ int i = 0;
+ for (; i < size; i++) {
+ if (refer[i] && ::memcmp(ret[i], refer[i], elt_size[i]))
+ return false;
+ }
+
+ return true;
+ }
+};
+
+template <typename T1, typename T2>
+struct Traits {
+ static bool Is_Same(void) {
+ return false;
+ };
+};
+
+template <typename T1>
+struct Traits<T1, T1> {
+ static bool Is_Same(void) {
+ return true;
+ };
+};
+
+template <typename T>
+Info_Result<T>* cast_as(void *info)
+{
+ Info_Result<T>* ret;
+ ret = reinterpret_cast<Info_Result<T>*>(info);
+ OCL_ASSERT((Traits<T, typename Info_Result<T>::type_value>::Is_Same()));
+ return ret;
+}
+
+
+#define CALL_INFO_AND_RET(TYPE, FUNC, ...) \
+ do { \
+ cl_int ret; \
+ size_t ret_size; \
+ \
+ Info_Result<TYPE>* info = cast_as<TYPE>(x.second); \
+ ret = FUNC (__VA_ARGS__, x.first, \
+ info->size, info->get_ret(), &ret_size); \
+ OCL_ASSERT((!ret)); \
+ OCL_ASSERT((info->check_result())); \
+ delete info; \
+ } while(0)
+
+/* ***************************************************** *
+ * clGetProgramInfo *
+ * ***************************************************** */
+#define CALL_PROGINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetProgramInfo, program)
+
+void get_program_info(void)
+{
+ map<cl_program_info, void *> maps;
+ int expect_value;
+ char * expect_source;
+ int sz;
+ char *ker_path = (char *)malloc(4096 * sizeof(char));
+ const char *kiss_path = getenv("OCL_KERNEL_PATH");
+ string line;
+ string source_code;
+
+ sprintf(ker_path, "%s/%s", kiss_path, "compiler_if_else.cl");
+
+ ifstream in(ker_path);
+ while (getline(in,line)) {
+ source_code = (source_code == "") ?
+ source_code + line : source_code + "\n" + line;
+ }
+ free(ker_path);
+ //cout<< source_code;
+ source_code = source_code + "\n";
+
+ expect_source = (char *)source_code.c_str();
+
+ OCL_CREATE_KERNEL("compiler_if_else");
+
+ /* First test for clGetProgramInfo. We just have 1 devices now */
+ expect_value = 2;//One program, one kernel.
+ maps.insert(make_pair(CL_PROGRAM_REFERENCE_COUNT,
+ (void *)(new Info_Result<>(((cl_uint)expect_value)))));
+ maps.insert(make_pair(CL_PROGRAM_CONTEXT,
+ (void *)(new Info_Result<cl_context>(ctx))));
+ expect_value = 1;
+ maps.insert(make_pair(CL_PROGRAM_NUM_DEVICES,
+ (void *)(new Info_Result<>(((cl_uint)expect_value)))));
+ maps.insert(make_pair(CL_PROGRAM_DEVICES,
+ (void *)(new Info_Result<cl_device_id>(device))));
+ sz = (strlen(expect_source) + 1);
+ maps.insert(make_pair(CL_PROGRAM_SOURCE,
+ (void *)(new Info_Result<char *>(expect_source, sz))));
+ expect_value = NO_STANDARD_REF;
+ maps.insert(make_pair(CL_PROGRAM_BINARY_SIZES,
+ (void *)(new Info_Result<size_t>((size_t)expect_value))));
+ sz = 4096; //big enough?
+ expect_source = NULL;
+ maps.insert(make_pair(CL_PROGRAM_BINARIES,
+ (void *)(new Info_Result<char **>(&expect_source, &sz, 1))));
+
+ std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
+ switch (x.first) {
+ case CL_PROGRAM_REFERENCE_COUNT:
+ case CL_PROGRAM_NUM_DEVICES:
+ CALL_PROGINFO_AND_RET(cl_uint);
+ break;
+ case CL_PROGRAM_CONTEXT:
+ CALL_PROGINFO_AND_RET(cl_context);
+ break;
+ case CL_PROGRAM_DEVICES:
+ CALL_PROGINFO_AND_RET(cl_device_id);
+ break;
+ case CL_PROGRAM_SOURCE:
+ CALL_PROGINFO_AND_RET(char *);
+ break;
+ case CL_PROGRAM_BINARY_SIZES:
+ CALL_PROGINFO_AND_RET(size_t);
+ break;
+ case CL_PROGRAM_BINARIES:
+ CALL_PROGINFO_AND_RET(char **);
+ break;
+ default:
+ break;
+ }
+ });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_program_info);
+
+/* ***************************************************** *
+ * clGetCommandQueueInfo *
+ * ***************************************************** */
+#define CALL_QUEUEINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetCommandQueueInfo, queue)
+
+void get_queue_info(void)
+{
+ /* use the compiler_fabs case to test us. */
+ const size_t n = 16;
+ map<cl_program_info, void *> maps;
+ int expect_ref;
+ cl_command_queue_properties prop;
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_KERNEL("compiler_fabs");
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ /* Do our test.*/
+ maps.insert(make_pair(CL_QUEUE_CONTEXT,
+ (void *)(new Info_Result<cl_context>(ctx))));
+ maps.insert(make_pair(CL_QUEUE_DEVICE,
+ (void *)(new Info_Result<cl_device_id>(device))));
+
+ expect_ref = 1;
+ maps.insert(make_pair(CL_QUEUE_REFERENCE_COUNT,
+ (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+ prop = 0;
+ maps.insert(make_pair(CL_QUEUE_PROPERTIES,
+ (void *)(new Info_Result<cl_command_queue_properties>(
+ ((cl_command_queue_properties)prop)))));
+
+ std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
+ switch (x.first) {
+ case CL_QUEUE_CONTEXT:
+ CALL_QUEUEINFO_AND_RET(cl_context);
+ break;
+ case CL_QUEUE_DEVICE:
+ CALL_QUEUEINFO_AND_RET(cl_device_id);
+ break;
+ case CL_QUEUE_REFERENCE_COUNT:
+ CALL_QUEUEINFO_AND_RET(cl_uint);
+ break;
+ case CL_QUEUE_PROPERTIES:
+ CALL_QUEUEINFO_AND_RET(cl_command_queue_properties);
+ break;
+ default:
+ break;
+ }
+ });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_queue_info);
+
+/* ***************************************************** *
+ * clGetProgramBuildInfo *
+ * ***************************************************** */
+#define CALL_PROG_BUILD_INFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, \
+ clGetProgramBuildInfo, program, device)
+
+void get_program_build_info(void)
+{
+ map<cl_program_info, void *> maps;
+ cl_build_status expect_status;
+ char build_opt[] = "-emit-llvm";
+ char log[] = "";
+ int sz;
+
+ OCL_CALL (cl_kernel_init, "compiler_if_else.cl", "compiler_if_else", SOURCE, build_opt);
+
+ /* Do our test.*/
+ expect_status = CL_BUILD_SUCCESS;
+ maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
+ (void *)(new Info_Result<cl_build_status>(expect_status))));
+ sz = strlen(build_opt) + 1;
+ maps.insert(make_pair(CL_PROGRAM_BUILD_OPTIONS,
+ (void *)(new Info_Result<char *>(build_opt, sz))));
+ sz = strlen(log) + 1;
+ maps.insert(make_pair(CL_PROGRAM_BUILD_LOG, /* not supported now, just "" */
+ (void *)(new Info_Result<char *>(log, sz))));
+
+ std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
+ switch (x.first) {
+ case CL_PROGRAM_BUILD_STATUS:
+ CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
+ break;
+ case CL_PROGRAM_BUILD_OPTIONS:
+ CALL_PROG_BUILD_INFO_AND_RET(char *);
+ break;
+ case CL_PROGRAM_BUILD_LOG:
+ CALL_PROG_BUILD_INFO_AND_RET(char *);
+ break;
+ default:
+ break;
+ }
+ });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_program_build_info);
+
+/* ***************************************************** *
+ * clGetContextInfo *
+ * ***************************************************** */
+#define CALL_CONTEXTINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetContextInfo, ctx)
+
+void get_context_info(void)
+{
+ /* use the compiler_fabs case to test us. */
+ const size_t n = 16;
+ map<cl_context_info, void *> maps;
+ int expect_ref;
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_KERNEL("compiler_fabs");
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ /* Do our test.*/
+ expect_ref = 1;
+ maps.insert(make_pair(CL_CONTEXT_NUM_DEVICES,
+ (void *)(new Info_Result<cl_uint>(expect_ref))));
+ maps.insert(make_pair(CL_CONTEXT_DEVICES,
+ (void *)(new Info_Result<cl_device_id>(device))));
+ // reference count seems depends on the implementation
+ expect_ref = NO_STANDARD_REF;
+ maps.insert(make_pair(CL_CONTEXT_REFERENCE_COUNT,
+ (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+ maps.insert(make_pair(CL_CONTEXT_PROPERTIES,
+ (void *)(new Info_Result<char*>(
+ (const char*)NULL, 100*sizeof(cl_context_properties)))));
+
+ std::for_each(maps.begin(), maps.end(), [](pair<cl_context_info, void *> x) {
+ switch (x.first) {
+ case CL_CONTEXT_NUM_DEVICES:
+ CALL_CONTEXTINFO_AND_RET(cl_uint);
+ break;
+ case CL_CONTEXT_DEVICES:
+ CALL_CONTEXTINFO_AND_RET(cl_device_id);
+ break;
+ case CL_CONTEXT_REFERENCE_COUNT:
+ CALL_CONTEXTINFO_AND_RET(cl_uint);
+ break;
+ case CL_CONTEXT_PROPERTIES:
+ CALL_CONTEXTINFO_AND_RET(char*);
+ break;
+ default:
+ break;
+ }
+ });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_context_info);
+
+/* ***************************************************** *
+ * clGetKernelInfo *
+ * ***************************************************** */
+#define CALL_KERNELINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetKernelInfo, kernel)
+
+void get_kernel_info(void)
+{
+ /* use the compiler_fabs case to test us. */
+ const size_t n = 16;
+ map<cl_kernel_info, void *> maps;
+ int expect_ref;
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_KERNEL("compiler_fabs");
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ // Run the kernel on GPU
+
+ maps.insert(make_pair(CL_KERNEL_PROGRAM,
+ (void *)(new Info_Result<cl_program>(program))));
+ maps.insert(make_pair(CL_KERNEL_CONTEXT,
+ (void *)(new Info_Result<cl_context>(ctx))));
+ // reference count seems depends on the implementation
+ expect_ref = NO_STANDARD_REF;
+ maps.insert(make_pair(CL_KERNEL_REFERENCE_COUNT,
+ (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+ expect_ref = 2;
+ maps.insert(make_pair(CL_KERNEL_NUM_ARGS,
+ (void *)(new Info_Result<cl_uint>(expect_ref))));
+
+ const char * expected_name = "compiler_fabs";
+ maps.insert(make_pair(CL_KERNEL_FUNCTION_NAME,
+ (void *)(new Info_Result<char*>(expected_name, strlen(expected_name)+1))));
+
+ std::for_each(maps.begin(), maps.end(), [](pair<cl_kernel_info, void *> x) {
+ switch (x.first) {
+ case CL_KERNEL_PROGRAM:
+ CALL_KERNELINFO_AND_RET(cl_program);
+ break;
+ case CL_KERNEL_CONTEXT:
+ CALL_KERNELINFO_AND_RET(cl_context);
+ break;
+ case CL_KERNEL_REFERENCE_COUNT:
+ CALL_KERNELINFO_AND_RET(cl_uint);
+ break;
+ case CL_KERNEL_NUM_ARGS:
+ CALL_KERNELINFO_AND_RET(cl_uint);
+ break;
+ case CL_KERNEL_FUNCTION_NAME:
+ CALL_KERNELINFO_AND_RET(char*);
+ break;
+ default:
+ break;
+ }
+ });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_kernel_info);
+
+/* ***************************************************** *
+ * clGetImageInfo *
+ * ***************************************************** */
+void get_image_info(void)
+{
+ const size_t w = 512;
+ const size_t h = 512;
+ cl_image_format format;
+ cl_image_desc desc;
+
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = CL_UNSIGNED_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = 0;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+ desc.num_mip_levels = 0;
+ desc.num_samples = 0;
+ desc.buffer = NULL;
+
+ OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+ cl_mem image = buf[0];
+
+ cl_image_format ret_format;
+ OCL_CALL(clGetImageInfo, image, CL_IMAGE_FORMAT, sizeof(ret_format), &ret_format, NULL);
+ OCL_ASSERT(format.image_channel_order == ret_format.image_channel_order);
+ OCL_ASSERT(format.image_channel_data_type == ret_format.image_channel_data_type);
+
+ size_t element_size;
+ OCL_CALL(clGetImageInfo, image, CL_IMAGE_ELEMENT_SIZE, sizeof(element_size), &element_size, NULL);
+ OCL_ASSERT(element_size == 4);
+
+ size_t row_pitch;
+ OCL_CALL(clGetImageInfo, image, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
+ OCL_ASSERT(row_pitch == 4 * w);
+
+ size_t slice_pitch;
+ OCL_CALL(clGetImageInfo, image, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
+ OCL_ASSERT(slice_pitch == 0);
+
+ size_t width;
+ OCL_CALL(clGetImageInfo, image, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+ OCL_ASSERT(width == w);
+
+ size_t height;
+ OCL_CALL(clGetImageInfo, image, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+ OCL_ASSERT(height == h);
+
+ size_t depth;
+ OCL_CALL(clGetImageInfo, image, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
+ OCL_ASSERT(depth == 0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_image_info);
+
+/* ***************************************************** *
+ * clGetMemObjectInfo *
+ * ***************************************************** */
+#define CALL_GETMEMINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetMemObjectInfo, (buf[0]))
+
+void get_mem_info(void)
+{
+ map<cl_mem_info, void *> maps;
+ int expect_ref;
+ cl_mem sub_buf;
+ cl_int error;
+
+ OCL_CREATE_BUFFER(buf[1], 0, 4096, NULL);
+
+ cl_buffer_region region;
+ region.origin = 1024;
+ region.size = 2048;
+ sub_buf = clCreateSubBuffer(buf[1], 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error );
+ buf[0] = sub_buf;
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ void * map_ptr = clEnqueueMapBuffer(queue, buf[0], 1, CL_MAP_READ, 0, 64, 0, NULL, NULL, NULL);
+
+ expect_ref = CL_MEM_OBJECT_BUFFER;
+ maps.insert(make_pair(CL_MEM_TYPE,
+ (void *)(new Info_Result<cl_mem_object_type>((cl_mem_object_type)expect_ref))));
+ expect_ref = 0;
+ maps.insert(make_pair(CL_MEM_FLAGS,
+ (void *)(new Info_Result<cl_mem_flags>(expect_ref))));
+ expect_ref = 2048;
+ maps.insert(make_pair(CL_MEM_SIZE,
+ (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+ expect_ref = 1024;
+ maps.insert(make_pair(CL_MEM_HOST_PTR,
+ (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+ expect_ref = 1;
+ maps.insert(make_pair(CL_MEM_MAP_COUNT,
+ (void *)(new Info_Result<cl_uint>(((cl_uint)expect_ref)))));
+ expect_ref = 1;
+ maps.insert(make_pair(CL_MEM_REFERENCE_COUNT,
+ (void *)(new Info_Result<cl_uint>(((cl_uint)expect_ref)))));
+ maps.insert(make_pair(CL_MEM_CONTEXT,
+ (void *)(new Info_Result<cl_context>(((cl_context)ctx)))));
+ maps.insert(make_pair(CL_MEM_ASSOCIATED_MEMOBJECT,
+ (void *)(new Info_Result<cl_mem>(((cl_mem)buf[1])))));
+ expect_ref = 1024;
+ maps.insert(make_pair(CL_MEM_OFFSET,
+ (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+
+ std::for_each(maps.begin(), maps.end(), [](pair<cl_mem_info, void *> x) {
+ switch (x.first) {
+ case CL_MEM_TYPE:
+ CALL_GETMEMINFO_AND_RET(cl_mem_object_type);
+ break;
+ case CL_MEM_FLAGS:
+ CALL_GETMEMINFO_AND_RET(cl_mem_flags);
+ break;
+ case CL_MEM_SIZE:
+ CALL_GETMEMINFO_AND_RET(size_t);
+ break;
+ case CL_MEM_HOST_PTR:
+ CALL_GETMEMINFO_AND_RET(size_t);
+ break;
+ case CL_MEM_MAP_COUNT:
+ CALL_GETMEMINFO_AND_RET(cl_uint);
+ break;
+ case CL_MEM_REFERENCE_COUNT:
+ CALL_GETMEMINFO_AND_RET(cl_uint);
+ break;
+ case CL_MEM_CONTEXT:
+ CALL_GETMEMINFO_AND_RET(cl_context);
+ break;
+ case CL_MEM_ASSOCIATED_MEMOBJECT:
+ CALL_GETMEMINFO_AND_RET(cl_mem);
+ break;
+ case CL_MEM_OFFSET:
+ CALL_GETMEMINFO_AND_RET(size_t);
+ break;
+
+ default:
+ break;
+ }
+ });
+
+ clEnqueueUnmapMemObject(queue, buf[0], map_ptr, 0, NULL, NULL);
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_mem_info);
diff --git a/utests/image_1D_buffer.cpp b/utests/image_1D_buffer.cpp
new file mode 100644
index 0000000..d8d761f
--- /dev/null
+++ b/utests/image_1D_buffer.cpp
@@ -0,0 +1,80 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+void image_1D_buffer(void)
+{
+ size_t buffer_sz = 1024;
+ char *buf_content = (char *)malloc(buffer_sz * sizeof(char));
+ int error;
+ cl_image_desc image_desc;
+ cl_image_format image_format;
+ cl_sampler sampler;
+ cl_mem image1, image2;
+ cl_mem ret_mem = NULL;
+
+ OCL_CREATE_KERNEL("image_1D_buffer");
+
+ for (int32_t i = 0; i < (int32_t)buffer_sz; ++i)
+ buf_content[i] = (rand() & 127);
+
+ cl_mem buff = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ buffer_sz, buf_content, &error);
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ memset(&image_desc, 0x0, sizeof(cl_image_desc));
+ memset(&image_format, 0x0, sizeof(cl_image_format));
+
+ image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ image_desc.image_row_pitch = buffer_sz;
+ image_desc.image_width = buffer_sz / sizeof(uint32_t); //assume rgba32
+ image_desc.buffer = buff;
+
+ image_format.image_channel_order = CL_RGBA;
+ image_format.image_channel_data_type = CL_UNSIGNED_INT8;
+
+ image1 = clCreateImage(ctx, CL_MEM_READ_ONLY, &image_format,
+ &image_desc, NULL, &error );
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ error = clGetImageInfo(image1, CL_IMAGE_BUFFER, sizeof(ret_mem), &ret_mem, NULL);
+ OCL_ASSERT(error == CL_SUCCESS);
+ OCL_ASSERT(ret_mem == buff);
+
+
+ memset(&image_desc, 0x0, sizeof(cl_image_desc));
+ image_desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+ image_desc.image_width = buffer_sz / sizeof(uint32_t);
+ image2 = clCreateImage(ctx, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
+ &image_format, &image_desc, buf_content, &error);
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ // Create sampler to use
+ sampler = clCreateSampler(ctx, false, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ cl_mem result_buf = buf[0] = clCreateBuffer(ctx, 0, buffer_sz, NULL, &error);
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &image1);
+ OCL_SET_ARG(1, sizeof(cl_mem), &image2);
+ OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ OCL_SET_ARG(3, sizeof(cl_mem), &result_buf);
+
+ globals[0] = buffer_sz/sizeof(int32_t);
+ locals[0] = 16;
+
+ OCL_NDRANGE(1);
+
+ /* Now check the result. */
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < buffer_sz/sizeof(int32_t); i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 1);
+ OCL_UNMAP_BUFFER(0);
+
+ clReleaseSampler(sampler);
+ clReleaseMemObject(image1);
+ clReleaseMemObject(image2);
+ clReleaseMemObject(buff);
+}
+
+MAKE_UTEST_FROM_FUNCTION(image_1D_buffer);
diff --git a/utests/load_program_from_bin_file.cpp b/utests/load_program_from_bin_file.cpp
new file mode 100644
index 0000000..feefacc
--- /dev/null
+++ b/utests/load_program_from_bin_file.cpp
@@ -0,0 +1,77 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+ dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_bin_file(void)
+{
+ const size_t n = 16;
+ float cpu_dst[16], cpu_src[16];
+ cl_int status;
+ cl_int binary_status;
+ char *ker_path = NULL;
+
+ cl_file_map_t *fm = cl_file_map_new();
+ ker_path = cl_do_kiss_path("compiler_ceil.bin", device);
+ OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+ const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
+ const size_t sz = cl_file_map_size(fm);
+
+ program = clCreateProgramWithBinary(ctx, 1,
+ &device, &sz, &src, &binary_status, &status);
+
+ OCL_ASSERT(program && status == CL_SUCCESS);
+
+ /* OCL requires to build the program even if it is created from a binary */
+ OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+ kernel = clCreateKernel(program, "compiler_ceil", &status);
+ OCL_ASSERT(status == CL_SUCCESS);
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+
+#if 0
+ printf("#### GPU:\n");
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ printf(" %f", ((float *)buf_data[1])[i]);
+ printf("\n#### CPU:\n");
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ printf(" %f", cpu_dst[i]);
+ printf("\n");
+#endif
+
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin_file);
diff --git a/utests/load_program_from_gen_bin.cpp b/utests/load_program_from_gen_bin.cpp
new file mode 100644
index 0000000..3db13b2
--- /dev/null
+++ b/utests/load_program_from_gen_bin.cpp
@@ -0,0 +1,93 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+ dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_gen_bin(void)
+{
+ const size_t n = 16;
+ float cpu_dst[16], cpu_src[16];
+ cl_int status;
+ cl_int binary_status;
+ char *ker_path = NULL;
+
+ cl_file_map_t *fm = cl_file_map_new();
+ ker_path = cl_do_kiss_path("compiler_ceil.cl", device);
+ OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+ const char *src = (const char *)cl_file_map_begin(fm);
+
+ program =clCreateProgramWithSource(ctx, 1, &src, NULL, &status);
+
+ OCL_ASSERT(program && status == CL_SUCCESS);
+
+ /* OCL requires to build the program even if it is created from a binary */
+ OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+ size_t binarySize;
+ unsigned char *binary = NULL;
+
+ status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+ OCL_ASSERT(status == CL_SUCCESS);
+ // Create a buffer and get the gen binary
+ binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
+ OCL_ASSERT(binary != NULL);
+
+ status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof( &binary), &binary, NULL );
+ OCL_ASSERT(status == CL_SUCCESS);
+
+ cl_program bin_program = clCreateProgramWithBinary(ctx, 1,
+ &device, &binarySize, (const unsigned char**)&binary, &binary_status, &status);
+ OCL_ASSERT(bin_program && status == CL_SUCCESS);
+ /* OCL requires to build the program even if it is created from a binary */
+ OCL_ASSERT(clBuildProgram(bin_program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+ kernel = clCreateKernel(bin_program, "compiler_ceil", &status);
+ OCL_ASSERT(status == CL_SUCCESS);
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+
+#if 0
+ printf("#### GPU:\n");
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ printf(" %f", ((float *)buf_data[1])[i]);
+ printf("\n#### CPU:\n");
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ printf(" %f", cpu_dst[i]);
+ printf("\n");
+#endif
+
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_gen_bin);
diff --git a/utests/my_test.cpp b/utests/my_test.cpp
new file mode 100644
index 0000000..73a4718
--- /dev/null
+++ b/utests/my_test.cpp
@@ -0,0 +1,99 @@
+#include "utest_helper.hpp"
+
+struct seg {
+ unsigned int end, color, offset;
+ seg(int e, int c):end(e), color(c) {}
+};
+typedef struct seg seg;
+
+typedef struct {
+ std::vector<seg> segs;
+} rle_data;
+
+struct rle_image {
+ int width, height;
+ std::vector<rle_data> data;
+ rle_image(int w, int h):width(w), height(h) {}
+};
+typedef struct rle_image rle_image;
+
+static void read_data(const char *filename, rle_image &image)
+{
+ FILE *fp;
+ char line[4096];
+ int i;
+ fp = fopen(filename, "r");
+ for (i = 0; i < image.height; i++) {
+ char *nptr = line, *endptr;
+ rle_data d;
+ int start = 0;
+ if (fgets(line, sizeof(line), fp) == NULL)
+ break;
+ for (;;) {
+ int len = strtol(nptr, &endptr, 10);
+ nptr = endptr;
+ int color = strtol(nptr, &endptr, 10);
+ nptr = endptr;
+ seg s(start + len, color);
+ d.segs.push_back(s);
+ if (*endptr == '\n' || *endptr == 0)
+ break;
+ start += len;
+ }
+ image.data.push_back(d);
+ }
+ fclose(fp);
+}
+
+static void prepare_rle_buffer(rle_image &image, std::vector<int> &rle_buffer, int *offsets)
+{
+ int offset = 0;
+ for (int i = 0; i < image.height; i++) {
+ unsigned int j;
+ rle_data d = image.data[i];
+ for (j = 0; j < d.segs.size(); j++) {
+ rle_buffer.push_back(d.segs[j].end);
+ rle_buffer.push_back(d.segs[j].color);
+ }
+ offsets[i] = offset;
+ offset += j;
+ }
+
+}
+
+static void expand_rle(rle_image &image)
+{
+ std::vector<int> rle_buffer;
+ int offsets[image.height];
+ int w = image.width/16;
+ prepare_rle_buffer(image, rle_buffer, offsets);
+ OCL_CREATE_KERNEL("my_test");
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, 2*sizeof(int)*rle_buffer.size(), &rle_buffer[0]);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, sizeof(int)*image.height, offsets);
+ OCL_CREATE_BUFFER(buf[2], 0, image.width*image.height, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(w), &w);
+
+ globals[0] = image.height;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+#if 1
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < image.height; i++) {
+ for (int j = 0; j < image.width; j++)
+ printf("%d ", ((unsigned char*)buf_data[2])[i*image.width+j]);
+ printf("\n****\n");
+ }
+ OCL_UNMAP_BUFFER(2);
+#endif
+}
+
+static void my_test(void)
+{
+ rle_image image(256, 256);
+ read_data("new_data.txt", image);
+ expand_rle(image);
+}
+MAKE_UTEST_FROM_FUNCTION(my_test);
diff --git a/utests/new_data.txt b/utests/new_data.txt
new file mode 100644
index 0000000..b12bb13
--- /dev/null
+++ b/utests/new_data.txt
@@ -0,0 +1,256 @@
+6 5 3 4 37 15 10 2 200 3
+156 1 97 200 3 3
+2 1 2 10 128 2 124 25
+5 5 251 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 1
+256 2
+256 3
+256 0
+256 0
+256 0
+256 1
+256 2
+256 3
+256 0
+256 0
+256 0
+256 0
+256 0
+256 4
+256 5
+256 6
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 3
+100 255 100 155 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 253 100 255 56 0
+56 0 20 8 180 9
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+1 253 5 252 150 168 100 254
+150 168 100 254 1 253 5 252
diff --git a/utests/profiling_exec.cpp b/utests/profiling_exec.cpp
new file mode 100644
index 0000000..afa55ba
--- /dev/null
+++ b/utests/profiling_exec.cpp
@@ -0,0 +1,102 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+static void cpu_exec (int n, float* src, float* dst)
+{
+ int i = 0;
+ for (; i < n; i++) {
+ float f = src[i];
+ f = f < 0 ? -f : f;
+ dst[i] = f;
+ }
+}
+
+#define QUEUE_SECONDS_LIMIT 10
+#define SUBMIT_SECONDS_LIMIT 20
+#define COMMAND_SECONDS_LIMIT 10
+
+static void check_profiling_time(cl_ulong queued, cl_ulong submit, cl_ulong start, cl_ulong end)
+{
+ size_t profiling_resolution = 0;
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
+ sizeof(profiling_resolution), &profiling_resolution, NULL);
+
+ /* Convert the time to second. */
+ double queue_to_submit = (double)(submit - queued)*1e-9;
+ double submit_to_start = (double)(start - submit)*1e-9;
+ double start_to_end = (double)(end - start)*1e-9;
+
+ //printf("Profiling info:\n");
+ //printf("Time from queue to submit : %fms\n", (double)(queue_to_submit) * 1000.f );
+ //printf( "Time from submit to start : %fms\n", (double)(submit_to_start) * 1000.f );
+ //printf( "Time from start to end: %fms\n", (double)(start_to_end) * 1000.f );
+
+ OCL_ASSERTM(queued <= submit, "Enqueue time is later than submit time, invalid\n");
+ OCL_ASSERTM(submit <= start, "Submit time is later than start time, invalid\n");
+ OCL_ASSERTM(start <= end, "Start time is later than end time, invalid\n");
+
+ OCL_ASSERTM(queue_to_submit <= QUEUE_SECONDS_LIMIT, "Too large time from queue to submit\n");
+ OCL_ASSERTM(submit_to_start <= QUEUE_SECONDS_LIMIT, "Too large time from submit to start\n");
+ OCL_ASSERTM(start_to_end <= QUEUE_SECONDS_LIMIT, "Too large time from start to end\n");
+}
+
+static void profiling_exec(void)
+{
+ const size_t n = 512;
+ cl_int status = CL_SUCCESS;
+ cl_command_queue profiling_queue = NULL;
+ cl_command_queue tmp_queue = NULL;
+ float* cpu_src = (float *)malloc(n*sizeof(float));
+ float* cpu_dst = (float *)malloc(n*sizeof(float));
+ cl_event exec_event;
+ cl_ulong time_queue, time_submit, time_start, time_end;
+
+
+ /* Because the profiling prop, we can not use default queue. */
+ profiling_queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &status);
+ OCL_ASSERT(status == CL_SUCCESS);
+
+ /* save the default queue. */
+ tmp_queue = queue;
+ queue = profiling_queue;
+
+ OCL_CREATE_KERNEL("compiler_fabs");
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 256;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ cpu_exec(n, cpu_src, cpu_dst);
+
+ // Run the kernel on GPU
+ OCL_CALL(clEnqueueNDRangeKernel, queue, kernel, 1, NULL, globals, locals, 0, NULL, &exec_event);
+ OCL_CALL(clWaitForEvents, 1, &exec_event);
+
+ OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time_queue, NULL);
+ OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &time_submit, NULL);
+ OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL);
+ OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL);
+
+ check_profiling_time(time_queue, time_submit, time_start, time_end);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+
+ queue = tmp_queue;
+ clReleaseCommandQueue(profiling_queue);
+ free(cpu_dst);
+ free(cpu_src);
+}
+
+MAKE_UTEST_FROM_FUNCTION(profiling_exec);
diff --git a/utests/runtime_barrier_list.cpp b/utests/runtime_barrier_list.cpp
new file mode 100644
index 0000000..135996f
--- /dev/null
+++ b/utests/runtime_barrier_list.cpp
@@ -0,0 +1,75 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE 32*1024
+void runtime_barrier_list(void)
+{
+ const size_t n = BUFFERSIZE;
+ cl_int cpu_src[BUFFERSIZE];
+ cl_int cpu_src_2[BUFFERSIZE];
+ cl_event ev[5];
+ cl_int status = 0;
+ cl_int value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_event");
+ OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, BUFFERSIZE*sizeof(int), NULL);
+
+ for(cl_uint i=0; i<BUFFERSIZE; i++)
+ {
+ cpu_src[i] = 3;
+ cpu_src_2[i] = 5;
+ }
+
+ OCL_CREATE_USER_EVENT(ev[0]);
+
+ clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(int), &value);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 32;
+
+ clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+ for (cl_uint i = 0; i < 3; ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status >= CL_SUBMITTED);
+ }
+
+
+ buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
+ clEnqueueBarrierWithWaitList(queue, 0, NULL, &ev[3]);
+
+ clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+
+ OCL_FINISH();
+ clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status != CL_COMPLETE);
+
+ OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+ clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status == CL_COMPLETE);
+
+ OCL_FINISH();
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status <= CL_COMPLETE);
+ }
+
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+ }
+ clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clReleaseEvent(ev[i]);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_barrier_list);
diff --git a/utests/runtime_compile_link.cpp b/utests/runtime_compile_link.cpp
new file mode 100644
index 0000000..4a39b6a
--- /dev/null
+++ b/utests/runtime_compile_link.cpp
@@ -0,0 +1,162 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+
+#define BUFFERSIZE 32*1024
+
+int init_program(const char* name, cl_context ctx, cl_program *pg )
+{
+ cl_int err;
+ char* ker_path = cl_do_kiss_path(name, device);
+
+ cl_file_map_t *fm = cl_file_map_new();
+ err = cl_file_map_open(fm, ker_path);
+ if(err != CL_FILE_MAP_SUCCESS)
+ OCL_ASSERT(0);
+ const char *src = cl_file_map_begin(fm);
+
+ *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
+ free(ker_path);
+ cl_file_map_delete(fm);
+ return 0;
+
+}
+
+void runtime_compile_link(void)
+{
+
+ cl_int err;
+
+ const char* header_file_name="runtime_compile_link.h";
+ cl_program foo_pg;
+ init_program(header_file_name, ctx, &foo_pg);
+
+ const char* myinc_file_name="include/runtime_compile_link_inc.h";
+ cl_program myinc_pg;
+ init_program(myinc_file_name, ctx, &myinc_pg);
+
+ const char* file_name_A="runtime_compile_link_a.cl";
+ cl_program program_A;
+ init_program(file_name_A, ctx, &program_A);
+
+ cl_program input_headers[2] = { foo_pg, myinc_pg};
+ const char * input_header_names[2] = {header_file_name, myinc_file_name};
+
+ err = clCompileProgram(program_A,
+ 0, NULL, // num_devices & device_list
+ NULL, // compile_options
+ 2, // num_input_headers
+ input_headers,
+ input_header_names,
+ NULL, NULL);
+
+ OCL_ASSERT(err==CL_SUCCESS);
+ const char* file_name_B="runtime_compile_link_b.cl";
+ cl_program program_B;
+ init_program(file_name_B, ctx, &program_B);
+
+ err = clCompileProgram(program_B,
+ 0, NULL, // num_devices & device_list
+ NULL, // compile_options
+ 2, // num_input_headers
+ input_headers,
+ input_header_names,
+ NULL, NULL);
+
+ OCL_ASSERT(err==CL_SUCCESS);
+ cl_program input_programs[2] = { program_A, program_B};
+ cl_program linked_program = clLinkProgram(ctx, 0, NULL, "-create-library", 2, input_programs, NULL, NULL, &err);
+
+ OCL_ASSERT(linked_program != NULL);
+ OCL_ASSERT(err == CL_SUCCESS);
+ size_t binarySize;
+ unsigned char *binary;
+
+ // Get the size of the resulting binary (only one device)
+ err= clGetProgramInfo( linked_program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+ OCL_ASSERT(err==CL_SUCCESS);
+
+ // Create a buffer and get the actual binary
+ binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
+ if (binary == NULL) {
+ OCL_ASSERT(0);
+ return ;
+ }
+
+ unsigned char *buffers[ 1 ] = { binary };
+ // Do another sanity check here first
+ size_t size;
+ cl_int loadErrors[ 1 ];
+ err = clGetProgramInfo( linked_program, CL_PROGRAM_BINARIES, 0, NULL, &size );
+ OCL_ASSERT(err==CL_SUCCESS);
+ if( size != sizeof( buffers ) ){
+ free(binary);
+ return ;
+ }
+
+ err = clGetProgramInfo( linked_program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
+ OCL_ASSERT(err==CL_SUCCESS);
+
+ cl_device_id deviceID;
+ err = clGetProgramInfo( linked_program, CL_PROGRAM_DEVICES, sizeof( deviceID), &deviceID, NULL );
+ OCL_ASSERT(err==CL_SUCCESS);
+
+ cl_program program_with_binary = clCreateProgramWithBinary(ctx, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &err);
+ OCL_ASSERT(err==CL_SUCCESS);
+
+ cl_program new_linked_program = clLinkProgram(ctx, 1, &deviceID, NULL, 1, &program_with_binary, NULL, NULL, &err);
+ OCL_ASSERT(err==CL_SUCCESS);
+ // link success, run this kernel.
+
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ src1[0] = (int64_t)1 << 63, src2[0] = 0x7FFFFFFFFFFFFFFFll;
+ src1[1] = (int64_t)1 << 63, src2[1] = ((int64_t)1 << 63) | 1;
+ src1[2] = -1ll, src2[2] = 0;
+ src1[3] = ((int64_t)123 << 32) | 0x7FFFFFFF, src2[3] = ((int64_t)123 << 32) | 0x80000000;
+ src1[4] = 0x7FFFFFFFFFFFFFFFll, src2[4] = (int64_t)1 << 63;
+ src1[5] = ((int64_t)1 << 63) | 1, src2[5] = (int64_t)1 << 63;
+ src1[6] = 0, src2[6] = -1ll;
+ src1[7] = ((int64_t)123 << 32) | 0x80000000, src2[7] = ((int64_t)123 << 32) | 0x7FFFFFFF;
+ for(size_t i=8; i<n; i++) {
+ src1[i] = i;
+ src2[i] = i;
+ }
+
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ kernel = clCreateKernel(new_linked_program, "runtime_compile_link_a", &err);
+
+ OCL_ASSERT(err == CL_SUCCESS);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+ clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 0, NULL, NULL);
+
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] < src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+ OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_compile_link);
diff --git a/utests/runtime_createcontext.cpp b/utests/runtime_createcontext.cpp
new file mode 100644
index 0000000..f08a189
--- /dev/null
+++ b/utests/runtime_createcontext.cpp
@@ -0,0 +1,14 @@
+#include "utest_helper.hpp"
+
+void runtime_createcontextfromtype(void) {
+ cl_int status;
+
+ cl_context ctx;
+ ctx = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
+ if (ctx == NULL) {
+ OCL_THROW_ERROR("runtime_createcontextfromtype", status);
+ }
+ clReleaseContext(ctx);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_createcontextfromtype);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
new file mode 100644
index 0000000..f8170a3
--- /dev/null
+++ b/utests/runtime_event.cpp
@@ -0,0 +1,60 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE 32*1024
+void runtime_event(void)
+{
+ const size_t n = BUFFERSIZE;
+ cl_int cpu_src[BUFFERSIZE];
+ cl_event ev[3];
+ cl_int status = 0;
+ cl_int value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_event");
+ OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+
+ for(cl_uint i=0; i<BUFFERSIZE; i++)
+ cpu_src[i] = 3;
+
+ OCL_CREATE_USER_EVENT(ev[0]);
+
+ clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(int), &value);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 32;
+ clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+ for (cl_uint i = 0; i < 3; ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status >= CL_SUBMITTED);
+ }
+
+ buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
+ OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+ clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status == CL_COMPLETE);
+
+ OCL_FINISH();
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status <= CL_COMPLETE);
+ }
+
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+ }
+ clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clReleaseEvent(ev[i]);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_event);
diff --git a/utests/runtime_flat_address_space.cpp b/utests/runtime_flat_address_space.cpp
new file mode 100644
index 0000000..08167c4
--- /dev/null
+++ b/utests/runtime_flat_address_space.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+int
+main(int argc, char *argv[])
+{
+ cl_mem dst[24];
+ int *dst_buffer = NULL;
+ const size_t n = 32 * 1024 * 1024;
+ const size_t global_work_size = n;
+ const size_t local_work_size = 16;
+ int status = 0;
+
+ if ((status = cl_test_init("test_write_only.cl", "test_write_only", SOURCE)) != 0)
+ goto error;
+
+ for (uint32_t j = 0; j < 24; ++j)
+ {
+ // Allocate the two buffers
+ dst[j] = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS) goto error;
+
+ // Set source and destination
+ OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst[j]);
+
+ // Run the kernel
+ OCL_CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ // Be sure that everything run fine
+ dst_buffer = (int *) clMapBufferIntel(dst[j], &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (uint32_t i = 0; i < n; ++i)
+ if (dst_buffer[i] != int(i)) {
+ fprintf(stderr, "run-time flat address space failed\n");
+ exit(-1);
+ }
+ OCL_CALL (clUnmapBufferIntel, dst[j]);
+ }
+
+ for (uint32_t j = 0; j < 24; ++j) OCL_CALL (clReleaseMemObject, dst[j]);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clReportUnfreedIntel());
+ assert(clReportUnfreedIntel() == 0);
+
+error:
+ return status;
+}
+
diff --git a/utests/runtime_marker_list.cpp b/utests/runtime_marker_list.cpp
new file mode 100644
index 0000000..f64b1d1
--- /dev/null
+++ b/utests/runtime_marker_list.cpp
@@ -0,0 +1,75 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE 32*1024
+void runtime_marker_list(void)
+{
+ const size_t n = BUFFERSIZE;
+ cl_int cpu_src[BUFFERSIZE];
+ cl_int cpu_src_2[BUFFERSIZE];
+ cl_event ev[5];
+ cl_int status = 0;
+ cl_int value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_event");
+ OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, BUFFERSIZE*sizeof(int), NULL);
+
+ for(cl_uint i=0; i<BUFFERSIZE; i++)
+ {
+ cpu_src[i] = 3;
+ cpu_src_2[i] = 5;
+ }
+
+ OCL_CREATE_USER_EVENT(ev[0]);
+
+ clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(int), &value);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 32;
+
+ clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+ for (cl_uint i = 0; i < 3; ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status >= CL_SUBMITTED);
+ }
+
+
+ buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
+ clEnqueueMarkerWithWaitList(queue, 0, NULL, &ev[3]);
+
+ clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+
+ OCL_FINISH();
+ clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status == CL_COMPLETE);
+
+ OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+ clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status == CL_COMPLETE);
+
+ OCL_FINISH();
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status <= CL_COMPLETE);
+ }
+
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+ }
+ clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clReleaseEvent(ev[i]);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_marker_list);
diff --git a/utests/runtime_null_kernel_arg.cpp b/utests/runtime_null_kernel_arg.cpp
new file mode 100644
index 0000000..447e345
--- /dev/null
+++ b/utests/runtime_null_kernel_arg.cpp
@@ -0,0 +1,27 @@
+#include "utest_helper.hpp"
+
+void runtime_null_kernel_arg(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("null_kernel_arg");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), NULL);
+ OCL_SET_ARG(2, sizeof(cl_mem), NULL);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ for (uint32_t i = 0; i < n; ++i)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+ OCL_UNMAP_BUFFER(0);
+}
+
+
+MAKE_UTEST_FROM_FUNCTION(runtime_null_kernel_arg);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
new file mode 100644
index 0000000..b0f575f
--- /dev/null
+++ b/utests/setenv.sh.in
@@ -0,0 +1,7 @@
+#!/bin/sh
+#
+export OCL_PCM_PATH=@LOCAL_PCM_OBJECT_DIR@
+export OCL_PCH_PATH=@LOCAL_PCH_OBJECT_DIR@
+export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
+export OCL_GBE_PATH=@LOCAL_GBE_OBJECT_DIR@
+export OCL_INTERP_PATH=@LOCAL_INTERP_OBJECT_DIR@
diff --git a/utests/sub_buffer.cpp b/utests/sub_buffer.cpp
new file mode 100644
index 0000000..d32fd65
--- /dev/null
+++ b/utests/sub_buffer.cpp
@@ -0,0 +1,135 @@
+#include "utest_helper.hpp"
+
+void sub_buffer_check(void)
+{
+ cl_int error;
+ cl_ulong max_alloc_size;
+ cl_uint address_align;
+ cl_mem main_buf;
+ cl_mem sub_buf;
+ char *main_buf_content;
+ char sub_buf_content[32];
+
+ error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_alloc_size), &max_alloc_size, NULL);
+ OCL_ASSERT(error == CL_SUCCESS);
+ error = clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(address_align ), &address_align, NULL );
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ main_buf_content = (char *)malloc(sizeof(char) * max_alloc_size);
+
+ for (cl_ulong i = 0; i < max_alloc_size; i++) {
+ main_buf_content[i] = rand() & 63;
+ }
+
+ main_buf = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, max_alloc_size, main_buf_content, &error);
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ /* Test read sub buffer. */
+ for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
+ for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+ cl_buffer_region region;
+ region.origin = off;
+ region.size = sz;
+
+ sub_buf = clCreateSubBuffer(main_buf, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error );
+
+ /* invalid size, should be failed. */
+ if(off + sz > max_alloc_size) {
+ OCL_ASSERT(error != CL_SUCCESS);
+ continue;
+ }
+ /* invalid align, should be failed. */
+ if(off & ((address_align/8)-1)) {
+ OCL_ASSERT(error != CL_SUCCESS);
+ continue;
+ }
+
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ error = clEnqueueReadBuffer(queue, sub_buf, CL_TRUE, 0, 32, (void *)sub_buf_content, 0, NULL, NULL);
+ OCL_ASSERT(error == CL_SUCCESS);
+
+#if 0
+ printf("\nRead ########### Src buffer: \n");
+ for (int i = 0; i < 32; ++i)
+ printf(" %2.2u", main_buf_content[off + i]);
+
+ printf("\nRead ########### dst buffer: \n");
+ for (int i = 0; i < 32; ++i)
+ printf(" %2.2u", sub_buf_content[i]);
+ printf("\n");
+#endif
+ for (int i = 0; i < 32; ++i) {
+
+ if (main_buf_content[off + i] != sub_buf_content[i]) {
+ printf ("different index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ }
+
+ }
+ }
+
+
+ for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
+ for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+ cl_buffer_region region;
+ region.origin = off;
+ region.size = sz;
+
+ sub_buf = clCreateSubBuffer(main_buf, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error );
+
+ /* invalid size, should be failed. */
+ if(off + sz > max_alloc_size) {
+ OCL_ASSERT(error != CL_SUCCESS);
+ continue;
+ }
+ /* invalid align, should be failed. */
+ if(off & (address_align/8-1)) {
+ OCL_ASSERT(error != CL_SUCCESS);
+ continue;
+ }
+
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ for (int i = 0; i < 32; i++) {
+ sub_buf_content[i] = rand() & 63;
+ }
+
+ error = clEnqueueWriteBuffer(queue, main_buf, CL_TRUE, off, 32, sub_buf_content, 0, NULL, NULL);
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ void * mapped_ptr = clEnqueueMapBuffer(queue, sub_buf, CL_TRUE, (cl_map_flags)( CL_MAP_READ | CL_MAP_WRITE ),
+ 0, 32, 0, NULL, NULL, &error );
+ OCL_ASSERT(error == CL_SUCCESS);
+
+#if 0
+ printf("\nMap ########### Src buffer: \n");
+ for (int i = 0; i < 32; ++i)
+ printf(" %2.2u", sub_buf_content[i]);
+
+ printf("\nMap ########### dst buffer: \n");
+ for (int i = 0; i < 32; ++i)
+ printf(" %2.2u", ((char *)mapped_ptr)[i]);
+ printf("\n");
+#endif
+ for (int i = 0; i < 32; i++) {
+
+ if (((char *)mapped_ptr)[i] != sub_buf_content[i]) {
+ printf ("different index is %d\n", i);
+ OCL_ASSERT(0);
+ }
+ }
+
+ error = clEnqueueUnmapMemObject(queue, sub_buf, mapped_ptr, 0, NULL, NULL );
+ OCL_ASSERT(error == CL_SUCCESS);
+
+ clReleaseMemObject(sub_buf);
+ }
+ }
+
+ clReleaseMemObject(main_buf);
+ free(main_buf_content);
+}
+
+MAKE_UTEST_FROM_FUNCTION(sub_buffer_check);
diff --git a/utests/test_printf.cpp b/utests/test_printf.cpp
new file mode 100644
index 0000000..3601574
--- /dev/null
+++ b/utests/test_printf.cpp
@@ -0,0 +1,18 @@
+#include "utest_helper.hpp"
+
+void test_printf(void)
+{
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("test_printf");
+ globals[0] = 16;
+ locals[0] = 16;
+ globals[1] = 4;
+ locals[1] = 4;
+ globals[2] = 8;
+ locals[2] = 2;
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_printf);
diff --git a/utests/utest.cpp b/utests/utest.cpp
new file mode 100644
index 0000000..b491cae
--- /dev/null
+++ b/utests/utest.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "utest.hpp"
+#include "utest_helper.hpp"
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <cstring>
+#include <stdlib.h>
+#include <csignal>
+
+struct signalMap
+{
+ const char* signalName;
+ int signalNum;
+};
+
+using namespace std;
+vector<UTest> *UTest::utestList = NULL;
+// Initialize and declare statistics struct
+RStatistics UTest::retStatistics;
+
+void releaseUTestList(void) { delete UTest::utestList; }
+void runSummaryAtExit(void) {
+ // If case crashes, count it as fail, and accumulate finishrun
+ if(UTest::retStatistics.finishrun != UTest::utestList->size()) {
+ UTest::retStatistics.finishrun++;
+ UTest::retStatistics.failCount++;
+ }
+ printf("\nsummary:\n----------\n");
+ printf(" total: %zu\n",UTest::utestList->size());
+ printf(" run: %zu\n",UTest::retStatistics.finishrun);
+ printf(" pass: %zu\n",UTest::retStatistics.passCount);
+ printf(" fail: %zu\n",UTest::retStatistics.failCount);
+ printf(" pass rate: %f\n",1-(float)UTest::retStatistics.failCount/(float)UTest::utestList->size());
+
+ releaseUTestList();
+}
+
+void signalHandler( int signum )
+{
+ const char* name = NULL;
+
+ signalMap arr[] = {
+ {"SIGILL", SIGILL},
+ {"SIGFPE", SIGFPE},
+ {"SIGABRT", SIGABRT},
+ {"SIGBUS", SIGBUS},
+ {"SIGSEGV", SIGSEGV},
+ {"SIGHUP", SIGHUP},
+ {"SIGINT", SIGINT},
+ {"SIGQUIT", SIGQUIT},
+ {"SIGTERM", SIGTERM},
+ {NULL, -1}
+ };
+
+ for(int i=0; arr[i].signalNum != -1 && arr[i].signalName != NULL; i++) {
+ if(arr[i].signalNum == signum)
+
+ name = arr[i].signalName;
+ }
+
+ printf(" Interrupt signal (%s) received.", name);
+
+ exit(signum);
+}
+
+void catch_signal(void){
+ struct sigaction sa;
+ int sigs[] = {
+ SIGILL, SIGFPE, SIGABRT, SIGBUS,
+ SIGSEGV, SIGHUP, SIGINT, SIGQUIT,
+ SIGTERM
+ };
+
+ sa.sa_handler = signalHandler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_RESETHAND;
+
+ for(unsigned int i = 0; i < sizeof(sigs)/sizeof(sigs[0]); ++i) {
+ if (sigaction(sigs[i], &sa, NULL) == -1)
+ perror("Could not set signal handler");
+ }
+}
+
+UTest::UTest(Function fn, const char *name, bool haveIssue, bool needDestroyProgram)
+ : fn(fn), name(name), haveIssue(haveIssue), needDestroyProgram(needDestroyProgram) {
+
+ if (utestList == NULL) {
+ utestList = new vector<UTest>;
+
+ catch_signal();
+ atexit(runSummaryAtExit);
+ }
+ utestList->push_back(*this);
+}
+
+
+static bool strequal(const char *s1, const char *s2) {
+ if (strcmp(s1, s2) == 0) return true;
+ return false;
+}
+
+void UTest::do_run(struct UTest utest){
+ // Print function name
+ printf("%s()", utest.name);
+ fflush(stdout);
+
+ // Run one case in utestList, print result [SUCCESS] or [FAILED]
+ (utest.fn)();
+}
+
+void UTest::run(const char *name) {
+ if (name == NULL) return;
+ if (utestList == NULL) return;
+
+ for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+ const UTest &utest = (*utestList)[retStatistics.finishrun];
+ if (utest.name == NULL || utest.fn == NULL ) continue;
+ if (strequal(utest.name, name)) {
+ do_run(utest);
+ cl_kernel_destroy(true);
+ cl_buffer_destroy();
+ }
+ }
+}
+
+void UTest::runAll(void) {
+ if (utestList == NULL) return;
+
+ for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+ const UTest &utest = (*utestList)[retStatistics.finishrun];
+ if (utest.fn == NULL) continue;
+ do_run(utest);
+ cl_kernel_destroy(utest.needDestroyProgram);
+ cl_buffer_destroy();
+ }
+}
+
+void UTest::runAllNoIssue(void) {
+ if (utestList == NULL) return;
+
+ for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+ const UTest &utest = (*utestList)[retStatistics.finishrun];
+ if (utest.fn == NULL || utest.haveIssue) continue;
+ do_run(utest);
+ cl_kernel_destroy(utest.needDestroyProgram);
+ cl_buffer_destroy();
+ }
+}
+
+void UTest::listAllCases()
+{
+ if (utestList == NULL) return;
+ for (size_t i = 0; i < utestList->size(); ++i) {
+ const UTest &utest = (*utestList)[i];
+ if (utest.fn == NULL) continue;
+ std::cout << utest.name << std::endl;
+ }
+}
diff --git a/utests/utest.hpp b/utests/utest.hpp
new file mode 100644
index 0000000..375ef70
--- /dev/null
+++ b/utests/utest.hpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Provides all unit test capabilites. It is rather rudimentary but it should
+ * do the job
+ */
+#ifndef __UTEST_UTEST_HPP__
+#define __UTEST_UTEST_HPP__
+
+#include "utest_exception.hpp"
+#include <vector>
+#include <iostream>
+
+/*! struct for statistics */
+struct RStatistics
+{
+ size_t passCount;
+ size_t failCount;
+ size_t finishrun;
+};
+
+/*! Quick and dirty unit test system with registration */
+struct UTest
+{
+ /*! A unit test function to run */
+ typedef void (*Function) (void);
+ /*! Empty test */
+ UTest(void);
+ /*! Build a new unit test and append it to the unit test list */
+ UTest(Function fn, const char *name, bool haveIssue = false, bool needDestroyProgram = true);
+ /*! Function to execute */
+ Function fn;
+ /*! Name of the test */
+ const char *name;
+ /*! Indicate whether current test cases has issue to be fixes */
+ bool haveIssue;
+ /*! Indicate whether destroy kernels/program. */
+ bool needDestroyProgram;
+ /*! The tests that are registered */
+ static std::vector<UTest> *utestList;
+ /*! Run the test with the given name */
+ static void run(const char *name);
+ /*! Run all the tests without known issue*/
+ static void runAllNoIssue(void);
+ /*! Run all the tests */
+ static void runAll(void);
+ /*! List all test cases */
+ static void listAllCases(void);
+ /*! Statistics struct */
+ static RStatistics retStatistics;
+ /*! Do run a test case actually */
+ static void do_run(struct UTest utest);
+};
+
+/*! Register a new unit test */
+#define UTEST_REGISTER(FN) static const UTest __##FN##__(FN, #FN);
+
+#define MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(FN, KEEP_PROGRAM) \
+ static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+ static const UTest __##FN##__(__ANON__##FN##__, #FN, false, !(KEEP_PROGRAM));
+
+
+/*! Turn a function into a unit test */
+#define MAKE_UTEST_FROM_FUNCTION(FN) \
+ static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+ static const UTest __##FN##__(__ANON__##FN##__, #FN);
+
+/*! Register a test case which has issue to be fixed */
+#define MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(FN) \
+ static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+ static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
+
+/*! Turn a function into a unit performance test */
+#define MAKE_BENCHMARK_FROM_FUNCTION(FN) \
+ static void __ANON__##FN##__(void) { BENCHMARK(FN()); } \
+ static const UTest __##FN##__(__ANON__##FN##__, #FN);
+
+/*! No assert is expected */
+#define UTEST_EXPECT_SUCCESS(EXPR) \
+ do { \
+ try { \
+ EXPR; \
+ std::cout << " [SUCCESS]" << std::endl; \
+ UTest::retStatistics.passCount += 1; \
+ } \
+ catch (Exception e) { \
+ std::cout << " [FAILED]" << std::endl; \
+ std::cout << " " << e.what() << std::endl; \
+ UTest::retStatistics.failCount++; \
+ } \
+ } while (0)
+
+#define UTEST_EXPECT_FAILED(EXPR) \
+ do { \
+ try { \
+ EXPR; \
+ std::cout << " [FAILED]" << std::endl; \
+ retStatistics.failCount++; \
+ } \
+ catch (gbe::Exception e) { \
+ std::cout << " [SUCCESS]" << std::endl; \
+ retStatistics.passCount++; \
+ } \
+ } while (0)
+
+#define BENCHMARK(EXPR) \
+ do { \
+ int ret = 0; \
+ try { \
+ ret = EXPR; \
+ printf(" %s [SUCCESS] [Result: %d]\n", #EXPR, ret);\
+ } \
+ catch (Exception e) { \
+ std::cout << " " << #EXPR << " [FAILED]" << std::endl; \
+ std::cout << " " << e.what() << std::endl; \
+ } \
+ } while (0)
+#endif /* __UTEST_UTEST_HPP__ */
+
diff --git a/utests/utest_assert.cpp b/utests/utest_assert.cpp
new file mode 100644
index 0000000..f3b9a00
--- /dev/null
+++ b/utests/utest_assert.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "utest_assert.hpp"
+#include "utest_exception.hpp"
+#include <cassert>
+#include <cstdlib>
+
+void onFailedAssertion(const char *msg, const char *file, const char *fn, int line)
+{
+ char lineString[256];
+ sprintf(lineString, "%i", line);
+ assert(msg != NULL && file != NULL && fn != NULL);
+ const std::string str = "Error: "
+ + std::string(msg) + "\n at file "
+ + std::string(file)
+ + ", function " + std::string(fn)
+ + ", line " + std::string(lineString);
+ throw Exception(str);
+}
+
diff --git a/utests/utest_assert.hpp b/utests/utest_assert.hpp
new file mode 100644
index 0000000..f93f9ac
--- /dev/null
+++ b/utests/utest_assert.hpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __OCL_ASSERT_HPP__
+#define __OCL_ASSERT_HPP__
+
+/*! To ensure that condition truth. Optional message is supported */
+void onFailedAssertion(const char *msg, const char *file, const char *fn, int line);
+
+#define OCL_ASSERT(EXPR) \
+ do { \
+ if (!(EXPR)) \
+ onFailedAssertion(#EXPR, __FILE__, __FUNCTION__, __LINE__); \
+ } while (0)
+
+#define OCL_ASSERTM(EXPR, MSG) \
+ do { \
+ if (!(EXPR)) \
+ onFailedAssertion(MSG, __FILE__, __FUNCTION__, __LINE__); \
+ } while (0)
+
+#endif /* __OCL_ASSERT_HPP__ */
+
diff --git a/utests/utest_error.c b/utests/utest_error.c
new file mode 100644
index 0000000..4582a33
--- /dev/null
+++ b/utests/utest_error.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_error.h"
+#include "CL/cl.h"
+
+const char *err_msg[] = {
+ [-CL_SUCCESS] = "CL_SUCCESS",
+ [-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
+ [-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
+ [-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
+ [-CL_MEM_OBJECT_ALLOCATION_FAILURE] = "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+ [-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
+ [-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
+ [-CL_PROFILING_INFO_NOT_AVAILABLE] = "CL_PROFILING_INFO_NOT_AVAILABLE",
+ [-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
+ [-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
+ [-CL_IMAGE_FORMAT_NOT_SUPPORTED] = "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+ [-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
+ [-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
+ [-CL_MISALIGNED_SUB_BUFFER_OFFSET] = "CL_MISALIGNED_SUB_BUFFER_OFFSET",
+ [-CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST] = "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST",
+ [-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
+ [-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
+ [-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
+ [-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
+ [-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
+ [-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
+ [-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
+ [-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
+ [-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT",
+ [-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] = "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+ [-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
+ [-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
+ [-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
+ [-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
+ [-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
+ [-CL_INVALID_PROGRAM_EXECUTABLE] = "CL_INVALID_PROGRAM_EXECUTABLE",
+ [-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
+ [-CL_INVALID_KERNEL_DEFINITION] = "CL_INVALID_KERNEL_DEFINITION",
+ [-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
+ [-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
+ [-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
+ [-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
+ [-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
+ [-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
+ [-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
+ [-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
+ [-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
+ [-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
+ [-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
+ [-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
+ [-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
+ [-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
+ [-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
+ [-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
+ [-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
+};
+const size_t err_msg_n = sizeof(err_msg) / sizeof(err_msg[0]);
+
diff --git a/utests/utest_error.h b/utests/utest_error.h
new file mode 100644
index 0000000..2da29b0
--- /dev/null
+++ b/utests/utest_error.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __UTEST_ERROR_H__
+#define __UTEST_ERROR_H__
+#include <stdlib.h>
+extern const char *err_msg[];
+extern const size_t err_msg_n;
+#endif /* __UTEST_ERROR_H__ */
+
diff --git a/utests/utest_exception.hpp b/utests/utest_exception.hpp
new file mode 100644
index 0000000..e19141f
--- /dev/null
+++ b/utests/utest_exception.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file exception.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __UTEST_EXCEPTION_HPP__
+#define __UTEST_EXCEPTION_HPP__
+
+#include <string>
+#include <exception>
+
+/*! Exception are only used while using unit tests */
+class Exception : public std::exception
+{
+public:
+ Exception(const std::string &msg) throw() : msg(msg) {}
+ Exception(const Exception &other) throw() : msg(other.msg) {}
+ ~Exception(void) throw() {}
+ Exception &operator= (const Exception &other) throw() {
+ this->msg = other.msg;
+ return *this;
+ }
+ const char *what(void) const throw() { return msg.c_str(); }
+private:
+ std::string msg; //!< String message
+};
+
+#endif /* __UTEST_EXCEPTION_HPP__ */
+
diff --git a/utests/utest_file_map.cpp b/utests/utest_file_map.cpp
new file mode 100644
index 0000000..55b7771
--- /dev/null
+++ b/utests/utest_file_map.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_file_map.hpp"
+#include "CL/cl.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+
+int
+cl_file_map_init(cl_file_map_t *fm)
+{
+ assert(fm);
+ memset(fm,0,sizeof(*fm));
+ return CL_SUCCESS;
+}
+
+void
+cl_file_map_destroy(cl_file_map_t *fm)
+{
+ if (fm->mapped) {
+ munmap(fm->start, fm->size);
+ fm->start = fm->stop = 0;
+ fm->size = 0;
+ fm->mapped = CL_FALSE;
+ }
+ if(fm->fd) {
+ close(fm->fd);
+ fm->fd = 0;
+ }
+ free(fm->name);
+ memset(fm,0,sizeof(*fm));
+}
+
+void
+cl_file_map_delete(cl_file_map_t *fm)
+{
+ if (fm == NULL)
+ return;
+ cl_file_map_destroy(fm);
+ free(fm);
+}
+
+cl_file_map_t*
+cl_file_map_new(void)
+{
+ cl_file_map_t *fm = NULL;
+
+ if ((fm = (cl_file_map_t *) calloc(1, sizeof(cl_file_map_t))) == NULL)
+ goto error;
+ if (cl_file_map_init(fm) != CL_SUCCESS)
+ goto error;
+
+exit:
+ return fm;
+error:
+ cl_file_map_delete(fm);
+ fm = NULL;
+ goto exit;
+}
+
+int
+cl_file_map_open(cl_file_map_t *fm, const char *name)
+{
+ int err = CL_FILE_MAP_SUCCESS;
+
+ /* Open the file */
+ fm->fd = open(name, O_RDONLY);
+ if(fm->fd < 0) {
+ err = CL_FILE_MAP_FILE_NOT_FOUND;
+ goto error;
+ }
+ if ((fm->name = (char*) calloc(strlen(name) + 1, sizeof(char))) == NULL)
+ goto error;
+ sprintf(fm->name, "%s", name);
+
+ /* Map it */
+ fm->size = lseek(fm->fd, 0, SEEK_END);
+ lseek(fm->fd, 0, SEEK_SET);
+ fm->start = mmap(0, fm->size, PROT_READ, MAP_SHARED, fm->fd, 0);
+ if(fm->start == NULL) {
+ err = CL_FILE_MAP_FAILED_TO_MMAP;
+ goto error;
+ }
+
+ fm->stop = ((char *) fm->start) + fm->size;
+ fm->mapped = CL_TRUE;
+
+exit:
+ return err;
+error:
+ cl_file_map_destroy(fm);
+ goto exit;
+}
+
diff --git a/utests/utest_file_map.hpp b/utests/utest_file_map.hpp
new file mode 100644
index 0000000..83d79ea
--- /dev/null
+++ b/utests/utest_file_map.hpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __UTEST_FILE_MAP_HPP__
+#define __UTEST_FILE_MAP_HPP__
+
+#include "CL/cl.h"
+#include <cstdlib>
+
+/* Map a file into memory for direct / cached / simple accesses */
+typedef struct cl_file_map {
+ void *start, *stop; /* First character and last one */
+ size_t size; /* Total size of the file */
+ int fd; /* Posix file descriptor */
+ cl_bool mapped; /* Indicate if a file was mapped or not */
+ char *name; /* File itself */
+} cl_file_map_t;
+
+/* Report information about an open temptative */
+enum {
+ CL_FILE_MAP_SUCCESS = 0,
+ CL_FILE_MAP_FILE_NOT_FOUND = 1,
+ CL_FILE_MAP_FAILED_TO_MMAP = 2
+};
+
+/* Allocate and Initialize a file mapper (but do not map any file */
+extern cl_file_map_t *cl_file_map_new(void);
+
+/* Initialize a file mapper (but do not map any file */
+extern int cl_file_map_init(cl_file_map_t *fm);
+
+/* Destroy but do not deallocate a file map */
+extern void cl_file_map_destroy(cl_file_map_t *fm);
+
+/* Destroy and free it */
+extern void cl_file_map_delete(cl_file_map_t *fm);
+
+/* Open a file and returns the error code */
+extern int cl_file_map_open(cl_file_map_t *fm, const char *name);
+
+static inline cl_bool
+cl_file_map_is_mapped(const cl_file_map_t *fm) {
+ return fm->mapped;
+}
+
+static inline const char*
+cl_file_map_begin(const cl_file_map_t *fm) {
+ return (const char*) fm->start;
+}
+
+static inline const char*
+cl_file_map_end(const cl_file_map_t *fm) {
+ return (const char*) fm->stop;
+}
+
+static inline size_t
+cl_file_map_size(const cl_file_map_t *fm) {
+ return fm->size;
+}
+
+#endif /* __UTEST_FILE_MAP_HPP__ */
+
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
new file mode 100644
index 0000000..7522001
--- /dev/null
+++ b/utests/utest_generator.py
@@ -0,0 +1,387 @@
+#!/usr/bin/python
+import os,sys,re
+
+FLT_MAX_POSI='0x1.fffffep127f'
+FLT_MIN_NEGA='-0x1.fffffep127f'
+FLT_MIN_POSI='0x1.0p-126f'
+FLT_MAX_NEGA='-0x1.0p-126f'
+
+paraTypeList={'float':'%e','int':'%d','double':'%lf','uint':'%d','string':'%s'}
+
+
+def ulpUnit(ulpSize):
+ return re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
+
+def ulpNum(ulpSize):
+ return re.findall(r"([0-9]+)",ulpSize)[0]
+
+def udebug(ulpSize,returnType):
+ #ulpUnit=re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
+ #ulpNum=re.findall(r"([0-9]+)",ulpSize)[0]
+ text='''
+ static const char* INFORNAN;
+ static %s ULPSIZE, ULPSIZE_FACTOR;
+
+ const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
+
+ if (env_strict == NULL || strcmp(env_strict, "0") == 0)
+ ULPSIZE_FACTOR = 1000;
+ else
+ ULPSIZE_FACTOR = 1;
+
+ if (isinf(cpu_data[index])){
+ INFORNAN="INF";
+ }
+ else if (isnan(cpu_data[index])){
+ INFORNAN="NAN";
+ }
+ else{
+ ULPSIZE=ULPSIZE_FACTOR * cl_%s((cpu_data[index] == 0) ? 1 : cpu_data[index])
+ * ((ULPSIZE_FACTOR == 1) ? %s : ( (%s == 0) ? 1 : %s));
+ }
+
+#if udebug
+ if (isinf(cpu_data[index])){
+ if (isinf(gpu_data[index]))
+ printf("%s expect:%s\\n", log, INFORNAN);
+ else
+ printf_c("%s expect:%s\\n", log, INFORNAN);
+ }
+ else if (isnan(cpu_data[index])){
+ if (isnan(gpu_data[index]))
+ printf("%s expect:%s\\n", log, INFORNAN);
+ else
+ printf_c("%s expect:%s\\n", log, INFORNAN);
+ }
+ else if (diff <= ULPSIZE){
+ printf("%s expect:%s\\n", log, ULPSIZE);
+ }
+ else
+ printf_c("%s expect:%s\\n", log, ULPSIZE);
+#else
+ if (isinf(cpu_data[index])){
+ sprintf(log, "%s expect:%s\\n", log, INFORNAN);
+ OCL_ASSERTM(isinf(gpu_data[index]),log);
+ }
+ else if (isnan(cpu_data[index])){
+ sprintf(log, "%s expect:%s\\n", log, INFORNAN);
+ OCL_ASSERTM(isnan(gpu_data[index]),log);
+ }
+ else{
+ sprintf(log, "%s expect:%s\\n", log, ULPSIZE);
+ OCL_ASSERTM(fabs(gpu_data[index]-cpu_data[index]) <= ULPSIZE, log);
+ }
+#endif
+ }
+}\n'''%(returnType,\
+ ulpUnit(ulpSize),ulpNum(ulpSize),\
+ ulpNum(ulpSize), ulpNum(ulpSize),\
+ paraTypeList['string'],paraTypeList['string'],\
+ paraTypeList['string'],paraTypeList['string'],\
+ paraTypeList['string'],paraTypeList['string'],\
+ paraTypeList['string'],paraTypeList['string'],\
+ paraTypeList['string'],paraTypeList['%s'%(returnType)],\
+ paraTypeList['string'],paraTypeList['%s'%(returnType)],\
+ paraTypeList['string'],paraTypeList['string'],\
+ paraTypeList['string'],paraTypeList['string'],\
+ paraTypeList['string'],paraTypeList['%s'%(returnType)])
+
+ return text
+
+def gene2ValuesLoop(values1,values2,inputValues):
+ values2=values2+inputValues*len(inputValues)
+
+ for i in inputValues:
+ for j in range(0,len(inputValues)):
+ values1 += [i]
+
+ return values1,values2
+
+def gene3ValuesLoop(values1,values2,values3,inputValues):
+ for i in inputValues:
+ for j in range(0,len(inputValues)):
+ for k in range(0,len(inputValues)):
+ values1 += [i]
+
+ for i in inputValues:
+ for j in inputValues:
+ for k in range(0,len(inputValues)):
+ values2 += [j]
+
+ values3=inputValues*(len(inputValues)**2)
+ return values1,values2,values3
+
+class func:
+ """ This class will define all needed instance attribute in fundation a c programing file. """
+
+ def __init__(self,name,cpuFuncName,inputType,outputType,values,ulp, cpu_func=''):
+ self.funcName = name
+ self.cpuFuncName = cpuFuncName
+ self.fileName = 'builtin_'+name
+ self.inputtype = inputType
+ self.outputtype = outputType
+ self.values = values
+ self.ulp = ulp
+ self.cpufunc=cpu_func
+ self.cpplines = []
+
+#####cpp file required information:
+ self.Head='''/*
+This file is generated by utest_generator.py.
+Usually you need NOT modify this file manually.
+But when any bug occured, you can change the value of udebug from 0 to 1,
+which can print more values and information to assist debuging the issue.
+*/
+
+#include "utest_helper.hpp"
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <string.h>
+
+#define udebug 0
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define INT_ULP 0
+
+#define printf_c(...) \\
+{\\
+ printf("\\033[1m\\033[40;31m");\\
+ printf( __VA_ARGS__ );\\
+ printf("\\033[0m");\\
+}
+'''
+ #########Execute class itself
+ self.geneProcess()
+
+#####Computer vector && argument type:
+ def argtype(self,paraN,index):
+ return re.findall(r"[a-zA-Z_]+",self.inputtype[paraN][index])[0]
+
+ def argvector(self,paraN,index):
+ vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
+ if vector:
+ vector=vector[0]
+ else:
+ vector=1
+ return vector
+
+ def returnVector(self,index):
+ returnVector=re.findall(r"[0-9]+",self.outputtype[index])
+ if returnVector:
+ returnVector=returnVector[0]
+ else:
+ returnVector=1
+ return returnVector
+
+ def retType(self,index):
+ return re.findall("[a-zA-Z_]+",self.outputtype[index])[0]
+
+ def inputNumFormat(self,paraN,index):
+ return paraTypeList['%s'%(self.argtype(paraN,index))]
+
+ def outputNumFormat(self,index):
+ return paraTypeList['%s'%(self.retType(index))]
+
+#####Cpu values analyse
+ def GenInputValues(self,index):
+ #namesuffix=self.inputtype[0][index]
+ for i in range(0,self.values.__len__()):
+ self.cpplines += [ "const %s input_data%d[] = {%s};" %(self.argtype(i,index),i+1,str(self.values[i]).strip('[]').replace('\'','')) ]
+ self.cpplines += [ "const int count_input = sizeof(input_data1) / sizeof(input_data1[0]);" ]
+ self.cpplines += [ "const int vector = %s;\n"%(self.argvector(self.inputtype.__len__()-1,index)) ]
+
+#####Cpu Function
+ def GenCpuCompilerMath(self,index):
+ #namesuffix=self.inputtype[0][index]
+ defline='static void cpu_compiler_math(%s *dst, '%(self.retType(index))
+ cpufunargs='('
+ funcline = ['{']
+ vectorargs=[]
+
+ if (self.returnVector(index) == 1 and self.argvector(0,index) != 1):
+ for i in range(0,self.values.__len__()):
+ defline += 'const %s *src%d'%(self.argtype(i,index),i+1)
+ defline += ( i == self.values.__len__()-1 ) and ')' or ','
+ vectorargs.append('(')
+ for i in range(0,self.values.__len__()):
+ for j in range(0,self.vector):
+ vectorargs += "x%d%d"%(i+1,j+1)
+ vectorargs += ( j == self.vector-1 ) and ');' or ','
+ funcline += [" const %s x%d%d = *(src%d+%d);"%(self.argtype(i,index),i+1,j+1,i+1,j)]
+
+ return 0
+
+ for i in range(0,self.values.__len__()):
+ defline += 'const %s *src%d'%(self.argtype(i,index),i+1)
+ defline += ( i == self.values.__len__()-1 ) and ')' or ','
+ cpufunargs += "x%d"%(i+1)
+ cpufunargs += ( i == self.values.__len__()-1 ) and ');' or ','
+ funcline += [" const %s x%d = *src%d;"%(self.argtype(i,index),i+1,i+1)]
+
+ funcline += [ " dst[0] = %s%s"%(self.cpuFuncName, cpufunargs) ]
+ funcline += [ '}']
+
+ funcline = [defline] + funcline
+
+ self.cpplines += funcline
+# self.writeCPP( '\n'.join(funcline), 'a', namesuffix)
+
+ def writeCPP(self,content,authority,namesuffix):
+ file_object = open("generated/%s_%s.cpp"%(self.fileName,namesuffix),authority)
+ file_object.writelines(content)
+ file_object.close()
+
+ def writeCL(self,content,authority,namesuffix):
+ file_object = open(os.getcwd()+"/../kernels/%s_%s.cl"%(self.fileName,namesuffix),authority)
+ file_object.writelines(content)
+ file_object.close()
+
+ def nameForCmake(self,content,namesuffix):
+ print("generated/%s_%s.cpp"%(self.fileName,namesuffix)),
+
+ def utestFunc(self,index):
+ funcLines=[]
+ namesuffix=self.inputtype[0][index]
+ funcline=[]
+ funchead='''
+static void %s_%s(void)
+{
+ int index;
+ %s gpu_data[count_input] = {0}, cpu_data[count_input] = {0}, diff=0.0;
+ char log[1024] = {0};
+
+ OCL_CREATE_KERNEL(\"%s_%s\");
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * sizeof(%s), NULL);
+
+ globals[0] = count_input;
+ locals[0] = 1;
+ '''%(self.fileName,namesuffix,\
+ self.retType(index),\
+ self.fileName, namesuffix,\
+ self.retType(index))
+
+ funcline += [funchead]
+ for i in range(1,self.values.__len__()+1):
+ funcline += [" OCL_CREATE_BUFFER(buf[%d], CL_MEM_READ_WRITE, count_input * sizeof(%s), NULL);"%(i,self.argtype(i-1,index))]
+ funcline += [" clEnqueueWriteBuffer( queue, buf[%d], CL_TRUE, 0, count_input * sizeof(%s), input_data%d, 0, NULL, NULL);"%(i,self.argtype(i-1,index),i)]
+
+ funcline += [" OCL_CREATE_BUFFER(buf[%d], CL_MEM_READ_WRITE, sizeof(int), NULL);"%(self.inputtype.__len__()+1)]
+ funcline += [" clEnqueueWriteBuffer( queue, buf[%d], CL_TRUE, 0, sizeof(int), &vector, 0, NULL, NULL);"%(self.inputtype.__len__()+1)]
+
+ #0=output 1=input1 2=input2 ... len+2=output
+ for i in range(0,self.values.__len__()+2):
+ funcline += [" OCL_SET_ARG(%d, sizeof(cl_mem), &buf[%d]);"%(i,i)]
+
+ funcrun='''
+ // Run the kernel:
+ OCL_NDRANGE( 1 );
+ clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(%s) * count_input, gpu_data, 0, NULL, NULL);
+'''%(self.inputtype.__len__()+1)
+ funcline += [ funcrun ]
+
+ funcsprintfa=' sprintf(log, \"'
+ funcsprintfb=''
+ if (self.returnVector(index) == 1 and self.argvector(0,index) != 1):
+ funccompare='''
+ for (index = 0; index < count_input/vector; index++)
+ {
+ cpu_compiler_math( cpu_data + index, '''
+ else:
+ funccompare='''
+ for (index = 0; index < count_input; index++)
+ {
+ cpu_compiler_math( cpu_data + index,'''
+
+ for i in range(0,self.values.__len__()):
+ funccompare += " input_data%d + index"%(i+1)
+ funccompare += (self.values.__len__() - 1 == i) and ');' or ','
+
+ funcsprintfa += "input_data%d:"%(i+1)
+ funcsprintfa += "%s "%(self.inputNumFormat(i,index))
+ funcsprintfb += " input_data%d[index],"%(i+1)
+
+ funcline += [ funccompare ]
+
+ funcsprintfa += " -> gpu:%s cpu:%s diff:%s\","%(self.outputNumFormat(index),self.outputNumFormat(index),self.outputNumFormat(index))#,self.outputNumFormat(index))
+ funcsprintfb += " gpu_data[index], cpu_data[index], diff);"#%(ulpUnit(self.ulp),ulpNum(self.ulp))
+
+ #funcdiff = " diff = fabs((gpu_data[index]-cpu_data[index])"
+ #funcdiff += (self.retType(index) == "int") and ');' or '/(cpu_data[index]>1?cpu_data[index]:1));'
+ valuejudge = " if (std::fpclassify(gpu_data[index]) == FP_SUBNORMAL){ gpu_data[index] = 0; }\n"
+ valuejudge += " if (std::fpclassify(cpu_data[index]) == FP_SUBNORMAL){ cpu_data[index] = 0; }\n"
+ funcdiff = " diff = fabs((gpu_data[index]-cpu_data[index]));"
+ funcline += [ valuejudge ]
+ funcline += [ funcdiff ]
+ funcline += [ funcsprintfa + funcsprintfb ]
+
+ self.cpplines += funcline
+
+ self.cpplines += [ udebug(self.ulp,self.retType(index)) ]
+ self.cpplines += [ "MAKE_UTEST_FROM_FUNCTION(%s_%s)"%(self.fileName,namesuffix) ]
+
+ def genCL(self,index):
+ namesuffix=self.inputtype[0][index]
+ clLine = []
+ clhead = '__kernel void %s_%s(__global %s *dst, '%(self.fileName,namesuffix,self.retType(index))
+ clvalueDef=''
+ clcomputer=''
+ tmp=''
+
+ for i in range(0,self.values.__len__()):
+ clhead += ' __global %s *src%d,'%(self.argtype(i,index),i+1)
+ clvalueDef += ' %s x%d = (%s) ('%(self.inputtype[i][index],i+1,self.inputtype[i][index])
+ tmp = 'src%d[i * (*vector) + '%(i+1)
+ for j in range(0,int(self.argvector(i,index))):
+ clvalueDef += tmp + ((int(self.argvector(i-1,index)) == j+1 ) and '%d]);\n'%(j) or '%d],'%(j))
+ clcomputer += (self.values.__len__() == i+1) and 'x%d);'%(i+1) or 'x%d,'%(i+1)
+
+ clhead += ' __global int *vector) {\n'
+ clhead += ' int i = get_global_id(0);'
+ clLine += [ clhead ]
+ clLine += [ clvalueDef ]
+ clLine += [ ' %s ret;'%(self.outputtype[index]) ]
+ clLine += [ ' ret = %s('%(self.funcName) + clcomputer ]
+
+ if (int(self.returnVector(index)) == 1):
+ clLine += [ ' dst[i] = ret;' ]
+ else:
+ for i in range(0,int(self.returnVector(index))):
+ clLine += [ ' dst[i * (*vector) + %d] = ret[%d];'%(i,i) ]
+ clLine += [ '};' ]
+
+ self.writeCL('\n'.join(clLine),'w',namesuffix)
+
+ def geneProcess(self):
+ for i in range(0,self.inputtype[0].__len__()):
+##########Write Cpp file
+ namesuffix=self.inputtype[0][i]
+ self.cpplines = []
+ #The head:
+ self.cpplines += [self.Head]
+
+ #Parameters:
+ self.GenInputValues(i)
+
+ #cpu function generator:
+ self.cpplines += [self.cpufunc]
+
+ #Cpu function:
+ self.GenCpuCompilerMath(i)
+
+ #utest function
+ self.utestFunc(i)
+
+ #kernel cl
+ self.genCL(i)
+
+ #CMakelists.txt
+ self.nameForCmake(self.fileName,namesuffix)
+
+ self.writeCPP( '\n'.join(self.cpplines) ,'w',namesuffix)
+#########End
+
+#def main():
+#
+#if __name__ == "__main__":
+# main()
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
new file mode 100644
index 0000000..cb4dd66
--- /dev/null
+++ b/utests/utest_helper.cpp
@@ -0,0 +1,674 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_file_map.hpp"
+#include "utest_helper.hpp"
+#include "utest_error.h"
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+
+#include <cstdio>
+#include <cstdint>
+#include <cstring>
+#include <cassert>
+#include <cmath>
+
+#define FATAL(...) \
+do { \
+ fprintf(stderr, "error: "); \
+ fprintf(stderr, __VA_ARGS__); \
+ fprintf(stderr, "\n");\
+ assert(0); \
+ exit(-1); \
+} while (0)
+
+#define FATAL_IF(COND, ...) \
+do { \
+ if (COND) FATAL(__VA_ARGS__); \
+} while (0)
+
+cl_platform_id platform = NULL;
+cl_device_id device = NULL;
+cl_context ctx = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_command_queue queue = NULL;
+cl_mem buf[MAX_BUFFER_N] = {};
+void *buf_data[MAX_BUFFER_N] = {};
+size_t globals[3] = {};
+size_t locals[3] = {};
+
+#ifdef HAS_EGL
+Display *xDisplay;
+EGLDisplay eglDisplay;
+EGLContext eglContext = NULL;
+EGLSurface eglSurface;
+Window xWindow;
+
+void cl_ocl_destroy_egl_window() {
+ eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
+ eglDestroyContext(eglDisplay, eglContext);
+ eglDestroySurface(eglDisplay, eglSurface);
+ XDestroyWindow(xDisplay, xWindow);
+ XCloseDisplay(xDisplay);
+}
+
+bool init_egl_window(int width, int height) {
+ XSetWindowAttributes swa;
+ Window win, root;
+ EGLint attr[] = { // some attributes to set up our egl-interface
+ EGL_BUFFER_SIZE, 16,
+ EGL_RENDERABLE_TYPE,
+ EGL_OPENGL_BIT,
+ EGL_NONE
+ };
+ //// egl-contexts collect all state descriptions needed required for operation
+ EGLint ctxattr[] = {
+ #if 0
+ EGL_CONTEXT_CLIENT_VERSION, 2,
+ #endif
+ EGL_NONE
+ };
+
+ EGLConfig ecfg;
+ EGLint numConfig;
+
+ eglContext = EGL_NO_CONTEXT;
+ xDisplay = XOpenDisplay(NULL);
+ if (xDisplay == NULL) {
+ fprintf(stderr, "Failed to open DISPLAY.\n");
+ return false;
+ }
+ root = DefaultRootWindow(xDisplay);
+ swa.event_mask = ExposureMask | PointerMotionMask | KeyPressMask;
+
+ win = XCreateWindow(
+ xDisplay, root, 0, 0, width, height, 0,
+ CopyFromParent, InputOutput,
+ CopyFromParent, CWEventMask,
+ &swa);
+ xWindow = win;
+
+ /////// the egl part //////////////////////////////////////////////////////////////////
+ // egl provides an interface to connect the graphics related functionality of openGL ES
+ // with the windowing interface and functionality of the native operation system (X11
+ // in our case.
+
+ eglDisplay = eglGetDisplay( (EGLNativeDisplayType) xDisplay );
+ if ( eglDisplay == EGL_NO_DISPLAY ) {
+ fprintf(stderr, "Got no EGL display.\n");
+ return false;
+ }
+ eglBindAPI(EGL_OPENGL_API);
+ int m,n;
+ if ( !eglInitialize( eglDisplay, &m, &n ) ) {
+ fprintf(stderr, "Unable to initialize EGL\n");
+ return false;
+ }
+ if ( !eglChooseConfig( eglDisplay, attr, &ecfg, 1, &numConfig ) ) {
+ fprintf(stderr, "Failed to choose config (eglError: %d)\n", eglGetError());
+ return false;
+ }
+ if ( numConfig != 1 ) {
+ fprintf(stderr, "Didn't get exactly one config, but %d", numConfig);
+ return false;
+ }
+ eglSurface = eglCreateWindowSurface ( eglDisplay, ecfg, win, NULL );
+ if ( eglSurface == EGL_NO_SURFACE ) {
+ fprintf(stderr, "Unable to create EGL surface (eglError: %d)\n", eglGetError());
+ return false;
+ }
+ eglContext = eglCreateContext ( eglDisplay, ecfg, EGL_NO_CONTEXT, ctxattr );
+ if ( eglContext == EGL_NO_CONTEXT ) {
+ fprintf(stderr, "Unable to create EGL context (eglError: %d)\n", eglGetError());
+ return false;
+ }
+ //// associate the egl-context with the egl-surface
+ eglMakeCurrent( eglDisplay, eglSurface, eglSurface, eglContext);
+
+ glClearColor(1.0, 1.0, 1.0, 1.0);
+ glClear(GL_COLOR_BUFFER_BIT);
+ glFinish();
+ eglSwapBuffers(eglDisplay, eglSurface);
+ return true;
+}
+#endif
+
+static const char*
+cl_test_channel_order_string(cl_channel_order order)
+{
+ switch(order) {
+#define DECL_ORDER(WHICH) case CL_##WHICH: return "CL_"#WHICH
+ DECL_ORDER(R);
+ DECL_ORDER(A);
+ DECL_ORDER(RG);
+ DECL_ORDER(RA);
+ DECL_ORDER(RGB);
+ DECL_ORDER(RGBA);
+ DECL_ORDER(BGRA);
+ DECL_ORDER(ARGB);
+ DECL_ORDER(INTENSITY);
+ DECL_ORDER(LUMINANCE);
+ DECL_ORDER(Rx);
+ DECL_ORDER(RGx);
+ DECL_ORDER(RGBx);
+#undef DECL_ORDER
+ default: return "Unsupported image channel order";
+ };
+}
+
+static const char*
+cl_test_channel_type_string(cl_channel_type type)
+{
+ switch(type) {
+#define DECL_TYPE(WHICH) case CL_##WHICH: return "CL_"#WHICH
+ DECL_TYPE(SNORM_INT8);
+ DECL_TYPE(SNORM_INT16);
+ DECL_TYPE(UNORM_INT8);
+ DECL_TYPE(UNORM_INT16);
+ DECL_TYPE(UNORM_SHORT_565);
+ DECL_TYPE(UNORM_SHORT_555);
+ DECL_TYPE(UNORM_INT_101010);
+ DECL_TYPE(SIGNED_INT8);
+ DECL_TYPE(SIGNED_INT16);
+ DECL_TYPE(SIGNED_INT32);
+ DECL_TYPE(UNSIGNED_INT8);
+ DECL_TYPE(UNSIGNED_INT16);
+ DECL_TYPE(UNSIGNED_INT32);
+ DECL_TYPE(HALF_FLOAT);
+ DECL_TYPE(FLOAT);
+#undef DECL_TYPE
+ default: return "Unsupported image channel type";
+ };
+}
+
+static void
+clpanic(const char *msg, int rval)
+{
+ printf("Failed: %s (%d)\n", msg, rval);
+ exit(-1);
+}
+
+char*
+cl_do_kiss_path(const char *file, cl_device_id device)
+{
+ cl_int ver;
+ const char *sub_path = NULL;
+ char *ker_path = NULL;
+ const char *kiss_path = getenv("OCL_KERNEL_PATH");
+ size_t sz = strlen(file);
+
+ if (device == NULL)
+ sub_path = "";
+ else {
+ if (clGetGenVersionIntel(device, &ver) != CL_SUCCESS)
+ clpanic("Unable to get Gen version", -1);
+ sub_path = "";
+ }
+
+ if (kiss_path == NULL)
+ clpanic("set OCL_KERNEL_PATH. This is where the kiss kernels are", -1);
+ sz += strlen(kiss_path) + strlen(sub_path) + 2; /* +1 for end of string, +1 for '/' */
+ if ((ker_path = (char*) malloc(sz)) == NULL)
+ clpanic("Allocation failed", -1);
+ sprintf(ker_path, "%s/%s%s", kiss_path, sub_path, file);
+ return ker_path;
+}
+
+int
+cl_kernel_init(const char *file_name, const char *kernel_name, int format, const char * build_opt)
+{
+ cl_file_map_t *fm = NULL;
+ char *ker_path = NULL;
+ cl_int status = CL_SUCCESS;
+ static const char *prevFileName = NULL;
+
+ /* Load the program and build it */
+ if (!program || (program && (!prevFileName || strcmp(prevFileName, file_name)))) {
+ if (program) clReleaseProgram(program);
+ ker_path = cl_do_kiss_path(file_name, device);
+ if (format == LLVM)
+ program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
+ else if (format == SOURCE) {
+ cl_file_map_t *fm = cl_file_map_new();
+ FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
+ "Failed to open file \"%s\" with kernel \"%s\". Did you properly set OCL_KERNEL_PATH variable?",
+ file_name, kernel_name);
+ const char *src = cl_file_map_begin(fm);
+ const size_t sz = cl_file_map_size(fm);
+ program = clCreateProgramWithSource(ctx, 1, &src, &sz, &status);
+ cl_file_map_delete(fm);
+ } else
+ FATAL("Not able to create program from binary");
+
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateProgramWithBinary\n");
+ goto error;
+ }
+ prevFileName = file_name;
+ }
+ /* OCL requires to build the program even if it is created from a binary */
+ OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
+
+ /* Create a kernel from the program */
+ if (kernel)
+ clReleaseKernel(kernel);
+ kernel = clCreateKernel(program, kernel_name, &status);
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateKernel\n");
+ goto error;
+ }
+
+exit:
+ free(ker_path);
+ cl_file_map_delete(fm);
+ return status;
+error:
+ prevFileName = NULL;
+ goto exit;
+}
+
+#define GET_PLATFORM_STR_INFO(LOWER_NAME, NAME) \
+ { \
+ size_t param_value_size; \
+ OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_##NAME, 0, 0, ¶m_value_size); \
+ std::vector<char> param_value(param_value_size); \
+ OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_##NAME, \
+ param_value_size, param_value.empty() ? NULL : ¶m_value.front(), \
+ ¶m_value_size); \
+ std::string str; \
+ if (!param_value.empty()) \
+ str = std::string(¶m_value.front(), param_value_size-1); \
+ printf("platform_" #LOWER_NAME " \"%s\"\n", str.c_str()); \
+ }
+
+#include <cstring>
+#define GET_DEVICE_STR_INFO(LOWER_NAME, NAME) \
+ std::string LOWER_NAME ##Str; \
+ OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, 0, 0, ¶m_value_size); \
+ { \
+ std::vector<char> param_value(param_value_size); \
+ OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, \
+ param_value_size, param_value.empty() ? NULL : ¶m_value.front(), \
+ ¶m_value_size); \
+ if (!param_value.empty()) \
+ LOWER_NAME ##Str = std::string(¶m_value.front(), param_value_size-1); \
+ } \
+ printf("device_" #LOWER_NAME " \"%s\"\n", LOWER_NAME ##Str.c_str());
+
+int
+cl_ocl_init(void)
+{
+ cl_int status = CL_SUCCESS;
+ cl_uint platform_n;
+ size_t i;
+#ifdef HAS_EGL
+ bool hasGLExt = false;
+#endif
+ cl_context_properties *props = NULL;
+
+ /* Get the platform number */
+ OCL_CALL (clGetPlatformIDs, 0, NULL, &platform_n);
+ printf("platform number %u\n", platform_n);
+ assert(platform_n >= 1);
+
+ /* Get a valid platform */
+ OCL_CALL (clGetPlatformIDs, 1, &platform, &platform_n);
+ GET_PLATFORM_STR_INFO(profile, PROFILE);
+ GET_PLATFORM_STR_INFO(name, NAME);
+ GET_PLATFORM_STR_INFO(vendor, VENDOR);
+ GET_PLATFORM_STR_INFO(version, VERSION);
+ GET_PLATFORM_STR_INFO(extensions, EXTENSIONS);
+
+ /* Get the device (only GPU device is supported right now) */
+ try {
+ OCL_CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+ {
+ size_t param_value_size;
+ GET_DEVICE_STR_INFO(profile, PROFILE);
+ GET_DEVICE_STR_INFO(name, NAME);
+ GET_DEVICE_STR_INFO(vendor, VENDOR);
+ GET_DEVICE_STR_INFO(version, VERSION);
+ GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
+ GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
+#ifdef HAS_EGL
+ if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
+ hasGLExt = true;
+ }
+#endif
+ }
+ } catch (...) {
+ fprintf(stderr, "error calling clGetDeviceIDs\n");
+ status = CL_DEVICE_NOT_FOUND;
+ goto error;
+ }
+
+#ifdef HAS_EGL
+ if (hasGLExt) {
+ int i = 0;
+ props = new cl_context_properties[7];
+ props[i++] = CL_CONTEXT_PLATFORM;
+ props[i++] = (cl_context_properties)platform;
+ if (init_egl_window(EGL_WINDOW_WIDTH, EGL_WINDOW_HEIGHT)) {
+ props[i++] = CL_EGL_DISPLAY_KHR;
+ props[i++] = (cl_context_properties)eglGetCurrentDisplay();
+ props[i++] = CL_GL_CONTEXT_KHR;
+ props[i++] = (cl_context_properties)eglGetCurrentContext();
+ }
+ props[i++] = 0;
+ }
+#endif
+ /* Now create a context */
+ ctx = clCreateContext(props, 1, &device, NULL, NULL, &status);
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateContext\n");
+ goto error;
+ }
+
+ /* All image types currently supported by the context */
+ cl_image_format fmt[256];
+ cl_uint fmt_n;
+ clGetSupportedImageFormats(ctx, 0, CL_MEM_OBJECT_IMAGE2D, 256, fmt, &fmt_n);
+ printf("%u image formats are supported\n", fmt_n);
+ for (i = 0; i < fmt_n; ++i)
+ printf("[%s %s]\n",
+ cl_test_channel_order_string(fmt[i].image_channel_order),
+ cl_test_channel_type_string(fmt[i].image_channel_data_type));
+
+ /* We are going to push NDRange kernels here */
+ queue = clCreateCommandQueue(ctx, device, 0, &status);
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateCommandQueue\n");
+ goto error;
+ }
+
+error:
+ if (props)
+ delete props;
+ return status;
+}
+
+int
+cl_test_init(const char *file_name, const char *kernel_name, int format)
+{
+ cl_int status = CL_SUCCESS;
+
+ /* Initialize OCL */
+ if ((status = cl_ocl_init()) != CL_SUCCESS)
+ goto error;
+
+ /* Load the kernel */
+ if ((status = cl_kernel_init(file_name, kernel_name, format, NULL)) != CL_SUCCESS)
+ goto error;
+
+error:
+ return status;
+}
+
+void
+cl_kernel_destroy(bool needDestroyProgram)
+{
+ if (kernel) {
+ clReleaseKernel(kernel);
+ kernel = NULL;
+ }
+ if (needDestroyProgram && program) {
+ clReleaseProgram(program);
+ program = NULL;
+ }
+}
+
+void
+cl_ocl_destroy(void)
+{
+ clReleaseCommandQueue(queue);
+ clReleaseContext(ctx);
+#ifdef HAS_EGL
+ if (eglContext != NULL) {
+ cl_ocl_destroy_egl_window();
+ eglContext = NULL;
+ }
+#endif
+}
+
+void
+cl_test_destroy(void)
+{
+ cl_kernel_destroy();
+ cl_ocl_destroy();
+ printf("%i memory leaks\n", clReportUnfreedIntel());
+ assert(clReportUnfreedIntel() == 0);
+}
+
+void
+cl_buffer_destroy(void)
+{
+ int i;
+ for (i = 0; i < MAX_BUFFER_N; ++i) {
+ if (buf_data[i] != NULL) {
+ clUnmapBufferIntel(buf[i]);
+ buf_data[i] = NULL;
+ }
+ if (buf[i] != NULL) {
+ clReleaseMemObject(buf[i]);
+ buf[i] = NULL;
+ }
+ }
+}
+
+void
+cl_report_perf_counters(cl_mem perf)
+{
+ cl_int status = CL_SUCCESS;
+ uint32_t *start = NULL, *end = NULL;
+ uint32_t i;
+ if (perf == NULL)
+ return;
+ start = (uint32_t*) clMapBufferIntel(perf, &status);
+ assert(status == CL_SUCCESS && start != NULL);
+ end = start + 128;
+
+ printf("BEFORE\n");
+ for (i = 0; i < 6*8; ++i) {
+ if (i % 8 == 0) printf("\n");
+ printf("[%3u 0x%8x] ", i, start[i]);
+ }
+ printf("\n\n");
+
+ printf("AFTER\n");
+ for (i = 0; i < 6*8; ++i) {
+ if (i % 8 == 0) printf("\n");
+ printf("[%3u 0x%8x] ", i, end[i]);
+ }
+ printf("\n\n");
+
+ printf("DIFF\n");
+ for (i = 0; i < 6*8; ++i) {
+ if (i % 8 == 0) printf("\n");
+ printf("[%3u %8i] ", i, end[i] - start[i]);
+ }
+ printf("\n\n");
+
+ clUnmapBufferIntel(perf);
+}
+
+struct bmphdr {
+ // 2 bytes of magic here, "BM", total header size is 54 bytes!
+ int filesize; // 4 total file size incl header
+ short as0, as1; // 8 app specific
+ int bmpoffset; // 12 ofset of bmp data
+ int headerbytes; // 16 bytes in header from this point (40 actually)
+ int width; // 20
+ int height; // 24
+ short nplanes; // 26 no of color planes
+ short bpp; // 28 bits/pixel
+ int compression; // 32 BI_RGB = 0 = no compression
+ int sizeraw; // 36 size of raw bmp file, excluding header, incl padding
+ int hres; // 40 horz resolutions pixels/meter
+ int vres; // 44
+ int npalcolors; // 48 No of colors in palette
+ int nimportant; // 52 No of important colors
+ // raw b, g, r data here, dword aligned per scan line
+};
+
+int *cl_read_bmp(const char *filename, int *width, int *height)
+{
+ struct bmphdr hdr;
+ char *bmppath = cl_do_kiss_path(filename, device);
+ FILE *fp = fopen(bmppath, "rb");
+ assert(fp);
+
+ char magic[2];
+ int ret;
+ ret = fread(&magic[0], 1, 2, fp);
+ ret = ret;
+ assert(2 == ret);
+ assert(magic[0] == 'B' && magic[1] == 'M');
+
+ ret = fread(&hdr, sizeof(hdr), 1, fp);
+ assert(1 == ret);
+
+ assert(hdr.width > 0 && hdr.height > 0 && hdr.nplanes == 1 && hdr.compression == 0);
+
+ int *rgb32 = (int *) malloc(hdr.width * hdr.height * sizeof(int));
+ assert(rgb32);
+ int x, y;
+
+ int *dst = rgb32;
+ for (y = 0; y < hdr.height; y++) {
+ for (x = 0; x < hdr.width; x++) {
+ assert(!feof(fp));
+ int b = (getc(fp) & 0x0ff);
+ int g = (getc(fp) & 0x0ff);
+ int r = (getc(fp) & 0x0ff);
+ *dst++ = (r | (g << 8) | (b << 16) | 0xff000000); /* abgr */
+ }
+ while (x & 3) {
+ getc(fp);
+ x++;
+ } // each scanline padded to dword
+ // printf("read row %d\n", y);
+ // fflush(stdout);
+ }
+ fclose(fp);
+ *width = hdr.width;
+ *height = hdr.height;
+ free(bmppath);
+ return rgb32;
+}
+
+void cl_write_bmp(const int *data, int width, int height, const char *filename)
+{
+ int x, y;
+
+ FILE *fp = fopen(filename, "wb");
+ assert(fp);
+
+ char *raw = (char *) malloc(width * height * sizeof(int)); // at most
+ assert(raw);
+ char *p = raw;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ int c = *data++;
+ *p++ = ((c >> 16) & 0xff);
+ *p++ = ((c >> 8) & 0xff);
+ *p++ = ((c >> 0) & 0xff);
+ }
+ while (x & 3) {
+ *p++ = 0;
+ x++;
+ } // pad to dword
+ }
+ int sizeraw = p - raw;
+ int scanline = (width * 3 + 3) & ~3;
+ assert(sizeraw == scanline * height);
+
+ struct bmphdr hdr;
+
+ hdr.filesize = scanline * height + sizeof(hdr) + 2;
+ hdr.as0 = 0;
+ hdr.as1 = 0;
+ hdr.bmpoffset = sizeof(hdr) + 2;
+ hdr.headerbytes = 40;
+ hdr.width = width;
+ hdr.height = height;
+ hdr.nplanes = 1;
+ hdr.bpp = 24;
+ hdr.compression = 0;
+ hdr.sizeraw = sizeraw;
+ hdr.hres = 0; // 2834;
+ hdr.vres = 0; // 2834;
+ hdr.npalcolors = 0;
+ hdr.nimportant = 0;
+
+ /* Now write bmp file */
+ char magic[2] = { 'B', 'M' };
+ fwrite(&magic[0], 1, 2, fp);
+ fwrite(&hdr, 1, sizeof(hdr), fp);
+ fwrite(raw, 1, hdr.sizeraw, fp);
+
+ fclose(fp);
+ free(raw);
+}
+
+static const float pixel_threshold = 0.05f;
+static const float max_error_ratio = 0.001f;
+
+int cl_check_image(const int *img, int w, int h, const char *bmp)
+{
+ int refw, refh;
+ int *ref = cl_read_bmp(bmp, &refw, &refh);
+ if (ref == NULL || refw != w || refh != h) return 0;
+ const int n = w*h;
+ int discrepancy = 0;
+ for (int i = 0; i < n; ++i) {
+ const float r = (float) (img[i] & 0xff);
+ const float g = (float) ((img[i] >> 8) & 0xff);
+ const float b = (float) ((img[i] >> 16) & 0xff);
+ const float rr = (float) (ref[i] & 0xff);
+ const float rg = (float) ((ref[i] >> 8) & 0xff);
+ const float rb = (float) ((ref[i] >> 16) & 0xff);
+ const float dr = fabs(r-rr) / (1.f/255.f + std::max(r,rr));
+ const float dg = fabs(g-rg) / (1.f/255.f + std::max(g,rg));
+ const float db = fabs(b-rb) / (1.f/255.f + std::max(b,rb));
+ const float err = sqrtf(dr*dr+dg*dg+db*db);
+ if (err > pixel_threshold) discrepancy++;
+ }
+ free(ref);
+ return (float(discrepancy) / float(n) > max_error_ratio) ? 0 : 1;
+}
+
+const float cl_FLT_ULP(float float_number)
+{
+ SF floatBin, ulpBin, ulpBinBase;
+ floatBin.f = float_number;
+
+ ulpBin.spliter.sign = ulpBinBase.spliter.sign = 0;
+ ulpBin.spliter.exponent = ulpBinBase.spliter.exponent = floatBin.spliter.exponent;
+ ulpBin.spliter.mantissa = 0x1;
+ ulpBinBase.spliter.mantissa = 0x0;
+
+ return ulpBin.f - ulpBinBase.f;
+}
+
+const int cl_INT_ULP(int int_number)
+{
+ return 0;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
new file mode 100644
index 0000000..de4d277
--- /dev/null
+++ b/utests/utest_helper.hpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest_helper.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __UTEST_HELPER_HPP__
+#define __UTEST_HELPER_HPP__
+
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+#include "utest.hpp"
+#include "utest_assert.hpp"
+#include "utest_error.h"
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+
+#ifdef HAS_EGL
+#define EGL_WINDOW_WIDTH 256
+#define EGL_WINDOW_HEIGHT 256
+#include <GL/gl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <CL/cl_gl.h>
+
+extern EGLDisplay eglDisplay;
+extern EGLContext eglContext;
+extern EGLSurface eglSurface;
+#endif
+
+#define OCL_THROW_ERROR(FN, STATUS) \
+ do { \
+ char msg[2048]; \
+ sprintf(msg, "error calling %s with error %s \n", #FN, err_msg[-STATUS]); \
+ OCL_ASSERTM(false, msg); \
+ } while (0)
+
+#define OCL_CALL(FN, ...) \
+ do { \
+ int status = FN(__VA_ARGS__); \
+ if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+ } while (0)
+
+#define OCL_CREATE_KERNEL(NAME) \
+ do { \
+ OCL_CALL (cl_kernel_init, NAME".cl", NAME, SOURCE, NULL); \
+ } while (0)
+
+#define OCL_DESTROY_KERNEL_KEEP_PROGRAM(KEEP_PROGRAM) \
+ do { \
+ cl_kernel_destroy(!(KEEP_PROGRAM)); \
+ } while(0)
+
+#define OCL_CREATE_KERNEL_FROM_FILE(FILE_NAME, KERNEL_NAME) \
+ do { \
+ OCL_CALL(cl_kernel_init, FILE_NAME".cl", KERNEL_NAME, SOURCE, NULL); \
+ } while (0)
+
+#define OCL_FLUSH() \
+ do { \
+ OCL_CALL(clFlush, queue); \
+ } while(0)
+
+#define OCL_FINISH() \
+ do { \
+ OCL_CALL(clFinish, queue); \
+ } while(0)
+
+#define OCL_CALL2(FN, RET, ...) \
+ do { \
+ cl_int status; \
+ RET = FN(__VA_ARGS__, &status);\
+ if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+ } while (0)
+
+#define OCL_CREATE_BUFFER(BUFFER, FLAGS, SIZE, DATA) \
+ OCL_CALL2(clCreateBuffer, BUFFER, ctx, FLAGS, SIZE, DATA)
+
+#define OCL_CREATE_USER_EVENT(EVENT) \
+ OCL_CALL2(clCreateUserEvent, EVENT, ctx)
+
+#define OCL_SET_USER_EVENT_STATUS(EVENT, STATUS) \
+ OCL_CALL(clSetUserEventStatus, EVENT, STATUS)
+
+#define OCL_CREATE_IMAGE(IMAGE, FLAGS, FORMAT, DESC, DATA) \
+ OCL_CALL2(clCreateImage, IMAGE, ctx, FLAGS, FORMAT, DESC, DATA)
+
+#define OCL_READ_IMAGE(IMAGE, ORIGIN, REGION, DATA) \
+ OCL_CALL(clEnqueueReadImage, queue, IMAGE, CL_TRUE, ORIGIN, REGION, 0, 0, DATA, 0, NULL, NULL)
+
+#define OCL_WRITE_IMAGE(IMAGE, ORIGIN, REGION, DATA) \
+ OCL_CALL(clEnqueueWriteImage, queue, IMAGE, CL_TRUE, ORIGIN, REGION, 0, 0, DATA, 0, NULL, NULL)
+
+#define OCL_CREATE_GL_IMAGE(IMAGE, FLAGS, TARGET, LEVEL, TEXTURE) \
+ OCL_CALL2(clCreateFromGLTexture, IMAGE, ctx, FLAGS, TARGET, LEVEL, TEXTURE)
+
+#define OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(ID) \
+ OCL_CALL(clEnqueueAcquireGLObjects, queue, 1, &buf[ID], 0, 0, 0)
+
+#define OCL_SWAP_EGL_BUFFERS() \
+ eglSwapBuffers(eglDisplay, eglSurface);
+
+#define OCL_CREATE_SAMPLER(SAMPLER, ADDRESS_MODE, FILTER_MODE) \
+ OCL_CALL2(clCreateSampler, SAMPLER, ctx, 0, ADDRESS_MODE, FILTER_MODE)
+
+#define OCL_MAP_BUFFER(ID) \
+ OCL_CALL2(clMapBufferIntel, buf_data[ID], buf[ID])
+
+#define OCL_UNMAP_BUFFER(ID) \
+ do { \
+ if (buf[ID] != NULL) { \
+ OCL_CALL (clUnmapBufferIntel, buf[ID]); \
+ buf_data[ID] = NULL; \
+ } \
+ } while (0)
+
+#define OCL_MAP_BUFFER_GTT(ID) \
+ OCL_CALL2(clMapBufferGTTIntel, buf_data[ID], buf[ID])
+
+#define OCL_UNMAP_BUFFER_GTT(ID) \
+ do { \
+ if (buf[ID] != NULL) { \
+ OCL_CALL (clUnmapBufferGTTIntel, buf[ID]); \
+ buf_data[ID] = NULL; \
+ } \
+ } while (0)
+
+#define OCL_NDRANGE(DIM_N) \
+ OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, DIM_N, NULL, globals, locals, 0, NULL, NULL)
+
+#define OCL_SET_ARG(ID, SIZE, ARG) \
+ OCL_CALL (clSetKernelArg, kernel, ID, SIZE, ARG)
+
+#define OCL_CHECK_IMAGE(DATA, W, H, FILENAME) \
+ if (cl_check_image(DATA, W, H, FILENAME) == 0) \
+ OCL_ASSERTM(false, "image mismatch")
+
+enum { MAX_BUFFER_N = 16 };
+extern cl_platform_id platform;
+extern cl_device_id device;
+extern cl_context ctx;
+extern cl_program program;
+extern cl_kernel kernel;
+extern cl_command_queue queue;
+extern cl_mem buf[MAX_BUFFER_N];
+extern void* buf_data[MAX_BUFFER_N];
+extern size_t globals[3];
+extern size_t locals[3];
+
+enum {
+ SOURCE = 0,
+ LLVM = 1,
+ BIN = 2
+};
+
+/* The SF is float type spliter*/
+typedef struct
+{
+ unsigned int mantissa:23;
+ unsigned int exponent:8;
+ unsigned int sign:1;
+} FLOAT;
+
+typedef union
+{
+ float f;
+ unsigned int i;
+ FLOAT spliter;
+} SF;
+
+/* Init OpenCL */
+extern int cl_ocl_init(void);
+
+/* Init program and kernel for the test */
+extern int cl_kernel_init(const char *file_name,
+ const char *kernel_name, int format, const char * build_opt);
+
+/* Get the file path */
+extern char* cl_do_kiss_path(const char *file, cl_device_id device);
+
+/* init the bunch of global varaibles here */
+extern int cl_test_init(const char *file_name, const char *kernel_name, int format);
+
+/* Unmap and release all the created buffers */
+extern void cl_buffer_destroy(void);
+
+/* Release OCL queue, context and device */
+extern void cl_ocl_destroy(void);
+
+/* Release kernel and program */
+extern void cl_kernel_destroy(bool needDestroyProgram = true);
+
+/* Release everything allocated in cl_test_init */
+extern void cl_test_destroy(void);
+
+/* Nicely output the performance counters */
+extern void cl_report_perf_counters(cl_mem perf);
+
+/* Read a bmp from file */
+extern int *cl_read_bmp(const char *filename, int *width, int *height);
+
+/* Write a bmp to a file */
+extern void cl_write_bmp(const int *data, int width, int height, const char *filename);
+
+/* Check data from img against bmp file located at "bmp" */
+extern int cl_check_image(const int *img, int w, int h, const char *bmp);
+
+/* Calculator ULP of each FLOAT value */
+extern const float cl_FLT_ULP(float float_number);
+
+/* Calculator ULP of each INT value */
+extern const int cl_INT_ULP(int int_number);
+
+#endif /* __UTEST_HELPER_HPP__ */
+
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
new file mode 100755
index 0000000..30a9b24
--- /dev/null
+++ b/utests/utest_math_gen.py
@@ -0,0 +1,577 @@
+#!/usr/bin/python
+from utest_generator import *
+import os,sys
+
+#base_input_values = [80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+#extend_input_values = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+
+#func:
+# gpufuncName
+# cpuFuncName
+# fileName: 'builtin_'+name
+# inputtype: a 2-D list because there're more than one input data
+# outputtype: a list
+# values
+# ulp
+
+# reduce pi*x limitation to [-pi,pi]
+reduce1='''
+static float reduce1( float x )
+{
+ SF fx, fy;
+ fx.f = fy.f = x;
+ int n;
+
+ fy.spliter.exponent = fx.spliter.exponent - 1;
+ n = (int)fy.f;
+
+ fx.f = fx.f - 2.0 * n;
+
+ // reduce to [-1.0, 1.0]
+ fx.f = (fx.f < -1)?(fx.f + 2.0):((fx.f > 1)?(fx.f - 2.0):fx.f);
+
+ return fx.f;
+}
+'''
+# define fuction: cospi
+cospi='''
+static float cospi(float x){
+ float r = x;
+ if ( x > 1 || x < -1) r = reduce1(x);
+
+ // reduce to [0.0, 1.0]
+ if (r < 0)
+ r = fabs(r);
+
+ if (r >= 0 && r <= 0.25)
+ return cosf(r * M_PI);
+ else if (r > 0.25 && r <= 0.5)
+ return sinf((0.5 - r) * M_PI);
+ else if (r > 0.5 && r <= 0.75)
+ return sinf(-(r-0.5) * M_PI);
+ else if (r > 0.75 && r <= 1.0){
+ return -cosf((1 - r) * M_PI);}
+
+ // Error return
+ return 0xffffffff;
+}
+'''
+# define function: sinpi
+sinpi='''
+static float sinpi(float x){
+ float r = x;
+ if ( x > 1 || x < -1) r = reduce1(x);
+
+ // reduce to [-0.5, 0.5]
+ if (r < -0.5)
+ r = -1 - r;
+ else if (r > 0.5)
+ r = 1 - r;
+
+ if (r > 0.25 && r <= 0.5)
+ return cosf((0.5 - r) * M_PI);
+ else if (r >= 0 && r <= 0.25)
+ return sinf(r * M_PI);
+ else if (r >= -0.25 && r < 0)
+ return -sinf(r * -M_PI);
+ else if (r >= -0.5 && r < -0.25){
+ return -cosf((0.5 + r) * M_PI);}
+
+ // Error return
+ return 0xffffffff;
+}
+'''
+
+base_input_values = [ 0, 1, 3.14, 5.15, 6.01, 7.89]
+base_input_values1 = [ 1, 3.14, 5.15, 6.01, 7.89]
+def main():
+ ##### gentype acos(gentype)
+ acos_input_values = base_input_values
+ acos_input_type = ['float','float2','float4','float8','float16']
+ acos_output_type = ['float','float2','float4','float8','float16']
+ acosUtests = func('acos','acos',[acos_input_type],acos_output_type,[acos_input_values],'4 * FLT_ULP')
+
+ ##### gentype acosh(gentype)
+ acosh_input_values = base_input_values
+ acosh_input_type = ['float','float2','float4','float8','float16']
+ acosh_output_type = ['float','float2','float4','float8','float16']
+ acoshUtests = func('acosh','acosh',[acosh_input_type],acosh_output_type,[acosh_input_values],'4 * FLT_ULP')
+
+ ##### gentype acospi(gentype x)
+ acospi_input_values = base_input_values
+ acospi_input_type = ['float','float2','float4','float8','float16']
+ acospi_output_type = ['float','float2','float4','float8','float16']
+ acospi_cpu_func='''
+static float acospi(float x){
+ return acos(x)/M_PI;
+} '''
+ acospiUtests = func('acospi','acospi',[acospi_input_type],acospi_output_type,[acospi_input_values],'4 * FLT_ULP',acospi_cpu_func)
+
+ ##### gentype asin(gentype)
+ asin_input_values = base_input_values
+ asin_input_type = ['float','float2','float4','float8','float16']
+ asin_output_type = ['float','float2','float4','float8','float16']
+ asinUtests = func('asin','asin',[asin_input_type],asin_output_type,[asin_input_values],'4 * FLT_ULP')
+
+ ##### gentype asinh(gentype)
+ asinh_input_values = base_input_values
+ asinh_input_type = ['float','float2','float4','float8','float16']
+ asinh_output_type = ['float','float2','float4','float8','float16']
+ asinhUtests = func('asinh','asinh',[asinh_input_type],asinh_output_type,[asinh_input_values],'4 * FLT_ULP')
+
+ ##### gentype asinpi(gentype x)
+ asinpi_input_values = base_input_values
+ asinpi_input_type = ['float','float2','float4','float8','float16']
+ asinpi_output_type = ['float','float2','float4','float8','float16']
+ asinpi_cpu_func='''
+static float asinpi(float x){
+ return asin(x)/M_PI;
+} '''
+ asinpiUtests = func('asinpi','asinpi',[asinpi_input_type],asinpi_output_type,[asinpi_input_values],'4 * FLT_ULP',asinpi_cpu_func)
+
+ ##### gentype atan(gentype y_over_x)
+ atan_input_values = base_input_values
+ atan_input_type = ['float','float2','float4','float8','float16']
+ atan_output_type = ['float','float2','float4','float8','float16']
+ atanUtests = func('atan','atan',[atan_input_type],atan_output_type,[atan_input_values],'5 * FLT_ULP')
+
+ ##### gentype atan2(gentype y, gentype x)
+ atan2_base_values = base_input_values
+ atan2_input_values1 = []
+ atan2_input_values2 = []
+ atan2_input_values1,atan2_input_values2=gene2ValuesLoop(atan2_input_values1,atan2_input_values2,atan2_base_values)
+ atan2_input_type1 = ['float','float2','float4','float8','float16']
+ atan2_input_type2 = ['float','float2','float4','float8','float16']
+ atan2_output_type = ['float','float2','float4','float8','float16']
+ atan2Utests = func('atan2','atan2',[atan2_input_type1,atan2_input_type2],atan2_output_type,[atan2_input_values1,atan2_input_values2],'6 * FLT_ULP')
+
+ ##### gentype atanh(gentype)
+ atanh_input_values = base_input_values
+ atanh_input_type = ['float','float2','float4','float8','float16']
+ atanh_output_type = ['float','float2','float4','float8','float16']
+ atanhUtests = func('atanh','atanh',[atanh_input_type],atanh_output_type,[atanh_input_values],'5 * FLT_ULP')
+
+ ##### gentype atanpi(gentype x)
+ atanpi_input_values = base_input_values
+ atanpi_input_type = ['float','float2','float4','float8','float16']
+ atanpi_output_type = ['float','float2','float4','float8','float16']
+ atanpi_cpu_func='''
+static float atanpi(float x){
+ return atan(x)/M_PI;
+} '''
+ atanpiUtests = func('atanpi','atanpi',[atanpi_input_type],atanpi_output_type,[atanpi_input_values],'4 * FLT_ULP',atanpi_cpu_func)
+
+# ##### gentype atan2pi(gentype y, gentype x)
+# atan2pi_base_values = base_input_values
+# atan2pi_input_values1 = []
+# atan2pi_input_values2 = []
+# atan2pi_input_values1,atan2pi_input_values2=gene2ValuesLoop(atan2pi_input_values1,atan2pi_input_values2,atan2pi_base_values)
+# atan2pi_input_type1 = ['float','float2','float4','float8','float16']
+# atan2pi_input_type2 = ['float','float2','float4','float8','float16']
+# atan2pi_output_type = ['float','float2','float4','float8','float16']
+# atan2pi_cpu_func='''
+#static float atan2pi(float y, float x){
+# return atan2(y,x)/M_PI;
+#} '''
+# atan2piUtests = func('atan2pi','atan2pi',[atan2pi_input_type1,atan2pi_input_type2],atan2pi_output_type,[atan2pi_input_values1,atan2pi_input_values2],'6 * FLT_ULP',atan2pi_cpu_func)
+
+ ##### gentype cbrt(gentype)
+ cbrt_input_values = base_input_values
+ cbrt_input_type = ['float','float2','float4','float8','float16']
+ cbrt_output_type = ['float','float2','float4','float8','float16']
+ cbrtUtests = func('cbrt','cbrt',[cbrt_input_type],cbrt_output_type,[cbrt_input_values],'4 * FLT_ULP')
+
+ ##### gentype ceil(gentype)
+ ceil_input_values = base_input_values
+ ceil_input_type = ['float','float2','float4','float8','float16']
+ ceil_output_type = ['float','float2','float4','float8','float16']
+ ceilUtests = func('ceil','ceil',[ceil_input_type],ceil_output_type,[ceil_input_values],'0 * FLT_ULP')
+
+ ##### gentype copysign(gentype x, gentype y)
+ copysign_base_values = base_input_values
+ copysign_input_values1 = []
+ copysign_input_values2 = []
+ copysign_input_values1,copysign_input_values2=gene2ValuesLoop(copysign_input_values1,copysign_input_values2,copysign_base_values)
+ copysign_input_type1 = ['float','float2','float4','float8','float16']
+ copysign_input_type2 = ['float','float2','float4','float8','float16']
+ copysign_output_type = ['float','float2','float4','float8','float16']
+ copysignUtests = func('copysign','copysign',[copysign_input_type1,copysign_input_type2],copysign_output_type,[copysign_input_values1,copysign_input_values2],'0 * FLT_ULP')
+
+ ##### gentype cos(gentype)
+ cos_input_values = base_input_values
+ cos_input_type = ['float','float2','float4','float8','float16']
+ cos_output_type = ['float','float2','float4','float8','float16']
+ cosUtests = func('cos','cos',[cos_input_type],cos_output_type,[cos_input_values],'4 * FLT_ULP')
+
+ ##### gentype cosh(gentype)
+ cosh_input_values = base_input_values
+ cosh_input_type = ['float','float2','float4','float8','float16']
+ cosh_output_type = ['float','float2','float4','float8','float16']
+ coshUtests = func('cosh','cosh',[cosh_input_type],cosh_output_type,[cosh_input_values],'4 * FLT_ULP')
+
+ ##### gentype cospi(gentype x)
+ cospi_input_values = base_input_values
+ cospi_input_type = ['float','float2','float4','float8','float16']
+ cospi_output_type = ['float','float2','float4','float8','float16']
+ cospi_cpu_func=reduce1+cospi
+ cospiUtests = func('cospi','cospi',[cospi_input_type],cospi_output_type,[cospi_input_values],'2 * FLT_ULP',cospi_cpu_func)
+
+# ##### gentype erf(gentype)
+# erf_input_values = base_input_values
+# erf_input_type = ['float','float2','float4','float8','float16']
+# erf_output_type = ['float','float2','float4','float8','float16']
+# erfUtests = func('erf','erf',[erf_input_type],erf_output_type,[erf_input_values],'16 * FLT_ULP')
+
+# ##### gentype erfc(gentype)
+# erfc_input_values = base_input_values
+# erfc_input_type = ['float','float2','float4','float8','float16']
+# erfc_output_type = ['float','float2','float4','float8','float16']
+# erfcUtests = func('erfc','erfc',[erfc_input_type],erfc_output_type,[erfc_input_values],'16 * FLT_ULP')
+
+ ##### gentype exp(gentype x)
+ exp_input_values = base_input_values
+ exp_input_type = ['float','float2','float4','float8','float16']
+ exp_output_type = ['float','float2','float4','float8','float16']
+ expUtests = func('exp','exp',[exp_input_type],exp_output_type,[exp_input_values],'4 * FLT_ULP')
+
+ ##### gentype exp2(gentype)
+ exp2_input_values = base_input_values
+ exp2_input_type = ['float','float2','float4','float8','float16']
+ exp2_output_type = ['float','float2','float4','float8','float16']
+ exp2Utests = func('exp2','exp2',[exp2_input_type],exp2_output_type,[exp2_input_values],'4 * FLT_ULP')
+
+ ##### gentype exp10(gentype)
+ exp10_input_values = base_input_values
+ exp10_input_type = ['float','float2','float4','float8','float16']
+ exp10_output_type = ['float','float2','float4','float8','float16']
+ exp10Utests = func('exp10','exp10',[exp10_input_type],exp10_output_type,[exp10_input_values],'4 * FLT_ULP')
+
+ ##### gentype expm1(gentype x)
+ expm1_input_values = base_input_values
+ expm1_input_type = ['float','float2','float4','float8','float16']
+ expm1_output_type = ['float','float2','float4','float8','float16']
+ expm1Utests = func('expm1','expm1',[expm1_input_type],expm1_output_type,[expm1_input_values],'4 * FLT_ULP')
+
+ ##### gentype fabs(gentype)
+ fabs_input_values = base_input_values
+ fabs_input_type = ['float','float2','float4','float8','float16']
+ fabs_output_type = ['float','float2','float4','float8','float16']
+ fabsUtests = func('fabs','fabs',[fabs_input_type],fabs_output_type,[fabs_input_values],'0 * FLT_ULP')
+
+ ##### gentype fdim(gentype x, gentype y)
+ fdim_base_values = base_input_values
+ fdim_input_values1 = []
+ fdim_input_values2 = []
+ fdim_input_values1,fdim_input_values2=gene2ValuesLoop(fdim_input_values1,fdim_input_values2,fdim_base_values)
+ fdim_input_type1 = ['float','float2','float4','float8','float16']
+ fdim_input_type2 = ['float','float2','float4','float8','float16']
+ fdim_output_type = ['float','float2','float4','float8','float16']
+ fdimUtests = func('fdim','fdim',[fdim_input_type1,fdim_input_type2],fdim_output_type,[fdim_input_values1,fdim_input_values2],'0 * FLT_ULP')
+
+ ##### gentype floor(gentype)
+ floor_input_values = base_input_values
+ floor_input_type = ['float','float2','float4','float8','float16']
+ floor_output_type = ['float','float2','float4','float8','float16']
+ floorUtests = func('floor','floor',[floor_input_type],floor_output_type,[floor_input_values],'0 * FLT_ULP')
+
+ ##### gentype fmax(gentype x, gentype y)
+ fmax_base_values = base_input_values
+ fmax_input_values1 = []
+ fmax_input_values2 = []
+ fmax_input_values1,fmax_input_values2=gene2ValuesLoop(fmax_input_values1,fmax_input_values2,fmax_base_values)
+ fmax_input_type1 = ['float','float2','float4','float8','float16']
+ fmax_input_type2 = ['float','float2','float4','float8','float16']
+ fmax_output_type = ['float','float2','float4','float8','float16']
+ fmaxUtests = func('fmax','fmax',[fmax_input_type1,fmax_input_type2],fmax_output_type,[fmax_input_values1,fmax_input_values2],'0 * FLT_ULP')
+
+ ##### gentypef fmax(gentypef x, float y)
+# fmax_gentypef_base_values = base_input_values
+# fmax_gentypef_input_values1 = []
+# fmax_gentypef_input_values2 = []
+# fmax_gentypef_input_values2,fmax_gentypef_input_values1=gene2ValuesLoop(fmax_gentypef_input_values1,fmax_gentypef_input_values2,fmax_gentypef_base_values)
+# fmax_gentypef_input_type1 = ['float','float2','float4','float8','float16']
+# fmax_gentypef_input_type2 = ['float','float','float','float','float']
+# fmax_gentypef_output_type = ['float','float2','float4','float8','float16']
+# ##### gentypef fmax(gentypef x, float y)
+# fmax_gentypefUtests = func('gentypef_fmax','gentypef_fmax',[fmax_gentypef_input_type1,fmax_gentypef_input_type2],fmax_gentypef_output_type,[fmax_gentypef_input_values1,fmax_gentypef_input_values2],'0 * FLT_ULP')
+
+ ##### gentype fmin(gentype x, gentype y)
+ fmin_base_values = base_input_values
+ fmin_input_values1 = []
+ fmin_input_values2 = []
+ fmin_input_values1,fmin_input_values2=gene2ValuesLoop(fmin_input_values1,fmin_input_values2,fmin_base_values)
+ fmin_input_type1 = ['float','float2','float4','float8','float16']
+ fmin_input_type2 = ['float','float2','float4','float8','float16']
+ fmin_output_type = ['float','float2','float4','float8','float16']
+ fminUtests = func('fmin','fmin',[fmin_input_type1,fmin_input_type2],fmin_output_type,[fmin_input_values1,fmin_input_values2],'0 * FLT_ULP')
+
+# ##### gentypef fmin(gentypef x, float y)
+# fmin_gentypef_base_values = base_input_values
+# fmin_gentypef_input_values1 = []
+# fmin_gentypef_input_values2 = []
+# fmin_gentypef_input_values2,fmin_gentypef_input_values1=gene2ValuesLoop(fmin_gentypef_input_values1,fmin_gentypef_input_values2,fmin_gentypef_base_values)
+# fmin_gentypef_input_type1 = ['float','float2','float4','float8','float16']
+# fmin_gentypef_input_type2 = ['float','float','float','float','float']
+# fmin_gentypef_output_type = ['float','float2','float4','float8','float16']
+# ##### gentypef fmin(gentypef x, float y)
+# fmin_gentypefUtests = func('gentypef_fmin','gentypef_fmin',[fmin_gentypef_input_type1,fmin_gentypef_input_type2],fmin_gentypef_output_type,[fmin_gentypef_input_values1,fmin_gentypef_input_values2],'0 * FLT_ULP')
+#
+ ##### gentype fmod(gentype x, gentype y)
+ fmod_base_values = base_input_values
+ fmod_input_values1 = []
+ fmod_input_values2 = []
+ fmod_input_values1,fmod_input_values2=gene2ValuesLoop(fmod_input_values1,fmod_input_values2,fmod_base_values)
+ fmod_input_type1 = ['float','float2','float4','float8','float16']
+ fmod_input_type2 = ['float','float2','float4','float8','float16']
+ fmod_output_type = ['float','float2','float4','float8','float16']
+ fmodUtests = func('fmod','fmod',[fmod_input_type1,fmod_input_type2],fmod_output_type,[fmod_input_values1,fmod_input_values2],'0 * FLT_ULP')
+
+ ##### gentype hypot(gentype x, gentype y)
+ hypot_base_values = base_input_values
+ hypot_input_values1 = []
+ hypot_input_values2 = []
+ hypot_input_values1,hypot_input_values2=gene2ValuesLoop(hypot_input_values1,hypot_input_values2,hypot_base_values)
+ hypot_input_type1 = ['float','float2','float4','float8','float16']
+ hypot_input_type2 = ['float','float2','float4','float8','float16']
+ hypot_output_type = ['float','float2','float4','float8','float16']
+ hypotUtests = func('hypot','hypot',[hypot_input_type1,hypot_input_type2],hypot_output_type,[hypot_input_values1,hypot_input_values2],'4 * FLT_ULP')
+
+ ##### intn ilogb(floartn x)
+ ilogb_input_values = base_input_values
+ ilogb_input_type = ['float','float2','float4','float8','float16']
+ ilogb_output_type = ['int','int2','int4','int8','int16']
+ ilogbUtests = func('ilogb','ilogb',[ilogb_input_type],ilogb_output_type,[ilogb_input_values],'0 * INT_ULP')
+
+ ##### gentype lgamma(gentype x)
+ lgamma_input_values = base_input_values
+ lgamma_input_type = ['float','float2','float4','float8','float16']
+ lgamma_output_type = ['float','float2','float4','float8','float16']
+ lgammaUtests = func('lgamma','lgamma',[lgamma_input_type],lgamma_output_type,[lgamma_input_values],'4 * FLT_ULP')
+
+ ##### gentype log(gentype)
+ log_input_values = base_input_values
+ log_input_type = ['float','float2','float4','float8','float16']
+ log_output_type = ['float','float2','float4','float8','float16']
+ logUtests = func('log','log',[log_input_type],log_output_type,[log_input_values],'4 * FLT_ULP')
+
+ ##### gentype log2(gentype)
+ log2_input_values = base_input_values
+ log2_input_type = ['float','float2','float4','float8','float16']
+ log2_output_type = ['float','float2','float4','float8','float16']
+ log2Utests = func('log2','log2',[log2_input_type],log2_output_type,[log2_input_values],'4 * FLT_ULP')
+
+ ##### gentype log10(gentype)
+ log10_input_values = base_input_values
+ log10_input_type = ['float','float2','float4','float8','float16']
+ log10_output_type = ['float','float2','float4','float8','float16']
+ log10Utests = func('log10','log10',[log10_input_type],log10_output_type,[log10_input_values],'4 * FLT_ULP')
+
+ ##### gentype log1p(gentype x)
+ log1p_input_values = base_input_values
+ log1p_input_type = ['float','float2','float4','float8','float16']
+ log1p_output_type = ['float','float2','float4','float8','float16']
+ log1pUtests = func('log1p','log1p',[log1p_input_type],log1p_output_type,[log1p_input_values],'4 * FLT_ULP')
+
+ ##### gentype logb(gentype x)
+ logb_input_values = base_input_values
+ logb_input_type = ['float','float2','float4','float8','float16']
+ logb_output_type = ['float','float2','float4','float8','float16']
+ logbUtests = func('logb','logb',[logb_input_type],logb_output_type,[logb_input_values],'0 * FLT_ULP')
+
+ ##### gentype maxmag(gentype x, gentype y)
+ maxmag_base_values = base_input_values
+ maxmag_input_values1 = []
+ maxmag_input_values2 = []
+ maxmag_input_values1,maxmag_input_values2=gene2ValuesLoop(maxmag_input_values1,maxmag_input_values2,maxmag_base_values)
+ maxmag_input_type1 = ['float','float2','float4','float8','float16']
+ maxmag_input_type2 = ['float','float2','float4','float8','float16']
+ maxmag_output_type = ['float','float2','float4','float8','float16']
+ maxmag_cpu_func='''
+static float maxmag(float x, float y){
+ if(fabs(x) > fabs(y))
+ return x;
+ else if (fabs(x) < fabs(y))
+ return y;
+ else
+ return fmax(x,y);
+} '''
+ maxmagUtests = func('maxmag','maxmag',[maxmag_input_type1,maxmag_input_type2],maxmag_output_type,[maxmag_input_values1,maxmag_input_values2],'0 * FLT_ULP',maxmag_cpu_func)
+
+ ##### gentype minmag(gentype x, gentype y)
+ minmag_base_values = base_input_values
+ minmag_input_values1 = []
+ minmag_input_values2 = []
+ minmag_input_values1,minmag_input_values2=gene2ValuesLoop(minmag_input_values1,minmag_input_values2,minmag_base_values)
+ minmag_input_type1 = ['float','float2','float4','float8','float16']
+ minmag_input_type2 = ['float','float2','float4','float8','float16']
+ minmag_output_type = ['float','float2','float4','float8','float16']
+ minmag_cpu_func='''
+static float minmag(float x, float y){
+ if(fabs(x) < fabs(y))
+ return x;
+ else if (fabs(x) > fabs(y))
+ return y;
+ else
+ return fmin(x,y);
+} '''
+ minmagUtests = func('minmag','minmag',[minmag_input_type1,minmag_input_type2],minmag_output_type,[minmag_input_values1,minmag_input_values2],'0 * FLT_ULP',minmag_cpu_func)
+
+# ##### floatn nan(uintn nancode)
+# nan_input_values = base_input_values
+# nan_input_type = ['uint','uint2','uint4','uint8','uint16']
+# nan_output_type = ['float','float2','float4','float8','float16']
+# nanUtests = func('nan','nan',[nan_input_type],nan_output_type,[nan_input_values],'0 * FLT_ULP')
+
+ ##### gentype nextafter(gentype x, gentype y)
+ nextafter_base_values = base_input_values
+ nextafter_input_values1 = []
+ nextafter_input_values2 = []
+ nextafter_input_values1,nextafter_input_values2=gene2ValuesLoop(nextafter_input_values1,nextafter_input_values2,nextafter_base_values)
+ nextafter_input_type1 = ['float','float2','float4','float8','float16']
+ nextafter_input_type2 = ['float','float2','float4','float8','float16']
+ nextafter_output_type = ['float','float2','float4','float8','float16']
+ nextafterUtests = func('nextafter','nextafterf',[nextafter_input_type1,nextafter_input_type2],nextafter_output_type,[nextafter_input_values1,nextafter_input_values2],'0 * FLT_ULP')
+
+ ##### gentype pow(gentype x, gentype y)
+ pow_base_values = base_input_values1
+ pow_input_values1 = []
+ pow_input_values2 = []
+ pow_input_values1,pow_input_values2=gene2ValuesLoop(pow_input_values1,pow_input_values2,pow_base_values)
+ pow_input_type1 = ['float','float2','float4','float8','float16']
+ pow_input_type2 = ['float','float2','float4','float8','float16']
+ pow_output_type = ['float','float2','float4','float8','float16']
+ powUtests = func('pow','powf',[pow_input_type1,pow_input_type2],pow_output_type,[pow_input_values1,pow_input_values2],'16 * FLT_ULP')
+
+ ##### floatn pown(floatn x, intn y)
+ pown_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, 0.5, 1, 0.0,1500.24,-1500.24]
+ pown_input_values2 = [-1,-2,-3,4,5,6,7,8,10,12,14,16,12]
+ pown_input_type1 = ['float','float2','float4','float8','float16']
+ pown_input_type2 = ['int','int2','int4','int8','int16']
+ pown_output_type = ['float','float2','float4','float8','float16']
+ pown_cpu_func='''
+static float pown(float x, int y){
+ return pow(x,y);
+} '''
+ pownUtests = func('pown','pown',[pown_input_type1,pown_input_type2],pown_output_type,[pown_input_values1,pown_input_values2],'16 * FLT_ULP', pown_cpu_func)
+
+ ##### gentype powr(gentype x, gentype y)
+ powr_input_values1 = [80, -80, 3.14, -3.14, 0.5, 1, -1, 0.0,6,1500.24,-1500.24]
+ powr_input_values2 = [5,6,7,8,10,11,12,13,14,0,12]
+ powr_input_type1 = ['float','float2','float4','float8','float16']
+ powr_input_type2 = ['float','float2','float4','float8','float16']
+ powr_output_type = ['float','float2','float4','float8','float16']
+ powr_cpu_func='''
+static float powr(float x, int y){
+ return powf(x,y);
+} '''
+ powrUtests = func('powr','powr',[powr_input_type1,powr_input_type2],powr_output_type,[powr_input_values1,powr_input_values2],'16 * FLT_ULP', powr_cpu_func)
+
+ ##### gentype remainder(gentype x, gentype y)
+ remainder_base_values = base_input_values
+ remainder_input_values1 = []
+ remainder_input_values2 = []
+ remainder_input_values1,remainder_input_values2=gene2ValuesLoop(remainder_input_values1,remainder_input_values2,remainder_base_values)
+ remainder_input_type1 = ['float','float2','float4','float8','float16']
+ remainder_input_type2 = ['float','float2','float4','float8','float16']
+ remainder_output_type = ['float','float2','float4','float8','float16']
+ remainderUtests = func('remainder','remainder',[remainder_input_type1,remainder_input_type2],remainder_output_type,[remainder_input_values1,remainder_input_values2],'0 * FLT_ULP')
+
+ ##### gentype rint(gentype x)
+ rint_input_values = base_input_values
+ rint_input_type = ['float','float2','float4','float8','float16']
+ rint_output_type = ['float','float2','float4','float8','float16']
+ rintUtests = func('rint','rint',[rint_input_type],rint_output_type,[rint_input_values],'0 * FLT_ULP')
+
+ ##### floatn rootn(floatn x, intn y)
+ rootn_input_values1 = [0.0, 0.0012, 0.5, 1, 3.14, 12345]
+ rootn_input_values2 = [-1, 1, -20, 20, -123, 456]
+ rootn_input_type1 = ['float','float2','float4','float8','float16']
+ rootn_input_type2 = ['int','int2','int4','int8','int16']
+ rootn_output_type = ['float','float2','float4','float8','float16']
+ rootn_cpu_func='''
+static float rootn(float x, int y){
+ return pow(x,1.0/y);
+} '''
+ rootnUtests = func('rootn','rootn',[rootn_input_type1,rootn_input_type2],rootn_output_type,[rootn_input_values1,rootn_input_values2],'4 * FLT_ULP',rootn_cpu_func)
+
+ ##### gentype round(gentype x)
+ round_input_values = base_input_values
+ round_input_type = ['float','float2','float4','float8','float16']
+ round_output_type = ['float','float2','float4','float8','float16']
+ roundUtests = func('round','round',[round_input_type],round_output_type,[round_input_values],'0 * FLT_ULP')
+
+ ##### gentype rsqrt(gentype)
+ rsqrt_input_values = base_input_values
+ rsqrt_input_type = ['float','float2','float4','float8','float16']
+ rsqrt_output_type = ['float','float2','float4','float8','float16']
+ rsqrt_cpu_func='''
+static float rsqrt(float x)
+{ return 1/sqrt(x);} '''
+ rsqrtUtests = func('rsqrt','rsqrt',[rsqrt_input_type],rsqrt_output_type,[rsqrt_input_values],'4 * FLT_ULP', rsqrt_cpu_func)
+
+
+ ##### gentype sin(gentype)
+ sin_input_values = base_input_values
+ sin_input_type = ['float','float2','float4','float8','float16']
+ sin_output_type = ['float','float2','float4','float8','float16']
+ sinUtests = func('sin','sin',[sin_input_type],sin_output_type,[sin_input_values],'4 * FLT_ULP')
+
+# ##### gentype sincos(gentype)
+# sincos_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+# sincos_input_values2 = []
+# sincos_input_type1 = ['float','float2','float4','float8','float16']
+# sincos_input_type2 = ['float','float2','float4','float8','float16']
+# sincos_output_type = ['float','float2','float4','float8','float16']
+# ###### gentype sincos(gentype)
+# # sincosUtests = func('sincos','sincos',[sincos_input_type1,sincos_input_type2],sincos_output_type,[sincos_input_values1,sincos_input_values2],'4 * FLT_ULP')
+
+ ##### gentype sinh(gentype)
+ sinh_input_values = base_input_values
+ sinh_input_type = ['float','float2','float4','float8','float16']
+ sinh_output_type = ['float','float2','float4','float8','float16']
+ sinhUtests = func('sinh','sinh',[sinh_input_type],sinh_output_type,[sinh_input_values],'4 * FLT_ULP')
+
+ ##### gentype sinpi(gentype x)
+ sinpi_input_values = [0, 1, 3.14, -0.88, -0.12, -0.5, 0.5, -0.49, 0.49, 0.51, -0.51, -0.1, 0.1]
+ sinpi_input_type = ['float','float2','float4','float8','float16']
+ sinpi_output_type = ['float','float2','float4','float8','float16']
+ sinpi_cpu_func=reduce1+sinpi
+ sinpiUtests = func('sinpi','sinpi',[sinpi_input_type],sinpi_output_type,[sinpi_input_values],'4 * FLT_ULP',sinpi_cpu_func)
+
+ ##### gentype sqrt(gentype)
+ sqrt_input_values = base_input_values
+ sqrt_input_type = ['float','float2','float4','float8','float16']
+ sqrt_output_type = ['float','float2','float4','float8','float16']
+ sqrtUtests = func('sqrt','sqrt',[sqrt_input_type],sqrt_output_type,[sqrt_input_values],'4 * FLT_ULP')
+
+ ##### gentype tan(gentype)
+ tan_input_values = base_input_values
+ tan_input_type = ['float','float2','float4','float8','float16']
+ tan_output_type = ['float','float2','float4','float8','float16']
+ tanUtests = func('tan','tan',[tan_input_type],tan_output_type,[tan_input_values],'5 * FLT_ULP')
+
+ ##### gentype tanh(gentype)
+ tanh_input_values = base_input_values
+ tanh_input_type = ['float','float2','float4','float8','float16']
+ tanh_output_type = ['float','float2','float4','float8','float16']
+ tanhUtests = func('tanh','tanh',[tanh_input_type],tanh_output_type,[tanh_input_values],'5 * FLT_ULP')
+
+ ##### gentype tanpi(gentype x)
+ tanpi_input_values = [ 0, 3.14, 5.15, 6.01, 7.89]
+ tanpi_input_type = ['float','float2','float4','float8','float16']
+ tanpi_output_type = ['float','float2','float4','float8','float16']
+ tanpi_cpu_func=reduce1+sinpi+cospi+'''
+static float tanpi(float x){
+ return sinpi(x)/cospi(x);
+}
+'''
+ tanpiUtests = func('tanpi','tanpi',[tanpi_input_type],tanpi_output_type,[tanpi_input_values],'400 * FLT_ULP',tanpi_cpu_func)
+
+ ##### gentype trunc(gentype)
+ trunc_input_values = base_input_values
+ trunc_input_type = ['float','float2','float4','float8','float16']
+ trunc_output_type = ['float','float2','float4','float8','float16']
+ truncUtests = func('trunc','trunc',[trunc_input_type],trunc_output_type,[trunc_input_values],'0 * FLT_ULP')
+
+if __name__ == "__main__":
+ main()
diff --git a/utests/utest_run.cpp b/utests/utest_run.cpp
new file mode 100644
index 0000000..cd4356a
--- /dev/null
+++ b/utests/utest_run.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest_run.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Just run the unit tests. The user can possibly provides the subset of it
+ */
+#include "utest_helper.hpp"
+#include "utest_exception.hpp"
+#include <iostream>
+#include <getopt.h>
+
+static const char *shortopts = "c:lanh";
+struct option longopts[] = {
+{"casename", required_argument, NULL, 'c'},
+{"list", no_argument, NULL, 'l'},
+{"all", no_argument, NULL, 'a'},
+{"allnoissue", no_argument, NULL, 'n'},
+{"help", no_argument, NULL, 'h'},
+{0, 0, 0, 0},
+};
+
+void usage()
+{
+ std::cout << "\
+Usage:\n\
+ ./utest_run <option>\n\
+\n\
+ option:\n\
+ -c <casename>: run sub-case named 'casename'\n\
+ -l : list all the available case name\n\
+ -a : run all test cases\n\
+ -n : run all test cases without known issue (default option)\n\
+ -h : display this usage\n\
+\
+ "<< std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+
+ int c = 0;
+ cl_ocl_init();
+
+ c = getopt_long (argc, argv, shortopts, longopts, NULL);
+
+ if (argc == 1)
+ c = 'n';
+ if (argc == 2 && c < 1 ){
+ c = 'c';
+ optarg = argv[1];
+ }
+
+ do {
+ switch (c)
+ {
+ case 'c':
+ try {
+ UTest::run(optarg);
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'l':
+ UTest::listAllCases();
+ break;
+
+ case 'a':
+ try {
+ UTest::runAll();
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'n':
+ try {
+ UTest::runAllNoIssue();
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'h':
+ default:
+ usage();
+ exit(1);
+ }
+ } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1);
+
+ cl_ocl_destroy();
+}
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git
More information about the Pkg-opencl-devel
mailing list