Upgrade to PRO for Only $50/Year—Limited-Time Offer! 🔥

C++でシェーダを書く

Fadis
November 09, 2024

 C++でシェーダを書く

何とかして近代的なC++の言語機能を使ってシェーダを書けないか、色々な方法を試します
これは2024年11月9日に行われた Kernel/VM探検隊@北陸 Part 7 での発表動画です
発表動画 : https://youtu.be/flH2Y6XoDZY
ソースコード : https://github.com/Fadis/gct/tree/kernelvm_20241109

Fadis

November 09, 2024
Tweet

More Decks by Fadis

Other Decks in Programming

Transcript

  1. for( int i = 0; i != 24; ++i )

    { c[ i ] = a[ i ] + b[ i ]; } a b c Ϩδελ "-6
  2. "-6 "-6 "-6 "-6 "-6 "-6 a b c SIMD

    Ϩδελʹෳ਺ͷ஋Λࡌͤͯ ୔ࢁͷALUͰҰؾʹܭࢉ͢Ε͹଎͍
  3. add.f32 r2, r1, r0; r0ʹ͋Δ32ݸͷ୯ਫ਼౓ුಈখ਺఺਺ͱ r1ʹ͋Δ32ݸͷ୯ਫ਼౓ුಈখ਺఺਺ͷ ͦΕͧΕͷཁૉΛ଍ͨ݁͠ՌΛr2ʹు͘ r0 r1 r2

    0 1024 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
  4. HLSL StructuredBuffer<float> input_buffer : register(t0); RWStructuredBuffer<float> output_buffer : register(u0); [numthreads(1,

    1, 1)] void main( uint3 index : SV_DispatchThreadID ) { output_buffer[ index.x ] = input_buffer[ index.x ] * 2.0; } GLSL #version 460 layout( binding=0 ) buffer InputBuffer { float input_buffer[]; }; layout( binding=1 ) buffer OutputBuffer { float output_buffer[]; }; layout( local_size_x=1, local_size_y=1, local_size_z=1 ) in; void main() { output_buffer[ gl_GlobalInvocationID.x ] = input_buffer[ gl_GlobalInvocationID.x ] * 2.0; }
  5. CUDA __global__ void find( float *input_buffer, float *output_buffer ) {

    const int index = blockIdx.x * blockDim.x + threadIdx.x; output_buffer[ index ] = input_buffer[ index ] * 2.0f; } OpenCL C __kernel void func( __global float *input_buffer, __global float *output_buffer ) { unsigned int index = get_global_id( 0 ); output_buffer[ index ] = input_buffer[ index ] * 2.0f; }
  6. #version 460 layout(local_size_x = 16, local_size_y = 16 ) in;

    vec3 diffuse( vec3 surface_pos, vec3 light_pos, vec3 normal, vec3 albedo, float metallicness, vec3 light_energy ) { const vec3 relative_light_pos = light_pos - surface_pos; const vec3 L = normalize( relative_light_pos ); const float light_distance2 = dot( relative_light_pos, relative_light_pos ); const float diffuse = max( dot( L, normal ), 0.0f ) / 3.141592653589793f; const vec3 linear = ( 1 - metallicness ) * diffuse * albedo * light_energy / light_distance2; return linear; } void main() { ... } GLSLͱͯ͠ਖ਼͍͠ίʔυ
  7. # define BOOST_TEST_MAIN #include <boost/test/included/unit_test.hpp> #include <glm/vec3.hpp> #include <glm/geometric.hpp> using

    namespace glm; vec3 diffuse( vec3 surface_pos, vec3 light_pos, vec3 normal, vec3 albedo, float metallicness, vec3 light_energy ) { const vec3 relative_light_pos = light_pos - surface_pos; const vec3 L = normalize( relative_light_pos ); const float light_distance2 = dot( relative_light_pos, relative_light_pos ); const float diffuse = max( dot( L, normal ), 0.0f ) / 3.141592653589793f; const vec3 linear = ( 1 - metallicness ) * diffuse * albedo * light_energy / light_distance2; return linear; } BOOST_AUTO_TEST_CASE(max_diffuse) { vec3 l = diffuse( vec3( 0.0f, 0.0f, 1.0f ), vec3( 1.0f, 0.0f, 1.0f ), vec3( 1.0f, 0.0f, 0.0f ), vec3( 1.0f, 1.0f, 1.0f ), 0.0f, vec3( 1.0f, 1.0f, 1.0f ) ); BOOST_CHECK_CLOSE( l.x, 1.f/3.141592653589793f, 0.0001f ); } ͦͷ··C++ͱͯ͠ਖ਼͍͠ίʔυ
  8. جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ܧঝ

    ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++03ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ GLSL͕஀ੜͨ͠ࠒͷঢ়گ ڞ༻ମ
  9. جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ

    ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++11ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর nullptr εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ܧঝ ڞ༻ମ variadic templates
  10. جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ

    ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++14ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum generic lambda ໭Γ஋ܕਪ࿦ ू੒ମ ܧঝ ڞ༻ମ nullptr variadic templates
  11. جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ

    ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++17ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ if constexpr constexpr lambda ߏ଄Խଋറ ৞ΈࠐΈࣜ ඪ४ଐੑ ܧঝ ڞ༻ମ generic lambda ໭Γ஋ܕਪ࿦ nullptr variadic templates
  12. جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ

    ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++20ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ constexpr lambda ߏ଄Խଋറ ඪ४ଐੑ ࡾํൺֱԋࢉࢠ ࢦࣔ෇͖ ॳظԽ consteval ίϧʔνϯ Ϟδϡʔϧ ίϯηϓτ ܧঝ ڞ༻ମ if constexpr ৞ΈࠐΈࣜ generic lambda ໭Γ஋ܕਪ࿦ nullptr variadic templates
  13. جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ

    ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++23ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ constexpr lambda ඪ४ଐੑ ࡾํൺֱԋࢉࢠ consteval ίϧʔνϯ Ϟδϡʔϧ ίϯηϓτ ଟ࣍ݩ ఴࣈԋࢉࢠ bfloat16 deducing this decay copy if consteval ςϯϓϨʔτ Ҿ਺ਪ࿦ ͭΒ͍ ܧঝ ڞ༻ମ ࢦࣔ෇͖ ॳظԽ if constexpr ৞ΈࠐΈࣜ generic lambda ໭Γ஋ܕਪ࿦ variadic templates nullptr ߏ଄Խଋറ
  14. OpenGL 4.5 DirectX 11 DirectX 12 OpenGL 4.6 OpenCL CUDA

    Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ
  15. OpenGL 4.5 DirectX 11 DirectX 12 OpenGL 4.6 OpenCL CUDA

    Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ void main() { vec3 normal = normalize( inpu t_normal.xyz ); vec3 pos = input_position. xyz; vec3 N = normal; ߴڃݴޠ a b × + 3 AST ࣮ߦՄೳόΠφϦ γϦΞϥΠζ ࣈ۟ղੳ ߏจղੳ SPIR-V ߴڃݴޠͷASTΛ όΠφϦܗࣜͰγϦΞϥΠζͨ͠෺ ߴڃݴޠΑΓߴ଎ʹύʔεͰ͖Δ
  16. OpenGL 4.5 DirectX 11 DirectX 12 OpenGL 4.6 OpenCL CUDA

    Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ Ͱ΋GLSLͰͳͯ͘΋SPIR-VʹͰ͖ΔߴڃݴޠͳΒ Vulkanʹ৯΂ͤ͞Δࣄ͕Ͱ͖Δ GLSLΛSPIR-Vʹ͢ΔϦϑΝϨϯείϯύΠϥ glslc͕ଘࡏ͢Δ
  17. HLSL 2021ͷมߋ఺ • C++ͷΑ͏ͳςϯϓϨʔτؔ਺ͱςϯϓϨʔτΫϥε • C++ͷΑ͏ͳԋࢉࢠΦʔόʔϩʔυ • C++ͷΑ͏ͳϏοτϑΟʔϧυ • Ϣʔβఆٛܕͷ҉໧ͷΩϟετͷৼΔ෣͍ΛC++ʹ߹Θͤͨ

    • ࿦ཧԋࢉͷ୹བྷධՁͷৼΔ෣͍ΛC++ʹ߹Θͤͨ • forϧʔϓ಺Ͱએݴ͞Εͨม਺ͷείʔϓΛC++ʹ߹Θͤͨ HLSL͕C++ʹد͖ͤͯͨ
  18. কདྷͷHLSLʹ޲͚ͯఏҊ͞Ε͍ͯΔػೳ • C++ͷΑ͏ͳඪ४ଐੑ • C++ͷΑ͏ͳnumeric_limits • C++ͷΑ͏ͳڞ༻ମ • C++ͷΑ͏ͳॳظԽࢠϦετ •

    C++ͷΑ͏ͳࢀরܕ • C++ͷΑ͏ͳconstϝϯόؔ਺ • C++ͷΑ͏ͳඇϝϯόԋࢉࢠΦʔόʔϩʔυ • C++ͷΑ͏ͳautoɺdecltypeɺconstexprɺ scoped enumɺvariadic templatesɺϢʔβ ఆٛϦςϥϧɺϥϜμࣜɺstatic assertɺ range-based for https://github.com/microsoft/hlsl-specs/tree/main/proposals
  19. RWStructuredBuffer<float> a; RWStructuredBuffer<float> b; template<typename T> T add( T x,

    T y ) { return x + y; } [numthreads(256, 1, 1)] void main( uint2 id : SV_DispatchThreadID ) { uint index = id.x; a[index] = add(b[index], b[index + 1]); } GPUͷϝϞϦ্ʹ֬อͨ͠ όοϑΝ͕͜͜ʹϦϯΫ͞ΕΔ ςϯϓϨʔτؔ਺ 256x1x1εϨουΛ local workgroupͱ͢Δ T=floatͰςϯϓϨʔτؔ਺Λݺͼग़͠
  20. function(add_shader TARGET SHADER) set(current-shader-path ${CMAKE_CURRENT_SOURCE_DIR}/${SHADER}) set(current-output-path ${CMAKE_CURRENT_BINARY_DIR}/${SHADER}.spv) get_filename_component(current-output-dir ${current-output-path} DIRECTORY)

    file(MAKE_DIRECTORY ${current-output-dir}) if( "${SHADER}" MATCHES "\.comp\.hlsl$" ) string( REPLACE ".comp.hlsl.spv" ".comp.spv" current-output-path "${current-output-path}" ) add_custom_command( OUTPUT ${current-output-path} COMMAND ${DXC} -spirv -fspv-target-env=vulkan1.3 ${current-shader-path} -T cs_6_8 -Fo ${current-output-path} DEPENDS ${current-shader-path} IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) elseif( "${SHADER}" MATCHES "\.vert\.hlsl$" ) ϑΝΠϧ໊͕.hlslͰऴΘ͍ͬͯͨΒ glslcͰ͸ͳ͘dxcͰίϯύΠϧ͢Δ ϧʔϧΛCMakeʹ௥Ճ DXCʹVulkan 1.3༻ͷSPIR-VΛ ग़ྗ͢ΔΑ͏ཁٻ
  21. $ spirv-reflect ./src/example/hlsl/test.comp.spv generator : Google spiregg source lang :

    HLSL source lang ver : 680 source file : entry point : main (stage=CS) local size : (256, 1, 1) Input variables: 1 0: spirv id : 2 location : (built-in) GlobalInvocationId type : uint3 semantic : name : qualifier : Descriptor bindings: 2 Binding 0.0 spirv id : 3 set : 0 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ
  22. Descriptor bindings: 2 Binding 0.0 spirv id : 3 set

    : 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : a (type.RWStructuredBuffer.float) // size = 0, padded size = 0 struct type.RWStructuredBuffer.float { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } a; Binding 0.1 spirv id : 4 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : b (type.RWStructuredBuffer.float) // size = 0, padded size = 0 γΣʔμ͸"a"ͱ͍͏໊લͷ֎෦ͷfloatܕͷ഑ྻΛಡΈॻ͖͢Δ ࣮ߦ͢Δࡍʹ͸GPU্ʹϝϞϦΛ֬อͯ͠ "a"ʹ݁ͼ͚ͭΔඞཁ͕͋Δ RWStructuredBuffer<float> a; RWStructuredBuffer<float> b;
  23. } a; Binding 0.1 spirv id : 4 set :

    0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : b (type.RWStructuredBuffer.float) // size = 0, padded size = 0 struct type.RWStructuredBuffer.float { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } b; γΣʔμ͸"b"ͱ͍͏໊લͷ֎෦ͷfloatܕͷ഑ྻΛಡΈॻ͖͢Δ ࣮ߦ͢Δࡍʹ͸GPU্ʹϝϞϦΛ֬อͯ͠ "b"ʹ݁ͼ͚ͭΔඞཁ͕͋Δ RWStructuredBuffer<float> a; RWStructuredBuffer<float> b;
  24. const auto input_data = res.allocator->create_mappable_buffer( sizeof( float ) * 260,

    vk::BufferUsageFlagBits::eStorageBuffer ); const auto output_data = res.allocator->create_mappable_buffer( sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; GPUͷϝϞϦʹόοϑΝinput_dataͱoutput_dataΛ֬อ
  25. sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); const auto output_data

    = res.allocator->create_mappable_buffer( sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); ໊લ͕"a"ͷσεΫϦϓλʹoutput_dataΛ݁ͼ͚ͭΔ ໊લ͕"b"ͷσεΫϦϓλʹinput_dataΛ݁ͼ͚ͭΔ ઌ΄ͲͷHLSL͔Β࡞ͬͨSPIR-VΛ࣮ߦ͢Δҝͷ ίϯϐϡʔτύΠϓϥΠϯΛ࡞Δ
  26. ); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool )

    .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); comp( rec, 0, 256, 1, 1 ); CPU্ͷεςʔδϯάόοϑΝʹinput_dataʹॻ͘஋Λฒ΂Δ
  27. mapped[ i ] = i; } } auto command_buffer =

    res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now( CPU্ͷεςʔδϯάόοϑΝͷ಺༰ΛGPU্ͷinput_dataʹసૹ ίϯϐϡʔτύΠϓϥΠϯΛ256εϨουͰ࣮ߦ GPU্ͷoutput_dataͷ಺༰ΛCPU্ͷεςʔδϯάόοϑΝʹసૹ
  28. $ ./src/example/hlsl/hlsl 1 3 5 7 ... 507 509 511

    0 0 0 0 RWStructuredBuffer<float> a; RWStructuredBuffer<float> b; template<typename T> T add( T x, T y ) { return x + y; } [numthreads(256, 1, 1)] void main( uint2 id : SV_DispatchThreadID ) { uint index = id.x; a[index] = add(b[index], b[index + 1]); } HLSL 2021Λ࢖͏ͱGLSLΑΓগ͚ͩ͠ C++ͱڞ௨ͷίʔυΛॻ͖΍͘͢ͳΔ
  29. https://www.khronos.org/opencl/assets/CXX_for_OpenCL.html C++ for OpenCL C++ for OpenCL 1.0 OpenCL C

    2.0ʹC++03ͷػೳͷ͏ͪ ༰қʹ௥ՃͰ͖Δ෺Λ௥Ճͨ͠෺ C++ for OpenCL 2021 OpenCL C 3.0ʹC++17ͷػೳͷ͏ͪ ༰қʹ௥ՃͰ͖Δ෺Λ௥Ճͨ͠෺
  30. __kernel void test(__global float *a, __global float *b) { const

    auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } Generic Lambda [ C++14 ]
  31. IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) elseif( "${SHADER}" MATCHES "\.cl$" ) string(

    REPLACE ".cl.spv" ".comp.spv" current-output-path "${current-output-path}" ) add_custom_command( OUTPUT ${current-output-path} COMMAND ${CLSPV} --cl-std=CLC++2021 -inline-entry-points -- spv-version=1.6 ${current-shader-path} -o ${current-output-path} DEPENDS ${current-shader-path} IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) else() add_custom_command( OUTPUT ${current-output-path} COMMAND ${GLSLC} -o ${current-output-path} ${current-shader- ϑΝΠϧ໊͕.clͰऴΘ͍ͬͯͨΒ glslcͰ͸ͳ͘clspvͰίϯύΠϧ͢Δ ϧʔϧΛCMakeʹ௥Ճ clspvʹ C++ for OpenCL 2021Λ Vulkan 1.3༻ͷSPIR-Vʹ ͢ΔΑ͏ཁٻ
  32. $ spirv-reflect ./src/example/cl/test.comp.spv generator : ??? source lang : OpenCL_CPP

    source lang ver : 202100 source file : entry point : test (stage=CS) local size : (0, 0, 0) Input variables: 1 0: spirv id : 7 location : (built-in) GlobalInvocationId type : uint3 semantic : name : qualifier : Push constant blocks: 1 0: spirv id : 5 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ
  33. Push constant blocks: 1 0: spirv id : 5 name

    : <unnamed> // size = 16, padded size = 16 struct <unnamed> { uint3 ; // abs offset = 0, rel offset = 0, size = 12, padded size = 16 } <unnamed>; Descriptor bindings: 2 Binding 0.0 spirv id : 18 set : 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 struct <unnamed> { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 push constantͰූ߸ͳ͠੔਺3ཁૉͷϕΫτϧΛ౉͢ඞཁ͕͋Δ OpenCLͷίʔυଆͰget_global_idΛݺͿͱ ίϯϐϡʔτύΠϓϥΠϯ͔Β౉ͬͯ͘ΔεϨουIDʹpush constantͷ஋Λ ଍ͨ͠஋͕ฦͬͯ͘Δ Ұ౓ʹ࣮ߦͰ͖ͳ͍͘Β͍େྔͷεϨουΛ ෳ਺ճʹ෼ׂ࣮ͯ͠ߦ͢Δͷʹศར
  34. Descriptor bindings: 2 Binding 0.0 spirv id : 18 set

    : 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 struct <unnamed> { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } <unnamed>; Binding 0.1 spirv id : 19 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 __kernel void test(__global float *a, __global float *b) { __kernelͳؔ਺ͷҾ਺͕σεΫϦϓλʹͳΔ 0൪໨ͷҾ਺͕binding=0ʹͳΔ ม਺໊ΛϦϑϨΫγϣϯ৘ใʹೖΕͯ͘Εͳ͍ͷͰ binding൪߸Ͱ݁ͼ͚ͭΔඞཁ͕͋Δ
  35. } <unnamed>; Binding 0.1 spirv id : 19 set :

    0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 struct <unnamed> { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } <unnamed>; __kernel void test(__global float *a, __global float *b) { 1൪໨ͷҾ਺΋ಉ༷
  36. $ ./src/dump_spirv -i src/example/cl/test.comp.spv { "descriptor_binding_count": 2, "descriptor_bindings": [ ...

    "spec_constants": [ { "constant_id": 0, "spirv_id": 8 }, { "constant_id": 1, "spirv_id": 9 }, { "constant_id": 2, "spirv_id": 10 } ], "spirv_execution_model": "GLCompute" } ಛघԽఆ਺͕3ؚͭ·Ε͍ͯΔ ಛघԽఆ਺ͱ͸: γΣʔμͷίϯύΠϧ࣌ʹ஋ΛܾΊΒΕΔఆ਺ ίʔυʹఆ਺͕௚ॻ͖͞Ε͍ͯΔͷͱ ಉ౳ͷ࠷దԽ͕ߦΘΕΔ OpenCLͰ͸LocalWorkgroupͷαΠζΛ ίʔυ্ʹॻ͔ͳ͍ͷͰ ಛघԽఆ਺Ͱࢦఆ͢Δ
  37. gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache(

    res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { 0, output_data } ) .add_resource( { 1, input_data } ) .add_spec( 0, 256 ) .add_spec( 1, 1 ) .add_spec( 2, 1 ) .set_dim( 256, 1, 1 ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); binding=0ͷσεΫϦϓλʹoutput_dataΛ݁ͼ͚ͭΔ binding=1ͷσεΫϦϓλʹinput_dataΛ݁ͼ͚ͭΔ ಛघԽఆ਺ʹ(256,1,1)Ληοτ͢Δ ϦϑϨΫγϣϯ͔ΒLocalWorkgroupͷαΠζ͕औΕͳ͍ͷͰ खಈͰ(256,1,1)Ληοτ͢Δ
  38. } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now();

    { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); glm::ivec3 offset( 0, 0, 0 ); rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( glm::ivec3 ), &offset ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); push constantʹεϨουIDͷΦϑηοτ(0,0,0)Ληοτ͢Δ
  39. auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer()

    }, {} ); glm::ivec3 offset( 0, 0, 0 ); rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( glm::ivec3 ), &offset ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now(); ࣮ߦ
  40. $ ./src/example/cl/cl 1 3 5 7 ... 507 509 511

    0 0 0 0 __kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } C++ for OpenCL 2021Λ࢖͏ͱ HLSL 2021ΑΓଟ͘ͷC++ͷػೳ͕࢖͑Δ ͨͩ͠ίϯϐϡʔτύΠϓϥΠϯઐ༻
  41. __global__ void test( float *a, float *b ) { const

    int index = blockIdx.x * blockDim.x + threadIdx.x; const auto add = []( auto x, auto y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } __kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } CUDA ੍ݶ෇͖C++17 C++ for OpenCL 2021 ੍ݶ෇͖C++17
  42. ͜ͷϢχοτ1ݸʹ͖ͭϓϩάϥϜΧ΢ϯλ͕1ݸ __kernel void test(__global float *a, __global float *b) {

    const auto index = get_global_id( 0 ); if( index % 2 == 1 ) { a[ index ] = b[ index ]; } else { a[ index ] = 0.0f; } } ෼ذͨ͘͠ͳͬͨΒͲ͏͢Δ? μΠόʔδΣϯε
  43. __kernel void test(__global float *a, __global float *b) { const

    auto index = get_global_id( 0 ); if( index % 2 == 1 ) { a[ index ] = b[ index ]; } else { a[ index ] = 0.0f; } }        ⋯ index        ⋯ index % 2 a[ index ] = b[ index ] B<> B<> B<> ⋯ a[ index ] = 0.0f     ⋯  B<>  B<>  B<>  ෼ذ৚݅ͰϚεΫΛ࡞Δ ྆ํͷϒϩοΫΛ࣮ߦ͢Δ ϚεΫʹج͍ͮͯͲͪΒͷ݁ՌΛ࠾༻͢Δ͔ܾΊΔ a[ index ]
  44. ؒ઀෼ذͩͬͨΒ? __device__ float func1() { return 3.0f; } __device__ float

    func2() { return 5.0f; } __global__ void find( float *input_buffer, float *output_buffer ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; const auto func = ( index % 2 ) ? &func1 : func2; output_buffer[ index ] = func(); }
  45. __device__ float func1() { return 3.0f; } __device__ float func2()

    { return 5.0f; } __global__ void find( float *input_buffer, float *output_buffer ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; const auto func = ( index % 2 ) ? &func1 : func2; output_buffer[ index ] = func(); } GVOD GVOD GVOD GVOD GVOD GVOD GVOD ⋯ func = ඈͼઌͷΞυϨε͔ΒϚεΫΛ࡞Γɺશͯͷؔ਺ΛݺͿ func1()ΛݺͿ func2()ΛݺͿ        ⋯        ⋯        ⋯ ϚεΫͰ݁ՌΛબͿ
  46. #include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1,

    1) void main( float *a, float *b ) { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; } #define vertex_shader __attribute__((annotate("shady::entry_point::Vertex"))) #define fragment_shader __attribute__((annotate("shady::entry_point::Fragment"))) #define compute_shader __attribute__((annotate("shady::entry_point::Compute"))) #define location(i) __attribute__((annotate("shady::location::"#i))) #define descriptor_set(i) __attribute__((annotate("shady::descriptor_set::"#i))) #define descriptor_binding(i) __attribute__((annotate("shady::descriptor_binding::"#i))) #define local_size(x, y, z) __attribute__((annotate("shady::workgroup_size::"#x"::"#y"::"#z))) ϚΫϩ͕ల։͞Εͯ __attribute__ʹͳΓ ίϯύΠϥʹSPIR-VΛుͨ͘Ίʹ ඞཁͳ৘ใ͕ૹΒΕΔ
  47. $ spirv-reflect test.comp.spv generator : ??? source lang : Unknown

    source lang ver : 0 source file : entry point : main (stage=CS) local size : (256, 1, 1) Input variables: 1 0: spirv id : 25 location : (built-in) GlobalInvocationId type : uint3 semantic : name : _ZN3vcc21gl_GlobalInvocationIDE qualifier : Push constant blocks: 1 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ
  48. Push constant blocks: 1 0: spirv id : 5 name

    : __main_args // size = 16, padded size = 16 struct <unnamed> { uint ; // abs offset = 0, rel offset = 0, size = 8, padded size = 8 uint ; // abs offset = 8, rel offset = 8, size = 8, padded size = 8 } __main_args; mainؔ਺ͷҾ਺͕push constantʹͳ͍ͬͯΔ 64bitͷϙΠϯλ2ͭͳͷͰ όοϑΝͷGPUଆ͔ΒݟͨΞυϨεΛͱͬͯ push constantʹಥͬࠐΊ͹ྑ͍
  49. gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache(

    res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } const auto input_data_addr = input_data->get_buffer()->get_address(); const auto output_data_addr = output_data->get_buffer()->get_address(); auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); σεΫϦϓλͳ͠ͰίϯϐϡʔτύΠϓϥΠϯΛ࡞Δ
  50. gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache(

    res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } const auto input_data_addr = input_data->get_buffer()->get_address(); const auto output_data_addr = output_data->get_buffer()->get_address(); auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); ೖྗ஋༻ͷόοϑΝͱग़ྗ஋༻ͷόοϑΝͷ GPU͔Βݟͨઌ಄ΞυϨεΛऔಘ͢Δ
  51. { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( {

    input_data->get_buffer() }, {} ); std::array< vk::DeviceAddress, 2u > addrs{ *output_data_addr, *input_data_addr }; rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( vk::DeviceAddress ) * addrs.size(), addrs.data() ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } push constantʹόοϑΝͷΞυϨεΛॻ͘
  52. rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); std::array<

    vk::DeviceAddress, 2u > addrs{ *output_data_addr, *input_data_addr }; rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( vk::DeviceAddress ) * addrs.size(), addrs.data() ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now(); ࣮ߦ
  53. $ ./src/example/vcc/vcc 1 3 5 7 ... 507 509 511

    0 0 0 0 #include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; }
  54. vcc͸C++ΛLLVM IRʹͨ͠ޙCPSม׵Λߦ͏ int f( int a ) { a +=

    1; if( a == 3 ) { a = 0; } else { a = g( a ); } return a; } f0 f1 f2 f3 ؔ਺Λbasic blockຖʹผͷؔ਺ʹ͢Δ ֤ؔ਺͸࣍ʹ࣮ߦ͍ͨؔ͠਺Λ ΩϡʔʹੵΉ ϩʔΧϧม਺ ͸ڞ༗ϝϞϦͰ࣍ͷؔ਺ʹ౉͞ΕΔ ͸ ͱ ΛΩϡʔʹੵΉ Ωϡʔʹੵ·Εͨؔ਺͸શ෦࣮ߦ͢Δ f0 f1 f2 ͱ ͸ڞʹ ΛΩϡʔʹੵΉͷͰ ͜͜Ͱ྆ऀ͔Βདྷͨม਺Λ౷߹͢Δ f1 f2 f3
  55. vcc͸C++ΛLLVM IRʹͨ͠ޙCPSม׵Λߦ͏ int f( int a ) { a +=

    1; if( a == 3 ) { a = 0; } else { a = g( a ); } return a; } f0 f1 f2 f3 ؔ਺Λbasic blockຖʹผͷؔ਺ʹ͢Δ ֤ؔ਺͸࣍ʹ࣮ߦ͍ͨؔ͠਺Λ ΩϡʔʹੵΉ ϩʔΧϧม਺ ͸ڞ༗ϝϞϦͰ࣍ͷؔ਺ʹ౉͞ΕΔ ͸ ͱ ΛΩϡʔʹੵΉ Ωϡʔʹੵ·Εͨؔ਺͸શ෦࣮ߦ͢Δ f0 f1 f2 ͱ ͸ڞʹ ΛΩϡʔʹੵΉͷͰ ͜͜Ͱ྆ऀ͔Βདྷͨม਺Λ݁߹͢Δ f1 f2 f3 ར఺1: NVIDIAͷಛڐΛճආͯؒ͠઀෼ذ͕Ͱ͖Δ ར఺2: ͲΜͳʹܹ͘͠μΠόʔδΣϯε͕ੜͯ͡΋ ੑೳΛ٘ਜ਼ʹ࣮ͯ͠ߦͰ͖Δ
  56. #include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1,

    1) void main( float *a, float *b ) { const auto index = gl_GlobalInvocationID.x; const auto add = []< typename T, typename U >( T x, U y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } Familiar template syntax for generic lambdas [ C++20 ] C++ΛLLVM IRʹ͢Δॴ·Ͱ͸ͨͩͷclangͳͷͰ ͜ΜͳίʔυΛॻ͍ͯ΋ίϯύΠϧ͸௨Δ
  57. $ ./src/example/vcc/vcc terminate called after throwing an instance of 'vk::DeviceLostError'

    what(): wait_for_executed failed.: ErrorDeviceLost தࢭ (ίΞμϯϓ) #include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { const auto index = gl_GlobalInvocationID.x; const auto add = []< typename T, typename U >( T x, U y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } ͳ͔ͥ ແݶϧʔϓʹ ؕΔ
  58. #include <shady.h> using namespace vcc; descriptor_set(0) descriptor_binding(1) uniform_constant extern float

    a[]; descriptor_set(0) descriptor_binding(2) uniform_constant extern float b[]; extern "C" compute_shader local_size(256, 1, 1) void main() { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; } σεΫϦϓλΛ࢖͏ͱ
  59. ·ͱΊ CPUͱGPUͰಉ͡ίʔυΛ࣮ߦ͍ͨ࣌͠GPUଆͰ࢖͏ݴޠ GLSL HLSL 2021 C++ for OpenCL 2021 CUDA

    C++ GLSLΑΓϚγ άϥϑΟΫεύΠϓϥΠϯ΋OK ϕϯμʔඇґଘͰίϯϐϡʔτύΠϓϥΠϯ͚͕ͩ ಈ͚͹ྑ͍ͳΒ͜Ε NVIDIAͷGPUͰίϯϐϡʔτύΠϓϥΠϯ͚͕ͩ ಈ͚͹ྑ͍ͳΒ͜Ε ͭΒ͍ ਓྨͷເɺάϥϑΟΫεύΠϓϥΠϯ΋OK ͔͠͠ݱঢ়ίϯύΠϥͷಈ͖͕ո͍͠