C++でシェーダを書く

C++ͰγΣʔμΛॻ͘ NAOMASA MATSUBAYASHI

for( int i = 0; i != 24; ++i )
{ c[ i ] = a[ i ] + b[ i ]; } a b c Ϩδελ "-6

"-6 "-6 "-6 "-6 "-6 "-6 a b c SIMD
Ϩδελʹෳ਺ͷ஋Λࡌͤͯ ୔ࢁͷALUͰҰؾʹܭࢉ͢Ε͹଎͍

Tensor Core ϩʔυετΞ σΟεύον໋ྩΩϟογϡ ϨδελόϯΫ ALU 1024bitͷϨδελʹ 32bitුಈখ਺఺਺Λ32ݸฒ΂ͯ 1໋ྩͰ32ݸͷ஋Λܭࢉ͠Α͏

͜ͷϓϩηοαΛ େྔʹฒ΂ͨΒ େྔͷσʔλΛߴ଎ʹॲཧͰ͖Δ GPU

1໋ྩσίʔυͨ͠Β 32ݸͷԋࢉث͕ ܭࢉΛߦ͏ Tensor Core ϩʔυετΞ σΟεύον໋ྩΩϟογϡ ϨδελόϯΫ

add.f32 r2, r1, r0; r0ʹ͋Δ32ݸͷ୯ਫ਼౓ුಈখ਺఺਺ͱ r1ʹ͋Δ32ݸͷ୯ਫ਼౓ුಈখ਺఺਺ͷ ͦΕͧΕͷཁૉΛ଍ͨ݁͠ՌΛr2ʹు͘ r0 r1 r2
0 1024 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

ΞηϯϒϦΛ௚઀ॻ͘ͷ͸ਏ͍ͷͰ ߴڃݴޠ͕ཉ͍͠

HLSL StructuredBuffer<float> input_buffer : register(t0); RWStructuredBuffer<float> output_buffer : register(u0); [numthreads(1,
1, 1)] void main( uint3 index : SV_DispatchThreadID ) { output_buffer[ index.x ] = input_buffer[ index.x ] * 2.0; } GLSL #version 460 layout( binding=0 ) buffer InputBuffer { float input_buffer[]; }; layout( binding=1 ) buffer OutputBuffer { float output_buffer[]; }; layout( local_size_x=1, local_size_y=1, local_size_z=1 ) in; void main() { output_buffer[ gl_GlobalInvocationID.x ] = input_buffer[ gl_GlobalInvocationID.x ] * 2.0; }

CUDA __global__ void find( float *input_buffer, float *output_buffer ) {
const int index = blockIdx.x * blockDim.x + threadIdx.x; output_buffer[ index ] = input_buffer[ index ] * 2.0f; } OpenCL C __kernel void func( __global float *input_buffer, __global float *output_buffer ) { unsigned int index = get_global_id( 0 ); output_buffer[ index ] = input_buffer[ index ] * 2.0f; }

GPUݻ༗ͷ੍໿ʹ߹ΘͤΔҝ CPUͱ͸ҟͳΔߴڃݴޠ͕༻͍ΒΕΔ

GPU޲͚ͷߴڃݴޠͱCPU޲͚ͷߴڃݴޠ͸ҧ͏͚Ͳ GPU޲͚ʹॻ͍ͨίʔυΛCPUͰ࣮ߦ͍ͨ͠ GPUͰ࣮ߦ͢Δؔ਺ʹର͢ΔϢχοτςετΛॻ͖͍ͨ GPU༻ͷ࣮૷ͱCPU༻ͷ࣮૷ͷ ίʔυϕʔεΛڞ௨ʹ͍ͨ͠

#version 460 layout(local_size_x = 16, local_size_y = 16 ) in;
vec3 diffuse( vec3 surface_pos, vec3 light_pos, vec3 normal, vec3 albedo, float metallicness, vec3 light_energy ) { const vec3 relative_light_pos = light_pos - surface_pos; const vec3 L = normalize( relative_light_pos ); const float light_distance2 = dot( relative_light_pos, relative_light_pos ); const float diffuse = max( dot( L, normal ), 0.0f ) / 3.141592653589793f; const vec3 linear = ( 1 - metallicness ) * diffuse * albedo * light_energy / light_distance2; return linear; } void main() { ... } GLSLͱͯ͠ਖ਼͍͠ίʔυ

# define BOOST_TEST_MAIN #include <boost/test/included/unit_test.hpp> #include <glm/vec3.hpp> #include <glm/geometric.hpp> using
namespace glm; vec3 diffuse( vec3 surface_pos, vec3 light_pos, vec3 normal, vec3 albedo, float metallicness, vec3 light_energy ) { const vec3 relative_light_pos = light_pos - surface_pos; const vec3 L = normalize( relative_light_pos ); const float light_distance2 = dot( relative_light_pos, relative_light_pos ); const float diffuse = max( dot( L, normal ), 0.0f ) / 3.141592653589793f; const vec3 linear = ( 1 - metallicness ) * diffuse * albedo * light_energy / light_distance2; return linear; } BOOST_AUTO_TEST_CASE(max_diffuse) { vec3 l = diffuse( vec3( 0.0f, 0.0f, 1.0f ), vec3( 1.0f, 0.0f, 1.0f ), vec3( 1.0f, 0.0f, 0.0f ), vec3( 1.0f, 1.0f, 1.0f ), 0.0f, vec3( 1.0f, 1.0f, 1.0f ) ); BOOST_CHECK_CLOSE( l.x, 1.f/3.141592653589793f, 0.0001f ); } ͦͷ··C++ͱͯ͠ਖ਼͍͠ίʔυ

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ܧঝ
ςϯϓϨʔτ ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++03ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ GLSL͕஀ੜͨ͠ࠒͷঢ়گ ڞ༻ମ

جຊతͳԋࢉ ؔ਺ Φʔόʔϩʔυ ߏ଄ମ ϓϦϓϩηοα ਺஋ܕ αϯϓϥʔ ਫฏԋࢉ ಉظ ςϯϓϨʔτ
ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++11ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর nullptr εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ܧঝ ڞ༻ମ variadic templates

ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++14ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum generic lambda ໭Γ஋ܕਪ࿦ ू੒ମ ܧঝ ڞ༻ମ nullptr variadic templates

ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++17ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ if constexpr constexpr lambda ߏ଄Խଋറ ৞ΈࠐΈࣜ ඪ४ଐੑ ܧঝ ڞ༻ମ generic lambda ໭Γ஋ܕਪ࿦ nullptr variadic templates

ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++20ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ constexpr lambda ߏ଄Խଋറ ඪ४ଐੑ ࡾํൺֱԋࢉࢠ ࢦࣔ෇͖ ॳظԽ consteval ίϧʔνϯ Ϟδϡʔϧ ίϯηϓτ ܧঝ ڞ༻ମ if constexpr ৞ΈࠐΈࣜ generic lambda ໭Γ஋ܕਪ࿦ nullptr variadic templates

ඪ४ ϥΠϒϥϦ ੍ޚߏจ new/delete ྫ֎ ࣮ߦ࣌ܕ৘ใ C++23ͷ ػೳ GLSLͷ ػೳ ͜ͷ෦෼͚ͩΛ ࢖ͬͯίʔυΛॻ͘ constexpr ϥϜμࣜ auto ӈล஋ࢀর εϨουϩʔΧϧ ετϨʔδ range based for scoped enum ू੒ମ constexpr lambda ඪ४ଐੑ ࡾํൺֱԋࢉࢠ consteval ίϧʔνϯ Ϟδϡʔϧ ίϯηϓτ ଟ࣍ݩ ఴࣈԋࢉࢠ bfloat16 deducing this decay copy if consteval ςϯϓϨʔτ Ҿ਺ਪ࿦ ͭΒ͍ ܧঝ ڞ༻ମ ࢦࣔ෇͖ ॳظԽ if constexpr ৞ΈࠐΈࣜ generic lambda ໭Γ஋ܕਪ࿦ variadic templates nullptr ߏ଄Խଋറ

GPU༻ͷߴڃݴޠ͕΋͏গ͠ ݱ୅ͷC++ʹ͍ۙػೳΛඋ͑ͯཉ͍͠

OpenGL 4.5 DirectX 11 DirectX 12 OpenGL 4.6 OpenCL CUDA
Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ

Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ void main() { vec3 normal = normalize( inpu t_normal.xyz ); vec3 pos = input_position. xyz; vec3 N = normal; ߴڃݴޠ a b × + 3 AST ࣮ߦՄೳόΠφϦ γϦΞϥΠζ ࣈ۟ղੳ ߏจղੳ SPIR-V ߴڃݴޠͷASTΛ όΠφϦܗࣜͰγϦΞϥΠζͨ͠෺ ߴڃݴޠΑΓߴ଎ʹύʔεͰ͖Δ

Vulkan GLSLͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ SPIR-VͰ͍ͩ͘͞ HLSLͰ͍ͩ͘͞ DXILͰ͍ͩ͘͞ OpenCL CͰ͍ͩ͘͞ C++ͷίʔυʹҰॹʹॻ͍͍ͯͩ͘͞ GPUͰ࣮ߦ͢Δίʔυͷ౉͠ํ Ͱ΋GLSLͰͳͯ͘΋SPIR-VʹͰ͖ΔߴڃݴޠͳΒ Vulkanʹ৯΂ͤ͞Δࣄ͕Ͱ͖Δ GLSLΛSPIR-Vʹ͢ΔϦϑΝϨϯείϯύΠϥ glslc͕ଘࡏ͢Δ

DXC https://github.com/microsoft/DirectXShaderCompiler HLSLΛDXILʹ͢Δ ίϯύΠϥ HLSLΛSPIR-Vʹ ͢Δࣄ΋Ͱ͖Δ clang-3.7Λforkͯ͠ ࡞ΒΕ͍ͯΔ

https://devblogs.microsoft.com/directx/announcing-hlsl-2021/ HLSL 2021 2021೥ʹϦϦʔε͞Εͨ HLSLͷ৽͍͠όʔδϣϯ DXC 1.8.2405Ҏ߱Ͱ ίϯύΠϧͰ͖Δ

HLSL 2021ͷมߋ఺ • C++ͷΑ͏ͳςϯϓϨʔτؔ਺ͱςϯϓϨʔτΫϥε • C++ͷΑ͏ͳԋࢉࢠΦʔόʔϩʔυ • C++ͷΑ͏ͳϏοτϑΟʔϧυ • Ϣʔβఆٛܕͷ҉໧ͷΩϟετͷৼΔ෣͍ΛC++ʹ߹Θͤͨ
• ࿦ཧԋࢉͷ୹བྷධՁͷৼΔ෣͍ΛC++ʹ߹Θͤͨ • forϧʔϓ಺Ͱએݴ͞Εͨม਺ͷείʔϓΛC++ʹ߹Θͤͨ HLSL͕C++ʹد͖ͤͯͨ

কདྷͷHLSLʹ޲͚ͯఏҊ͞Ε͍ͯΔػೳ • C++ͷΑ͏ͳඪ४ଐੑ • C++ͷΑ͏ͳnumeric_limits • C++ͷΑ͏ͳڞ༻ମ • C++ͷΑ͏ͳॳظԽࢠϦετ •
C++ͷΑ͏ͳࢀরܕ • C++ͷΑ͏ͳconstϝϯόؔ਺ • C++ͷΑ͏ͳඇϝϯόԋࢉࢠΦʔόʔϩʔυ • C++ͷΑ͏ͳautoɺdecltypeɺconstexprɺ scoped enumɺvariadic templatesɺϢʔβ ఆٛϦςϥϧɺϥϜμࣜɺstatic assertɺ range-based for https://github.com/microsoft/hlsl-specs/tree/main/proposals

RWStructuredBuffer<float> a; RWStructuredBuffer<float> b; template<typename T> T add( T x,
T y ) { return x + y; } [numthreads(256, 1, 1)] void main( uint2 id : SV_DispatchThreadID ) { uint index = id.x; a[index] = add(b[index], b[index + 1]); } GPUͷϝϞϦ্ʹ֬อͨ͠ όοϑΝ͕͜͜ʹϦϯΫ͞ΕΔ ςϯϓϨʔτؔ਺ 256x1x1εϨουΛ local workgroupͱ͢Δ T=floatͰςϯϓϨʔτؔ਺Λݺͼग़͠

function(add_shader TARGET SHADER) set(current-shader-path ${CMAKE_CURRENT_SOURCE_DIR}/${SHADER}) set(current-output-path ${CMAKE_CURRENT_BINARY_DIR}/${SHADER}.spv) get_filename_component(current-output-dir ${current-output-path} DIRECTORY)
file(MAKE_DIRECTORY ${current-output-dir}) if( "${SHADER}" MATCHES "\.comp\.hlsl$" ) string( REPLACE ".comp.hlsl.spv" ".comp.spv" current-output-path "${current-output-path}" ) add_custom_command( OUTPUT ${current-output-path} COMMAND ${DXC} -spirv -fspv-target-env=vulkan1.3 ${current-shader-path} -T cs_6_8 -Fo ${current-output-path} DEPENDS ${current-shader-path} IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) elseif( "${SHADER}" MATCHES "\.vert\.hlsl$" ) ϑΝΠϧ໊͕.hlslͰऴΘ͍ͬͯͨΒ glslcͰ͸ͳ͘dxcͰίϯύΠϧ͢Δ ϧʔϧΛCMakeʹ௥Ճ DXCʹVulkan 1.3༻ͷSPIR-VΛ ग़ྗ͢ΔΑ͏ཁٻ

$ spirv-reflect ./src/example/hlsl/test.comp.spv generator : Google spiregg source lang :
HLSL source lang ver : 680 source file : entry point : main (stage=CS) local size : (256, 1, 1) Input variables: 1 0: spirv id : 2 location : (built-in) GlobalInvocationId type : uint3 semantic : name : qualifier : Descriptor bindings: 2 Binding 0.0 spirv id : 3 set : 0 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ

Descriptor bindings: 2 Binding 0.0 spirv id : 3 set
: 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : a (type.RWStructuredBuffer.float) // size = 0, padded size = 0 struct type.RWStructuredBuffer.float { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } a; Binding 0.1 spirv id : 4 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : b (type.RWStructuredBuffer.float) // size = 0, padded size = 0 γΣʔμ͸"a"ͱ͍͏໊લͷ֎෦ͷfloatܕͷ഑ྻΛಡΈॻ͖͢Δ ࣮ߦ͢Δࡍʹ͸GPU্ʹϝϞϦΛ֬อͯ͠ "a"ʹ݁ͼ͚ͭΔඞཁ͕͋Δ RWStructuredBuffer<float> a; RWStructuredBuffer<float> b;

} a; Binding 0.1 spirv id : 4 set :
0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : b (type.RWStructuredBuffer.float) // size = 0, padded size = 0 struct type.RWStructuredBuffer.float { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } b; γΣʔμ͸"b"ͱ͍͏໊લͷ֎෦ͷfloatܕͷ഑ྻΛಡΈॻ͖͢Δ ࣮ߦ͢Δࡍʹ͸GPU্ʹϝϞϦΛ֬อͯ͠ "b"ʹ݁ͼ͚ͭΔඞཁ͕͋Δ RWStructuredBuffer<float> a; RWStructuredBuffer<float> b;

const auto input_data = res.allocator->create_mappable_buffer( sizeof( float ) * 260,
vk::BufferUsageFlagBits::eStorageBuffer ); const auto output_data = res.allocator->create_mappable_buffer( sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; GPUͷϝϞϦʹόοϑΝinput_dataͱoutput_dataΛ֬อ

sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); const auto output_data
= res.allocator->create_mappable_buffer( sizeof( float ) * 260, vk::BufferUsageFlagBits::eStorageBuffer ); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); ໊લ͕"a"ͷσεΫϦϓλʹoutput_dataΛ݁ͼ͚ͭΔ ໊લ͕"b"ͷσεΫϦϓλʹinput_dataΛ݁ͼ͚ͭΔ ઌ΄ͲͷHLSL͔Β࡞ͬͨSPIR-VΛ࣮ߦ͢Δҝͷ ίϯϐϡʔτύΠϓϥΠϯΛ࡞Δ

); gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool )
.set_pipeline_cache( res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { "a", output_data } ) .add_resource( { "b", input_data } ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); comp( rec, 0, 256, 1, 1 ); CPU্ͷεςʔδϯάόοϑΝʹinput_dataʹॻ͘஋Λฒ΂Δ

mapped[ i ] = i; } } auto command_buffer =
res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now( CPU্ͷεςʔδϯάόοϑΝͷ಺༰ΛGPU্ͷinput_dataʹసૹ ίϯϐϡʔτύΠϓϥΠϯΛ256εϨουͰ࣮ߦ GPU্ͷoutput_dataͷ಺༰ΛCPU্ͷεςʔδϯάόοϑΝʹసૹ

$ ./src/example/hlsl/hlsl 1 3 5 7 ... 507 509 511
0 0 0 0 RWStructuredBuffer<float> a; RWStructuredBuffer<float> b; template<typename T> T add( T x, T y ) { return x + y; } [numthreads(256, 1, 1)] void main( uint2 id : SV_DispatchThreadID ) { uint index = id.x; a[index] = add(b[index], b[index + 1]); } HLSL 2021Λ࢖͏ͱGLSLΑΓগ͚ͩ͠ C++ͱڞ௨ͷίʔυΛॻ͖΍͘͢ͳΔ

https://github.com/google/clspv clspv OpenCL CΛ SPIR-Vʹ ίϯύΠϧ͢Δ ຊՈͷclangʹ࣮૷ͨ͠ OpenCL Cͷύʔαͱ ຊՈͷllvmʹ࣮૷ͨ͠
SPIR-VόοΫΤϯυΛ ૊Έ߹Θͤͯಈ͘

https://www.khronos.org/opencl/assets/CXX_for_OpenCL.html C++ for OpenCL C++ for OpenCL 1.0 OpenCL C
2.0ʹC++03ͷػೳͷ͏ͪ ༰қʹ௥ՃͰ͖Δ෺Λ௥Ճͨ͠෺ C++ for OpenCL 2021 OpenCL C 3.0ʹC++17ͷػೳͷ͏ͪ ༰қʹ௥ՃͰ͖Δ෺Λ௥Ճͨ͠෺

__kernel void test(__global float *a, __global float *b) { const
auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } Generic Lambda [ C++14 ]

IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) elseif( "${SHADER}" MATCHES "\.cl$" ) string(
REPLACE ".cl.spv" ".comp.spv" current-output-path "${current-output-path}" ) add_custom_command( OUTPUT ${current-output-path} COMMAND ${CLSPV} --cl-std=CLC++2021 -inline-entry-points -- spv-version=1.6 ${current-shader-path} -o ${current-output-path} DEPENDS ${current-shader-path} IMPLICIT_DEPENDS CXX ${current-shader-path} VERBATIM) else() add_custom_command( OUTPUT ${current-output-path} COMMAND ${GLSLC} -o ${current-output-path} ${current-shader- ϑΝΠϧ໊͕.clͰऴΘ͍ͬͯͨΒ glslcͰ͸ͳ͘clspvͰίϯύΠϧ͢Δ ϧʔϧΛCMakeʹ௥Ճ clspvʹ C++ for OpenCL 2021Λ Vulkan 1.3༻ͷSPIR-Vʹ ͢ΔΑ͏ཁٻ

$ spirv-reflect ./src/example/cl/test.comp.spv generator : ??? source lang : OpenCL_CPP
source lang ver : 202100 source file : entry point : test (stage=CS) local size : (0, 0, 0) Input variables: 1 0: spirv id : 7 location : (built-in) GlobalInvocationId type : uint3 semantic : name : qualifier : Push constant blocks: 1 0: spirv id : 5 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ

Push constant blocks: 1 0: spirv id : 5 name
: <unnamed> // size = 16, padded size = 16 struct <unnamed> { uint3 ; // abs offset = 0, rel offset = 0, size = 12, padded size = 16 } <unnamed>; Descriptor bindings: 2 Binding 0.0 spirv id : 18 set : 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 struct <unnamed> { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 push constantͰූ߸ͳ͠੔਺3ཁૉͷϕΫτϧΛ౉͢ඞཁ͕͋Δ OpenCLͷίʔυଆͰget_global_idΛݺͿͱ ίϯϐϡʔτύΠϓϥΠϯ͔Β౉ͬͯ͘ΔεϨουIDʹpush constantͷ஋Λ ଍ͨ͠஋͕ฦͬͯ͘Δ Ұ౓ʹ࣮ߦͰ͖ͳ͍͘Β͍େྔͷεϨουΛ ෳ਺ճʹ෼ׂ࣮ͯ͠ߦ͢Δͷʹศར

Descriptor bindings: 2 Binding 0.0 spirv id : 18 set
: 0 binding : 0 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 struct <unnamed> { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } <unnamed>; Binding 0.1 spirv id : 19 set : 0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 __kernel void test(__global float *a, __global float *b) { __kernelͳؔ਺ͷҾ਺͕σεΫϦϓλʹͳΔ 0൪໨ͷҾ਺͕binding=0ʹͳΔ ม਺໊ΛϦϑϨΫγϣϯ৘ใʹೖΕͯ͘Εͳ͍ͷͰ binding൪߸Ͱ݁ͼ͚ͭΔඞཁ͕͋Δ

} <unnamed>; Binding 0.1 spirv id : 19 set :
0 binding : 1 type : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER (UAV) count : 1 accessed : true name : <unnamed> // size = 0, padded size = 0 struct <unnamed> { float ; // abs offset = 0, rel offset = 0, size = 0, padded size = 0 } <unnamed>; __kernel void test(__global float *a, __global float *b) { 1൪໨ͷҾ਺΋ಉ༷

$ ./src/dump_spirv -i src/example/cl/test.comp.spv { "descriptor_binding_count": 2, "descriptor_bindings": [ ...
"spec_constants": [ { "constant_id": 0, "spirv_id": 8 }, { "constant_id": 1, "spirv_id": 9 }, { "constant_id": 2, "spirv_id": 10 } ], "spirv_execution_model": "GLCompute" } ಛघԽఆ਺͕3ؚͭ·Ε͍ͯΔ ಛघԽఆ਺ͱ͸: γΣʔμͷίϯύΠϧ࣌ʹ஋ΛܾΊΒΕΔఆ਺ ίʔυʹఆ਺͕௚ॻ͖͞Ε͍ͯΔͷͱ ಉ౳ͷ࠷దԽ͕ߦΘΕΔ OpenCLͰ͸LocalWorkgroupͷαΠζΛ ίʔυ্ʹॻ͔ͳ͍ͷͰ ಛघԽఆ਺Ͱࢦఆ͢Δ

gct::compute comp( gct::compute_create_info() .set_allocator( res.allocator ) .set_descriptor_pool( res.descriptor_pool ) .set_pipeline_cache(
res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) .add_resource( { 0, output_data } ) .add_resource( { 1, input_data } ) .add_spec( 0, 256 ) .add_spec( 1, 1 ) .add_spec( 2, 1 ) .set_dim( 256, 1, 1 ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); binding=0ͷσεΫϦϓλʹoutput_dataΛ݁ͼ͚ͭΔ binding=1ͷσεΫϦϓλʹinput_dataΛ݁ͼ͚ͭΔ ಛघԽఆ਺ʹ(256,1,1)Ληοτ͢Δ ϦϑϨΫγϣϯ͔ΒLocalWorkgroupͷαΠζ͕औΕͳ͍ͷͰ खಈͰ(256,1,1)Ληοτ͢Δ

} auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now();
{ { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); glm::ivec3 offset( 0, 0, 0 ); rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( glm::ivec3 ), &offset ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); push constantʹεϨουIDͷΦϑηοτ(0,0,0)Ληοτ͢Δ

auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer()
}, {} ); glm::ivec3 offset( 0, 0, 0 ); rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( glm::ivec3 ), &offset ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now(); ࣮ߦ

$ ./src/example/cl/cl 1 3 5 7 ... 507 509 511
0 0 0 0 __kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } C++ for OpenCL 2021Λ࢖͏ͱ HLSL 2021ΑΓଟ͘ͷC++ͷػೳ͕࢖͑Δ ͨͩ͠ίϯϐϡʔτύΠϓϥΠϯઐ༻

https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VK_NV_cuda_kernel_launch.html VK_NV_cuda_kernel_launch ͜ͷ֦ு͕αϙʔτ͞Ε͍ͯΔ৔߹ nvcc -ptx Ͱ࡞ͬͨPTX͔Β VkCudaModuleNVΛ࡞Γ Vulkan͔Β CUDAͷ__global__ͳؔ਺Λ ࣮ߦͰ͖Δ
NVIDIA͔͠αϙʔτ͍ͯ͠ͳ͍

__global__ void test( float *a, float *b ) { const
int index = blockIdx.x * blockDim.x + threadIdx.x; const auto add = []( auto x, auto y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } __kernel void test(__global float *a, __global float *b) { const auto index = get_global_id( 0 ); const auto add = []( auto x, auto y ) __private { return x + y; }; a[index] = add( b[index], b[index + 1] ); } CUDA ੍ݶ෇͖C++17 C++ for OpenCL 2021 ੍ݶ෇͖C++17

ಉ͡C++17ͷαϒηοτͰ΋੍ݶ͕ҧ͏ CUDA C++ for OpenCL 2021 ྫ֎ Ծ૝ؔ਺ ؔ਺ ϙΠϯλ
σʔλ ϙΠϯλ ࢀর =࢖͑Δ

͜ͷϢχοτ1ݸʹ͖ͭϓϩάϥϜΧ΢ϯλ͕1ݸ

͜ͷϢχοτ1ݸʹ͖ͭϓϩάϥϜΧ΢ϯλ͕1ݸ __kernel void test(__global float *a, __global float *b) {
const auto index = get_global_id( 0 ); if( index % 2 == 1 ) { a[ index ] = b[ index ]; } else { a[ index ] = 0.0f; } } ෼ذͨ͘͠ͳͬͨΒͲ͏͢Δ? μΠόʔδΣϯε

__kernel void test(__global float *a, __global float *b) { const
auto index = get_global_id( 0 ); if( index % 2 == 1 ) { a[ index ] = b[ index ]; } else { a[ index ] = 0.0f; } } ⋯ index ⋯ index % 2 a[ index ] = b[ index ] B<> B<> B<> ⋯ a[ index ] = 0.0f ⋯ B<> B<> B<> ෼ذ৚݅ͰϚεΫΛ࡞Δ ྆ํͷϒϩοΫΛ࣮ߦ͢Δ ϚεΫʹج͍ͮͯͲͪΒͷ݁ՌΛ࠾༻͢Δ͔ܾΊΔ a[ index ]

ؒ઀෼ذͩͬͨΒ? __device__ float func1() { return 3.0f; } __device__ float
func2() { return 5.0f; } __global__ void find( float *input_buffer, float *output_buffer ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; const auto func = ( index % 2 ) ? &func1 : func2; output_buffer[ index ] = func(); }

__device__ float func1() { return 3.0f; } __device__ float func2()
{ return 5.0f; } __global__ void find( float *input_buffer, float *output_buffer ) { const int index = blockIdx.x * blockDim.x + threadIdx.x; const auto func = ( index % 2 ) ? &func1 : func2; output_buffer[ index ] = func(); } GVOD GVOD GVOD GVOD GVOD GVOD GVOD ⋯ func = ඈͼઌͷΞυϨε͔ΒϚεΫΛ࡞Γɺશͯͷؔ਺ΛݺͿ func1()ΛݺͿ func2()ΛݺͿ ⋯ ⋯ ⋯ ϚεΫͰ݁ՌΛબͿ

https://patents.google.com/patent/US7761697 ถࠃಛڐ 7761697B1 SIMDͰؒ઀෼ذ͢Δํ๏ ಛڐݖऀ NVIDIA 2026೥11݄21೔·Ͱ

Ծ૝ؔ਺͸vtableͷΞυϨεΛ࢖ͬͨ ؒ઀෼ذͳͷͰαϙʔτͰ͖ͳ͍

https://shady-gang.github.io/vcc/ Vulkan Clang Compiler C++ΛSPIR-Vʹ ίϯύΠϧ͢ΔίϯύΠϥ ྫ֎ͱ࣮ߦ࣌ܕ৘ใΛআ͘ શͯͷݴޠػೳΛαϙʔτ gotoΛؚΉ ͋ΒΏΔίʔυϑϩʔʹରԠ

#include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1,
1) void main( float *a, float *b ) { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; } #define vertex_shader __attribute__((annotate("shady::entry_point::Vertex"))) #define fragment_shader __attribute__((annotate("shady::entry_point::Fragment"))) #define compute_shader __attribute__((annotate("shady::entry_point::Compute"))) #define location(i) __attribute__((annotate("shady::location::"#i))) #define descriptor_set(i) __attribute__((annotate("shady::descriptor_set::"#i))) #define descriptor_binding(i) __attribute__((annotate("shady::descriptor_binding::"#i))) #define local_size(x, y, z) __attribute__((annotate("shady::workgroup_size::"#x"::"#y"::"#z))) ϚΫϩ͕ల։͞Εͯ __attribute__ʹͳΓ ίϯύΠϥʹSPIR-VΛుͨ͘Ίʹ ඞཁͳ৘ใ͕ૹΒΕΔ

$ spirv-reflect test.comp.spv generator : ??? source lang : Unknown
source lang ver : 0 source file : entry point : main (stage=CS) local size : (256, 1, 1) Input variables: 1 0: spirv id : 25 location : (built-in) GlobalInvocationId type : uint3 semantic : name : _ZN3vcc21gl_GlobalInvocationIDE qualifier : Push constant blocks: 1 ੜ੒͞ΕͨSPIR-VͷϦϑϨΫγϣϯ৘ใΛμϯϓ

Push constant blocks: 1 0: spirv id : 5 name
: __main_args // size = 16, padded size = 16 struct <unnamed> { uint ; // abs offset = 0, rel offset = 0, size = 8, padded size = 8 uint ; // abs offset = 8, rel offset = 8, size = 8, padded size = 8 } __main_args; mainؔ਺ͷҾ਺͕push constantʹͳ͍ͬͯΔ 64bitͷϙΠϯλ2ͭͳͷͰ όοϑΝͷGPUଆ͔ΒݟͨΞυϨεΛͱͬͯ push constantʹಥͬࠐΊ͹ྑ͍

res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } const auto input_data_addr = input_data->get_buffer()->get_address(); const auto output_data_addr = output_data->get_buffer()->get_address(); auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); σεΫϦϓλͳ͠ͰίϯϐϡʔτύΠϓϥΠϯΛ࡞Δ

res.pipeline_cache ) .set_shader( CMAKE_CURRENT_BINARY_DIR "/test.comp.spv" ) ); { auto mapped = input_data->map< float >(); for( unsigned int i = 0u; i != 260; ++i ) { mapped[ i ] = i; } } const auto input_data_addr = input_data->get_buffer()->get_address(); const auto output_data_addr = output_data->get_buffer()->get_address(); auto command_buffer = res.queue->get_command_pool()->allocate(); const auto begin = std::chrono::high_resolution_clock::now(); { { auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); ೖྗ஋༻ͷόοϑΝͱग़ྗ஋༻ͷόοϑΝͷ GPU͔Βݟͨઌ಄ΞυϨεΛऔಘ͢Δ

{ auto rec = command_buffer->begin(); rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( {
input_data->get_buffer() }, {} ); std::array< vk::DeviceAddress, 2u > addrs{ *output_data_addr, *input_data_addr }; rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( vk::DeviceAddress ) * addrs.size(), addrs.data() ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } push constantʹόοϑΝͷΞυϨεΛॻ͘

rec.sync_to_device( input_data ); rec.transfer_to_compute_barrier( { input_data->get_buffer() }, {} ); std::array<
vk::DeviceAddress, 2u > addrs{ *output_data_addr, *input_data_addr }; rec->pushConstants( **comp.get_pipeline()->get_props().get_layout(), comp.get_pipeline()->get_props().get_layout()- >get_props().get_push_constant_range()[ 0 ].stageFlags, 0u, sizeof( vk::DeviceAddress ) * addrs.size(), addrs.data() ); comp( rec, 0, 256, 1, 1 ); rec.sync_to_host( output_data ); } command_buffer->execute_and_wait(); } const auto end = std::chrono::high_resolution_clock::now(); ࣮ߦ

$ ./src/example/vcc/vcc 1 3 5 7 ... 507 509 511
0 0 0 0 #include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; }

vcc͸C++ΛLLVM IRʹͨ͠ޙCPSม׵Λߦ͏ int f( int a ) { a +=
1; if( a == 3 ) { a = 0; } else { a = g( a ); } return a; } f0 f1 f2 f3 ؔ਺Λbasic blockຖʹผͷؔ਺ʹ͢Δ ֤ؔ਺͸࣍ʹ࣮ߦ͍ͨؔ͠਺Λ ΩϡʔʹੵΉ ϩʔΧϧม਺ ͸ڞ༗ϝϞϦͰ࣍ͷؔ਺ʹ౉͞ΕΔ ͸ ͱ ΛΩϡʔʹੵΉ Ωϡʔʹੵ·Εͨؔ਺͸શ෦࣮ߦ͢Δ f0 f1 f2 ͱ ͸ڞʹ ΛΩϡʔʹੵΉͷͰ ͜͜Ͱ྆ऀ͔Βདྷͨม਺Λ౷߹͢Δ f1 f2 f3

vcc͸C++ΛLLVM IRʹͨ͠ޙCPSม׵Λߦ͏ int f( int a ) { a +=
1; if( a == 3 ) { a = 0; } else { a = g( a ); } return a; } f0 f1 f2 f3 ؔ਺Λbasic blockຖʹผͷؔ਺ʹ͢Δ ֤ؔ਺͸࣍ʹ࣮ߦ͍ͨؔ͠਺Λ ΩϡʔʹੵΉ ϩʔΧϧม਺ ͸ڞ༗ϝϞϦͰ࣍ͷؔ਺ʹ౉͞ΕΔ ͸ ͱ ΛΩϡʔʹੵΉ Ωϡʔʹੵ·Εͨؔ਺͸શ෦࣮ߦ͢Δ f0 f1 f2 ͱ ͸ڞʹ ΛΩϡʔʹੵΉͷͰ ͜͜Ͱ྆ऀ͔Βདྷͨม਺Λ݁߹͢Δ f1 f2 f3 ར఺1: NVIDIAͷಛڐΛճආͯؒ͠઀෼ذ͕Ͱ͖Δ ར఺2: ͲΜͳʹܹ͘͠μΠόʔδΣϯε͕ੜͯ͡΋ ੑೳΛ٘ਜ਼ʹ࣮ͯ͠ߦͰ͖Δ

#include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1,
1) void main( float *a, float *b ) { const auto index = gl_GlobalInvocationID.x; const auto add = []< typename T, typename U >( T x, U y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } Familiar template syntax for generic lambdas [ C++20 ] C++ΛLLVM IRʹ͢Δॴ·Ͱ͸ͨͩͷclangͳͷͰ ͜ΜͳίʔυΛॻ͍ͯ΋ίϯύΠϧ͸௨Δ

$ ./src/example/vcc/vcc terminate called after throwing an instance of 'vk::DeviceLostError'
what(): wait_for_executed failed.: ErrorDeviceLost தࢭ (ίΞμϯϓ) #include <shady.h> using namespace vcc; extern "C" compute_shader local_size(256, 1, 1) void main( float *a, float *b ) { const auto index = gl_GlobalInvocationID.x; const auto add = []< typename T, typename U >( T x, U y ) { return x + y; }; a[index] = add( b[index], b[index + 1] ); } ͳ͔ͥ ແݶϧʔϓʹ ؕΔ

#include <shady.h> using namespace vcc; descriptor_set(0) descriptor_binding(1) uniform_constant extern float
a[]; descriptor_set(0) descriptor_binding(2) uniform_constant extern float b[]; extern "C" compute_shader local_size(256, 1, 1) void main() { a[ gl_GlobalInvocationID.x ] = b[ gl_GlobalInvocationID.x ] + b[ gl_GlobalInvocationID.x + 1 ]; } σεΫϦϓλΛ࢖͏ͱ

$ spirv-reflect test.comp.spv Segmentation fault (ίΞμϯϓ) ܕ৘ใ͕յΕ͍ͯͯಡΊͳ͍ ϦϑϨΫγϣϯ͕ੜ੒͞ΕΔ ख๏͸໘ന͍͕ ݱঢ়ͷ࣮૷͸͙͢ʹ࢖͑Δ඼࣭Ͱ͸ͳ͍

·ͱΊ CPUͱGPUͰಉ͡ίʔυΛ࣮ߦ͍ͨ࣌͠GPUଆͰ࢖͏ݴޠ GLSL HLSL 2021 C++ for OpenCL 2021 CUDA
C++ GLSLΑΓϚγ άϥϑΟΫεύΠϓϥΠϯ΋OK ϕϯμʔඇґଘͰίϯϐϡʔτύΠϓϥΠϯ͚͕ͩ ಈ͚͹ྑ͍ͳΒ͜Ε NVIDIAͷGPUͰίϯϐϡʔτύΠϓϥΠϯ͚͕ͩ ಈ͚͹ྑ͍ͳΒ͜Ε ͭΒ͍ ਓྨͷເɺάϥϑΟΫεύΠϓϥΠϯ΋OK ͔͠͠ݱঢ়ίϯύΠϥͷಈ͖͕ո͍͠

https://fadis.booth.pm/ ٕज़ॻయͰຊग़ͯ͠·͢ /&8

C++でシェーダを書く

C++でシェーダを書く

More Decks by Fadis

Other Decks in Programming

Featured

Transcript