CrossPlatformSettings_piece_all.glsl" shared float2 g_minMaxValues[4u * 4u * 4u]; shared uint2 g_mask[4u * 4u]; layout( location = 0 ) uniform uint2 params; #define p_channelIdx params.x #define p_useSNorm params.y uniform sampler2D srcTex; layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; layout( local_size_x = 4, // local_size_y = 4, // local_size_z = 4 ) in; /// Each block is 16 pixels /// Each thread works on 4 pixels /// Therefore each block needs 4 threads, generating 8 masks /// At the end these 8 masks get merged into 2 and results written to output /// /// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?** /// /// A: It's a sweetspot. /// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound) /// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks) /// overhead, and also more LDS usage which reduces occupancy. /// - Long threads (e.g. 1 thread per block) misses parallelism opportunities void main() { float minVal, maxVal; float4 srcPixel; const uint blockThreadId = gl_LocalInvocationID.x; const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u; for( uint i = 0u; i < 4u; ++i ) { const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId ); const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw; srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w ); srcPixel[i] *= 255.0f; } minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z ); maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z ); minVal = min( minVal, srcPixel.w ); maxVal = max( maxVal, srcPixel.w ); const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u ); const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y; g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal ); g_mask[maskIdxBase] = uint2( 0u, 0u ); __sharedOnlyBarrier; // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded for( uint i = 0u; i < 4u; ++i ) { minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal ); maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal ); } // determine bias and emit color indices // given the choice of maxVal/minVal, these indices are optimal: // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/ float dist = maxVal - minVal; float dist4 = dist * 4.0f; float dist2 = dist * 2.0f; float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f ); bias -= minVal * 7.0f; uint mask0 = 0u, mask1 = 0u; for( uint i = 0u; i < 4u; ++i ) { float a = srcPixel[i] * 7.0f + bias; int ind = 0; // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max). if( a >= dist4 ) { ind = 4; a -= dist4; } if( a >= dist2 ) { ind += 2; a -= dist2; } if( a >= dist ) ind += 1; // turn linear scale into DXT index (0/1 are extremal pts) ind = -ind & 7; ind ^= ( 2 > ind ) ? 1 : 0; // write index const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u; if( bits < 32u ) { mask0 |= uint( ind ) << bits; if( bits + 3u > 32u ) { mask1 |= uint( ind ) >> ( 32u - bits ); } } else { mask1 |= uint( ind ) << ( bits - 32u ); } } if( mask0 != 0u ) atomicOr( g_mask[maskIdxBase].x, mask0 ); if( mask1 != 0u ) atomicOr( g_mask[maskIdxBase].y, mask1 ); __sharedOnlyBarrier; if( blockThreadId == 0u ) { // Save data uint4 outputBytes; if( p_useSNorm != 0u ) { outputBytes.x = packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) ); } else { outputBytes.x = packUnorm4x8( float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) ); } outputBytes.y = g_mask[maskIdxBase].x >> 16u; outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu; outputBytes.w = g_mask[maskIdxBase].y >> 16u; uint2 dstUV = gl_GlobalInvocationID.yz; imageStore( dstTexture, int2( dstUV ), outputBytes ); } }