CrossPlatformSettings_piece_all.glsl"

shared float2 g_minMaxValues[4u * 4u * 4u];
shared uint2 g_mask[4u * 4u];

layout( location = 0 ) uniform uint2 params;

#define p_channelIdx params.x
#define p_useSNorm params.y

uniform sampler2D srcTex;

layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;

layout( local_size_x = 4,  //
		local_size_y = 4,  //
		local_size_z = 4 ) in;

/// Each block is 16 pixels
/// Each thread works on 4 pixels
/// Therefore each block needs 4 threads, generating 8 masks
/// At the end these 8 masks get merged into 2 and results written to output
///
/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
///
/// A: It's a sweetspot.
///  - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
///  - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
///    overhead, and also more LDS usage which reduces occupancy.
///  - Long threads (e.g. 1 thread per block) misses parallelism opportunities
void main()
{
	float minVal, maxVal;
	float4 srcPixel;

	const uint blockThreadId = gl_LocalInvocationID.x;

	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;

	for( uint i = 0u; i < 4u; ++i )
	{
		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId );

		const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw;
		srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w );
		srcPixel[i] *= 255.0f;
	}

	minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z );
	maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z );
	minVal = min( minVal, srcPixel.w );
	maxVal = max( maxVal, srcPixel.w );

	const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u );
	const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y;

	g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal );
	g_mask[maskIdxBase] = uint2( 0u, 0u );

	__sharedOnlyBarrier;

	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
	for( uint i = 0u; i < 4u; ++i )
	{
		minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal );
		maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal );
	}

	// determine bias and emit color indices
	// given the choice of maxVal/minVal, these indices are optimal:
	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
	float dist = maxVal - minVal;
	float dist4 = dist * 4.0f;
	float dist2 = dist * 2.0f;
	float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f );
	bias -= minVal * 7.0f;

	uint mask0 = 0u, mask1 = 0u;

	for( uint i = 0u; i < 4u; ++i )
	{
		float a = srcPixel[i] * 7.0f + bias;

		int ind = 0;

		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
		if( a >= dist4 )
		{
			ind = 4;
			a -= dist4;
		}

		if( a >= dist2 )
		{
			ind += 2;
			a -= dist2;
		}

		if( a >= dist )
			ind += 1;

		// turn linear scale into DXT index (0/1 are extremal pts)
		ind = -ind & 7;
		ind ^= ( 2 > ind ) ? 1 : 0;

		// write index
		const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u;
		if( bits < 32u )
		{
			mask0 |= uint( ind ) << bits;
			if( bits + 3u > 32u )
			{
				mask1 |= uint( ind ) >> ( 32u - bits );
			}
		}
		else
		{
			mask1 |= uint( ind ) << ( bits - 32u );
		}
	}

	if( mask0 != 0u )
		atomicOr( g_mask[maskIdxBase].x, mask0 );
	if( mask1 != 0u )
		atomicOr( g_mask[maskIdxBase].y, mask1 );

	__sharedOnlyBarrier;

	if( blockThreadId == 0u )
	{
		// Save data
		uint4 outputBytes;

		if( p_useSNorm != 0u )
		{
			outputBytes.x =
				packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f,
									  minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) );
		}
		else
		{
			outputBytes.x = packUnorm4x8(
				float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) );
		}
		outputBytes.y = g_mask[maskIdxBase].x >> 16u;
		outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu;
		outputBytes.w = g_mask[maskIdxBase].y >> 16u;

		uint2 dstUV = gl_GlobalInvocationID.yz;
		imageStore( dstTexture, int2( dstUV ), outputBytes );
	}
}