rows_per_invocation; const uint first_col = gl_GlobalInvocationID.x*columns_per_invocation; const uint last_row_exclusive = min(first_row+rows_per_invocation, $input_data_0_h$); const uint last_column_exclusive = min(first_col+columns_per_invocation, $input_data_0_w$); vec4 value = vec4(0); for (uint h = first_row; h < last_row_exclusive; ++h) { for (uint w = first_col; w < last_column_exclusive; ++w) { value += $input_data_0[w, h, gid.z]$; } } highp vec4 subgroup_sum = subgroupAdd(value); if(subgroupElect()) { subgroup_sums[gl_SubgroupID] = subgroup_sum; } memoryBarrierShared(); barrier(); // Do the final reduction in the first subgroup. if(gl_SubgroupID == 0) { highp vec4 subtotal = vec4(0); if (gl_SubgroupInvocationID < gl_NumSubgroups) { subtotal = subgroup_sums[gl_SubgroupInvocationID]; } highp vec4 grand_total = subgroupAdd(subtotal); if(subgroupElect()) { highp vec4 result = grand_total / $input_data_0_w$ / $input_data_0_h$; $output_data_0[0, 0, gid.z] = result$; } }