HistogramCompute.compute 3.1 KB
#include "UnityCG.cginc"

RWStructuredBuffer<uint4> _Histogram;
Texture2D<float4> _Source;

CBUFFER_START (Params)
    uint _IsLinear;
    float4 _Res;
    uint4 _Channels;
CBUFFER_END

groupshared uint4 gs_histogram[256];

#define GROUP_SIZE 16

#pragma kernel KHistogramGather
[numthreads(GROUP_SIZE, GROUP_SIZE,1)]
void KHistogramGather(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThreadId : SV_GroupThreadID)
{
    const uint localThreadId = groupThreadId.y * GROUP_SIZE + groupThreadId.x;
    
    if (localThreadId < 256)
        gs_histogram[localThreadId] = uint4(0, 0, 0, 0);

    GroupMemoryBarrierWithGroupSync();

    if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
    {
        // We want a gamma histogram (like Photoshop & all)
        float3 color = saturate(_Source[dispatchThreadId].xyz);
        if (_IsLinear > 0)
            color = LinearToGammaSpace(color);
        
        // Convert color & luminance to histogram bin
        uint3 idx_c = (uint3)(round(color * 255.0));
        uint idx_l = (uint)(round(dot(color.rgb, float3(0.2125, 0.7154, 0.0721)) * 255.0));
    
        // Fill the group shared histogram
        if (_Channels.x > 0u) InterlockedAdd(gs_histogram[idx_c.x].x, 1); // Red
        if (_Channels.y > 0u) InterlockedAdd(gs_histogram[idx_c.y].y, 1); // Green
        if (_Channels.z > 0u) InterlockedAdd(gs_histogram[idx_c.z].z, 1); // Blue
        if (_Channels.w > 0u) InterlockedAdd(gs_histogram[idx_l].w, 1);   // Luminance
    }

    GroupMemoryBarrierWithGroupSync();

    // Merge
    if (localThreadId < 256)
    {
        uint4 h = gs_histogram[localThreadId];
        if (_Channels.x > 0u && h.x > 0) InterlockedAdd(_Histogram[localThreadId].x, h.x); // Red
        if (_Channels.y > 0u && h.y > 0) InterlockedAdd(_Histogram[localThreadId].y, h.y); // Green
        if (_Channels.z > 0u && h.z > 0) InterlockedAdd(_Histogram[localThreadId].z, h.z); // Blue
        if (_Channels.w > 0u && h.w > 0) InterlockedAdd(_Histogram[localThreadId].w, h.w); // Luminance
    }
}

// Scaling pass
groupshared uint4 gs_pyramid[256];

#pragma kernel KHistogramScale
[numthreads(16,16,1)]
void KHistogramScale(uint2 groupThreadId : SV_GroupThreadID)
{
    const uint localThreadId = groupThreadId.y * 16 + groupThreadId.x;
    gs_pyramid[localThreadId] = _Histogram[localThreadId];

    GroupMemoryBarrierWithGroupSync();

    // Parallel reduction to find the max value
    UNITY_UNROLL
    for(uint i = 256 >> 1; i > 0; i >>= 1)
    {
        if(localThreadId < i)
            gs_pyramid[localThreadId] = max(gs_pyramid[localThreadId], gs_pyramid[localThreadId + i]);

        GroupMemoryBarrierWithGroupSync();
    }

    // Actual scaling
    float4 factor = _Res.y / (float4)gs_pyramid[0];
    _Histogram[localThreadId] = (uint4)round(_Histogram[localThreadId] * factor);
}

#pragma kernel KHistogramClear
[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
void KHistogramClear(uint2 dispatchThreadId : SV_DispatchThreadID)
{
    if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
        _Histogram[dispatchThreadId.y * _Res.x + dispatchThreadId.x] = uint4(0u, 0u, 0u, 0u);
}