HistogramCompute.compute
3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include "UnityCG.cginc"
RWStructuredBuffer<uint4> _Histogram;
Texture2D<float4> _Source;
CBUFFER_START (Params)
uint _IsLinear;
float4 _Res;
uint4 _Channels;
CBUFFER_END
groupshared uint4 gs_histogram[256];
#define GROUP_SIZE 16
#pragma kernel KHistogramGather
[numthreads(GROUP_SIZE, GROUP_SIZE,1)]
void KHistogramGather(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThreadId : SV_GroupThreadID)
{
const uint localThreadId = groupThreadId.y * GROUP_SIZE + groupThreadId.x;
if (localThreadId < 256)
gs_histogram[localThreadId] = uint4(0, 0, 0, 0);
GroupMemoryBarrierWithGroupSync();
if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
{
// We want a gamma histogram (like Photoshop & all)
float3 color = saturate(_Source[dispatchThreadId].xyz);
if (_IsLinear > 0)
color = LinearToGammaSpace(color);
// Convert color & luminance to histogram bin
uint3 idx_c = (uint3)(round(color * 255.0));
uint idx_l = (uint)(round(dot(color.rgb, float3(0.2125, 0.7154, 0.0721)) * 255.0));
// Fill the group shared histogram
if (_Channels.x > 0u) InterlockedAdd(gs_histogram[idx_c.x].x, 1); // Red
if (_Channels.y > 0u) InterlockedAdd(gs_histogram[idx_c.y].y, 1); // Green
if (_Channels.z > 0u) InterlockedAdd(gs_histogram[idx_c.z].z, 1); // Blue
if (_Channels.w > 0u) InterlockedAdd(gs_histogram[idx_l].w, 1); // Luminance
}
GroupMemoryBarrierWithGroupSync();
// Merge
if (localThreadId < 256)
{
uint4 h = gs_histogram[localThreadId];
if (_Channels.x > 0u && h.x > 0) InterlockedAdd(_Histogram[localThreadId].x, h.x); // Red
if (_Channels.y > 0u && h.y > 0) InterlockedAdd(_Histogram[localThreadId].y, h.y); // Green
if (_Channels.z > 0u && h.z > 0) InterlockedAdd(_Histogram[localThreadId].z, h.z); // Blue
if (_Channels.w > 0u && h.w > 0) InterlockedAdd(_Histogram[localThreadId].w, h.w); // Luminance
}
}
// Scaling pass
groupshared uint4 gs_pyramid[256];
#pragma kernel KHistogramScale
[numthreads(16,16,1)]
void KHistogramScale(uint2 groupThreadId : SV_GroupThreadID)
{
const uint localThreadId = groupThreadId.y * 16 + groupThreadId.x;
gs_pyramid[localThreadId] = _Histogram[localThreadId];
GroupMemoryBarrierWithGroupSync();
// Parallel reduction to find the max value
UNITY_UNROLL
for(uint i = 256 >> 1; i > 0; i >>= 1)
{
if(localThreadId < i)
gs_pyramid[localThreadId] = max(gs_pyramid[localThreadId], gs_pyramid[localThreadId + i]);
GroupMemoryBarrierWithGroupSync();
}
// Actual scaling
float4 factor = _Res.y / (float4)gs_pyramid[0];
_Histogram[localThreadId] = (uint4)round(_Histogram[localThreadId] * factor);
}
#pragma kernel KHistogramClear
[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
void KHistogramClear(uint2 dispatchThreadId : SV_DispatchThreadID)
{
if (dispatchThreadId.x < (uint)_Res.x && dispatchThreadId.y < (uint)_Res.y)
_Histogram[dispatchThreadId.y * _Res.x + dispatchThreadId.x] = uint4(0u, 0u, 0u, 0u);
}