843 lines
25 KiB
GLSL
843 lines
25 KiB
GLSL
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
|
|
*
|
|
* Permission is hereby granted, free of charge,
|
|
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation the rights to
|
|
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
|
|
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
|
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#if defined(FFT_FP16) && defined(GL_ES)
|
|
precision mediump float;
|
|
#endif
|
|
|
|
#define BINDING_SSBO_IN 0
|
|
#define BINDING_SSBO_OUT 1
|
|
#define BINDING_SSBO_AUX 2
|
|
#define BINDING_UBO 3
|
|
#define BINDING_TEXTURE0 4
|
|
#define BINDING_TEXTURE1 5
|
|
#define BINDING_IMAGE 6
|
|
|
|
layout(std140, binding = BINDING_UBO) uniform UBO
|
|
{
|
|
uvec4 p_stride_padding;
|
|
vec4 texture_offset_scale;
|
|
} constant_data;
|
|
#define uStride constant_data.p_stride_padding.y
|
|
|
|
// cfloat is the "generic" type used to hold complex data.
|
|
// GLFFT supports vec2, vec4 and "vec8" for its complex data
|
|
// to be able to work on 1, 2 and 4 complex values in a single vector.
|
|
// FFT_VEC2, FFT_VEC4, FFT_VEC8 defines which type we're using.
|
|
// The shaders are compiled on-demand.
|
|
|
|
// FP16 values are packed as 2xfp16 in a uint.
|
|
// packHalf2x16 and unpackHalf2x16 are used to bitcast between these formats.
|
|
|
|
// The complex number format is (real, imag, real, imag, ...) in an interleaved fashion.
|
|
// For complex-to-real or real-to-complex transforms, we consider two adjacent real samples to be a complex number as-is.
|
|
// Separate "resolve" passes are added to make the transform correct.
|
|
|
|
#if defined(FFT_VEC2)
|
|
#define cfloat vec2
|
|
#define cfloat_buffer_fp16 uint
|
|
#elif defined(FFT_VEC4)
|
|
#define cfloat vec4
|
|
#define cfloat_buffer_fp16 uvec2
|
|
#elif defined(FFT_VEC8)
|
|
#if !defined(FFT_INPUT_FP16) || !defined(FFT_OUTPUT_FP16) || !defined(FFT_FP16)
|
|
#error FFT_VEC8 must use FP16 everywhere.
|
|
#endif
|
|
#define cfloat uvec4
|
|
#define cfloat_buffer_fp16 uvec4
|
|
#else
|
|
#error FFT_VEC2, FFT_VEC4 or FFT_VEC8 must be defined.
|
|
#endif
|
|
|
|
#ifdef FFT_INPUT_FP16
|
|
#define cfloat_buffer_in cfloat_buffer_fp16
|
|
#else
|
|
#define cfloat_buffer_in cfloat
|
|
#endif
|
|
|
|
#ifdef FFT_OUTPUT_FP16
|
|
#define cfloat_buffer_out cfloat_buffer_fp16
|
|
#else
|
|
#define cfloat_buffer_out cfloat
|
|
#endif
|
|
|
|
// Normally this would be sqrt(1 / radix), but we'd have to apply normalization
|
|
// for every pass instead of just half of them. Also, 1 / 2^n is "lossless" in FP math.
|
|
#ifdef FFT_NORMALIZE
|
|
#define FFT_NORM_FACTOR (1.0 / float(FFT_RADIX))
|
|
#endif
|
|
|
|
// FFT_CVECTOR_SIZE defines an interleaving stride for the first pass.
|
|
// The first FFT pass with stockham autosort needs to do some shuffling around if we're processing
|
|
// more than one complex value per vector.
|
|
// This is only needed for horizontal transforms since we vectorize horizontally and different elements
|
|
// in the vector are from different transforms when we do vertical transforms.
|
|
|
|
#if defined(FFT_P1) && !defined(FFT_DUAL) && defined(FFT_HORIZ) && defined(FFT_VEC8)
|
|
#define FFT_CVECTOR_SIZE 4
|
|
#elif defined(FFT_P1) && ((!defined(FFT_DUAL) && defined(FFT_HORIZ) && defined(FFT_VEC4)) || (defined(FFT_DUAL) && defined(FFT_HORIZ) && defined(FFT_VEC8)))
|
|
#define FFT_CVECTOR_SIZE 2
|
|
#else
|
|
#define FFT_CVECTOR_SIZE 1
|
|
#endif
|
|
|
|
#ifdef GL_ES
|
|
#define FFT_HIGHP highp
|
|
#else
|
|
#define FFT_HIGHP
|
|
#endif
|
|
|
|
#ifdef FFT_VEC8
|
|
|
|
// Currently unlikely to be useful.
|
|
uvec4 PADD(uvec4 a, uvec4 b)
|
|
{
|
|
return uvec4(
|
|
packHalf2x16(unpackHalf2x16(a.x) + unpackHalf2x16(b.x)),
|
|
packHalf2x16(unpackHalf2x16(a.y) + unpackHalf2x16(b.y)),
|
|
packHalf2x16(unpackHalf2x16(a.z) + unpackHalf2x16(b.z)),
|
|
packHalf2x16(unpackHalf2x16(a.w) + unpackHalf2x16(b.w)));
|
|
}
|
|
|
|
uvec4 PSUB(uvec4 a, uvec4 b)
|
|
{
|
|
return uvec4(
|
|
packHalf2x16(unpackHalf2x16(a.x) - unpackHalf2x16(b.x)),
|
|
packHalf2x16(unpackHalf2x16(a.y) - unpackHalf2x16(b.y)),
|
|
packHalf2x16(unpackHalf2x16(a.z) - unpackHalf2x16(b.z)),
|
|
packHalf2x16(unpackHalf2x16(a.w) - unpackHalf2x16(b.w)));
|
|
}
|
|
|
|
uvec4 PMUL(uvec4 a, uvec4 b)
|
|
{
|
|
return uvec4(
|
|
packHalf2x16(unpackHalf2x16(a.x) * unpackHalf2x16(b.x)),
|
|
packHalf2x16(unpackHalf2x16(a.y) * unpackHalf2x16(b.y)),
|
|
packHalf2x16(unpackHalf2x16(a.z) * unpackHalf2x16(b.z)),
|
|
packHalf2x16(unpackHalf2x16(a.w) * unpackHalf2x16(b.w)));
|
|
}
|
|
|
|
uvec4 CONJ_SWIZZLE(uvec4 v)
|
|
{
|
|
return uvec4(
|
|
packHalf2x16(unpackHalf2x16(v.x).yx),
|
|
packHalf2x16(unpackHalf2x16(v.y).yx),
|
|
packHalf2x16(unpackHalf2x16(v.z).yx),
|
|
packHalf2x16(unpackHalf2x16(v.w).yx));
|
|
}
|
|
|
|
uvec4 LDUP_SWIZZLE(uvec4 v)
|
|
{
|
|
return uvec4(
|
|
packHalf2x16(unpackHalf2x16(v.x).xx),
|
|
packHalf2x16(unpackHalf2x16(v.y).xx),
|
|
packHalf2x16(unpackHalf2x16(v.z).xx),
|
|
packHalf2x16(unpackHalf2x16(v.w).xx));
|
|
}
|
|
|
|
uvec4 HDUP_SWIZZLE(uvec4 v)
|
|
{
|
|
return uvec4(
|
|
packHalf2x16(unpackHalf2x16(v.x).yy),
|
|
packHalf2x16(unpackHalf2x16(v.y).yy),
|
|
packHalf2x16(unpackHalf2x16(v.z).yy),
|
|
packHalf2x16(unpackHalf2x16(v.w).yy));
|
|
}
|
|
|
|
// Sign-flip. Works for the cases we're interested in.
|
|
uvec4 cmul_minus_j(uvec4 v)
|
|
{
|
|
return uvec4(0x80000000u) ^ CONJ_SWIZZLE(v);
|
|
}
|
|
|
|
uvec4 cmul_plus_j(uvec4 v)
|
|
{
|
|
return uvec4(0x00008000u) ^ CONJ_SWIZZLE(v);
|
|
}
|
|
|
|
uvec4 cmul(uvec4 a, uvec4 b)
|
|
{
|
|
uvec4 r3 = CONJ_SWIZZLE(a);
|
|
uvec4 r1 = LDUP_SWIZZLE(b);
|
|
uvec4 R0 = PMUL(a, r1);
|
|
uvec4 r2 = HDUP_SWIZZLE(b);
|
|
uvec4 R1 = PMUL(r2, r3);
|
|
return PADD(R0, uvec4(0x8000u) ^ R1);
|
|
}
|
|
|
|
void butterfly(inout uvec4 a, inout uvec4 b, uvec4 w)
|
|
{
|
|
uvec4 t = cmul(b, w);
|
|
b = PSUB(a, t);
|
|
a = PADD(a, t);
|
|
}
|
|
|
|
void butterfly(inout uvec4 a, inout uvec4 b, vec4 w)
|
|
{
|
|
uvec4 t = cmul(b, uvec2(packHalf2x16(w.xy), packHalf2x16(w.zw)).xxyy);
|
|
b = PSUB(a, t);
|
|
a = PADD(a, t);
|
|
}
|
|
|
|
void butterfly(inout uvec4 a, inout uvec4 b, vec2 w)
|
|
{
|
|
uvec4 t = cmul(b, uvec4(packHalf2x16(w)));
|
|
b = PSUB(a, t);
|
|
a = PADD(a, t);
|
|
}
|
|
|
|
void butterfly_p1(inout uvec4 a, inout uvec4 b)
|
|
{
|
|
uvec4 t = b;
|
|
b = PSUB(a, t);
|
|
a = PADD(a, t);
|
|
}
|
|
|
|
void butterfly_p1_minus_j(inout uvec4 a, inout uvec4 b)
|
|
{
|
|
uvec4 t = b;
|
|
b = uvec4(0x80000000u) ^ (PSUB(CONJ_SWIZZLE(a), CONJ_SWIZZLE(t)));
|
|
a = PADD(a, t);
|
|
}
|
|
|
|
void butterfly_p1_plus_j(inout uvec4 a, inout uvec4 b)
|
|
{
|
|
uvec4 t = b;
|
|
b = uvec4(0x00008000u) ^ (PSUB(CONJ_SWIZZLE(a), CONJ_SWIZZLE(t)));
|
|
a = PADD(a, t);
|
|
}
|
|
#endif
|
|
|
|
// Complex multiply.
|
|
vec4 cmul(vec4 a, vec4 b)
|
|
{
|
|
vec4 r3 = a.yxwz;
|
|
vec4 r1 = b.xxzz;
|
|
vec4 R0 = a * r1;
|
|
vec4 r2 = b.yyww;
|
|
vec4 R1 = r2 * r3;
|
|
return R0 + vec4(-R1.x, R1.y, -R1.z, R1.w);
|
|
}
|
|
|
|
vec2 cmul(vec2 a, vec2 b)
|
|
{
|
|
vec2 r3 = a.yx;
|
|
vec2 r1 = b.xx;
|
|
vec2 R0 = a * r1;
|
|
vec2 r2 = b.yy;
|
|
vec2 R1 = r2 * r3;
|
|
return R0 + vec2(-R1.x, R1.y);
|
|
}
|
|
|
|
#ifdef FFT_INPUT_TEXTURE
|
|
|
|
#ifndef FFT_P1
|
|
#error Input texture can only be used when P == 1.
|
|
#endif
|
|
|
|
#ifdef GL_ES
|
|
#if defined(FFT_INPUT_FP16) || defined(FFT_FP16)
|
|
precision mediump sampler2D;
|
|
#else
|
|
precision highp sampler2D;
|
|
#endif
|
|
#endif
|
|
|
|
#define uTexelOffset constant_data.texture_offset_scale.xy
|
|
#define uTexelScale constant_data.texture_offset_scale.zw
|
|
|
|
layout(binding = BINDING_TEXTURE0) uniform sampler2D uTexture;
|
|
#ifdef FFT_CONVOLVE
|
|
layout(binding = BINDING_TEXTURE1) uniform sampler2D uTexture2;
|
|
#endif
|
|
|
|
cfloat load_texture(sampler2D sampler, uvec2 coord)
|
|
{
|
|
FFT_HIGHP vec2 uv = vec2(coord) * uTexelScale + uTexelOffset;
|
|
|
|
// Quite messy, this :)
|
|
#if defined(FFT_VEC8)
|
|
#if defined(FFT_INPUT_REAL)
|
|
return uvec4(
|
|
packHalf2x16(vec2(textureLodOffset(sampler, uv, 0.0, ivec2(0, 0)).x, textureLodOffset(sampler, uv, 0.0, ivec2(1, 0)).x)),
|
|
packHalf2x16(vec2(textureLodOffset(sampler, uv, 0.0, ivec2(2, 0)).x, textureLodOffset(sampler, uv, 0.0, ivec2(3, 0)).x)),
|
|
packHalf2x16(vec2(textureLodOffset(sampler, uv, 0.0, ivec2(4, 0)).x, textureLodOffset(sampler, uv, 0.0, ivec2(5, 0)).x)),
|
|
packHalf2x16(vec2(textureLodOffset(sampler, uv, 0.0, ivec2(6, 0)).x, textureLodOffset(sampler, uv, 0.0, ivec2(7, 0)).x)));
|
|
#elif defined(FFT_DUAL)
|
|
vec4 c0 = textureLodOffset(sampler, uv, 0.0, ivec2(0, 0));
|
|
vec4 c1 = textureLodOffset(sampler, uv, 0.0, ivec2(1, 0));
|
|
return uvec4(packHalf2x16(c0.xy), packHalf2x16(c0.zw), packHalf2x16(c1.xy), packHalf2x16(c1.zw));
|
|
#else
|
|
return uvec4(
|
|
packHalf2x16(textureLodOffset(sampler, uv, 0.0, ivec2(0, 0)).xy),
|
|
packHalf2x16(textureLodOffset(sampler, uv, 0.0, ivec2(1, 0)).xy),
|
|
packHalf2x16(textureLodOffset(sampler, uv, 0.0, ivec2(2, 0)).xy),
|
|
packHalf2x16(textureLodOffset(sampler, uv, 0.0, ivec2(3, 0)).xy));
|
|
#endif
|
|
#elif defined(FFT_VEC4)
|
|
#if defined(FFT_INPUT_REAL)
|
|
return vec4(
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(0, 0)).x,
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(1, 0)).x,
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(2, 0)).x,
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(3, 0)).x);
|
|
#elif defined(FFT_DUAL)
|
|
return textureLod(sampler, uv, 0.0);
|
|
#else
|
|
return vec4(
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(0, 0)).xy,
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(1, 0)).xy);
|
|
#endif
|
|
#elif defined(FFT_VEC2)
|
|
#if defined(FFT_INPUT_REAL)
|
|
return vec2(
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(0, 0)).x,
|
|
textureLodOffset(sampler, uv, 0.0, ivec2(1, 0)).x);
|
|
#else
|
|
return textureLod(sampler, uv, 0.0).xy;
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
cfloat load_texture(uvec2 coord)
|
|
{
|
|
#ifdef FFT_CONVOLVE
|
|
// Convolution in frequency domain is multiplication.
|
|
cfloat c0 = load_texture(uTexture, coord);
|
|
cfloat c1 = load_texture(uTexture2, coord);
|
|
return cmul(c0, c1);
|
|
#else
|
|
return load_texture(uTexture, coord);
|
|
#endif
|
|
}
|
|
|
|
// Implement a dummy load_global, or we have to #ifdef out lots of dead code elsewhere.
|
|
#ifdef FFT_VEC8
|
|
cfloat load_global(uint offset)
|
|
{
|
|
return cfloat(0u);
|
|
}
|
|
#else
|
|
cfloat load_global(uint offset)
|
|
{
|
|
return cfloat(0.0);
|
|
}
|
|
#endif
|
|
|
|
#else
|
|
|
|
layout(std430, binding = BINDING_SSBO_IN) readonly buffer Block
|
|
{
|
|
cfloat_buffer_in data[];
|
|
} fft_in;
|
|
|
|
#ifdef FFT_CONVOLVE
|
|
layout(std430, binding = BINDING_SSBO_AUX) readonly buffer Block2
|
|
{
|
|
cfloat_buffer_in data[];
|
|
} fft_in2;
|
|
|
|
cfloat load_global(uint offset)
|
|
{
|
|
// Convolution in frequency domain is multiplication.
|
|
#if defined(FFT_INPUT_FP16) && defined(FFT_VEC2)
|
|
return cmul(unpackHalf2x16(fft_in.data[offset]), unpackHalf2x16(fft_in2.data[offset]));
|
|
#elif defined(FFT_INPUT_FP16) && defined(FFT_VEC4)
|
|
uvec2 data = fft_in.data[offset];
|
|
uvec2 data2 = fft_in2.data[offset];
|
|
return cmul(vec4(unpackHalf2x16(data.x), unpackHalf2x16(data.y)), vec4(unpackHalf2x16(data2.x), unpackHalf2x16(data2.y)));
|
|
#else
|
|
return cmul(fft_in.data[offset], fft_in2.data[offset]);
|
|
#endif
|
|
}
|
|
#else
|
|
cfloat load_global(uint offset)
|
|
{
|
|
#if defined(FFT_INPUT_FP16) && defined(FFT_VEC2)
|
|
return unpackHalf2x16(fft_in.data[offset]);
|
|
#elif defined(FFT_INPUT_FP16) && defined(FFT_VEC4)
|
|
uvec2 data = fft_in.data[offset];
|
|
return vec4(unpackHalf2x16(data.x), unpackHalf2x16(data.y));
|
|
#else
|
|
return fft_in.data[offset];
|
|
#endif
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
#ifndef FFT_OUTPUT_IMAGE
|
|
layout(std430, binding = BINDING_SSBO_OUT) writeonly buffer BlockOut
|
|
{
|
|
cfloat_buffer_out data[];
|
|
} fft_out;
|
|
|
|
void store_global(uint offset, cfloat v)
|
|
{
|
|
#ifdef FFT_NORM_FACTOR
|
|
#ifdef FFT_VEC8
|
|
v = PMUL(uvec4(packHalf2x16(vec2(FFT_NORM_FACTOR))), v);
|
|
#else
|
|
v *= FFT_NORM_FACTOR;
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(FFT_OUTPUT_FP16) && defined(FFT_VEC2)
|
|
fft_out.data[offset] = packHalf2x16(v);
|
|
#elif defined(FFT_OUTPUT_FP16) && defined(FFT_VEC4)
|
|
fft_out.data[offset] = uvec2(packHalf2x16(v.xy), packHalf2x16(v.zw));
|
|
#else
|
|
fft_out.data[offset] = v;
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef FFT_OUTPUT_IMAGE
|
|
|
|
#ifdef GL_ES
|
|
#ifdef FFT_OUTPUT_REAL
|
|
precision highp image2D;
|
|
#else
|
|
precision mediump image2D;
|
|
#endif
|
|
precision highp uimage2D;
|
|
#endif
|
|
|
|
//#ifdef FFT_P1
|
|
//#error FFT_OUTPUT_IMAGE is not supported in first pass.
|
|
//#endif
|
|
|
|
// Currently, GLFFT only supports outputing to "fixed" formats like these.
|
|
// Should be possible to add options for this to at least choose between FP16/FP32 output,
|
|
// and maybe rgba8_unorm for FFT_DUAL case.
|
|
#if defined(FFT_DUAL)
|
|
layout(rgba16f, binding = BINDING_IMAGE) uniform writeonly image2D uImage;
|
|
#elif defined(FFT_OUTPUT_REAL)
|
|
layout(r32f, binding = BINDING_IMAGE) uniform writeonly image2D uImage;
|
|
#else
|
|
// GLES 3.1 doesn't support rg16f layout for some reason, so work around it ...
|
|
layout(r32ui, binding = BINDING_IMAGE) uniform writeonly uimage2D uImage;
|
|
#endif
|
|
|
|
void store(ivec2 coord, vec4 value)
|
|
{
|
|
#ifdef FFT_NORM_FACTOR
|
|
value *= FFT_NORM_FACTOR;
|
|
#endif
|
|
|
|
#if defined(FFT_DUAL)
|
|
imageStore(uImage, coord, value);
|
|
#elif defined(FFT_HORIZ)
|
|
#ifdef FFT_OUTPUT_REAL
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(0, 0), value.xxxx);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(1, 0), value.yyyy);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(2, 0), value.zzzz);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(3, 0), value.wwww);
|
|
#else
|
|
imageStore(uImage, coord + ivec2(0, 0), uvec4(packHalf2x16(value.xy)));
|
|
imageStore(uImage, coord + ivec2(1, 0), uvec4(packHalf2x16(value.zw)));
|
|
#endif
|
|
#elif defined(FFT_VERT)
|
|
#ifdef FFT_OUTPUT_REAL
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(0, 0), value.xxxx);
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(1, 0), value.yyyy);
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(2, 0), value.zzzz);
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(3, 0), value.wwww);
|
|
#else
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(0, 0), uvec4(packHalf2x16(value.xy)));
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(1, 0), uvec4(packHalf2x16(value.zw)));
|
|
#endif
|
|
#else
|
|
#error Inconsistent defines.
|
|
#endif
|
|
}
|
|
|
|
#ifndef FFT_DUAL
|
|
void store(ivec2 coord, vec2 value)
|
|
{
|
|
#ifdef FFT_NORM_FACTOR
|
|
value *= FFT_NORM_FACTOR;
|
|
#endif
|
|
|
|
#if defined(FFT_HORIZ)
|
|
#ifdef FFT_OUTPUT_REAL
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(0, 0), value.xxxx);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(1, 0), value.yyyy);
|
|
#else
|
|
imageStore(uImage, coord, uvec4(packHalf2x16(value.xy)));
|
|
#endif
|
|
#elif defined(FFT_VERT)
|
|
#ifdef FFT_OUTPUT_REAL
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(0, 0), value.xxxx);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(1, 0), value.yyyy);
|
|
#else
|
|
imageStore(uImage, coord, uvec4(packHalf2x16(value.xy)));
|
|
#endif
|
|
#else
|
|
#error Inconsistent defines.
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#ifdef FFT_VEC8
|
|
void store(ivec2 coord, uvec4 value)
|
|
{
|
|
#ifdef FFT_NORM_FACTOR
|
|
value = PMUL(value, uvec4(packHalf2x16(vec2(FFT_NORM_FACTOR))));
|
|
#endif
|
|
|
|
#if defined(FFT_DUAL)
|
|
#if defined(FFT_HORIZ)
|
|
imageStore(uImage, coord + ivec2(0, 0), vec4(unpackHalf2x16(value.x), unpackHalf2x16(value.y)));
|
|
imageStore(uImage, coord + ivec2(1, 0), vec4(unpackHalf2x16(value.z), unpackHalf2x16(value.w)));
|
|
#else
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(0, 0), vec4(unpackHalf2x16(value.x), unpackHalf2x16(value.y)));
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(1, 0), vec4(unpackHalf2x16(value.z), unpackHalf2x16(value.w)));
|
|
#endif
|
|
#elif defined(FFT_HORIZ)
|
|
#ifdef FFT_OUTPUT_REAL
|
|
vec2 value0 = unpackHalf2x16(value.x);
|
|
vec2 value1 = unpackHalf2x16(value.y);
|
|
vec2 value2 = unpackHalf2x16(value.z);
|
|
vec2 value3 = unpackHalf2x16(value.w);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(0, 0), value0.xxxx);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(1, 0), value0.yyyy);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(2, 0), value1.xxxx);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(3, 0), value1.yyyy);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(4, 0), value2.xxxx);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(5, 0), value2.yyyy);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(6, 0), value3.xxxx);
|
|
imageStore(uImage, coord * ivec2(2, 1) + ivec2(7, 0), value3.yyyy);
|
|
#else
|
|
imageStore(uImage, coord + ivec2(0, 0), value.xxxx);
|
|
imageStore(uImage, coord + ivec2(1, 0), value.yyyy);
|
|
imageStore(uImage, coord + ivec2(2, 0), value.zzzz);
|
|
imageStore(uImage, coord + ivec2(3, 0), value.wwww);
|
|
#endif
|
|
#elif defined(FFT_VERT)
|
|
#ifdef FFT_OUTPUT_REAL
|
|
vec2 value0 = unpackHalf2x16(value.x);
|
|
vec2 value1 = unpackHalf2x16(value.y);
|
|
vec2 value2 = unpackHalf2x16(value.z);
|
|
vec2 value3 = unpackHalf2x16(value.w);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(0, 0), value0.xxxx);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(1, 0), value0.yyyy);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(2, 0), value1.xxxx);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(3, 0), value1.yyyy);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(4, 0), value2.xxxx);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(5, 0), value2.yyyy);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(6, 0), value3.xxxx);
|
|
imageStore(uImage, coord * ivec2(8, 1) + ivec2(7, 0), value3.yyyy);
|
|
#else
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(0, 0), value.xxxx);
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(1, 0), value.yyyy);
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(2, 0), value.zzzz);
|
|
imageStore(uImage, coord * ivec2(4, 1) + ivec2(3, 0), value.wwww);
|
|
#endif
|
|
#else
|
|
#error Inconsistent defines.
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#define PI 3.14159265359
|
|
#define SQRT_1_2 0.70710678118
|
|
|
|
#ifdef FFT_INVERSE
|
|
#define PI_DIR (+PI)
|
|
#else
|
|
#define PI_DIR (-PI)
|
|
#endif
|
|
|
|
// Some GLES implementations have lower trancendental precision than desired which
|
|
// significantly affects the overall FFT precision.
|
|
// For these implementations it might make sense to add a LUT UBO with twiddle factors,
|
|
// which can be used here.
|
|
|
|
// 4-component FP16 twiddles, pack in uvec4.
|
|
#if !defined(FFT_DUAL) && defined(FFT_HORIZ) && defined(FFT_VEC8)
|
|
#define FFT_OUTPUT_STEP 4u
|
|
#define FFT_OUTPUT_SHIFT 2u
|
|
#define ctwiddle uvec4
|
|
ctwiddle twiddle(uint k, uint p)
|
|
{
|
|
// Trancendentals should always be done in highp.
|
|
FFT_HIGHP vec4 angles = PI_DIR * (float(k) + vec4(0.0, 1.0, 2.0, 3.0)) / float(p);
|
|
FFT_HIGHP vec4 cos_a = cos(angles);
|
|
FFT_HIGHP vec4 sin_a = sin(angles);
|
|
return ctwiddle(
|
|
packHalf2x16(vec2(cos_a.x, sin_a.x)),
|
|
packHalf2x16(vec2(cos_a.y, sin_a.y)),
|
|
packHalf2x16(vec2(cos_a.z, sin_a.z)),
|
|
packHalf2x16(vec2(cos_a.w, sin_a.w)));
|
|
}
|
|
|
|
#ifdef FFT_INVERSE
|
|
#define TWIDDLE_1_8 (uvec4(packHalf2x16(vec2(+SQRT_1_2, +SQRT_1_2))))
|
|
#define TWIDDLE_3_8 (uvec4(packHalf2x16(vec2(-SQRT_1_2, +SQRT_1_2))))
|
|
#else
|
|
#define TWIDDLE_1_8 (uvec4(packHalf2x16(vec2(+SQRT_1_2, -SQRT_1_2))))
|
|
#define TWIDDLE_3_8 (uvec4(packHalf2x16(vec2(-SQRT_1_2, -SQRT_1_2))))
|
|
#endif
|
|
|
|
// 2-component twiddles, pack in vec4.
|
|
#elif (!defined(FFT_DUAL) && defined(FFT_HORIZ) && defined(FFT_VEC4)) || (defined(FFT_DUAL) && defined(FFT_HORIZ) && defined(FFT_VEC8))
|
|
#define FFT_OUTPUT_STEP 2u
|
|
#define FFT_OUTPUT_SHIFT 1u
|
|
#define ctwiddle vec4
|
|
ctwiddle twiddle(uint k, uint p)
|
|
{
|
|
// Trancendentals should always be done in highp.
|
|
FFT_HIGHP vec2 angles = PI_DIR * (float(k) + vec2(0.0, 1.0)) / float(p);
|
|
FFT_HIGHP vec2 cos_a = cos(angles);
|
|
FFT_HIGHP vec2 sin_a = sin(angles);
|
|
return ctwiddle(cos_a.x, sin_a.x, cos_a.y, sin_a.y);
|
|
}
|
|
|
|
#ifdef FFT_INVERSE
|
|
#define TWIDDLE_1_8 (vec2(+SQRT_1_2, +SQRT_1_2).xyxy)
|
|
#define TWIDDLE_3_8 (vec2(-SQRT_1_2, +SQRT_1_2).xyxy)
|
|
#else
|
|
#define TWIDDLE_1_8 (vec2(+SQRT_1_2, -SQRT_1_2).xyxy)
|
|
#define TWIDDLE_3_8 (vec2(-SQRT_1_2, -SQRT_1_2).xyxy)
|
|
#endif
|
|
|
|
// 1-component twiddle, pack in vec2.
|
|
#else
|
|
|
|
#define FFT_OUTPUT_STEP 1u
|
|
#define FFT_OUTPUT_SHIFT 0u
|
|
#define ctwiddle vec2
|
|
ctwiddle twiddle(uint k, uint p)
|
|
{
|
|
// Trancendentals should always be done in highp.
|
|
FFT_HIGHP float angle = PI_DIR * float(k) / float(p);
|
|
return ctwiddle(cos(angle), sin(angle));
|
|
}
|
|
|
|
#ifdef FFT_INVERSE
|
|
#define TWIDDLE_1_8 (vec2(+SQRT_1_2, +SQRT_1_2))
|
|
#define TWIDDLE_3_8 (vec2(-SQRT_1_2, +SQRT_1_2))
|
|
#else
|
|
#define TWIDDLE_1_8 (vec2(+SQRT_1_2, -SQRT_1_2))
|
|
#define TWIDDLE_3_8 (vec2(-SQRT_1_2, -SQRT_1_2))
|
|
#endif
|
|
|
|
#endif
|
|
|
|
// Complex multiply by v * -j. Trivial case which can avoid mul/add.
|
|
vec4 cmul_minus_j(vec4 v)
|
|
{
|
|
return vec4(v.y, -v.x, v.w, -v.z);
|
|
}
|
|
|
|
vec2 cmul_minus_j(vec2 v)
|
|
{
|
|
return vec2(v.y, -v.x);
|
|
}
|
|
|
|
// Complex multiply by v * +j. Trivial case which can avoid mul/add.
|
|
vec4 cmul_plus_j(vec4 v)
|
|
{
|
|
return vec4(-v.y, v.x, -v.w, v.z);
|
|
}
|
|
|
|
vec2 cmul_plus_j(vec2 v)
|
|
{
|
|
return vec2(-v.y, v.x);
|
|
}
|
|
|
|
#ifdef FFT_INVERSE
|
|
#define cmul_dir_j(v) cmul_plus_j(v)
|
|
#else
|
|
#define cmul_dir_j(v) cmul_minus_j(v)
|
|
#endif
|
|
|
|
// Calculate an in-place butterfly with twiddle factors.
|
|
// a ----------- a + wb
|
|
// \ /
|
|
// \ /
|
|
// X
|
|
// / \
|
|
// / \
|
|
// w * b ------- a - wb
|
|
//
|
|
void butterfly(inout vec4 a, inout vec4 b, vec4 w)
|
|
{
|
|
vec4 t = cmul(b, w);
|
|
b = a - t;
|
|
a = a + t;
|
|
}
|
|
|
|
// Computes butterflies, but the twiddle factors for the two butterflies are
|
|
// identical.
|
|
void butterfly(inout vec4 a, inout vec4 b, vec2 w)
|
|
{
|
|
butterfly(a, b, w.xyxy);
|
|
}
|
|
|
|
void butterfly(inout vec2 a, inout vec2 b, vec2 w)
|
|
{
|
|
vec2 t = cmul(b, w);
|
|
b = a - t;
|
|
a = a + t;
|
|
}
|
|
|
|
// First pass butterfly, special case where w = 1.
|
|
void butterfly_p1(inout vec4 a, inout vec4 b)
|
|
{
|
|
vec4 t = b;
|
|
b = a - t;
|
|
a = a + t;
|
|
}
|
|
|
|
// First pass butterfly, but also multiply in a twiddle factor of -j to b afterwards.
|
|
// Used in P == 1 transforms for radix-4, radix-8 etc.
|
|
void butterfly_p1_minus_j(inout vec4 a, inout vec4 b)
|
|
{
|
|
vec4 t = b;
|
|
b = vec4(1.0, -1.0, 1.0, -1.0) * (a.yxwz - t.yxwz);
|
|
a = a + t;
|
|
}
|
|
|
|
void butterfly_p1_plus_j(inout vec4 a, inout vec4 b)
|
|
{
|
|
vec4 t = b;
|
|
b = vec4(-1.0, 1.0, -1.0, 1.0) * (a.yxwz - t.yxwz);
|
|
a = a + t;
|
|
}
|
|
|
|
void butterfly_p1(inout vec2 a, inout vec2 b)
|
|
{
|
|
vec2 t = b;
|
|
b = a - t;
|
|
a = a + t;
|
|
}
|
|
|
|
void butterfly_p1_minus_j(inout vec2 a, inout vec2 b)
|
|
{
|
|
vec2 t = b;
|
|
b = vec2(1.0, -1.0) * (a.yx - t.yx);
|
|
a = a + t;
|
|
}
|
|
|
|
void butterfly_p1_plus_j(inout vec2 a, inout vec2 b)
|
|
{
|
|
vec2 t = b;
|
|
b = vec2(-1.0, 1.0) * (a.yx - t.yx);
|
|
a = a + t;
|
|
}
|
|
|
|
#ifdef FFT_INVERSE
|
|
#define butterfly_p1_dir_j(a, b) butterfly_p1_plus_j(a, b)
|
|
#else
|
|
#define butterfly_p1_dir_j(a, b) butterfly_p1_minus_j(a, b)
|
|
#endif
|
|
|
|
#ifdef FFT_RESOLVE_REAL_TO_COMPLEX
|
|
vec2 r2c_twiddle(uint i, uint p)
|
|
{
|
|
vec2 w = -twiddle(i, p);
|
|
return vec2(-w.y, w.x);
|
|
}
|
|
|
|
// See http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM for
|
|
// how the real-to-complex and complex-to-real resolve passes work.
|
|
// The final real-to-complex transform pass is done by extracting two interleaved FFTs by conjugate symmetry.
|
|
|
|
// If we have a real sequence:
|
|
// (r0, r1, r2, r3, r4, ...), we merge two adjacent real values to a sequence of complex numbers.
|
|
// We take the FFT of this complex sequence as normal.
|
|
// What we end up with really is:
|
|
// FFT((r0, r2, r4, r6, ...)) + FFT(j * (r1, r3, r5, r7, ...)).
|
|
// If we know the individual FFTs of the even and the odds we can complete the FFT by a single decimation-in-frequency stage.
|
|
// By conjugate symmetry, we can extract the even and odd FFTs and complex our transform.
|
|
// Complex-to-real is just the same thing, but in reverse.
|
|
|
|
void FFT_real_to_complex(uvec2 i)
|
|
{
|
|
uint stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
|
|
uint offset = i.y * stride;
|
|
|
|
if (i.x == 0u)
|
|
{
|
|
#ifdef FFT_INPUT_TEXTURE
|
|
vec2 x = load_texture(i);
|
|
#else
|
|
vec2 x = load_global(offset);
|
|
#endif
|
|
|
|
#ifdef FFT_OUTPUT_IMAGE
|
|
store(ivec2(i), vec2(x.x + x.y, 0.0));
|
|
store(ivec2(i) + ivec2(stride, 0), vec2(x.x - x.y, 0.0));
|
|
#else
|
|
store_global(2u * offset, vec2(x.x + x.y, 0.0));
|
|
store_global(2u * offset + stride, vec2(x.x - x.y, 0.0));
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
#ifdef FFT_INPUT_TEXTURE
|
|
vec2 a = load_texture(i);
|
|
vec2 b = load_texture(uvec2(stride - i.x, i.y));
|
|
#else
|
|
vec2 a = load_global(offset + i.x);
|
|
vec2 b = load_global(offset + stride - i.x);
|
|
#endif
|
|
b = vec2(b.x, -b.y);
|
|
vec2 fe = a + b;
|
|
vec2 fo = cmul(a - b, r2c_twiddle(i.x, stride));
|
|
|
|
#ifdef FFT_OUTPUT_IMAGE
|
|
store(ivec2(i), 0.5 * (fe + fo));
|
|
#else
|
|
store_global(2u * offset + i.x, 0.5 * (fe + fo));
|
|
#endif
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef FFT_RESOLVE_COMPLEX_TO_REAL
|
|
vec2 c2r_twiddle(uint i, uint p)
|
|
{
|
|
vec2 w = twiddle(i, p);
|
|
return vec2(-w.y, w.x);
|
|
}
|
|
|
|
void FFT_complex_to_real(uvec2 i)
|
|
{
|
|
uint stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
|
|
uint offset = i.y * stride;
|
|
|
|
#ifdef FFT_INPUT_TEXTURE
|
|
vec2 a = load_texture(i);
|
|
vec2 b = load_texture(uvec2(stride - i.x, i.y));
|
|
#else
|
|
vec2 a = load_global(2u * offset + i.x);
|
|
vec2 b = load_global(2u * offset + stride - i.x);
|
|
#endif
|
|
b = vec2(b.x, -b.y);
|
|
vec2 even = a + b;
|
|
vec2 odd = cmul(a - b, c2r_twiddle(i.x, stride));
|
|
|
|
store_global(offset + i.x, even + odd);
|
|
}
|
|
#endif
|
|
|