glava-ridged/glfft/glfft.cpp
VetheonGames 5499f9f4b9 Init
2023-09-07 16:16:06 -06:00

1126 lines
34 KiB
C++

/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
*
* Permission is hereby granted, free of charge,
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "glfft.hpp"
#include <algorithm>
#include <stdexcept>
#include <fstream>
#include <sstream>
#include <numeric>
#include <assert.h>
#include <cmath>
#ifdef GLFFT_CLI_ASYNC
#include "glfft_cli.hpp"
#endif
/* GLava addition: should be defined by meson */
#if defined(GLAVA_STANDALONE) || !defined(SHADER_INSTALL_PATH)
#undef SHADER_INSTALL_PATH
#define SHADER_INSTALL_PATH "../shaders/glava"
#endif
using namespace std;
using namespace GLFFT;
enum Bindings
{
BindingSSBOIn = 0,
BindingSSBOOut = 1,
BindingSSBOAux = 2,
BindingUBO = 3,
BindingTexture0 = 4,
BindingTexture1 = 5,
BindingImage = 6
};
struct WorkGroupSize
{
unsigned x, y, z;
};
struct Radix
{
WorkGroupSize size;
unsigned num_workgroups_x;
unsigned num_workgroups_y;
unsigned radix;
unsigned vector_size;
bool shared_banked;
};
static unsigned next_pow2(unsigned v)
{
v--;
v |= v >> 16;
v |= v >> 8;
v |= v >> 4;
v |= v >> 2;
v |= v >> 1;
return v + 1;
}
static void reduce(unsigned &wg_size, unsigned &divisor)
{
if (divisor > 1 && wg_size >= divisor)
{
wg_size /= divisor;
divisor = 1;
}
else if (divisor > 1 && wg_size < divisor)
{
divisor /= wg_size;
wg_size = 1;
}
}
static unsigned radix_to_wg_z(unsigned radix)
{
switch (radix)
{
case 16:
return 4;
case 64:
return 8;
default:
return 1;
}
}
static Radix build_radix(unsigned Nx, unsigned Ny,
Mode mode, unsigned vector_size, bool shared_banked, unsigned radix,
WorkGroupSize size,
bool pow2_stride)
{
unsigned wg_x = 0, wg_y = 0;
if (Ny == 1 && size.y > 1)
{
throw logic_error("WorkGroupSize.y must be 1, when Ny == 1.\n");
}
// To avoid too many threads per workgroup due to workgroup_size_z,
// try to divide workgroup_size_y, then workgroup_size_x.
// TODO: Make a better constraint solver which takes into account cache line sizes,
// and image swizzling patterns, etc ... Not that critical though, since wisdom interface
// will find the optimal options despite this.
unsigned divisor = size.z;
reduce(size.y, divisor);
reduce(size.x, divisor);
switch (mode)
{
case Vertical:
// If we have pow2_stride, we need to transform 2^n + 1 elements horizontally,
// so just add a single workgroup in X.
// We pad by going up to pow2 stride anyways.
// We will transform some garbage,
// but it's better than transforming close to double the amount.
wg_x = (2 * Nx) / (vector_size * size.x) + pow2_stride;
wg_y = Ny / (size.y * radix);
break;
case VerticalDual:
vector_size = max(vector_size, 4u);
wg_x = (4 * Nx) / (vector_size * size.x);
wg_y = Ny / (size.y * radix);
break;
case Horizontal:
wg_x = (2 * Nx) / (vector_size * radix * size.x);
wg_y = Ny / size.y;
break;
case HorizontalDual:
vector_size = max(vector_size, 4u);
wg_x = (4 * Nx) / (vector_size * radix * size.x);
wg_y = Ny / size.y;
break;
default:
assert(0);
}
return { size, wg_x, wg_y, radix, vector_size, shared_banked };
}
// Resolve radices are simpler, and don't yet support different vector sizes, etc.
static Radix build_resolve_radix(unsigned Nx, unsigned Ny, WorkGroupSize size)
{
return { size, Nx / size.x, Ny / size.y, 2, 2, false };
}
// Smaller FFT with larger workgroups are not always possible to create.
static bool is_radix_valid(unsigned Nx, unsigned Ny,
Mode mode, unsigned vector_size, unsigned radix,
WorkGroupSize size,
bool pow2_stride)
{
auto res = build_radix(Nx, Ny,
mode, vector_size, false, radix,
size,
pow2_stride);
return res.num_workgroups_x > 0 && res.num_workgroups_y > 0;
}
static double find_cost(unsigned Nx, unsigned Ny, Mode mode, unsigned radix,
const FFTOptions &options, const FFTWisdom &wisdom)
{
auto opt = wisdom.find_optimal_options(Nx, Ny, radix, mode, SSBO, SSBO, options.type);
// Return a very rough estimate if we cannot find cost.
// The cost functions generated here are expected to be huge,
// always much larger than true cost functions.
// The purpose of this is to give a strong bias towards radices we have wisdom for.
// We also give a bias towards larger radices, since they are generally more BW efficient.
return opt ? opt->first.cost : Nx * Ny * (log2(float(radix)) + 2.0f);
}
struct CostPropagate
{
CostPropagate() = default;
CostPropagate(double cost, vector<unsigned> radices)
: cost(cost), radices(move(radices)) {}
void merge_if_better(const CostPropagate &a, const CostPropagate &b)
{
double new_cost = a.cost + b.cost;
if ((cost == 0.0 || new_cost < cost) && a.cost != 0.0 && b.cost != 0.0)
{
cost = new_cost;
radices = a.radices;
radices.insert(end(radices), begin(b.radices), end(b.radices));
}
}
double cost = 0.0;
vector<unsigned> radices;
};
static vector<Radix> split_radices(unsigned Nx, unsigned Ny, Mode mode, Target input_target, Target output_target,
const FFTOptions &options,
bool pow2_stride, const FFTWisdom &wisdom, double &accumulate_cost)
{
unsigned N;
switch (mode)
{
case Vertical:
case VerticalDual:
N = Ny;
break;
case Horizontal:
case HorizontalDual:
N = Nx;
break;
default:
return {};
}
// N == 1 is for things like Nx1 transforms where we don't do any vertical transforms.
if (N == 1)
{
return {};
}
// Treat cost 0.0 as invalid.
double cost_table[8] = {0.0};
CostPropagate cost_propagate[32];
// Fill table with fastest known ways to do radix 4, radix 8, radix 16, and 64.
// We'll then find the optimal subdivision which has the lowest additive cost.
cost_table[2] = find_cost(Nx, Ny, mode, 4, options, wisdom);
cost_table[3] = find_cost(Nx, Ny, mode, 8, options, wisdom);
cost_table[4] = find_cost(Nx, Ny, mode, 16, options, wisdom);
cost_table[6] = find_cost(Nx, Ny, mode, 64, options, wisdom);
auto is_valid = [&](unsigned radix) -> bool {
unsigned workgroup_size_z = radix_to_wg_z(radix);
auto &opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);
// We don't want pow2_stride to round up a very inefficient work group and make the is_valid test pass.
return is_radix_valid(Nx, Ny,
mode, opt.vector_size, radix,
{ opt.workgroup_size_x, opt.workgroup_size_y, workgroup_size_z },
false);
};
// If our work-space is too small to allow certain radices, we disable them from consideration here.
for (unsigned i = 2; i <= 6; i++)
{
// Don't check the composite radix.
if (i == 5)
{
continue;
}
if (is_valid(1 << i))
{
cost_propagate[i] = CostPropagate(cost_table[i], { 1u << i });
}
}
// Now start bubble this up all the way to N, starting from radix 16.
for (unsigned i = 4; (1u << i) <= N; i++)
{
auto &target = cost_propagate[i];
for (unsigned r = 2; i - r >= r; r++)
{
target.merge_if_better(cost_propagate[r], cost_propagate[i - r]);
}
if ((1u << i) == N && target.cost == 0.0)
{
throw logic_error("There is no possible subdivision ...\n");
}
}
// Ensure that the radix splits are sensible.
// A radix-N non p-1 transform mandates that p factor is at least N.
// Sort the splits so that larger radices come first.
// For composite radices like 16 and 64, they are built with 4x4 and 8x8, so we only
// need p factors for 4 and 8 for those cases.
// The cost function doesn't depend in which order we split the radices.
auto &cost = cost_propagate[unsigned(log2(float(N)))];
auto radices = move(cost.radices);
sort(begin(radices), end(radices), greater<unsigned>());
if (accumulate(begin(radices), end(radices), 1u, multiplies<unsigned>()) != N)
{
throw logic_error("Radix splits are invalid.");
}
vector<Radix> radices_out;
radices_out.reserve(radices.size());
// Fill in the structs with all information.
for (auto radix : radices)
{
bool first = radices_out.empty();
bool last = radices_out.size() + 1 == radices.size();
// Use known performance options as a fallback.
// We used SSBO -> SSBO cost functions to find the optimal radix splits,
// but replace first and last options with Image -> SSBO / SSBO -> Image cost functions if appropriate.
auto &orig_opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);
auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode,
first ? input_target : SSBO,
last ? output_target : SSBO,
{ orig_opt, options.type });
radices_out.push_back(build_radix(Nx, Ny,
mode, opts.vector_size, opts.shared_banked, radix,
{ opts.workgroup_size_x, opts.workgroup_size_y, radix_to_wg_z(radix) },
pow2_stride));
}
accumulate_cost += cost.cost;
return radices_out;
}
Program* ProgramCache::find_program(const Parameters &parameters) const
{
auto itr = programs.find(parameters);
if (itr != end(programs))
{
return itr->second.get();
}
else
{
return nullptr;
}
}
void ProgramCache::insert_program(const Parameters &parameters, std::unique_ptr<Program> program)
{
programs[parameters] = move(program);
}
Program* FFT::get_program(const Parameters &params)
{
Program *prog = cache->find_program(params);
if (!prog)
{
auto newprog = build_program(params);
if (!newprog)
{
throw runtime_error("Failed to compile shader.\n");
}
prog = newprog.get();
cache->insert_program(params, move(newprog));
}
return prog;
}
static inline unsigned mode_to_input_components(Mode mode)
{
switch (mode)
{
case HorizontalDual:
case VerticalDual:
return 4;
case Horizontal:
case Vertical:
case ResolveComplexToReal:
return 2;
case ResolveRealToComplex:
return 1;
default:
return 0;
}
}
FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
unsigned radix, unsigned p,
Mode mode, Target input_target, Target output_target,
std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options)
: context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
{
set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
{
throw logic_error("FFT size is not POT.");
}
if (p != 1 && input_target != SSBO)
{
throw logic_error("P != 1 only supported with SSBO as input.");
}
if (p < radix && output_target != SSBO)
{
throw logic_error("P < radix only supported with SSBO as output.");
}
// We don't really care about transform direction since it's just a matter of sign-flipping twiddles,
// but we have to obey some fundamental assumptions of resolve passes.
Direction direction = mode == ResolveComplexToReal ? Inverse : Forward;
Radix res;
if (mode == ResolveRealToComplex || mode == ResolveComplexToReal)
{
res = build_resolve_radix(Nx, Ny, { options.performance.workgroup_size_x, options.performance.workgroup_size_y, 1 });
}
else
{
res = build_radix(Nx, Ny,
mode, options.performance.vector_size, options.performance.shared_banked, radix,
{ options.performance.workgroup_size_x, options.performance.workgroup_size_y, radix_to_wg_z(radix) },
false);
}
const Parameters params = {
res.size.x,
res.size.y,
res.size.z,
res.radix,
res.vector_size,
direction,
mode,
input_target,
output_target,
p == 1,
res.shared_banked,
options.type.fp16, options.type.input_fp16, options.type.output_fp16,
options.type.normalize,
};
if (res.num_workgroups_x == 0 || res.num_workgroups_y == 0)
{
throw logic_error("Invalid workgroup sizes for this radix.");
}
unsigned uv_scale_x = res.vector_size / mode_to_input_components(mode);
const Pass pass = {
params,
res.num_workgroups_x, res.num_workgroups_y,
uv_scale_x,
next_pow2(res.num_workgroups_x * params.workgroup_size_x),
get_program(params),
};
passes.push_back(pass);
}
static inline void print_radix_splits(Context *context, const vector<Radix> radices[2])
{
context->log("Transform #1\n");
for (auto &radix : radices[0])
{
context->log(" Size: (%u, %u, %u)\n",
radix.size.x, radix.size.y, radix.size.z);
context->log(" Dispatch: (%u, %u)\n",
radix.num_workgroups_x, radix.num_workgroups_y);
context->log(" Radix: %u\n",
radix.radix);
context->log(" VectorSize: %u\n\n",
radix.vector_size);
}
context->log("Transform #2\n");
for (auto &radix : radices[1])
{
context->log(" Size: (%u, %u, %u)\n",
radix.size.x, radix.size.y, radix.size.z);
context->log(" Dispatch: (%u, %u)\n",
radix.num_workgroups_x, radix.num_workgroups_y);
context->log(" Radix: %u\n",
radix.radix);
context->log(" VectorSize: %u\n\n",
radix.vector_size);
}
}
static inline unsigned type_to_input_components(Type type)
{
switch (type)
{
case ComplexToComplex:
case ComplexToReal:
return 2;
case RealToComplex:
return 1;
case ComplexToComplexDual:
return 4;
default:
return 0;
}
}
FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
Type type, Direction direction, Target input_target, Target output_target,
std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom)
: context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
{
set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);
size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
temp_buffer_size >>= options.type.output_fp16;
temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
if (output_target != SSBO)
{
temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
}
bool expand = false;
if (type == ComplexToReal || type == RealToComplex)
{
// If we're doing C2R or R2C, we'll need double the scratch memory,
// so make sure we're dividing Nx *after* allocating.
Nx /= 2;
expand = true;
}
// Sanity checks.
if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
{
throw logic_error("FFT size is not POT.");
}
if (type == ComplexToReal && direction == Forward)
{
throw logic_error("ComplexToReal transforms requires inverse transform.");
}
if (type == RealToComplex && direction != Forward)
{
throw logic_error("RealToComplex transforms requires forward transform.");
}
if (type == RealToComplex && input_target == Image)
{
throw logic_error("Input real-to-complex must use ImageReal target.");
}
if (type == ComplexToReal && output_target == Image)
{
throw logic_error("Output complex-to-real must use ImageReal target.");
}
vector<Radix> radices[2];
Mode modes[2];
Target targets[4];
switch (direction)
{
case Forward:
modes[0] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
modes[1] = type == ComplexToComplexDual ? VerticalDual : Vertical;
targets[0] = input_target;
targets[1] = Ny > 1 ? SSBO : output_target;
targets[2] = targets[1];
targets[3] = output_target;
radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, false, wisdom, cost);
radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, expand, wisdom, cost);
break;
case Inverse:
case InverseConvolve:
modes[0] = type == ComplexToComplexDual ? VerticalDual : Vertical;
modes[1] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
targets[0] = input_target;
targets[1] = Ny > 1 ? SSBO : input_target;
targets[2] = targets[1];
targets[3] = output_target;
radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, expand, wisdom, cost);
radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, false, wisdom, cost);
break;
}
#if 0
print_radix_splits(context, radices);
#endif
passes.reserve(radices[0].size() + radices[1].size() + expand);
unsigned index = 0;
unsigned last_index = (radices[1].empty() && !expand) ? 0 : 1;
for (auto &radix_direction : radices)
{
unsigned p = 1;
unsigned i = 0;
for (auto &radix : radix_direction)
{
// If this is the last pass and we're writing to an image, use a special shader variant.
bool last_pass = index == last_index && i == radix_direction.size() - 1;
bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
Target out_target = last_pass ? output_target : SSBO;
Target in_target = passes.empty() ? input_target : SSBO;
Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
unsigned uv_scale_x = radix.vector_size / type_to_input_components(type);
const Parameters params = {
radix.size.x,
radix.size.y,
radix.size.z,
radix.radix,
radix.vector_size,
dir,
modes[index],
in_target,
out_target,
p == 1,
radix.shared_banked,
options.type.fp16, input_fp16, options.type.output_fp16,
options.type.normalize,
};
const Pass pass = {
params,
radix.num_workgroups_x, radix.num_workgroups_y,
uv_scale_x,
next_pow2(radix.num_workgroups_x * params.workgroup_size_x),
get_program(params),
};
passes.push_back(pass);
p *= radix.radix;
i++;
}
// After the first transform direction, inject either a real-to-complex resolve or complex-to-real resolve.
// This way, we avoid having special purpose transforms for all FFT variants.
if (index == 0 && (type == ComplexToReal || type == RealToComplex))
{
bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
bool last_pass = radices[1].empty();
Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
Target in_target = passes.empty() ? input_target : SSBO;
Target out_target = last_pass ? output_target : SSBO;
Mode mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
unsigned uv_scale_x = 1;
auto base_opts = options;
base_opts.type.input_fp16 = input_fp16;
auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts);
auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });
const Parameters params = {
res.size.x,
res.size.y,
res.size.z,
res.radix,
res.vector_size,
dir,
mode,
in_target,
out_target,
true,
false,
base_opts.type.fp16, base_opts.type.input_fp16, base_opts.type.output_fp16,
base_opts.type.normalize,
};
const Pass pass = {
params,
Nx / res.size.x,
Ny / res.size.y,
uv_scale_x,
next_pow2(Nx),
get_program(params),
};
passes.push_back(pass);
}
index++;
}
}
string FFT::load_shader_string(const char *path)
{
ifstream file(path);
if (!file.good())
{
throw runtime_error("Failed to load shader file from disk.\n");
}
stringstream buf;
buf << file.rdbuf();
return buf.str();
}
void FFT::store_shader_string(const char *path, const string &source)
{
ofstream file(path);
file.write(source.data(), source.size());
}
unique_ptr<Program> FFT::build_program(const Parameters &params)
{
string str;
str.reserve(16 * 1024);
#if 0
context->log("Building program:\n");
context->log(
" WG_X: %u\n"
" WG_Y: %u\n"
" WG_Z: %u\n"
" P1: %u\n"
" Radix: %u\n"
" Dir: %d\n"
" Mode: %u\n"
" InTarget: %u\n"
" OutTarget: %u\n"
" FP16: %u\n"
" InFP16: %u\n"
" OutFP16: %u\n"
" Norm: %u\n",
params.workgroup_size_x,
params.workgroup_size_y,
params.workgroup_size_z,
params.p1,
params.radix,
params.direction,
params.mode,
params.input_target,
params.output_target,
params.fft_fp16,
params.input_fp16,
params.output_fp16,
params.fft_normalize);
#endif
if (params.p1)
{
str += "#define FFT_P1\n";
}
if (params.fft_fp16)
{
str += "#define FFT_FP16\n";
}
if (params.input_fp16)
{
str += "#define FFT_INPUT_FP16\n";
}
if (params.output_fp16)
{
str += "#define FFT_OUTPUT_FP16\n";
}
if (params.fft_normalize)
{
str += "#define FFT_NORMALIZE\n";
}
if (params.direction == InverseConvolve)
{
str += "#define FFT_CONVOLVE\n";
}
str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n";
str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n";
str += string("#define FFT_RADIX ") + to_string(params.radix) + "\n";
unsigned vector_size = params.vector_size;
switch (params.mode)
{
case VerticalDual:
str += "#define FFT_DUAL\n";
str += "#define FFT_VERT\n";
break;
case Vertical:
str += "#define FFT_VERT\n";
break;
case HorizontalDual:
str += "#define FFT_DUAL\n";
str += "#define FFT_HORIZ\n";
break;
case Horizontal:
str += "#define FFT_HORIZ\n";
break;
case ResolveRealToComplex:
str += "#define FFT_RESOLVE_REAL_TO_COMPLEX\n";
str += "#define FFT_HORIZ\n";
vector_size = 2;
break;
case ResolveComplexToReal:
str += "#define FFT_RESOLVE_COMPLEX_TO_REAL\n";
str += "#define FFT_HORIZ\n";
vector_size = 2;
break;
}
switch (params.input_target)
{
case ImageReal:
str += "#define FFT_INPUT_REAL\n";
// Fallthrough
case Image:
str += "#define FFT_INPUT_TEXTURE\n";
break;
default:
break;
}
switch (params.output_target)
{
case ImageReal:
str += "#define FFT_OUTPUT_REAL\n";
// Fallthrough
case Image:
str += "#define FFT_OUTPUT_IMAGE\n";
break;
default:
break;
}
switch (vector_size)
{
case 2:
str += "#define FFT_VEC2\n";
break;
case 4:
str += "#define FFT_VEC4\n";
break;
case 8:
str += "#define FFT_VEC8\n";
break;
}
str += string("layout(local_size_x = ") +
to_string(params.workgroup_size_x) +
", local_size_y = " +
to_string(params.workgroup_size_y) +
", local_size_z = " +
to_string(params.workgroup_size_z) +
") in;\n";
str += load_shader_string("glfft/glsl/fft_common.comp");
switch (params.radix)
{
case 4:
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp");
break;
case 8:
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp");
break;
case 16:
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp");
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp");
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix16.comp");
break;
case 64:
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp");
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp");
str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix64.comp");
break;
}
str += load_shader_string("glfft/glsl/fft_main.comp");
auto prog = context->compile_compute_shader(str.c_str());
if (!prog)
{
puts(str.c_str());
}
#if 0
char shader_path[1024];
snprintf(shader_path, sizeof(shader_path), "glfft_shader_radix%u_first%u_mode%u_in_target%u_out_target%u.comp.src",
params.radix, params.p1, params.mode, unsigned(params.input_target), unsigned(params.output_target));
store_shader_string(shader_path, str);
#endif
return prog;
}
double FFT::bench(Context *context, Resource *output, Resource *input,
unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time)
{
context->wait_idle();
auto *cmd = context->request_command_buffer();
for (unsigned i = 0; i < warmup_iterations; i++)
{
process(cmd, output, input);
}
context->submit_command_buffer(cmd);
context->wait_idle();
unsigned runs = 0;
double start_time = context->get_time();
double total_time = 0.0;
for (unsigned i = 0; i < iterations && (((context->get_time() - start_time) < max_time) || i == 0); i++)
{
#ifdef GLFFT_CLI_ASYNC
check_async_cancel();
#endif
auto *cmd = context->request_command_buffer();
double iteration_start = context->get_time();
for (unsigned d = 0; d < dispatches_per_iteration; d++)
{
process(cmd, output, input);
cmd->barrier();
runs++;
}
context->submit_command_buffer(cmd);
context->wait_idle();
double iteration_end = context->get_time();
total_time += iteration_end - iteration_start;
}
return total_time / runs;
}
void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux)
{
if (passes.empty())
{
return;
}
Resource *buffers[2] = {
input,
passes.size() & 1 ?
(passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) :
temp_buffer.get(),
};
if (input_aux != 0)
{
if (passes.front().parameters.input_target != SSBO)
{
cmd->bind_texture(BindingTexture1, static_cast<Texture*>(input_aux));
cmd->bind_sampler(BindingTexture1, texture.samplers[1]);
}
else
{
if (ssbo.input_aux.size != 0)
{
cmd->bind_storage_buffer_range(BindingSSBOAux,
ssbo.input_aux.offset, ssbo.input_aux.size, static_cast<Buffer*>(input_aux));
}
else
{
cmd->bind_storage_buffer(BindingSSBOAux, static_cast<Buffer*>(input_aux));
}
}
}
Program *current_program = nullptr;
unsigned p = 1;
unsigned pass_index = 0;
struct FFTConstantData
{
uint32_t p;
uint32_t stride;
uint32_t padding[2];
float offset_x, offset_y;
float scale_x, scale_y;
};
for (auto &pass : passes)
{
if (pass.program != current_program)
{
cmd->bind_program(pass.program);
current_program = pass.program;
}
if (pass.parameters.p1)
{
p = 1;
}
FFTConstantData constant_data;
constant_data.p = p;
constant_data.stride = pass.stride;
p *= pass.parameters.radix;
if (pass.parameters.input_target != SSBO)
{
cmd->bind_texture(BindingTexture0, static_cast<Texture*>(buffers[0]));
cmd->bind_sampler(BindingTexture0, texture.samplers[0]);
// If one compute thread reads multiple texels in X dimension, scale this accordingly.
float scale_x = texture.scale_x * pass.uv_scale_x;
constant_data.offset_x = texture.offset_x;
constant_data.offset_y = texture.offset_y;
constant_data.scale_x = scale_x;
constant_data.scale_y = texture.scale_y;
}
else
{
if (buffers[0] == input && ssbo.input.size != 0)
{
cmd->bind_storage_buffer_range(BindingSSBOIn,
ssbo.input.offset, ssbo.input.size, static_cast<Buffer*>(buffers[0]));
}
else if (buffers[0] == output && ssbo.output.size != 0)
{
cmd->bind_storage_buffer_range(BindingSSBOIn,
ssbo.output.offset, ssbo.output.size, static_cast<Buffer*>(buffers[0]));
}
else
{
cmd->bind_storage_buffer(BindingSSBOIn, static_cast<Buffer*>(buffers[0]));
}
}
if (pass.parameters.output_target != SSBO)
{
Format format = FormatUnknown;
// TODO: Make this more flexible, would require shader variants per-format though.
if (pass.parameters.output_target == ImageReal)
{
format = FormatR32Float;
}
else
{
switch (pass.parameters.mode)
{
case VerticalDual:
case HorizontalDual:
format = FormatR16G16B16A16Float;
break;
case Vertical:
case Horizontal:
case ResolveRealToComplex:
format = FormatR32Uint;
break;
default:
break;
}
}
cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(output), format);
}
else
{
if (buffers[1] == output && ssbo.output.size != 0)
{
cmd->bind_storage_buffer_range(BindingSSBOOut,
ssbo.output.offset, ssbo.output.size, static_cast<Buffer*>(buffers[1]));
}
else
{
cmd->bind_storage_buffer(BindingSSBOOut, static_cast<Buffer*>(buffers[1]));
}
}
cmd->push_constant_data(BindingUBO, &constant_data, sizeof(constant_data));
cmd->dispatch(pass.workgroups_x, pass.workgroups_y, 1);
// For last pass, we don't know how our resource will be used afterwards,
// so let barrier decisions be up to the API user.
if (pass_index + 1 < passes.size())
{
cmd->barrier(static_cast<Buffer*>(buffers[1]));
}
if (pass_index == 0)
{
buffers[0] = passes.size() & 1 ?
temp_buffer.get() :
(passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output);
}
swap(buffers[0], buffers[1]);
pass_index++;
}
}