glava-ridged/glfft/glfft.cpp

/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "glfft.hpp"
#include <algorithm>
#include <stdexcept>
#include <fstream>
#include <sstream>
#include <numeric>
#include <assert.h>
#include <cmath>

#ifdef GLFFT_CLI_ASYNC
#include "glfft_cli.hpp"
#endif

/* GLava addition: should be defined by meson */
#if defined(GLAVA_STANDALONE) || !defined(SHADER_INSTALL_PATH)
#undef SHADER_INSTALL_PATH
#define SHADER_INSTALL_PATH "../shaders/glava"
#endif

using namespace std;
using namespace GLFFT;

enum Bindings
{
    BindingSSBOIn = 0,
    BindingSSBOOut = 1,
    BindingSSBOAux = 2,
    BindingUBO = 3,
    BindingTexture0 = 4,
    BindingTexture1 = 5,
    BindingImage = 6
};

struct WorkGroupSize
{
    unsigned x, y, z;
};

struct Radix
{
    WorkGroupSize size;
    unsigned num_workgroups_x;
    unsigned num_workgroups_y;
    unsigned radix;
    unsigned vector_size;
    bool shared_banked;
};

static unsigned next_pow2(unsigned v)
{
    v--;
    v |= v >> 16;
    v |= v >> 8;
    v |= v >> 4;
    v |= v >> 2;
    v |= v >> 1;
    return v + 1;
}

static void reduce(unsigned &wg_size, unsigned &divisor)
{
    if (divisor > 1 && wg_size >= divisor)
    {
        wg_size /= divisor;
        divisor = 1;
    }
    else if (divisor > 1 && wg_size < divisor)
    {
        divisor /= wg_size;
        wg_size = 1;
    }
}

static unsigned radix_to_wg_z(unsigned radix)
{
    switch (radix)
    {
        case 16:
            return 4;

        case 64:
            return 8;

        default:
            return 1;
    }
}

static Radix build_radix(unsigned Nx, unsigned Ny,
        Mode mode, unsigned vector_size, bool shared_banked, unsigned radix,
        WorkGroupSize size,
        bool pow2_stride)
{
    unsigned wg_x = 0, wg_y = 0;

    if (Ny == 1 && size.y > 1)
    {
        throw logic_error("WorkGroupSize.y must be 1, when Ny == 1.\n");
    }

    // To avoid too many threads per workgroup due to workgroup_size_z,
    // try to divide workgroup_size_y, then workgroup_size_x.
    // TODO: Make a better constraint solver which takes into account cache line sizes,
    // and image swizzling patterns, etc ... Not that critical though, since wisdom interface
    // will find the optimal options despite this.
    unsigned divisor = size.z;
    reduce(size.y, divisor);
    reduce(size.x, divisor);

    switch (mode)
    {
        case Vertical:
            // If we have pow2_stride, we need to transform 2^n + 1 elements horizontally,
            // so just add a single workgroup in X.
            // We pad by going up to pow2 stride anyways.
            // We will transform some garbage,
            // but it's better than transforming close to double the amount.
            wg_x = (2 * Nx) / (vector_size * size.x) + pow2_stride;
            wg_y = Ny / (size.y * radix);
            break;

        case VerticalDual:
            vector_size = max(vector_size, 4u);
            wg_x = (4 * Nx) / (vector_size * size.x);
            wg_y = Ny / (size.y * radix);
            break;

        case Horizontal:
            wg_x = (2 * Nx) / (vector_size * radix * size.x);
            wg_y = Ny / size.y;
            break;

        case HorizontalDual:
            vector_size = max(vector_size, 4u);
            wg_x = (4 * Nx) / (vector_size * radix * size.x);
            wg_y = Ny / size.y;
            break;

        default:
            assert(0);
    }

    return { size, wg_x, wg_y, radix, vector_size, shared_banked };
}

// Resolve radices are simpler, and don't yet support different vector sizes, etc.
static Radix build_resolve_radix(unsigned Nx, unsigned Ny, WorkGroupSize size)
{
    return { size, Nx / size.x, Ny / size.y, 2, 2, false };
}

// Smaller FFT with larger workgroups are not always possible to create.
static bool is_radix_valid(unsigned Nx, unsigned Ny,
        Mode mode, unsigned vector_size, unsigned radix,
        WorkGroupSize size,
        bool pow2_stride)
{
    auto res = build_radix(Nx, Ny,
            mode, vector_size, false, radix,
            size,
            pow2_stride);

    return res.num_workgroups_x > 0 && res.num_workgroups_y > 0;
}

static double find_cost(unsigned Nx, unsigned Ny, Mode mode, unsigned radix,
        const FFTOptions &options, const FFTWisdom &wisdom)
{
    auto opt = wisdom.find_optimal_options(Nx, Ny, radix, mode, SSBO, SSBO, options.type);

    // Return a very rough estimate if we cannot find cost.
    // The cost functions generated here are expected to be huge,
    // always much larger than true cost functions.
    // The purpose of this is to give a strong bias towards radices we have wisdom for.
    // We also give a bias towards larger radices, since they are generally more BW efficient.
    return opt ? opt->first.cost : Nx * Ny * (log2(float(radix)) + 2.0f);
}

struct CostPropagate
{
    CostPropagate() = default;
    CostPropagate(double cost, vector<unsigned> radices)
        : cost(cost), radices(move(radices)) {}

    void merge_if_better(const CostPropagate &a, const CostPropagate &b)
    {
        double new_cost = a.cost + b.cost;

        if ((cost == 0.0 || new_cost < cost) && a.cost != 0.0 && b.cost != 0.0)
        {
            cost = new_cost;
            radices = a.radices;
            radices.insert(end(radices), begin(b.radices), end(b.radices));
        }
    }

    double cost = 0.0;
    vector<unsigned> radices;
};

static vector<Radix> split_radices(unsigned Nx, unsigned Ny, Mode mode, Target input_target, Target output_target,
        const FFTOptions &options,
        bool pow2_stride, const FFTWisdom &wisdom, double &accumulate_cost)
{
    unsigned N;
    switch (mode)
    {
        case Vertical:
        case VerticalDual:
            N = Ny;
            break;

        case Horizontal:
        case HorizontalDual:
            N = Nx;
            break;

        default:
            return {};
    }

    // N == 1 is for things like Nx1 transforms where we don't do any vertical transforms.
    if (N == 1)
    {
        return {};
    }

    // Treat cost 0.0 as invalid.
    double cost_table[8] = {0.0};
    CostPropagate cost_propagate[32];

    // Fill table with fastest known ways to do radix 4, radix 8, radix 16, and 64.
    // We'll then find the optimal subdivision which has the lowest additive cost.
    cost_table[2] = find_cost(Nx, Ny, mode,  4, options, wisdom);
    cost_table[3] = find_cost(Nx, Ny, mode,  8, options, wisdom);
    cost_table[4] = find_cost(Nx, Ny, mode, 16, options, wisdom);
    cost_table[6] = find_cost(Nx, Ny, mode, 64, options, wisdom);

    auto is_valid = [&](unsigned radix) -> bool {
        unsigned workgroup_size_z = radix_to_wg_z(radix);
        auto &opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);

        // We don't want pow2_stride to round up a very inefficient work group and make the is_valid test pass.
        return is_radix_valid(Nx, Ny,
                mode, opt.vector_size, radix,
                { opt.workgroup_size_x, opt.workgroup_size_y, workgroup_size_z },
                false);
    };

    // If our work-space is too small to allow certain radices, we disable them from consideration here.
    for (unsigned i = 2; i <= 6; i++)
    {
        // Don't check the composite radix.
        if (i == 5)
        {
            continue;
        }

        if (is_valid(1 << i))
        {
            cost_propagate[i] = CostPropagate(cost_table[i], { 1u << i });
        }
    }

    // Now start bubble this up all the way to N, starting from radix 16.
    for (unsigned i = 4; (1u << i) <= N; i++)
    {
        auto &target = cost_propagate[i];

        for (unsigned r = 2; i - r >= r; r++)
        {
            target.merge_if_better(cost_propagate[r], cost_propagate[i - r]);
        }

        if ((1u << i) == N && target.cost == 0.0)
        {
            throw logic_error("There is no possible subdivision ...\n");
        }
    }

    // Ensure that the radix splits are sensible.
    // A radix-N non p-1 transform mandates that p factor is at least N.
    // Sort the splits so that larger radices come first.
    // For composite radices like 16 and 64, they are built with 4x4 and 8x8, so we only
    // need p factors for 4 and 8 for those cases.
    // The cost function doesn't depend in which order we split the radices.
    auto &cost = cost_propagate[unsigned(log2(float(N)))];
    auto radices = move(cost.radices);

    sort(begin(radices), end(radices), greater<unsigned>());

    if (accumulate(begin(radices), end(radices), 1u, multiplies<unsigned>()) != N)
    {
        throw logic_error("Radix splits are invalid.");
    }

    vector<Radix> radices_out;
    radices_out.reserve(radices.size());

    // Fill in the structs with all information.
    for (auto radix : radices)
    {
        bool first = radices_out.empty();
        bool last = radices_out.size() + 1 == radices.size();

        // Use known performance options as a fallback.
        // We used SSBO -> SSBO cost functions to find the optimal radix splits,
        // but replace first and last options with Image -> SSBO / SSBO -> Image cost functions if appropriate.
        auto &orig_opt = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode, SSBO, SSBO, options);
        auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, radix, mode,
                first ? input_target : SSBO,
                last ? output_target : SSBO,
                { orig_opt, options.type });

        radices_out.push_back(build_radix(Nx, Ny,
                    mode, opts.vector_size, opts.shared_banked, radix,
                    { opts.workgroup_size_x, opts.workgroup_size_y, radix_to_wg_z(radix) },
                    pow2_stride));
    }

    accumulate_cost += cost.cost;
    return radices_out;
}

Program* ProgramCache::find_program(const Parameters &parameters) const
{
    auto itr = programs.find(parameters);
    if (itr != end(programs))
    {
        return itr->second.get();
    }
    else
    {
        return nullptr;
    }
}

void ProgramCache::insert_program(const Parameters &parameters, std::unique_ptr<Program> program)
{
    programs[parameters] = move(program);
}

Program* FFT::get_program(const Parameters &params)
{
    Program *prog = cache->find_program(params);
    if (!prog)
    {
        auto newprog = build_program(params);
        if (!newprog)
        {
            throw runtime_error("Failed to compile shader.\n");
        }
        prog = newprog.get();
        cache->insert_program(params, move(newprog));
    }
    return prog;
}

static inline unsigned mode_to_input_components(Mode mode)
{
    switch (mode)
    {
        case HorizontalDual:
        case VerticalDual:
            return 4;

        case Horizontal:
        case Vertical:
        case ResolveComplexToReal:
            return 2;

        case ResolveRealToComplex:
            return 1;

        default:
            return 0;
    }
}

FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
        unsigned radix, unsigned p,
        Mode mode, Target input_target, Target output_target,
        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options)
    : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
{
    set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);

    if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
    {
        throw logic_error("FFT size is not POT.");
    }

    if (p != 1 && input_target != SSBO)
    {
        throw logic_error("P != 1 only supported with SSBO as input.");
    }

    if (p < radix && output_target != SSBO)
    {
        throw logic_error("P < radix only supported with SSBO as output.");
    }

    // We don't really care about transform direction since it's just a matter of sign-flipping twiddles,
    // but we have to obey some fundamental assumptions of resolve passes.
    Direction direction = mode == ResolveComplexToReal ? Inverse : Forward;

    Radix res;
    if (mode == ResolveRealToComplex || mode == ResolveComplexToReal)
    {
        res = build_resolve_radix(Nx, Ny, { options.performance.workgroup_size_x, options.performance.workgroup_size_y, 1 });
    }
    else
    {
        res = build_radix(Nx, Ny,
                mode, options.performance.vector_size, options.performance.shared_banked, radix,
                { options.performance.workgroup_size_x, options.performance.workgroup_size_y, radix_to_wg_z(radix) },
                false);
    }

    const Parameters params = {
        res.size.x,
        res.size.y,
        res.size.z,
        res.radix,
        res.vector_size,
        direction,
        mode,
        input_target,
        output_target,
        p == 1,
        res.shared_banked,
        options.type.fp16, options.type.input_fp16, options.type.output_fp16,
        options.type.normalize,
    };

    if (res.num_workgroups_x == 0 || res.num_workgroups_y == 0)
    {
        throw logic_error("Invalid workgroup sizes for this radix.");
    }

    unsigned uv_scale_x = res.vector_size / mode_to_input_components(mode);
    const Pass pass = {
        params,
        res.num_workgroups_x, res.num_workgroups_y,
        uv_scale_x,
        next_pow2(res.num_workgroups_x * params.workgroup_size_x),
        get_program(params),
    };

    passes.push_back(pass);
}

static inline void print_radix_splits(Context *context, const vector<Radix> radices[2])
{
    context->log("Transform #1\n");
    for (auto &radix : radices[0])
    {
        context->log("  Size: (%u, %u, %u)\n",
                radix.size.x, radix.size.y, radix.size.z);
        context->log("  Dispatch: (%u, %u)\n",
                radix.num_workgroups_x, radix.num_workgroups_y);
        context->log("  Radix: %u\n",
                radix.radix);
        context->log("  VectorSize: %u\n\n",
                radix.vector_size);
    }

    context->log("Transform #2\n");
    for (auto &radix : radices[1])
    {
        context->log("  Size: (%u, %u, %u)\n",
                radix.size.x, radix.size.y, radix.size.z);
        context->log("  Dispatch: (%u, %u)\n",
                radix.num_workgroups_x, radix.num_workgroups_y);
        context->log("  Radix: %u\n",
                radix.radix);
        context->log("  VectorSize: %u\n\n",
                radix.vector_size);
    }
}

static inline unsigned type_to_input_components(Type type)
{
    switch (type)
    {
        case ComplexToComplex:
        case ComplexToReal:
            return 2;

        case RealToComplex:
            return 1;

        case ComplexToComplexDual:
            return 4;

        default:
            return 0;
    }
}

FFT::FFT(Context *context, unsigned Nx, unsigned Ny,
        Type type, Direction direction, Target input_target, Target output_target,
        std::shared_ptr<ProgramCache> program_cache, const FFTOptions &options, const FFTWisdom &wisdom)
    : context(context), cache(move(program_cache)), size_x(Nx), size_y(Ny)
{
    set_texture_offset_scale(0.5f / Nx, 0.5f / Ny, 1.0f / Nx, 1.0f / Ny);

    size_t temp_buffer_size = Nx * Ny * sizeof(float) * (type == ComplexToComplexDual ? 4 : 2);
    temp_buffer_size >>= options.type.output_fp16;

    temp_buffer = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
    if (output_target != SSBO)
    {
        temp_buffer_image = context->create_buffer(nullptr, temp_buffer_size, AccessStreamCopy);
    }

    bool expand = false;
    if (type == ComplexToReal || type == RealToComplex)
    {
        // If we're doing C2R or R2C, we'll need double the scratch memory,
        // so make sure we're dividing Nx *after* allocating.
        Nx /= 2;
        expand = true;
    }

    // Sanity checks.
    if (!Nx || !Ny || (Nx & (Nx - 1)) || (Ny & (Ny - 1)))
    {
        throw logic_error("FFT size is not POT.");
    }

    if (type == ComplexToReal && direction == Forward)
    {
        throw logic_error("ComplexToReal transforms requires inverse transform.");
    }

    if (type == RealToComplex && direction != Forward)
    {
        throw logic_error("RealToComplex transforms requires forward transform.");
    }

    if (type == RealToComplex && input_target == Image)
    {
        throw logic_error("Input real-to-complex must use ImageReal target.");
    }

    if (type == ComplexToReal && output_target == Image)
    {
        throw logic_error("Output complex-to-real must use ImageReal target.");
    }

    vector<Radix> radices[2];
    Mode modes[2];
    Target targets[4];

    switch (direction)
    {
        case Forward:
            modes[0] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
            modes[1] = type == ComplexToComplexDual ? VerticalDual : Vertical;

            targets[0] = input_target;
            targets[1] = Ny > 1 ? SSBO : output_target;
            targets[2] = targets[1];
            targets[3] = output_target;

            radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, false, wisdom, cost);
            radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, expand, wisdom, cost);
            break;

        case Inverse:
        case InverseConvolve:
            modes[0] = type == ComplexToComplexDual ? VerticalDual : Vertical;
            modes[1] = type == ComplexToComplexDual ? HorizontalDual : Horizontal;

            targets[0] = input_target;
            targets[1] = Ny > 1 ? SSBO : input_target;
            targets[2] = targets[1];
            targets[3] = output_target;

            radices[0] = split_radices(Nx, Ny, modes[0], targets[0], targets[1], options, expand, wisdom, cost);
            radices[1] = split_radices(Nx, Ny, modes[1], targets[2], targets[3], options, false, wisdom, cost);
            break;
    }

#if 0
    print_radix_splits(context, radices);
#endif

    passes.reserve(radices[0].size() + radices[1].size() + expand);

    unsigned index = 0;
    unsigned last_index = (radices[1].empty() && !expand) ? 0 : 1;

    for (auto &radix_direction : radices)
    {
        unsigned p = 1;
        unsigned i = 0;
        
        for (auto &radix : radix_direction)
        {
            // If this is the last pass and we're writing to an image, use a special shader variant.
            bool last_pass = index == last_index && i == radix_direction.size() - 1;

            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
            Target out_target = last_pass ? output_target : SSBO;
            Target in_target = passes.empty() ? input_target : SSBO;
            Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
            unsigned uv_scale_x = radix.vector_size / type_to_input_components(type);

            const Parameters params = {
                radix.size.x,
                radix.size.y,
                radix.size.z,
                radix.radix,
                radix.vector_size,
                dir,
                modes[index],
                in_target,
                out_target,
                p == 1,
                radix.shared_banked,
                options.type.fp16, input_fp16, options.type.output_fp16,
                options.type.normalize,
            };

            const Pass pass = {
                params,
                radix.num_workgroups_x, radix.num_workgroups_y,
                uv_scale_x,
                next_pow2(radix.num_workgroups_x * params.workgroup_size_x),
                get_program(params),
            };

            passes.push_back(pass);

            p *= radix.radix;
            i++;
        }

        // After the first transform direction, inject either a real-to-complex resolve or complex-to-real resolve.
        // This way, we avoid having special purpose transforms for all FFT variants.
        if (index == 0 && (type == ComplexToReal || type == RealToComplex))
        {
            bool input_fp16 = passes.empty() ? options.type.input_fp16 : options.type.output_fp16;
            bool last_pass = radices[1].empty();
            Direction dir = direction == InverseConvolve && !passes.empty() ? Inverse : direction;
            Target in_target = passes.empty() ? input_target : SSBO;
            Target out_target = last_pass ? output_target : SSBO;
            Mode mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
            unsigned uv_scale_x = 1;

            auto base_opts = options;
            base_opts.type.input_fp16 = input_fp16;

            auto &opts = wisdom.find_optimal_options_or_default(Nx, Ny, 2, mode, in_target, out_target, base_opts);
            auto res = build_resolve_radix(Nx, Ny, { opts.workgroup_size_x, opts.workgroup_size_y, 1 });

            const Parameters params = {
                res.size.x,
                res.size.y,
                res.size.z,
                res.radix,
                res.vector_size,
                dir,
                mode,
                in_target,
                out_target,
                true,
                false,
                base_opts.type.fp16, base_opts.type.input_fp16, base_opts.type.output_fp16,
                base_opts.type.normalize,
            };

            const Pass pass = {
                params,
                Nx / res.size.x,
                Ny / res.size.y,
                uv_scale_x,
                next_pow2(Nx),
                get_program(params),
            };

            passes.push_back(pass);
        }

        index++;
    }
}

string FFT::load_shader_string(const char *path)
{
    ifstream file(path);
    if (!file.good())
    {
        throw runtime_error("Failed to load shader file from disk.\n");
    }
    stringstream buf;
    buf << file.rdbuf();
    return buf.str();
}

void FFT::store_shader_string(const char *path, const string &source)
{
    ofstream file(path);
    file.write(source.data(), source.size());
}

unique_ptr<Program> FFT::build_program(const Parameters &params)
{
    string str;
    str.reserve(16 * 1024);

#if 0
    context->log("Building program:\n");
    context->log(
            "   WG_X:      %u\n"
            "   WG_Y:      %u\n"
            "   WG_Z:      %u\n"
            "   P1:        %u\n"
            "   Radix:     %u\n"
            "   Dir:       %d\n"
            "   Mode:      %u\n"
            "   InTarget:  %u\n"
            "   OutTarget: %u\n"
            "   FP16:      %u\n"
            "   InFP16:    %u\n"
            "   OutFP16:   %u\n"
            "   Norm:      %u\n",
            params.workgroup_size_x,
            params.workgroup_size_y,
            params.workgroup_size_z,
            params.p1,
            params.radix,
            params.direction,
            params.mode,
            params.input_target,
            params.output_target,
            params.fft_fp16,
            params.input_fp16,
            params.output_fp16,
            params.fft_normalize);
#endif

    if (params.p1)
    {
        str += "#define FFT_P1\n";
    }

    if (params.fft_fp16)
    {
        str += "#define FFT_FP16\n";
    }

    if (params.input_fp16)
    {
        str += "#define FFT_INPUT_FP16\n";
    }

    if (params.output_fp16)
    {
        str += "#define FFT_OUTPUT_FP16\n";
    }

    if (params.fft_normalize)
    {
        str += "#define FFT_NORMALIZE\n";
    }

    if (params.direction == InverseConvolve)
    {
        str += "#define FFT_CONVOLVE\n";
    }

    str += params.shared_banked ? "#define FFT_SHARED_BANKED 1\n" : "#define FFT_SHARED_BANKED 0\n";

    str += params.direction == Forward ? "#define FFT_FORWARD\n" : "#define FFT_INVERSE\n";
    str += string("#define FFT_RADIX ") + to_string(params.radix) + "\n";
    
    unsigned vector_size = params.vector_size;
    switch (params.mode)
    {
        case VerticalDual:
            str += "#define FFT_DUAL\n";
            str += "#define FFT_VERT\n";
            break;

        case Vertical:
            str += "#define FFT_VERT\n";
            break;

        case HorizontalDual:
            str += "#define FFT_DUAL\n";
            str += "#define FFT_HORIZ\n";
            break;

        case Horizontal:
            str += "#define FFT_HORIZ\n";
            break;

        case ResolveRealToComplex:
            str += "#define FFT_RESOLVE_REAL_TO_COMPLEX\n";
            str += "#define FFT_HORIZ\n";
            vector_size = 2;
            break;

        case ResolveComplexToReal:
            str += "#define FFT_RESOLVE_COMPLEX_TO_REAL\n";
            str += "#define FFT_HORIZ\n";
            vector_size = 2;
            break;
    }

    switch (params.input_target)
    {
        case ImageReal:
            str += "#define FFT_INPUT_REAL\n";
            // Fallthrough
        case Image:
            str += "#define FFT_INPUT_TEXTURE\n";
            break;

        default:
            break;
    }

    switch (params.output_target)
    {
        case ImageReal:
            str += "#define FFT_OUTPUT_REAL\n";
            // Fallthrough
        case Image:
            str += "#define FFT_OUTPUT_IMAGE\n";
            break;

        default:
            break;
    }

    switch (vector_size)
    {
        case 2:
            str += "#define FFT_VEC2\n";
            break;

        case 4:
            str += "#define FFT_VEC4\n";
            break;

        case 8:
            str += "#define FFT_VEC8\n";
            break;
    }

    str += string("layout(local_size_x = ") +
        to_string(params.workgroup_size_x) +
        ", local_size_y = " +
        to_string(params.workgroup_size_y) +
        ", local_size_z = " +
        to_string(params.workgroup_size_z) +
        ") in;\n";
    
    str += load_shader_string("glfft/glsl/fft_common.comp");
    switch (params.radix)
    {
        case 4:
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp");
            break;

        case 8:
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp");
            break;

        case 16:
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix4.comp");
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp");
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix16.comp");
            break;

        case 64:
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix8.comp");
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_shared.comp");
            str += load_shader_string(SHADER_INSTALL_PATH "/util/fft_radix64.comp");
            break;
    }
    str += load_shader_string("glfft/glsl/fft_main.comp");

    auto prog = context->compile_compute_shader(str.c_str());
    if (!prog)
    {
        puts(str.c_str());
    }

#if 0
    char shader_path[1024];
    snprintf(shader_path, sizeof(shader_path), "glfft_shader_radix%u_first%u_mode%u_in_target%u_out_target%u.comp.src",
            params.radix, params.p1, params.mode, unsigned(params.input_target), unsigned(params.output_target));
    store_shader_string(shader_path, str);
#endif

    return prog;
}

double FFT::bench(Context *context, Resource *output, Resource *input,
        unsigned warmup_iterations, unsigned iterations, unsigned dispatches_per_iteration, double max_time)
{
    context->wait_idle();
    auto *cmd = context->request_command_buffer();
    for (unsigned i = 0; i < warmup_iterations; i++)
    {
        process(cmd, output, input);
    }
    context->submit_command_buffer(cmd);
    context->wait_idle();

    unsigned runs = 0;
    double start_time = context->get_time();
    double total_time = 0.0;

    for (unsigned i = 0; i < iterations && (((context->get_time() - start_time) < max_time) || i == 0); i++)
    {
#ifdef GLFFT_CLI_ASYNC
        check_async_cancel();
#endif

        auto *cmd = context->request_command_buffer();

        double iteration_start = context->get_time();
        for (unsigned d = 0; d < dispatches_per_iteration; d++)
        {
            process(cmd, output, input);
            cmd->barrier();
            runs++;
        }

        context->submit_command_buffer(cmd);
        context->wait_idle();

        double iteration_end = context->get_time();
        total_time += iteration_end - iteration_start;
    }

    return total_time / runs;
}

void FFT::process(CommandBuffer *cmd, Resource *output, Resource *input, Resource *input_aux)
{
    if (passes.empty())
    {
        return;
    }

    Resource *buffers[2] = {
        input,
        passes.size() & 1 ?
            (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output) :
            temp_buffer.get(),
    };

    if (input_aux != 0)
    {
        if (passes.front().parameters.input_target != SSBO)
        {
            cmd->bind_texture(BindingTexture1, static_cast<Texture*>(input_aux));
            cmd->bind_sampler(BindingTexture1, texture.samplers[1]);
        }
        else
        {
            if (ssbo.input_aux.size != 0)
            {
                cmd->bind_storage_buffer_range(BindingSSBOAux,
                        ssbo.input_aux.offset, ssbo.input_aux.size, static_cast<Buffer*>(input_aux));
            }
            else
            {
                cmd->bind_storage_buffer(BindingSSBOAux, static_cast<Buffer*>(input_aux));
            }
        }
    }

    Program *current_program = nullptr;
    unsigned p = 1;
    unsigned pass_index = 0;

    struct FFTConstantData
    {
        uint32_t p;
        uint32_t stride;
        uint32_t padding[2];
        float offset_x, offset_y;
        float scale_x, scale_y;
    };

    for (auto &pass : passes)
    {
        if (pass.program != current_program)
        {
            cmd->bind_program(pass.program);
            current_program = pass.program;
        }

        if (pass.parameters.p1)
        {
            p = 1;
        }

        FFTConstantData constant_data;
        constant_data.p = p;
        constant_data.stride = pass.stride;
        p *= pass.parameters.radix;

        if (pass.parameters.input_target != SSBO)
        {
            cmd->bind_texture(BindingTexture0, static_cast<Texture*>(buffers[0]));
            cmd->bind_sampler(BindingTexture0, texture.samplers[0]);

            // If one compute thread reads multiple texels in X dimension, scale this accordingly.
            float scale_x = texture.scale_x * pass.uv_scale_x;

            constant_data.offset_x = texture.offset_x;
            constant_data.offset_y = texture.offset_y;
            constant_data.scale_x = scale_x;
            constant_data.scale_y = texture.scale_y;
        }
        else
        {
            if (buffers[0] == input && ssbo.input.size != 0)
            {
                cmd->bind_storage_buffer_range(BindingSSBOIn,
                        ssbo.input.offset, ssbo.input.size, static_cast<Buffer*>(buffers[0]));
            }
            else if (buffers[0] == output && ssbo.output.size != 0)
            {
                cmd->bind_storage_buffer_range(BindingSSBOIn,
                        ssbo.output.offset, ssbo.output.size, static_cast<Buffer*>(buffers[0]));
            }
            else
            {
                cmd->bind_storage_buffer(BindingSSBOIn, static_cast<Buffer*>(buffers[0]));
            }
        }

        if (pass.parameters.output_target != SSBO)
        {
            Format format = FormatUnknown;

            // TODO: Make this more flexible, would require shader variants per-format though.
            if (pass.parameters.output_target == ImageReal)
            {
                format = FormatR32Float;
            }
            else
            {
                switch (pass.parameters.mode)
                {
                    case VerticalDual:
                    case HorizontalDual:
                        format = FormatR16G16B16A16Float;
                        break;

                    case Vertical:
                    case Horizontal:
                    case ResolveRealToComplex:
                        format = FormatR32Uint;
                        break;

                    default:
                        break;
                }
            }
            cmd->bind_storage_texture(BindingImage, static_cast<Texture*>(output), format);
        }
        else
        {
            if (buffers[1] == output && ssbo.output.size != 0)
            {
                cmd->bind_storage_buffer_range(BindingSSBOOut,
                        ssbo.output.offset, ssbo.output.size, static_cast<Buffer*>(buffers[1]));
            }
            else
            {
                cmd->bind_storage_buffer(BindingSSBOOut, static_cast<Buffer*>(buffers[1]));
            }
        }

        cmd->push_constant_data(BindingUBO, &constant_data, sizeof(constant_data));
        cmd->dispatch(pass.workgroups_x, pass.workgroups_y, 1);

        // For last pass, we don't know how our resource will be used afterwards,
        // so let barrier decisions be up to the API user.
        if (pass_index + 1 < passes.size())
        {
            cmd->barrier(static_cast<Buffer*>(buffers[1]));
        }

        if (pass_index == 0)
        {
            buffers[0] = passes.size() & 1 ?
                temp_buffer.get() :
                (passes.back().parameters.output_target != SSBO ? temp_buffer_image.get() : output);
        }

        swap(buffers[0], buffers[1]);
        pass_index++;
    }
}