glava-ridged/glfft/glfft_wisdom.cpp

/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
 *
 * Permission is hereby granted, free of charge,
 * to any person obtaining a copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "glfft_wisdom.hpp"
#include "glfft_interface.hpp"
#include "glfft.hpp"
#include <utility>

/* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */
#ifdef GLFFT_SERIALIZATION
#include "rapidjson/reader.h"
#include "rapidjson/prettywriter.h"
#include "rapidjson/stringbuffer.h"
#include "rapidjson/document.h"
using namespace rapidjson;
#endif

#ifdef GLFFT_CLI_ASYNC
#include "glfft_cli.hpp"
#endif

using namespace std;
using namespace GLFFT;

FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context)
{
    FFTStaticWisdom res;

    const char *renderer = context->get_renderer_string();
    unsigned threads = context->get_max_work_group_threads();

    if (strstr(renderer, "GeForce") || strstr(renderer, "Quadro"))
    {
        context->log("Detected GeForce/Quadro GPU.\n");
        res.min_workgroup_size = 32; // Warp threads.
        res.min_workgroup_size_shared = 32;
        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
        res.min_vector_size = 2;
        res.max_vector_size = 2;
        res.shared_banked = FFTStaticWisdom::True;
    }
    else if (strstr(renderer, "Radeon"))
    {
        context->log("Detected Radeon GPU.\n");
        res.min_workgroup_size = 64; // Wavefront threads (GCN).
        res.min_workgroup_size_shared = 128;
        res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
        // TODO: Find if we can restrict this to 2 or 4 always.
        res.min_vector_size = 2;
        res.max_vector_size = 4;
        res.shared_banked = FFTStaticWisdom::True;
    }
    else if (strstr(renderer, "Mali"))
    {
        context->log("Detected Mali GPU.\n");

        res.min_workgroup_size = 4;
        res.min_workgroup_size_shared = 4;
        res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.
        res.min_vector_size = 4;
        res.max_vector_size = 4;
        res.shared_banked = FFTStaticWisdom::False;
    }
    // TODO: Add more GPUs.

    return res;
}

pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(
        Context *context, unsigned Nx, unsigned Ny, unsigned radix,
        Mode mode, Target input_target, Target output_target,
        const FFTOptions::Type &type)
{
    WisdomPass pass = {
        {
            Nx, Ny, radix, mode, input_target, output_target,
            type,
        },
        0.0,
    };

    auto itr = library.find(pass);
    if (itr != end(library))
    {
        return make_pair(itr->first.cost, itr->second);
    }
    else
    {
        auto result = study(context, pass, type);
        pass.cost = result.first;
        library[pass] = result.second;

        return result;
    }
}

void FFTWisdom::learn_optimal_options_exhaustive(Context *context,
        unsigned Nx, unsigned Ny,
        Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)
{
    bool learn_resolve = type == ComplexToReal || type == RealToComplex;
    Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;
    Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;

    // Create wisdom for horizontal transforms and vertical transform.
    static const unsigned radices[] = { 4, 8, 16, 64 };
    for (auto radix : radices)
    {
        try
        {
            // If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.

            // Learn plain transforms.
            if (Ny > 1)
            {
                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);
            }
            learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);

            // Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with
            // block interleave and larger WG_Y sizes.
            if (input_target != SSBO)
            {
                if (Ny > 1)
                {
                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);
                }
                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);
            }

            if (output_target != SSBO)
            {
                if (Ny > 1)
                {
                    learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);
                }
                learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);
            }
        }
#ifdef GLFFT_CLI_ASYNC
        catch (const AsyncCancellation &)
        {
            throw;
        }
#endif
        catch (...)
        {
            // If our default options cannot successfully create the radix pass (i.e. throws),
            // just ignore it for purpose of creating wisdom.
        }
    }

    auto resolve_type = fft_type;
    resolve_type.input_fp16 = resolve_type.output_fp16;
    Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
    Target resolve_input_target = SSBO;

    // If we have C2R Nx1 transform, the first pass is resolve, so use those types.
    if (type == ComplexToReal && Ny == 1)
    {
        resolve_type = fft_type;
        resolve_input_target = input_target;
    }

    // If we need to do a resolve pass, train this case as well.
    if (learn_resolve)
    {
        try
        {
            // If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.
            if (Ny == 1 && resolve_mode == ResolveRealToComplex)
            {
                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
            }
            else
            {
                learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);
            }
        }
#ifdef GLFFT_CLI_ASYNC
        catch (const AsyncCancellation &)
        {
            throw;
        }
#endif
        catch (...)
        {
            // If our default options cannot successfully create the radix pass (i.e. throws),
            // just ignore it for purpose of creating wisdom.
        }
    }
}

double FFTWisdom::bench(Context *context, Resource *output, Resource *input,
        const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const
{
    FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,
            pass.pass.mode, pass.pass.input_target, pass.pass.output_target,
            cache, options);

    return fft.bench(context,
            output, input, params.warmup, params.iterations, params.dispatches, params.timeout);
}

static inline unsigned mode_to_size(Mode mode)
{
    switch (mode)
    {
        case VerticalDual:
        case HorizontalDual:
        case ResolveRealToComplex:
        case ResolveComplexToReal:
            return 4;

        default:
            return 2;
    }
}

std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const
{
    auto cache = make_shared<ProgramCache>();

    unique_ptr<Resource> output;
    unique_ptr<Resource> input;

    unsigned mode_size = mode_to_size(pass.pass.mode);
    vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);

    if (pass.pass.input_target == SSBO)
    {
        input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy);
    }
    else
    {
        Format format = FormatUnknown;
        unsigned Nx = pass.pass.Nx;
        unsigned Ny = pass.pass.Ny;

        switch (pass.pass.mode)
        {
            case VerticalDual:
            case HorizontalDual:
                format = FormatR32G32B32A32Float;
                break;

            case Vertical:
            case Horizontal:
                format = FormatR32G32Float;
                break;

            case ResolveComplexToReal:
                format = FormatR32G32Float;
                Nx *= 2;
                break;

            default:
                throw logic_error("Invalid input mode.\n");
        }

        input = context->create_texture(tmp.data(), Nx, Ny, format);
    }

    if (pass.pass.output_target == SSBO)
    {
        output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy);
    }
    else
    {
        Format format = FormatUnknown;
        unsigned Nx = pass.pass.Nx;
        unsigned Ny = pass.pass.Ny;

        switch (pass.pass.mode)
        {
            case VerticalDual:
            case HorizontalDual:
                format = FormatR32G32B32A32Float;
                break;

            case Vertical:
            case Horizontal:
                format = FormatR32G32Float;
                break;

            case ResolveRealToComplex:
                format = FormatR32G32Float;
                Nx *= 2;
                break;

            default:
                throw logic_error("Invalid output mode.\n");
        }

        output = context->create_texture(nullptr, Nx, Ny, format);
    }

    // Exhaustive search, look for every sensible combination, and find fastest parameters.
    // Get initial best cost with defaults.
    FFTOptions::Performance best_perf;
    double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache);

    static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };
    static const unsigned vector_size_values[] = { 2, 4, 8 };
    static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
    static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };

    bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex;
    bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual;
    unsigned bench_count = 0;

    for (auto shared_banked : shared_banked_values)
    {
        // Useless test, since shared banked is only relevant for radix 16/64.
        if (pass.pass.radix < 16 && shared_banked)
        {
            continue;
        }

        bool fair_shared_banked = (pass.pass.radix < 16) ||
                                  (static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
                                  (shared_banked == static_wisdom.shared_banked);

        if (!fair_shared_banked)
        {
            continue;
        }

        for (auto vector_size : vector_size_values)
        {
            // Resolve passes currently only support vector size 2. Shared banked makes no sense either.
            if (test_resolve && (vector_size != 2 || shared_banked))
            {
                continue;
            }

            // We can only use vector_size 8 with FP16.
            if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16))
            {
                continue;
            }

            // Makes little sense to test since since vector_size will be bumped to 4 anyways.
            if (test_dual && vector_size < 4)
            {
                continue;
            }

            for (auto workgroup_size_x : workgroup_size_x_values)
            {
                for (auto workgroup_size_y : workgroup_size_y_values)
                {
                    unsigned workgroup_size  = workgroup_size_x * workgroup_size_y;

                    unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :
                                                                          static_wisdom.min_workgroup_size;

                    unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
                    unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;

                    bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
                                               workgroup_size >= min_workgroup_size;
                    if (pass.pass.Ny == 1 && workgroup_size_y > 1)
                    {
                        fair_workgroup_size = false;
                    }

                    if (!fair_workgroup_size)
                    {
                        continue;
                    }

                    // If we have dual mode, accept vector sizes larger than max.
                    bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
                                                             vector_size >= min_vector_size);

                    if (!fair_vector_size)
                    {
                        continue;
                    }

                    FFTOptions::Performance perf;
                    perf.shared_banked = shared_banked;
                    perf.vector_size = vector_size;
                    perf.workgroup_size_x = workgroup_size_x;
                    perf.workgroup_size_y = workgroup_size_y;

                    try
                    {
                        // If workgroup sizes are too big for our test, this will throw.
                        double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache);
                        bench_count++;

#if 1
                        context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);
                        context->log("  Width:            %4u\n", pass.pass.Nx);
                        context->log("  Height:           %4u\n", pass.pass.Ny);
                        context->log("  Shared banked:     %3s\n", shared_banked ? "yes" : "no");
                        context->log("  Vector size:         %u\n", vector_size);
                        context->log("  Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
                        context->log("  Cost:         %8.3g\n", cost);
#endif

                        if (cost < minimum_cost)
                        {
#if 1
                            context->log("  New optimal solution! (%g -> %g)\n", minimum_cost, cost);
#endif
                            best_perf = perf;
                            minimum_cost = cost;
                        }
                    }
#ifdef GLFFT_CLI_ASYNC
                    catch (const AsyncCancellation &)
                    {
                        throw;
                    }
#endif
                    catch (...)
                    {
                        // If we pass in bogus parameters,
                        // FFT will throw and we just ignore this.
                    }
                }
            }
        }
    }

    context->log("Tested %u variants!\n", bench_count);
    return make_pair(minimum_cost, best_perf);
}

const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
        Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const
{
    WisdomPass pass = {
        {
            Nx, Ny, radix, mode, input_target, output_target,
            type,
        },
        0.0,
    };

    auto itr = library.find(pass);
    return itr != end(library) ? (&(*itr)) : nullptr;
}

const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
        Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
{
    WisdomPass pass = {
        {
            Nx, Ny, radix, mode, input_target, output_target,
            base_options.type,
        },
        0.0,
    };

    auto itr = library.find(pass);

#if 0
    if (itr == end(library))
    {
        context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
                Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));
    }
#endif

    return itr != end(library) ? itr->second : base_options.performance;
}

#ifdef GLFFT_SERIALIZATION
std::string FFTWisdom::archive() const
{
    StringBuffer s;
    PrettyWriter<StringBuffer> writer{s};

    writer.StartObject();
    writer.String("library");

    // Serialize all wisdom accumulated to a string.
    writer.StartArray();
    for (auto &entry : library)
    {
        writer.StartObject();

        writer.String("scenario");
        writer.StartObject();
        writer.String("nx");
        writer.Uint(entry.first.pass.Nx);
        writer.String("ny");
        writer.Uint(entry.first.pass.Ny);
        writer.String("radix");
        writer.Uint(entry.first.pass.radix);
        writer.String("mode");
        writer.Uint(entry.first.pass.mode);
        writer.String("input_target");
        writer.Uint(entry.first.pass.input_target);
        writer.String("output_target");
        writer.Uint(entry.first.pass.output_target);
        writer.EndObject();

        writer.String("type");
        writer.StartObject();
        writer.String("fp16");
        writer.Bool(entry.first.pass.type.fp16);
        writer.String("input_fp16");
        writer.Bool(entry.first.pass.type.input_fp16);
        writer.String("output_fp16");
        writer.Bool(entry.first.pass.type.output_fp16);
        writer.String("normalize");
        writer.Bool(entry.first.pass.type.normalize);
        writer.EndObject();

        writer.String("performance");
        writer.StartObject();
        writer.String("shared_banked");
        writer.Bool(entry.second.shared_banked);
        writer.String("vector_size");
        writer.Uint(entry.second.vector_size);
        writer.String("workgroup_size_x");
        writer.Uint(entry.second.workgroup_size_x);
        writer.String("workgroup_size_y");
        writer.Uint(entry.second.workgroup_size_y);
        writer.EndObject();

        writer.String("cost");
        writer.Double(entry.first.cost);

        writer.EndObject();
    }
    writer.EndArray();
    writer.EndObject();
    return s.GetString();
}

void FFTWisdom::extract(const char *json)
{
    Document document;
    document.Parse(json);

    // Exception safe, we don't want to risk throwing in the middle of the
    // loop, leaving the library is broken state.
    unordered_map<WisdomPass, FFTOptions::Performance> new_library;

    auto &lib = document["library"];

    // y u no begin(), end() :(
    for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr)
    {
        auto &v = *itr;

        WisdomPass pass;
        FFTOptions::Performance perf;

        pass.cost = v["cost"].GetDouble();

        auto &scenario = v["scenario"];
        pass.pass.Nx = scenario["nx"].GetUint();
        pass.pass.Ny = scenario["ny"].GetUint();
        pass.pass.radix = scenario["radix"].GetUint();
        pass.pass.mode = static_cast<Mode>(scenario["mode"].GetUint());
        pass.pass.input_target = static_cast<Target>(scenario["input_target"].GetUint());
        pass.pass.output_target = static_cast<Target>(scenario["output_target"].GetUint());

        auto &type = v["type"];
        pass.pass.type.fp16 = type["fp16"].GetBool();
        pass.pass.type.input_fp16 = type["input_fp16"].GetBool();
        pass.pass.type.output_fp16 = type["output_fp16"].GetBool();
        pass.pass.type.normalize = type["normalize"].GetBool();

        auto &performance = v["performance"];
        perf.shared_banked = performance["shared_banked"].GetBool();
        perf.vector_size = performance["vector_size"].GetUint();
        perf.workgroup_size_x = performance["workgroup_size_x"].GetUint();
        perf.workgroup_size_y = performance["workgroup_size_y"].GetUint();

        new_library[pass] = perf;
    }

    // Exception safe.
    swap(library, new_library);
}
#endif
Init 2023-09-07 16:16:06 -06:00			`/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>`
			`*`
			`* Permission is hereby granted, free of charge,`
			`* to any person obtaining a copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation the rights to`
			`* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,`
			`* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,`
			`* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,`
			`* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`
			`*/`

			`#include "glfft_wisdom.hpp"`
			`#include "glfft_interface.hpp"`
			`#include "glfft.hpp"`
			`#include <utility>`

			`/* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */`
			`#ifdef GLFFT_SERIALIZATION`
			`#include "rapidjson/reader.h"`
			`#include "rapidjson/prettywriter.h"`
			`#include "rapidjson/stringbuffer.h"`
			`#include "rapidjson/document.h"`
			`using namespace rapidjson;`
			`#endif`

			`#ifdef GLFFT_CLI_ASYNC`
			`#include "glfft_cli.hpp"`
			`#endif`

			`using namespace std;`
			`using namespace GLFFT;`

			`FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context)`
			`{`
			`FFTStaticWisdom res;`

			`const char *renderer = context->get_renderer_string();`
			`unsigned threads = context->get_max_work_group_threads();`

			`if (strstr(renderer, "GeForce") \|\| strstr(renderer, "Quadro"))`
			`{`
			`context->log("Detected GeForce/Quadro GPU.\n");`
			`res.min_workgroup_size = 32; // Warp threads.`
			`res.min_workgroup_size_shared = 32;`
			`res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.`
			`res.min_vector_size = 2;`
			`res.max_vector_size = 2;`
			`res.shared_banked = FFTStaticWisdom::True;`
			`}`
			`else if (strstr(renderer, "Radeon"))`
			`{`
			`context->log("Detected Radeon GPU.\n");`
			`res.min_workgroup_size = 64; // Wavefront threads (GCN).`
			`res.min_workgroup_size_shared = 128;`
			`res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.`
			`// TODO: Find if we can restrict this to 2 or 4 always.`
			`res.min_vector_size = 2;`
			`res.max_vector_size = 4;`
			`res.shared_banked = FFTStaticWisdom::True;`
			`}`
			`else if (strstr(renderer, "Mali"))`
			`{`
			`context->log("Detected Mali GPU.\n");`

			`res.min_workgroup_size = 4;`
			`res.min_workgroup_size_shared = 4;`
			`res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.`
			`res.min_vector_size = 4;`
			`res.max_vector_size = 4;`
			`res.shared_banked = FFTStaticWisdom::False;`
			`}`
			`// TODO: Add more GPUs.`

			`return res;`
			`}`

			`pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(`
			`Context *context, unsigned Nx, unsigned Ny, unsigned radix,`
			`Mode mode, Target input_target, Target output_target,`
			`const FFTOptions::Type &type)`
			`{`
			`WisdomPass pass = {`
			`{`
			`Nx, Ny, radix, mode, input_target, output_target,`
			`type,`
			`},`
			`0.0,`
			`};`

			`auto itr = library.find(pass);`
			`if (itr != end(library))`
			`{`
			`return make_pair(itr->first.cost, itr->second);`
			`}`
			`else`
			`{`
			`auto result = study(context, pass, type);`
			`pass.cost = result.first;`
			`library[pass] = result.second;`

			`return result;`
			`}`
			`}`

			`void FFTWisdom::learn_optimal_options_exhaustive(Context *context,`
			`unsigned Nx, unsigned Ny,`
			`Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)`
			`{`
			`bool learn_resolve = type == ComplexToReal \|\| type == RealToComplex;`
			`Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;`
			`Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;`

			`// Create wisdom for horizontal transforms and vertical transform.`
			`static const unsigned radices[] = { 4, 8, 16, 64 };`
			`for (auto radix : radices)`
			`{`
			`try`
			`{`
			`// If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.`

			`// Learn plain transforms.`
			`if (Ny > 1)`
			`{`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);`
			`}`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);`

			`// Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with`
			`// block interleave and larger WG_Y sizes.`
			`if (input_target != SSBO)`
			`{`
			`if (Ny > 1)`
			`{`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);`
			`}`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);`
			`}`

			`if (output_target != SSBO)`
			`{`
			`if (Ny > 1)`
			`{`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);`
			`}`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);`
			`}`
			`}`
			`#ifdef GLFFT_CLI_ASYNC`
			`catch (const AsyncCancellation &)`
			`{`
			`throw;`
			`}`
			`#endif`
			`catch (...)`
			`{`
			`// If our default options cannot successfully create the radix pass (i.e. throws),`
			`// just ignore it for purpose of creating wisdom.`
			`}`
			`}`

			`auto resolve_type = fft_type;`
			`resolve_type.input_fp16 = resolve_type.output_fp16;`
			`Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;`
			`Target resolve_input_target = SSBO;`

			`// If we have C2R Nx1 transform, the first pass is resolve, so use those types.`
			`if (type == ComplexToReal && Ny == 1)`
			`{`
			`resolve_type = fft_type;`
			`resolve_input_target = input_target;`
			`}`

			`// If we need to do a resolve pass, train this case as well.`
			`if (learn_resolve)`
			`{`
			`try`
			`{`
			`// If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.`
			`if (Ny == 1 && resolve_mode == ResolveRealToComplex)`
			`{`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);`
			`}`
			`else`
			`{`
			`learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);`
			`}`
			`}`
			`#ifdef GLFFT_CLI_ASYNC`
			`catch (const AsyncCancellation &)`
			`{`
			`throw;`
			`}`
			`#endif`
			`catch (...)`
			`{`
			`// If our default options cannot successfully create the radix pass (i.e. throws),`
			`// just ignore it for purpose of creating wisdom.`
			`}`
			`}`
			`}`

			`double FFTWisdom::bench(Context context, Resource output, Resource *input,`
			`const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const`
			`{`
			`FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,`
			`pass.pass.mode, pass.pass.input_target, pass.pass.output_target,`
			`cache, options);`

			`return fft.bench(context,`
			`output, input, params.warmup, params.iterations, params.dispatches, params.timeout);`
			`}`

			`static inline unsigned mode_to_size(Mode mode)`
			`{`
			`switch (mode)`
			`{`
			`case VerticalDual:`
			`case HorizontalDual:`
			`case ResolveRealToComplex:`
			`case ResolveComplexToReal:`
			`return 4;`

			`default:`
			`return 2;`
			`}`
			`}`

			`std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const`
			`{`
			`auto cache = make_shared<ProgramCache>();`

			`unique_ptr<Resource> output;`
			`unique_ptr<Resource> input;`

			`unsigned mode_size = mode_to_size(pass.pass.mode);`
			`vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);`

			`if (pass.pass.input_target == SSBO)`
			`{`
			`input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy);`
			`}`
			`else`
			`{`
			`Format format = FormatUnknown;`
			`unsigned Nx = pass.pass.Nx;`
			`unsigned Ny = pass.pass.Ny;`

			`switch (pass.pass.mode)`
			`{`
			`case VerticalDual:`
			`case HorizontalDual:`
			`format = FormatR32G32B32A32Float;`
			`break;`

			`case Vertical:`
			`case Horizontal:`
			`format = FormatR32G32Float;`
			`break;`

			`case ResolveComplexToReal:`
			`format = FormatR32G32Float;`
			`Nx *= 2;`
			`break;`

			`default:`
			`throw logic_error("Invalid input mode.\n");`
			`}`

			`input = context->create_texture(tmp.data(), Nx, Ny, format);`
			`}`

			`if (pass.pass.output_target == SSBO)`
			`{`
			`output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy);`
			`}`
			`else`
			`{`
			`Format format = FormatUnknown;`
			`unsigned Nx = pass.pass.Nx;`
			`unsigned Ny = pass.pass.Ny;`

			`switch (pass.pass.mode)`
			`{`
			`case VerticalDual:`
			`case HorizontalDual:`
			`format = FormatR32G32B32A32Float;`
			`break;`

			`case Vertical:`
			`case Horizontal:`
			`format = FormatR32G32Float;`
			`break;`

			`case ResolveRealToComplex:`
			`format = FormatR32G32Float;`
			`Nx *= 2;`
			`break;`

			`default:`
			`throw logic_error("Invalid output mode.\n");`
			`}`

			`output = context->create_texture(nullptr, Nx, Ny, format);`
			`}`

			`// Exhaustive search, look for every sensible combination, and find fastest parameters.`
			`// Get initial best cost with defaults.`
			`FFTOptions::Performance best_perf;`
			`double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache);`

			`static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };`
			`static const unsigned vector_size_values[] = { 2, 4, 8 };`
			`static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };`
			`static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };`

			`bool test_resolve = pass.pass.mode == ResolveComplexToReal \|\| pass.pass.mode == ResolveRealToComplex;`
			`bool test_dual = pass.pass.mode == VerticalDual \|\| pass.pass.mode == HorizontalDual;`
			`unsigned bench_count = 0;`

			`for (auto shared_banked : shared_banked_values)`
			`{`
			`// Useless test, since shared banked is only relevant for radix 16/64.`
			`if (pass.pass.radix < 16 && shared_banked)`
			`{`
			`continue;`
			`}`

			`bool fair_shared_banked = (pass.pass.radix < 16) \|\|`
			`(static_wisdom.shared_banked == FFTStaticWisdom::DontCare) \|\|`
			`(shared_banked == static_wisdom.shared_banked);`

			`if (!fair_shared_banked)`
			`{`
			`continue;`
			`}`

			`for (auto vector_size : vector_size_values)`
			`{`
			`// Resolve passes currently only support vector size 2. Shared banked makes no sense either.`
			`if (test_resolve && (vector_size != 2 \|\| shared_banked))`
			`{`
			`continue;`
			`}`

			`// We can only use vector_size 8 with FP16.`
			`if (vector_size == 8 && (!type.fp16 \|\| !type.input_fp16 \|\| !type.output_fp16))`
			`{`
			`continue;`
			`}`

			`// Makes little sense to test since since vector_size will be bumped to 4 anyways.`
			`if (test_dual && vector_size < 4)`
			`{`
			`continue;`
			`}`

			`for (auto workgroup_size_x : workgroup_size_x_values)`
			`{`
			`for (auto workgroup_size_y : workgroup_size_y_values)`
			`{`
			`unsigned workgroup_size = workgroup_size_x * workgroup_size_y;`

			`unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :`
			`static_wisdom.min_workgroup_size;`

			`unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;`
			`unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;`

			`bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&`
			`workgroup_size >= min_workgroup_size;`
			`if (pass.pass.Ny == 1 && workgroup_size_y > 1)`
			`{`
			`fair_workgroup_size = false;`
			`}`

			`if (!fair_workgroup_size)`
			`{`
			`continue;`
			`}`

			`// If we have dual mode, accept vector sizes larger than max.`
			`bool fair_vector_size = test_resolve \|\| (vector_size <= max_vector_size &&`
			`vector_size >= min_vector_size);`

			`if (!fair_vector_size)`
			`{`
			`continue;`
			`}`

			`FFTOptions::Performance perf;`
			`perf.shared_banked = shared_banked;`
			`perf.vector_size = vector_size;`
			`perf.workgroup_size_x = workgroup_size_x;`
			`perf.workgroup_size_y = workgroup_size_y;`

			`try`
			`{`
			`// If workgroup sizes are too big for our test, this will throw.`
			`double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache);`
			`bench_count++;`

			`#if 1`
			`context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);`
			`context->log(" Width: %4u\n", pass.pass.Nx);`
			`context->log(" Height: %4u\n", pass.pass.Ny);`
			`context->log(" Shared banked: %3s\n", shared_banked ? "yes" : "no");`
			`context->log(" Vector size: %u\n", vector_size);`
			`context->log(" Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);`
			`context->log(" Cost: %8.3g\n", cost);`
			`#endif`

			`if (cost < minimum_cost)`
			`{`
			`#if 1`
			`context->log(" New optimal solution! (%g -> %g)\n", minimum_cost, cost);`
			`#endif`
			`best_perf = perf;`
			`minimum_cost = cost;`
			`}`
			`}`
			`#ifdef GLFFT_CLI_ASYNC`
			`catch (const AsyncCancellation &)`
			`{`
			`throw;`
			`}`
			`#endif`
			`catch (...)`
			`{`
			`// If we pass in bogus parameters,`
			`// FFT will throw and we just ignore this.`
			`}`
			`}`
			`}`
			`}`
			`}`

			`context->log("Tested %u variants!\n", bench_count);`
			`return make_pair(minimum_cost, best_perf);`
			`}`

			`const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,`
			`Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const`
			`{`
			`WisdomPass pass = {`
			`{`
			`Nx, Ny, radix, mode, input_target, output_target,`
			`type,`
			`},`
			`0.0,`
			`};`

			`auto itr = library.find(pass);`
			`return itr != end(library) ? (&(*itr)) : nullptr;`
			`}`

			`const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,`
			`Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const`
			`{`
			`WisdomPass pass = {`
			`{`
			`Nx, Ny, radix, mode, input_target, output_target,`
			`base_options.type,`
			`},`
			`0.0,`
			`};`

			`auto itr = library.find(pass);`

			`#if 0`
			`if (itr == end(library))`
			`{`
			`context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",`
			`Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));`
			`}`
			`#endif`

			`return itr != end(library) ? itr->second : base_options.performance;`
			`}`

			`#ifdef GLFFT_SERIALIZATION`
			`std::string FFTWisdom::archive() const`
			`{`
			`StringBuffer s;`
			`PrettyWriter<StringBuffer> writer{s};`

			`writer.StartObject();`
			`writer.String("library");`

			`// Serialize all wisdom accumulated to a string.`
			`writer.StartArray();`
			`for (auto &entry : library)`
			`{`
			`writer.StartObject();`

			`writer.String("scenario");`
			`writer.StartObject();`
			`writer.String("nx");`
			`writer.Uint(entry.first.pass.Nx);`
			`writer.String("ny");`
			`writer.Uint(entry.first.pass.Ny);`
			`writer.String("radix");`
			`writer.Uint(entry.first.pass.radix);`
			`writer.String("mode");`
			`writer.Uint(entry.first.pass.mode);`
			`writer.String("input_target");`
			`writer.Uint(entry.first.pass.input_target);`
			`writer.String("output_target");`
			`writer.Uint(entry.first.pass.output_target);`
			`writer.EndObject();`

			`writer.String("type");`
			`writer.StartObject();`
			`writer.String("fp16");`
			`writer.Bool(entry.first.pass.type.fp16);`
			`writer.String("input_fp16");`
			`writer.Bool(entry.first.pass.type.input_fp16);`
			`writer.String("output_fp16");`
			`writer.Bool(entry.first.pass.type.output_fp16);`
			`writer.String("normalize");`
			`writer.Bool(entry.first.pass.type.normalize);`
			`writer.EndObject();`

			`writer.String("performance");`
			`writer.StartObject();`
			`writer.String("shared_banked");`
			`writer.Bool(entry.second.shared_banked);`
			`writer.String("vector_size");`
			`writer.Uint(entry.second.vector_size);`
			`writer.String("workgroup_size_x");`
			`writer.Uint(entry.second.workgroup_size_x);`
			`writer.String("workgroup_size_y");`
			`writer.Uint(entry.second.workgroup_size_y);`
			`writer.EndObject();`

			`writer.String("cost");`
			`writer.Double(entry.first.cost);`

			`writer.EndObject();`
			`}`
			`writer.EndArray();`
			`writer.EndObject();`
			`return s.GetString();`
			`}`

			`void FFTWisdom::extract(const char *json)`
			`{`
			`Document document;`
			`document.Parse(json);`

			`// Exception safe, we don't want to risk throwing in the middle of the`
			`// loop, leaving the library is broken state.`
			`unordered_map<WisdomPass, FFTOptions::Performance> new_library;`

			`auto &lib = document["library"];`

			`// y u no begin(), end() :(`
			`for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr)`
			`{`
			`auto &v = *itr;`

			`WisdomPass pass;`
			`FFTOptions::Performance perf;`

			`pass.cost = v["cost"].GetDouble();`

			`auto &scenario = v["scenario"];`
			`pass.pass.Nx = scenario["nx"].GetUint();`
			`pass.pass.Ny = scenario["ny"].GetUint();`
			`pass.pass.radix = scenario["radix"].GetUint();`
			`pass.pass.mode = static_cast<Mode>(scenario["mode"].GetUint());`
			`pass.pass.input_target = static_cast<Target>(scenario["input_target"].GetUint());`
			`pass.pass.output_target = static_cast<Target>(scenario["output_target"].GetUint());`

			`auto &type = v["type"];`
			`pass.pass.type.fp16 = type["fp16"].GetBool();`
			`pass.pass.type.input_fp16 = type["input_fp16"].GetBool();`
			`pass.pass.type.output_fp16 = type["output_fp16"].GetBool();`
			`pass.pass.type.normalize = type["normalize"].GetBool();`

			`auto &performance = v["performance"];`
			`perf.shared_banked = performance["shared_banked"].GetBool();`
			`perf.vector_size = performance["vector_size"].GetUint();`
			`perf.workgroup_size_x = performance["workgroup_size_x"].GetUint();`
			`perf.workgroup_size_y = performance["workgroup_size_y"].GetUint();`

			`new_library[pass] = perf;`
			`}`

			`// Exception safe.`
			`swap(library, new_library);`
			`}`
			`#endif`