glava-ridged/glfft/glfft_wisdom.cpp

601 lines
21 KiB
C++
Raw Normal View History

2023-09-07 16:16:06 -06:00
/* Copyright (C) 2015 Hans-Kristian Arntzen <maister@archlinux.us>
*
* Permission is hereby granted, free of charge,
* to any person obtaining a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "glfft_wisdom.hpp"
#include "glfft_interface.hpp"
#include "glfft.hpp"
#include <utility>
/* GLAVA NOTICE: automatic wisdom serialization support may be added at a late date */
#ifdef GLFFT_SERIALIZATION
#include "rapidjson/reader.h"
#include "rapidjson/prettywriter.h"
#include "rapidjson/stringbuffer.h"
#include "rapidjson/document.h"
using namespace rapidjson;
#endif
#ifdef GLFFT_CLI_ASYNC
#include "glfft_cli.hpp"
#endif
using namespace std;
using namespace GLFFT;
FFTStaticWisdom FFTWisdom::get_static_wisdom_from_renderer(Context *context)
{
FFTStaticWisdom res;
const char *renderer = context->get_renderer_string();
unsigned threads = context->get_max_work_group_threads();
if (strstr(renderer, "GeForce") || strstr(renderer, "Quadro"))
{
context->log("Detected GeForce/Quadro GPU.\n");
res.min_workgroup_size = 32; // Warp threads.
res.min_workgroup_size_shared = 32;
res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
res.min_vector_size = 2;
res.max_vector_size = 2;
res.shared_banked = FFTStaticWisdom::True;
}
else if (strstr(renderer, "Radeon"))
{
context->log("Detected Radeon GPU.\n");
res.min_workgroup_size = 64; // Wavefront threads (GCN).
res.min_workgroup_size_shared = 128;
res.max_workgroup_size = min(threads, 256u); // Very unlikely that more than 256 threads will do anything good.
// TODO: Find if we can restrict this to 2 or 4 always.
res.min_vector_size = 2;
res.max_vector_size = 4;
res.shared_banked = FFTStaticWisdom::True;
}
else if (strstr(renderer, "Mali"))
{
context->log("Detected Mali GPU.\n");
res.min_workgroup_size = 4;
res.min_workgroup_size_shared = 4;
res.max_workgroup_size = 64; // Going beyond 64 threads per WG is not a good idea.
res.min_vector_size = 4;
res.max_vector_size = 4;
res.shared_banked = FFTStaticWisdom::False;
}
// TODO: Add more GPUs.
return res;
}
pair<double, FFTOptions::Performance> FFTWisdom::learn_optimal_options(
Context *context, unsigned Nx, unsigned Ny, unsigned radix,
Mode mode, Target input_target, Target output_target,
const FFTOptions::Type &type)
{
WisdomPass pass = {
{
Nx, Ny, radix, mode, input_target, output_target,
type,
},
0.0,
};
auto itr = library.find(pass);
if (itr != end(library))
{
return make_pair(itr->first.cost, itr->second);
}
else
{
auto result = study(context, pass, type);
pass.cost = result.first;
library[pass] = result.second;
return result;
}
}
void FFTWisdom::learn_optimal_options_exhaustive(Context *context,
unsigned Nx, unsigned Ny,
Type type, Target input_target, Target output_target, const FFTOptions::Type &fft_type)
{
bool learn_resolve = type == ComplexToReal || type == RealToComplex;
Mode vertical_mode = type == ComplexToComplexDual ? VerticalDual : Vertical;
Mode horizontal_mode = type == ComplexToComplexDual ? HorizontalDual : Horizontal;
// Create wisdom for horizontal transforms and vertical transform.
static const unsigned radices[] = { 4, 8, 16, 64 };
for (auto radix : radices)
{
try
{
// If we're doing SSBO -> Image or Image -> SSBO. Create wisdom for the two variants.
// Learn plain transforms.
if (Ny > 1)
{
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, SSBO, fft_type);
}
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, SSBO, fft_type);
// Learn the first/last pass transforms. Can be fairly significant since accessing textures makes more sense with
// block interleave and larger WG_Y sizes.
if (input_target != SSBO)
{
if (Ny > 1)
{
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, input_target, SSBO, fft_type);
}
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, input_target, SSBO, fft_type);
}
if (output_target != SSBO)
{
if (Ny > 1)
{
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, vertical_mode, SSBO, output_target, fft_type);
}
learn_optimal_options(context, Nx >> learn_resolve, Ny, radix, horizontal_mode, SSBO, output_target, fft_type);
}
}
#ifdef GLFFT_CLI_ASYNC
catch (const AsyncCancellation &)
{
throw;
}
#endif
catch (...)
{
// If our default options cannot successfully create the radix pass (i.e. throws),
// just ignore it for purpose of creating wisdom.
}
}
auto resolve_type = fft_type;
resolve_type.input_fp16 = resolve_type.output_fp16;
Mode resolve_mode = type == ComplexToReal ? ResolveComplexToReal : ResolveRealToComplex;
Target resolve_input_target = SSBO;
// If we have C2R Nx1 transform, the first pass is resolve, so use those types.
if (type == ComplexToReal && Ny == 1)
{
resolve_type = fft_type;
resolve_input_target = input_target;
}
// If we need to do a resolve pass, train this case as well.
if (learn_resolve)
{
try
{
// If Ny == 1 and we're doing RealToComplex, this will be the last pass, so use output_target as target.
if (Ny == 1 && resolve_mode == ResolveRealToComplex)
{
learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, output_target, resolve_type);
}
else
{
learn_optimal_options(context, Nx >> learn_resolve, Ny, 2, resolve_mode, resolve_input_target, SSBO, resolve_type);
}
}
#ifdef GLFFT_CLI_ASYNC
catch (const AsyncCancellation &)
{
throw;
}
#endif
catch (...)
{
// If our default options cannot successfully create the radix pass (i.e. throws),
// just ignore it for purpose of creating wisdom.
}
}
}
double FFTWisdom::bench(Context *context, Resource *output, Resource *input,
const WisdomPass &pass, const FFTOptions &options, const shared_ptr<ProgramCache> &cache) const
{
FFT fft(context, pass.pass.Nx, pass.pass.Ny, pass.pass.radix, pass.pass.input_target != SSBO ? 1 : pass.pass.radix,
pass.pass.mode, pass.pass.input_target, pass.pass.output_target,
cache, options);
return fft.bench(context,
output, input, params.warmup, params.iterations, params.dispatches, params.timeout);
}
static inline unsigned mode_to_size(Mode mode)
{
switch (mode)
{
case VerticalDual:
case HorizontalDual:
case ResolveRealToComplex:
case ResolveComplexToReal:
return 4;
default:
return 2;
}
}
std::pair<double, FFTOptions::Performance> FFTWisdom::study(Context *context, const WisdomPass &pass, FFTOptions::Type type) const
{
auto cache = make_shared<ProgramCache>();
unique_ptr<Resource> output;
unique_ptr<Resource> input;
unsigned mode_size = mode_to_size(pass.pass.mode);
vector<float> tmp(mode_size * pass.pass.Nx * pass.pass.Ny);
if (pass.pass.input_target == SSBO)
{
input = context->create_buffer(tmp.data(), tmp.size() * sizeof(float) >> type.input_fp16, AccessStaticCopy);
}
else
{
Format format = FormatUnknown;
unsigned Nx = pass.pass.Nx;
unsigned Ny = pass.pass.Ny;
switch (pass.pass.mode)
{
case VerticalDual:
case HorizontalDual:
format = FormatR32G32B32A32Float;
break;
case Vertical:
case Horizontal:
format = FormatR32G32Float;
break;
case ResolveComplexToReal:
format = FormatR32G32Float;
Nx *= 2;
break;
default:
throw logic_error("Invalid input mode.\n");
}
input = context->create_texture(tmp.data(), Nx, Ny, format);
}
if (pass.pass.output_target == SSBO)
{
output = context->create_buffer(nullptr, tmp.size() * sizeof(float) >> type.output_fp16, AccessStreamCopy);
}
else
{
Format format = FormatUnknown;
unsigned Nx = pass.pass.Nx;
unsigned Ny = pass.pass.Ny;
switch (pass.pass.mode)
{
case VerticalDual:
case HorizontalDual:
format = FormatR32G32B32A32Float;
break;
case Vertical:
case Horizontal:
format = FormatR32G32Float;
break;
case ResolveRealToComplex:
format = FormatR32G32Float;
Nx *= 2;
break;
default:
throw logic_error("Invalid output mode.\n");
}
output = context->create_texture(nullptr, Nx, Ny, format);
}
// Exhaustive search, look for every sensible combination, and find fastest parameters.
// Get initial best cost with defaults.
FFTOptions::Performance best_perf;
double minimum_cost = bench(context, output.get(), input.get(), pass, { best_perf, type }, cache);
static const FFTStaticWisdom::Tristate shared_banked_values[] = { FFTStaticWisdom::False, FFTStaticWisdom::True };
static const unsigned vector_size_values[] = { 2, 4, 8 };
static const unsigned workgroup_size_x_values[] = { 4, 8, 16, 32, 64, 128, 256 };
static const unsigned workgroup_size_y_values[] = { 1, 2, 4, 8, };
bool test_resolve = pass.pass.mode == ResolveComplexToReal || pass.pass.mode == ResolveRealToComplex;
bool test_dual = pass.pass.mode == VerticalDual || pass.pass.mode == HorizontalDual;
unsigned bench_count = 0;
for (auto shared_banked : shared_banked_values)
{
// Useless test, since shared banked is only relevant for radix 16/64.
if (pass.pass.radix < 16 && shared_banked)
{
continue;
}
bool fair_shared_banked = (pass.pass.radix < 16) ||
(static_wisdom.shared_banked == FFTStaticWisdom::DontCare) ||
(shared_banked == static_wisdom.shared_banked);
if (!fair_shared_banked)
{
continue;
}
for (auto vector_size : vector_size_values)
{
// Resolve passes currently only support vector size 2. Shared banked makes no sense either.
if (test_resolve && (vector_size != 2 || shared_banked))
{
continue;
}
// We can only use vector_size 8 with FP16.
if (vector_size == 8 && (!type.fp16 || !type.input_fp16 || !type.output_fp16))
{
continue;
}
// Makes little sense to test since since vector_size will be bumped to 4 anyways.
if (test_dual && vector_size < 4)
{
continue;
}
for (auto workgroup_size_x : workgroup_size_x_values)
{
for (auto workgroup_size_y : workgroup_size_y_values)
{
unsigned workgroup_size = workgroup_size_x * workgroup_size_y;
unsigned min_workgroup_size = pass.pass.radix >= 16 ? static_wisdom.min_workgroup_size_shared :
static_wisdom.min_workgroup_size;
unsigned min_vector_size = test_dual ? max(4u, static_wisdom.min_vector_size) : static_wisdom.min_vector_size;
unsigned max_vector_size = test_dual ? max(4u, static_wisdom.max_vector_size) : static_wisdom.max_vector_size;
bool fair_workgroup_size = workgroup_size <= static_wisdom.max_workgroup_size &&
workgroup_size >= min_workgroup_size;
if (pass.pass.Ny == 1 && workgroup_size_y > 1)
{
fair_workgroup_size = false;
}
if (!fair_workgroup_size)
{
continue;
}
// If we have dual mode, accept vector sizes larger than max.
bool fair_vector_size = test_resolve || (vector_size <= max_vector_size &&
vector_size >= min_vector_size);
if (!fair_vector_size)
{
continue;
}
FFTOptions::Performance perf;
perf.shared_banked = shared_banked;
perf.vector_size = vector_size;
perf.workgroup_size_x = workgroup_size_x;
perf.workgroup_size_y = workgroup_size_y;
try
{
// If workgroup sizes are too big for our test, this will throw.
double cost = bench(context, output.get(), input.get(), pass, { perf, type }, cache);
bench_count++;
#if 1
context->log("\nWisdom run (mode = %u, radix = %u):\n", pass.pass.mode, pass.pass.radix);
context->log(" Width: %4u\n", pass.pass.Nx);
context->log(" Height: %4u\n", pass.pass.Ny);
context->log(" Shared banked: %3s\n", shared_banked ? "yes" : "no");
context->log(" Vector size: %u\n", vector_size);
context->log(" Workgroup size: (%u, %u)\n", workgroup_size_x, workgroup_size_y);
context->log(" Cost: %8.3g\n", cost);
#endif
if (cost < minimum_cost)
{
#if 1
context->log(" New optimal solution! (%g -> %g)\n", minimum_cost, cost);
#endif
best_perf = perf;
minimum_cost = cost;
}
}
#ifdef GLFFT_CLI_ASYNC
catch (const AsyncCancellation &)
{
throw;
}
#endif
catch (...)
{
// If we pass in bogus parameters,
// FFT will throw and we just ignore this.
}
}
}
}
}
context->log("Tested %u variants!\n", bench_count);
return make_pair(minimum_cost, best_perf);
}
const pair<const WisdomPass, FFTOptions::Performance>* FFTWisdom::find_optimal_options(unsigned Nx, unsigned Ny, unsigned radix,
Mode mode, Target input_target, Target output_target, const FFTOptions::Type &type) const
{
WisdomPass pass = {
{
Nx, Ny, radix, mode, input_target, output_target,
type,
},
0.0,
};
auto itr = library.find(pass);
return itr != end(library) ? (&(*itr)) : nullptr;
}
const FFTOptions::Performance& FFTWisdom::find_optimal_options_or_default(unsigned Nx, unsigned Ny, unsigned radix,
Mode mode, Target input_target, Target output_target, const FFTOptions &base_options) const
{
WisdomPass pass = {
{
Nx, Ny, radix, mode, input_target, output_target,
base_options.type,
},
0.0,
};
auto itr = library.find(pass);
#if 0
if (itr == end(library))
{
context->log("Didn't find options for (%u x %u, radix %u, mode %u, input_target %u, output_target %u)\n",
Nx, Ny, radix, unsigned(mode), unsigned(input_target), unsigned(output_target));
}
#endif
return itr != end(library) ? itr->second : base_options.performance;
}
#ifdef GLFFT_SERIALIZATION
std::string FFTWisdom::archive() const
{
StringBuffer s;
PrettyWriter<StringBuffer> writer{s};
writer.StartObject();
writer.String("library");
// Serialize all wisdom accumulated to a string.
writer.StartArray();
for (auto &entry : library)
{
writer.StartObject();
writer.String("scenario");
writer.StartObject();
writer.String("nx");
writer.Uint(entry.first.pass.Nx);
writer.String("ny");
writer.Uint(entry.first.pass.Ny);
writer.String("radix");
writer.Uint(entry.first.pass.radix);
writer.String("mode");
writer.Uint(entry.first.pass.mode);
writer.String("input_target");
writer.Uint(entry.first.pass.input_target);
writer.String("output_target");
writer.Uint(entry.first.pass.output_target);
writer.EndObject();
writer.String("type");
writer.StartObject();
writer.String("fp16");
writer.Bool(entry.first.pass.type.fp16);
writer.String("input_fp16");
writer.Bool(entry.first.pass.type.input_fp16);
writer.String("output_fp16");
writer.Bool(entry.first.pass.type.output_fp16);
writer.String("normalize");
writer.Bool(entry.first.pass.type.normalize);
writer.EndObject();
writer.String("performance");
writer.StartObject();
writer.String("shared_banked");
writer.Bool(entry.second.shared_banked);
writer.String("vector_size");
writer.Uint(entry.second.vector_size);
writer.String("workgroup_size_x");
writer.Uint(entry.second.workgroup_size_x);
writer.String("workgroup_size_y");
writer.Uint(entry.second.workgroup_size_y);
writer.EndObject();
writer.String("cost");
writer.Double(entry.first.cost);
writer.EndObject();
}
writer.EndArray();
writer.EndObject();
return s.GetString();
}
void FFTWisdom::extract(const char *json)
{
Document document;
document.Parse(json);
// Exception safe, we don't want to risk throwing in the middle of the
// loop, leaving the library is broken state.
unordered_map<WisdomPass, FFTOptions::Performance> new_library;
auto &lib = document["library"];
// y u no begin(), end() :(
for (Value::ConstValueIterator itr = lib.Begin(); itr != lib.End(); ++itr)
{
auto &v = *itr;
WisdomPass pass;
FFTOptions::Performance perf;
pass.cost = v["cost"].GetDouble();
auto &scenario = v["scenario"];
pass.pass.Nx = scenario["nx"].GetUint();
pass.pass.Ny = scenario["ny"].GetUint();
pass.pass.radix = scenario["radix"].GetUint();
pass.pass.mode = static_cast<Mode>(scenario["mode"].GetUint());
pass.pass.input_target = static_cast<Target>(scenario["input_target"].GetUint());
pass.pass.output_target = static_cast<Target>(scenario["output_target"].GetUint());
auto &type = v["type"];
pass.pass.type.fp16 = type["fp16"].GetBool();
pass.pass.type.input_fp16 = type["input_fp16"].GetBool();
pass.pass.type.output_fp16 = type["output_fp16"].GetBool();
pass.pass.type.normalize = type["normalize"].GetBool();
auto &performance = v["performance"];
perf.shared_banked = performance["shared_banked"].GetBool();
perf.vector_size = performance["vector_size"].GetUint();
perf.workgroup_size_x = performance["workgroup_size_x"].GetUint();
perf.workgroup_size_y = performance["workgroup_size_y"].GetUint();
new_library[pass] = perf;
}
// Exception safe.
swap(library, new_library);
}
#endif