Quantcast
Channel: Active questions tagged gcc - Stack Overflow
Viewing all articles
Browse latest Browse all 22016

How did moving calculations from run-time to compile-time make my code slower?

$
0
0

I'm writing a function to calculate the fast Fourier transform using the Stockham FFT algorithm and discovered that all of the "twiddle factors" for the calculation can be pre-calculated at compile time if the length of the FFT is a power of two.

In FFT calculations the twiddle factor calculations usually take up a large portion of the overall time so in theory doing this should result in a drastic improvement in the performance of the algorithm.

I spent all day yesterday reimplementing my algorithm on a new compiler (gcc 10) so I could use the C++20 consteval feature to pre-calculate all of the twiddle factors at compile time. I was successfully able to do it, but in the end the code that pre-calculates all of the twiddle factors at compile time actually runs slower!

Here is the code that performs all of the calculations at runtime:

#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <complex>
#include <iostream>
#include <vector>

using namespace std;

static vector<complex<double>> StockhamFFT(const vector<complex<double>>& x);

constexpr bool IsPowerOf2(const size_t value)
{
    return value && (!(value & (value - 1)));
}

vector<complex<double>> FFT(const vector<double>& x)
{
    const auto N = x.size();
    assert(IsPowerOf2(x.size()));
    const auto NOver2 = N/2;

    vector<complex<double>> x_p(N);
    transform(x.begin(), x.end(), x_p.begin(), [](const double value) {
        return complex<double>(value);
        });

    return StockhamFFT(x_p);
}

// C++ implementation of the Stockam FFT algorithm
static vector<complex<double>> StockhamFFT(const vector<complex<double>>& x)
{
    const auto N = x.size();
    assert(IsPowerOf2(N));
    const auto NOver2 = N/2;

    // Pre-calculate the twiddle factors (at runtime)
    vector<complex<double>> W(NOver2);
    const auto omega = 2.0 * M_PI / N;
    for (size_t n = 0; n < NOver2; ++n)
    {
        W[n] = complex{ cos(-omega * n), sin(-omega * n) };
    }

    // The Stockham algorithm requires one vector for input/output data and
    // another as a temporary workspace
    vector<complex<double>> a(x);
    vector<complex<double>> b(N);

    // Set the spacing between twiddle factors used at the first stage
    size_t WStride = N/2;

    // Loop through each stage of the FFT
    for (size_t stride = 1; stride < N; stride *= 2)
    {
        // Loop through the individual FFTs of each stage
        for (size_t m = 0; m < NOver2; m += stride)
        {
            const auto mTimes2 = m*2;

            // Perform each individual FFT
            for (size_t n = 0; n < stride; ++n)
            {
                // Calculate the input indexes
                const auto aIndex1 = n + m;
                const auto aIndex2 = aIndex1 + NOver2;

                // Calculate the output indexes
                const auto bIndex1 = n + mTimes2;
                const auto bIndex2 = bIndex1 + stride;

                // Perform the FFT
                const auto tmp1 = a[aIndex1];
                const auto tmp2 = W[n*WStride]*a[aIndex2];

                // Sum the results
                b[bIndex1] = tmp1 + tmp2;
                b[bIndex2] = tmp1 - tmp2; // (>*.*)> symmetry! <(*.*<)
            }
        }

        // Spacing between twiddle factors is half for the next stage
        WStride /= 2;

        // Swap the data (output of this stage is input of the next)
        a.swap(b);
    }

    return a;
}

int main()
{
    size_t N = pow(2, 18);
    vector<double> x(N);

    int f_s = 1000;
    double t_s = 1.0 / f_s;

    for (size_t n = 0; n < N; ++n)
    {
        x[n] = sin(2 * M_PI * 100 * n * t_s);
    }

    auto sum = 0;
    for (int i = 1; i < 100; ++i)
    {
        auto start = chrono::high_resolution_clock::now();
        auto X = FFT(x);
        auto stop = chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::microseconds>(stop - start);
        sum += duration.count();
    }
    auto average = sum / 100;

    std::cout << "duration "<< average << " microseconds."<< std::endl;
}

With this as a starting point I was able to extract out the twiddle factor calculations from the StockhamFFT function and perform them at compile-time using a consteval function. This is what the code looks like after:

#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <complex>
#include <iostream>
#include <vector>

using namespace std;

static vector<complex<double>> StockhamFFT(const vector<complex<double>>& x);

constexpr bool IsPowerOf2(const size_t value)
{
    return value && (!(value & (value - 1)));
}

// Evaluates FFT twiddle factors at compile time!
template <size_t N>
static consteval array<complex<double>, N/2> CalculateTwiddleFactors()
{
    static_assert(IsPowerOf2(N), "N must be a power of 2.");

    array<complex<double>, N/2> W;
    const auto omega = 2.0*M_PI/N;
    for (size_t n = 0; n < N/2; ++n)
    {
        W[n] = complex{cos(-omega*n), sin(-omega*n)};
    }

    return W;
}

// Calculate the twiddle factors (>*O*)> AT COMPILE TIME <(*O*<)
constexpr auto W = CalculateTwiddleFactors<static_cast<size_t>(pow(2,18))>();

vector<complex<double>> FFT(const vector<double>& x)
{
    const auto N = x.size();
    assert(IsPowerOf2(x.size()));
    const auto NOver2 = N/2;

    vector<complex<double>> x_p(N);
    transform(x.begin(), x.end(), x_p.begin(), [](const double value) {
        return complex<double>(value);
        });

    return StockhamFFT(x_p);
}

// C++ implementation of the Stockam FFT algorithm
static vector<complex<double>> StockhamFFT(const vector<complex<double>>& x)
{
    const auto N = x.size();
    assert(IsPowerOf2(N));
    const auto NOver2 = N/2;

    //***********************************************************************
    // Twiddle factors are already calculated at compile time!!!
    //***********************************************************************

    // The Stockham algorithm requires one vector for input/output data and
    // another as a temporary workspace
    vector<complex<double>> a(x);
    vector<complex<double>> b(N);

    // Set the spacing between twiddle factors used at the first stage
    size_t WStride = N/2;

    // Loop through each stage of the FFT
    for (size_t stride = 1; stride < N; stride *= 2)
    {
        // Loop through the individual FFTs of each stage
        for (size_t m = 0; m < NOver2; m += stride)
        {
            const auto mTimes2 = m*2;

            // Perform each individual FFT
            for (size_t n = 0; n < stride; ++n)
            {
                // Calculate the input indexes
                const auto aIndex1 = n + m;
                const auto aIndex2 = aIndex1 + NOver2;

                // Calculate the output indexes
                const auto bIndex1 = n + mTimes2;
                const auto bIndex2 = bIndex1 + stride;

                // Perform the FFT
                const auto tmp1 = a[aIndex1];
                const auto tmp2 = W[n*WStride]*a[aIndex2];

                // Sum the results
                b[bIndex1] = tmp1 + tmp2;
                b[bIndex2] = tmp1 - tmp2; // (>*.*)> symmetry! <(*.*<)
            }
        }

        // Spacing between twiddle factors is half for the next stage
        WStride /= 2;

        // Swap the data (output of this stage is input of the next)
        a.swap(b);
    }

    return a;
}

int main()
{
    size_t N = pow(2, 18);
    vector<double> x(N);

    int f_s = 1000;
    double t_s = 1.0 / f_s;

    for (size_t n = 0; n < N; ++n)
    {
        x[n] = sin(2 * M_PI * 100 * n * t_s);
    }

    auto sum = 0;
    for (int i = 1; i < 100; ++i)
    {
        auto start = chrono::high_resolution_clock::now();
        auto X = FFT(x);
        auto stop = chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::microseconds>(stop - start);
        sum += duration.count();
    }
    auto average = sum / 100;

    std::cout << "duration "<< average << " microseconds."<< std::endl;
}

Both versions were compiled on Ubuntu 19.10 with gcc 10.0.1:

g++ -std=c++2a -o main main.cpp

Note that the gcc compiler is specifically required because it is the only compiler that supports constexpr versions of sin and cos

The "run-time" example yields the following:

duration 292854 microseconds.

The "compile-time" example yields the following:

duration 295230 microseconds.

The compile-time version did take notably longer to compile, but still somehow takes longer to run even though most of the calculations are done before the program even begins! How is this possible? Am I missing something critical here?


Viewing all articles
Browse latest Browse all 22016

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>