FractalRendering on GPU with CUDA

Question

I am doing a fractal renderer using CUDA, SFML, C++, recently optimized it to eat less memory, now I am going to optimize the actual fractals, because for some reason, it is the most holding back point (look lower ↓)

 GPU activities:
69.95%  8.85657s       899  9.8516ms  2.5720ms  48.582ms  fractal_rendering(unsigned char*, __int64, int, int, float, float, float, float, sf::Color*, int, float, bool*)
24.60%  3.11495s       728  4.2788ms  1.3658ms  19.410ms  fractal_rendering(unsigned char*, __int64, int, int, float, float, float, float, sf::Color*, int, float, bool*, float, float)

(nvprof results)
also experiencing some CUDA event hold backs, but I'll fix it later with atomic and another thread.

Here is actual code:

main.cpp

#include "CUDA files/FractalClass.cuh"
#include <SFML/Graphics.hpp>
#include <iostream>
#include <thread>

void main_thread() {
    render_state curr_qual = render_state::bad;
    render_state prev_qual = render_state::bad;
    render_state prev_qual_jul = render_state::good;
    render_state curr_qual_jul = render_state::good;

   

    sf::RenderWindow window(sf::VideoMode({ 1920, 1080 }), "Fractals");
    sf::RenderTexture buffer({1920, 1080});

    sf::Font font;
    bool state = font.openFromFile("C:\\Windows\\Fonts\\ariblk.ttf");
    if (!state)
        return;
    sf::Text text(font);
    text.setString("0");


    double zx, zy;

    bool drawen = false;

    sf::Clock timer;
    sf::Clock timer_julia;
    sf::Clock fps_clock;

    float fps = 0;

    bool is_dragging = false;

    bool julia_render = false;

    FractalBase<fractals::mandelbrot> mandelbrot;
    FractalBase<fractals::julia> julia_set;
    julia_set.setPosition({float(window.getSize().x - 800), 0.f});
    double max_iters = mandelbrot.get_max_iters();

    bool block_julia = false;
    
    bool mouse_moved = true;

    window.setFramerateLimit(60);

    sf::Vector2i mouse;
    while (window.isOpen()) {
        while (const auto event = window.pollEvent()) {
            if (const auto* mm = event->getIf<sf::Event::MouseMoved>()) {
                if(mouse.x < 800 && mouse.y < 600)
                    mouse_moved = true;
                mouse = mm->position;
            } 

            if (event->is<sf::Event::Closed>()) {
                window.close();
            }

            if (const auto* button = event->getIf<sf::Event::KeyPressed>()) {
                if (button->scancode == sf::Keyboard::Scancode::Escape) {
                    window.close();
                }
                else if (button->scancode == sf::Keyboard::Scancode::R) {
                    if (mouse.x < 800 && mouse.y < 600)
                        mandelbrot.set_max_iters(mandelbrot.get_max_iters() * 2);
                }
                else if(button->scancode == sf::Keyboard::Scancode::S) {
                    if (mouse.x < 800 && mouse.y < 600)
                        block_julia = !block_julia;
                }
            }
            
            if (const auto* mouseWheelScrolled = event->getIf<sf::Event::MouseWheelScrolled>()) {
                if (mouse.x < 800 && mouse.y < 600) {
                    curr_qual = render_state::good;
                    mandelbrot.handleZoom(mouseWheelScrolled->delta, mouse);
                }
                else if (mouse.x > window.getSize().x - 800 && mouse.y < 600) {
                    julia_set.handleZoom(mouseWheelScrolled->delta, mouse);
                    curr_qual_jul = render_state::good;
                }
            }

            if (const auto* mouseButtonPressed = event->getIf<sf::Event::MouseButtonPressed>()) {
                if (mouseButtonPressed->button == sf::Mouse::Button::Left) {
                    if (mouse.x < 800 && mouse.y < 600) {
                        mouse_moved = true;
                        curr_qual = render_state::good;
                        mandelbrot.start_dragging(mouse);
                    }
                    else if (mouse.x > window.getSize().x - 800 && mouse.y < 600) {
                        julia_set.start_dragging(mouse);
                        curr_qual_jul = render_state::good;
                    }
                }
            }

            if (const auto* mouseButtonReleased = event->getIf<sf::Event::MouseButtonReleased>()) {
                if (mouseButtonReleased->button == sf::Mouse::Button::Left) {
                    if (mouse.x < 800 && mouse.y < 600)
                        mandelbrot.stop_dragging();
                    else if (mouse.x > window.getSize().x - 800 && mouse.y < 600) {
                        julia_set.stop_dragging();
                    }
                }
            }

            if (const auto* mouseButtonReleased = event->getIf<sf::Event::MouseMoved>()) {
                if (mouse.x < 800 && mouse.y < 600) {
                    mouse_moved = true;
                    if (mandelbrot.get_is_dragging()) {
                        curr_qual = render_state::good;
                        mandelbrot.dragging({ mouse.x, mouse.y });
                    }
                }
                else if(mouse.x > window.getSize().x - 800 && mouse.y < 600 && julia_set.get_is_dragging()) {
                    if (julia_set.get_is_dragging()) {
                        julia_set.dragging({ mouse.x, mouse.y });
                        curr_qual_jul = render_state::good;
                    }
                }
            }
        }
        ++fps;


        if (curr_qual != render_state::best || prev_qual != render_state::best) {
            prev_qual = curr_qual;
            std::string quality = curr_qual == render_state::best ? "Best" : curr_qual == render_state::good ? "Good" : "Bad";
            auto start = std::chrono::high_resolution_clock::now();
            mandelbrot.render(curr_qual);
            auto end = std::chrono::high_resolution_clock::now();
            auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
            std::cout << "Time needed to render: " << diff.count() << "\n";

            auto startdrawing = std::chrono::high_resolution_clock::now();
            buffer.clear();
            window.clear();
            buffer.draw(mandelbrot);
            if (drawen) {
                buffer.draw(julia_set);
            }

            buffer.display();
            window.draw(sf::Sprite(buffer.getTexture()));
            window.draw(text);

            auto time = timer.restart();;
            window.display();
            auto enddrawing = std::chrono::high_resolution_clock::now();
            auto diffdrawing = std::chrono::duration_cast<std::chrono::milliseconds>(enddrawing - startdrawing);
            std::cout << "Time needed to apply SSAA4 and draw to window: " << diffdrawing.count() << "\n";
            std::cout << "Mandelbrot set " << "(" << quality << ")" << " was drew in : " << time.asMilliseconds() << std::endl;
            if (quality == "Bad") {
                curr_qual = render_state::good;
            }

            if (quality == "Good") {
                curr_qual = render_state::best;
            }
        }

        if(mouse_moved && !block_julia) {
            drawen = true;
            zx = -(mandelbrot.get_x_offset() - (mouse.x / mandelbrot.get_zoom_x()));

            zy = -(mandelbrot.get_y_offset() - (mouse.y / mandelbrot.get_zoom_y()));
             
            julia_set.render(render_state::good, zx, zy);


            julia_set.setPosition({ float(window.getSize().x - 800), 0 });

            buffer.clear();
            window.clear();

            buffer.draw(mandelbrot);
            buffer.draw(julia_set);

            buffer.display();

            window.draw(sf::Sprite(buffer.getTexture()));
            window.draw(text);

            window.display();

            auto time = timer_julia.restart();
            

            mouse_moved = false;
        }

        else if (curr_qual_jul != render_state::best || prev_qual_jul != render_state::best) {
            prev_qual_jul = curr_qual_jul;
            std::string quality = curr_qual_jul == render_state::best ? "Best" : "Good";

            julia_set.render(curr_qual_jul, zx, zy);
            julia_set.setPosition({ float(window.getSize().x - 800), 0 });

            buffer.clear();
            window.clear();
            buffer.draw(mandelbrot);
            if (drawen) {
                buffer.draw(julia_set);
            }

            buffer.display();
            window.draw(sf::Sprite(buffer.getTexture()));
            window.draw(text);
            window.display();
            

            std::cout << "julia set " << "(" << quality << ")\n";

            if (quality == "Good") {
                curr_qual_jul = render_state::best;
            }
        }
        else {
            window.clear();
            window.draw(sf::Sprite(buffer.getTexture()));
            window.draw(text);
            window.display();
        }

        if(fps_clock.getElapsedTime().asSeconds() > 0.2f) {
            float elapsed = fps_clock.getElapsedTime().asSeconds();
            text.setString(std::to_string(fps / elapsed));
            text.setPosition({ 10, 10 });
            fps = 0;
            fps_clock.restart();
        }

    }
}



int main() {
    std::thread mainThread(main_thread);
    mainThread.join();
}

CUDA_ComputationFunctions.cu

#include "CUDA_ComputationFunctions.cuh"
#include <cuda_runtime.h>  
#include <vector>
#include <cmath>
#include <iostream>

inline static void cudaCheckError() {
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        std::cerr << "CUDA Error: " << cudaGetErrorString(error) << " in " << __FILE__ << ":" << __LINE__ << std::endl;
        exit(EXIT_FAILURE);
    }
}


/**
 * @brief Converts HSV (Hue, Saturation, Value) color values to RGB (Red, Green, Blue).
 * This function performs the conversion based on the provided HSV values and
 * calculates the corresponding RGB values, which are then stored in the provided
 * unsigned integer references for red, green, and blue channels.
 *
 * @param h Hue value (angle in degrees from 0 to 360).
 * @param s Saturation value (range 0 to 1).
 * @param v Value (brightness) (range 0 to 1).
 * @param r Reference to the red channel (output).
 * @param g Reference to the green channel (output).
 * @param b Reference to the blue channel (output).
 */
[[assume(h > 0 && h < 360 && s >= 0 && s <= 1 && v >= 0 && v <= 1 && r >= 0 && g >= 0 && b >= 0 && r <= 255 && g <= 255 && b <= 255)]] inline static void HSVtoRGB(double h, double s, double v, unsigned int& r, unsigned int& g, unsigned int& b) {
    h = fmod(h, 360.0);
    double c = v * s;
    double x = c * (1 - std::fabs(fmod(h / 60.0, 2) - 1));
    double m = v - c;

    double r_ = 0, g_ = 0, b_ = 0;
    if (h < 60) { r_ = c, g_ = x, b_ = 0; }
    else if (h < 120) { r_ = x, g_ = c, b_ = 0; }
    else if (h < 180) { r_ = 0, g_ = c, b_ = x; }
    else if (h < 240) { r_ = 0, g_ = x, b_ = c; }
    else if (h < 300) { r_ = x, g_ = 0, b_ = c; }
    else { r_ = c, g_ = 0, b_ = x; }

    r = static_cast<unsigned int>((r_ + m) * 255);
    g = static_cast<unsigned int>((g_ + m) * 255);
    b = static_cast<unsigned int>((b_ + m) * 255);
}

/**
 * @brief Creates a color palette using HSV color space and converts it to RGB.
 * The palette is designed to smoothly transition colors for visualization of the Mandelbrot set.
 * @param numColors The number of colors in the palette.
 * @return std::vector<sf::Color> A vector containing the generated color palette.
 **/
[[assume(numColors > 500)]] std::vector<sf::Color> createHSVPalette(int numColors) {
    std::vector<sf::Color> palette;
    for (int i = 0; i < numColors; ++i) {
        double t = static_cast<double>(i) / numColors;

        double hue;
        double saturation = 1.0;
        double value;

        hue = std::pow((t - 0.2) / 0.8, 0.5) * 360.0;
        value = 1.0;

        unsigned int r, g, b;
        HSVtoRGB(hue, saturation, value, r, g, b);
        palette.push_back(sf::Color(r, g, b, 255));
    }
    return palette;
}

std::vector<sf::Color> CreateBlackOWhitePalette(int numColors) {
    std::vector<sf::Color> palette;
    for (int i = 0; i < numColors; ++i) {
        double t = static_cast<double>(i) / numColors;
        unsigned int r, g, b;
        r = static_cast<unsigned int>(t * 255);
        g = static_cast<unsigned int>(t * 255);
        b = static_cast<unsigned int>(t * 255);
        palette.push_back(sf::Color(r, g, b, 255));
    }
    return palette;
}

/**
 * @brief Applies 4x Super-Sampling Anti-Aliasing (SSAA) to reduce aliasing artifacts.
 * This function takes a high-resolution image (2x width and height of the desired output)
 * and downsamples it to produce a smoother, anti-aliased image. It averages the color
 * of 2x2 pixel blocks in the high-resolution image to generate each pixel in the
 * low-resolution image.
 *
 * @param high_qual The high-resolution input image.
 * @param size The scaling factor (should be 2 for 4x SSAA).
 * @return sf::Image The anti-aliased, low-resolution image.
 */
__global__ 
void ANTIALIASING_SSAA4(unsigned char* src, unsigned char* dest, int src_width, int src_height, int dest_width, int dest_height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < dest_width && y < dest_height) {
        int r = 0, g = 0, b = 0;
        for (int i = 0; i < 2; ++i) {
            for (int j = 0; j < 2; ++j) {
                int src_x = x * 2 + i;
                int src_y = y * 2 + j;
                if (src_x < src_width && src_y < src_height) {
                    int src_index = (src_y * src_width + src_x) * 4;
                    r += src[src_index];
                    g += src[src_index + 1];
                    b += src[src_index + 2];
                }
            }
        }
        int dest_index = (y * dest_width + x) * 4;
        dest[dest_index] = r / 4;
        dest[dest_index + 1] = g / 4;
        dest[dest_index + 2] = b / 4;
        dest[dest_index + 3] = 255;
    }
}

CUDA_ComputationFunctions.cuh

#pragma once
#include <cuda_runtime.h>
#include <TGUI/Backend/SFML-Graphics.hpp>

// Device functions (inline definitions)
inline __device__ float complex_mult_real(float real1, float imag1, float real2, float imag2) {
    return real1 * real2 - imag1 * imag2;
}

inline __device__ float complex_mult_imag(float real1, float imag1, float real2, float imag2) {
    return 2 * real1 * imag2;
}

inline __device__ float dev_abs(float x) {
    return (x >= 0) ? x : -x;
}

inline __device__ float dev_log2(float x) {
    return log(x) / log(2.0f);
}

inline __device__ float dev_sqrt(float x) {
    return sqrt(x);
}

inline __device__ float complex_abs2(float real, float imag) {
    return real * real + imag * imag;
}

inline __device__ float Gradient(float current_iteration, float max_iter) {
    if (current_iteration >= max_iter) return 0.0;
    // comment following two lines to make the colors less appealing on the screen (less of the colors circle the black thing on the screen)
    current_iteration = dev_sqrt(current_iteration);
    max_iter = dev_sqrt(max_iter);
    return (current_iteration) / static_cast<float>(max_iter);
}

inline __device__ sf::Color getPaletteColor(int index, int paletteSize, sf::Color* palette) {
    index = (index < 0) ? 0 : ((index >= paletteSize) ? paletteSize - 1 : index);
    return palette[index];
}

__global__ void ANTIALIASING_SSAA4(unsigned char* src, unsigned char* dest, int src_width, int src_height, int dest_width, int dest_height);

// Host utility functions (declarations only)
extern void cudaCheckError();
extern void HSVtoRGB(double h, double s, double v, unsigned int& r, unsigned int& g, unsigned int& b);

FractalClass.cu

#include "FractalClass.cuh"
#include <TGUI/Backend/SFML-Graphics.hpp>
#include <iostream>
#include <thread>
#include <functional>

bool running_other_core = false;

sf::Image stretchImageNearestNeighbor(const sf::Image& source, unsigned int targetWidth, unsigned int targetHeight) {
    sf::Image result({ targetWidth, targetHeight }, sf::Color::Black);

    float scaleX = static_cast<float>(source.getSize().x) / targetWidth;
    float scaleY = static_cast<float>(source.getSize().y) / targetHeight;

    for (unsigned int y = 0; y < targetHeight; ++y) {
        for (unsigned int x = 0; x < targetWidth; ++x) {
            unsigned int srcX = static_cast<unsigned int>(x * scaleX);
            unsigned int srcY = static_cast<unsigned int>(y * scaleY);
            sf::Color color = source.getPixel({ srcX, srcY });
            result.setPixel({ x, y }, color);
        }
    }

    return result;
}

template <typename Derived>
FractalBase<Derived>::FractalBase()
    : max_iterations(300), basic_zoom_x(240.0), basic_zoom_y(240.0),
    zoom_x(basic_zoom_x), zoom_y(basic_zoom_y),
    x_offset(3.0), y_offset(1.85),
    zoom_factor(1.0), zoom_speed(0.1),
    zoom_scale(1.0), width(400), height(300)
{
    if (std::is_same<Derived, fractals::julia>::value) {
        x_offset = 2.5;
        //y_offset = 1.25;
        palette = CreateBlackOWhitePalette(5000);
        paletteSize = 5000;
    }
    else {
        palette = createHSVPalette(20000);
        paletteSize = 20000;
    }

    cudaMalloc(&stopFlagDevice, sizeof(bool));
    bool flag = true;
    cudaMemcpy(stopFlagDevice, &flag, sizeof(bool), cudaMemcpyHostToDevice);
    stopFlagCpu.store(flag);

    cudaMalloc(&d_palette, palette.size() * sizeof(sf::Color));
    cudaMemcpy(d_palette, palette.data(), palette.size() * sizeof(sf::Color), cudaMemcpyHostToDevice);

    // Alloc data for the GPU uncompressed image
    cudaMallocManaged(&d_pixels, 1600 * 1200 * 4 * sizeof(uint32_t));

    // Alloc data for the CPU uncompressed image
    cudaMallocHost(&pixels, 1600 * 1200 * 4 * sizeof(char4));

    // Alloc data for the GPU compressed image
    cudaMalloc(&ssaa_buffer, 800 * 600 * 4 * sizeof(char4));

    // Alloc data for the CPU compressed image
    cudaMallocHost(&compressed, 800 * 600 * 4 * sizeof(char4));

    cudaStreamCreate(&stream);

    cudaEventCreate(&start_rendering);
    cudaEventCreate(&stop_rendering);
}

template <typename Derived>
FractalBase<Derived>::~FractalBase() {
    cudaFree(d_palette);
    cudaFree(d_pixels);
    cudaFree(stopFlagDevice);
    cudaFreeHost(pixels);
    cudaFreeHost(compressed);
    cudaStreamDestroy(stream);
    cudaEventDestroy(start_rendering);
    cudaEventDestroy(stop_rendering);
}

template <typename Derived>
unsigned int FractalBase<Derived>::get_max_iters() { return max_iterations; }

template <typename Derived>
bool FractalBase<Derived>::get_is_dragging() { return is_dragging; }

template <typename Derived>
void FractalBase<Derived>::set_max_iters(unsigned int max_iters) { max_iterations = max_iters; }

template <typename Derived>
double FractalBase<Derived>::get_x_offset() { return x_offset; }

template <typename Derived>
double FractalBase<Derived>::get_y_offset() { return y_offset; }

template <typename Derived>
double FractalBase<Derived>::get_zoom_x() { return zoom_x; }

template <typename Derived>
double FractalBase<Derived>::get_zoom_y() { return zoom_y; }

template <typename Derived>
double FractalBase<Derived>::get_zoom_scale() { return zoom_scale; }


// that code served me good in the past, however it's being replaced with better version with atomic operations
// UwU sooooo saaaad UwU
//template <typename Derived>
//void FractalBase<Derived>::checkEventAndSetFlag(cudaEvent_t event) {
//    while (cudaEventQuery(event) == cudaErrorNotReady) {
//        std::this_thread::sleep_for(std::chrono::milliseconds(1));
//    }
//    bool flag = false;
//    running_other_core = false;
//    cudaMemcpy(stopFlagDevice, &flag, sizeof(bool), cudaMemcpyHostToDevice);
//}

void FractalBase<fractals::mandelbrot>::render(render_state quality) {
    cudaEvent_t event;
    cudaEventCreate(&event);

    int new_width, new_height;
    double new_zoom_scale;

    if (quality == render_state::good) {
        new_width = 800;
        new_height = 600;
        antialiasing = false;
        new_zoom_scale = 1.0;
    }
    else { // render_state::best
        new_width = 1600;
        new_height = 1200;
        antialiasing = true;
        new_zoom_scale = 2.0;
    }

    if (width != new_width || height != new_height) {
        double center_x = x_offset + (width / (zoom_x * zoom_scale)) / 2.0;
        double center_y = y_offset + (height / (zoom_y * zoom_scale)) / 2.0;

        zoom_scale = new_zoom_scale;
        width = new_width;
        height = new_height;

        x_offset = center_x - (width / (zoom_x * zoom_scale)) / 2.0;
        y_offset = center_y - (height / (zoom_y * zoom_scale)) / 2.0;

        width = new_width;
        height = new_height;
    }

    double render_zoom_x = zoom_x * zoom_scale;
    double render_zoom_y = zoom_y * zoom_scale;

    dim3 dimBlock(50, 50);
    dim3 dimGrid(
        (width + dimBlock.x - 1) / dimBlock.x,
        (height + dimBlock.y - 1) / dimBlock.y
    );

    size_t len = width * height * 4;
    cudaEventRecord(start_rendering, stream);
    fractal_rendering <<<dimBlock, dimGrid, 0, stream>>> (
        d_pixels, len, width, height, float(render_zoom_x), float(render_zoom_y),
        float(x_offset), float(y_offset), d_palette, paletteSize,
        float(max_iterations), stopFlagDevice
        );


    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cout << "fractal mandelbrot: " << cudaGetErrorString(err) << "\n";
    }
    ++counter;
}


void FractalBase<fractals::julia>::render(
    render_state quality,
    double zx, double zy
) {
    cudaEvent_t event;
    cudaEventCreate(&event);

    int new_width, new_height;
    double new_zoom_scale;

    if (quality == render_state::good) {
        new_width = 800;
        new_height = 600;
        antialiasing = false;
        new_zoom_scale = 1.0;
    }
    else { // render_state::best
        new_width = 1600;
        new_height = 1200;
        antialiasing = true;
        new_zoom_scale = 2.0;
    }

    if (width != new_width || height != new_height) {
        double center_x = x_offset + (width / (zoom_x * zoom_scale)) / 2.0;
        double center_y = y_offset + (height / (zoom_y * zoom_scale)) / 2.0;

        zoom_scale = new_zoom_scale;
        width = new_width;
        height = new_height;

        x_offset = center_x - (width / (zoom_x * zoom_scale)) / 2.0;
        y_offset = center_y - (height / (zoom_y * zoom_scale)) / 2.0;

        width = new_width;
        height = new_height;
    }

    double render_zoom_x = zoom_x * zoom_scale;
    double render_zoom_y = zoom_y * zoom_scale;

    dim3 dimBlock(64, 64);
    dim3 dimGrid(
        (width + dimBlock.x - 1) / dimBlock.x,
        (height + dimBlock.y - 1) / dimBlock.y
    );


    size_t len = width * height * 4;
    cudaEventRecord(start_rendering, stream);
    fractal_rendering<<<dimBlock, dimGrid, 0, stream>>>(
        d_pixels, len, width, height, float(render_zoom_x), float(render_zoom_y),
        float(x_offset), float(y_offset), d_palette, paletteSize,
        float(max_iterations), stopFlagDevice, zx, zy
        );

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cout << "fractal julia: " << cudaGetErrorString(err) << "\n";
    }
    ++counter;
}


template <typename Derived>
void FractalBase<Derived>::draw(sf::RenderTarget& target, sf::RenderStates states) const {
    if (antialiasing) {
        // SSAA rendering
        dim3 dimBlock(50, 50);
        dim3 dimGrid(
            (width + dimBlock.x - 1) / dimBlock.x,
            (height + dimBlock.y - 1) / dimBlock.y
        );

        auto start = std::chrono::high_resolution_clock::now();
        ANTIALIASING_SSAA4<<<dimBlock, dimGrid, 0, stream>>>(d_pixels, ssaa_buffer, 1600, 1200, 800, 600);
        auto end = std::chrono::high_resolution_clock::now();


        auto start_copying = std::chrono::high_resolution_clock::now();
        cudaMemcpyAsync(compressed, ssaa_buffer, 800 * 600 * 4 * sizeof(unsigned char), cudaMemcpyDeviceToHost, stream);


        std::cout << "SSAA4 time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
        cudaStreamSynchronize(stream);

        sf::Image image({ 800, 600 }, compressed);
        sf::Texture texture(image);
        sf::Sprite sprite(texture);

        auto end_copying = std::chrono::high_resolution_clock::now();
        std::cout << "SSAA4 copying time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_copying - start_copying).count() << " ms" << std::endl;

        sprite.setPosition({ 0, 0 });
        states.transform *= getTransform();
        target.draw(sprite, states);
    }
    else {
        cudaEventRecord(stop_rendering, stream);
        cudaMemcpyAsync(pixels, d_pixels, width * height * 4 * sizeof(unsigned char), cudaMemcpyDeviceToHost, stream);
        cudaError_t status = cudaEventQuery(stop_rendering);
        if (status != cudaSuccess && counter % 10 == 0) {
            //std::cout << "Rendering is not finished: " << cudaGetErrorString(status) << "\n";
            //cudaStreamSynchronize(stream);
        }
        sf::Image image({ 800, 600 }, pixels);
        sf::Texture texture(image);
        sf::Sprite sprite(texture);

        sprite.setPosition({ 0, 0 });
        states.transform *= getTransform();
        target.draw(sprite, states);
    }
}
template <typename Derived>
void FractalBase<Derived>::handleZoom(float wheel_delta, const sf::Vector2i mouse_pos) {
    double old_zoom_x = zoom_x;
    double old_zoom_y = zoom_y;
    double old_x_offset = x_offset;
    double old_y_offset = y_offset;

    double zoom_change = 1.0 + wheel_delta * zoom_speed;
    zoom_factor *= zoom_change;
    zoom_factor = std::max(std::min(zoom_factor, 100000000000000.0), 0.01);

    zoom_x = basic_zoom_x * zoom_factor;
    zoom_y = basic_zoom_y * zoom_factor;

    double image_mouse_x = mouse_pos.x * 1.0;
    double image_mouse_y = mouse_pos.y * 1.0;
    if (std::is_same<Derived, fractals::julia>::value) {
        image_mouse_x -= 1920 - 800;
    }

    x_offset = old_x_offset + (image_mouse_x / zoom_x - image_mouse_x / old_zoom_x);
    y_offset = old_y_offset + (image_mouse_y / zoom_y - image_mouse_y / old_zoom_y);

}

template <typename Derived>
void FractalBase<Derived>::start_dragging(sf::Vector2i mouse_pos) {
    is_dragging = true;
    drag_start_pos = mouse_pos;
}


template <typename Derived>
void FractalBase<Derived>::dragging(sf::Vector2i mouse_pos) {
    if (!is_dragging) return;

    sf::Vector2i delta_pos = mouse_pos - drag_start_pos;
    double delta_real = static_cast<double>(delta_pos.x) / (zoom_x * zoom_scale);
    double delta_imag = static_cast<double>(delta_pos.y) / (zoom_y * zoom_scale);

    x_offset += delta_real;
    y_offset += delta_imag;
    drag_start_pos = mouse_pos;
}

template <typename Derived>
void FractalBase<Derived>::stop_dragging() {
    is_dragging = false;
}

template <typename Derived>
void FractalBase<Derived>::move_fractal(sf::Vector2i offset) {
    x_offset += offset.x / (zoom_x * zoom_scale);
    y_offset += offset.y / (zoom_y * zoom_scale);
}

template class FractalBase<fractals::mandelbrot>;
template class FractalBase<fractals::julia>;

FractalClass.cuh

#pragma once
#include "fractals/mandelbrot.cuh"
#include "fractals/julia.cuh"
#include "CUDA_ComputationFunctions.cuh"
#include <TGUI/Backend/SFML-Graphics.hpp>
#include <cuda_runtime.h>

namespace fractals {
    struct mandelbrot{};
    struct julia{};
    struct burning_ship{};
};

extern bool running_other_core;

enum class render_state {
    bad,
    good,
    best
};

template <typename Derived>
class FractalBase : public sf::Transformable, public sf::Drawable {
protected:
    unsigned int max_iterations;

    double basic_zoom_x;
    double basic_zoom_y;

    double zoom_x;
    double zoom_y;

    double x_offset;
    double y_offset;

    double zoom_factor;
    double zoom_speed;

    sf::Vector2i drag_start_pos;
    bool is_dragging = false;

    sf::Color* d_palette;

    std::vector<sf::Color> palette;
    int paletteSize;

    unsigned char* d_pixels;
    size_t ssaa_buffer_size = 0;

    unsigned char* pixels;
    size_t ssaa_limit_dst = 0;

    unsigned char* ssaa_buffer;

    unsigned char* compressed;

    bool is_paused = false;

    render_state state = render_state::good;
    bool antialiasing = true;

    bool* stopFlagDevice;
    std::atomic<bool> stopFlagCpu;

    double zoom_scale;

    unsigned int width;
    unsigned int height;

    cudaStream_t stream;
    cudaEvent_t start_rendering, stop_rendering;

    unsigned char counter = 0;
public:
    FractalBase();
    ~FractalBase();

    /*@returns amount of max_iterations*/
    unsigned int get_max_iters();



    /*@returns mouse currently dragging state*/
    bool get_is_dragging();

    double get_x_offset();

    double get_y_offset();

    double get_zoom_x();

    double get_zoom_y();

    double get_zoom_scale();


    /*@brief sets max_iterations to new given number
    **@param max_iters
    */
    void set_max_iters(unsigned int max_iters);

    /**
     * @brief CUDA kernel function to calculate and render the fractal given template.
     * This kernel is executed in parallel by multiple threads on the GPU.
     * Each thread calculates the color of a single pixel based on its position
     * and the fractal rendering algorithm.
     *
     * @param width Image width.
     * @param height Image height.
     */
    void render(render_state quality);

    void render(
        render_state quality,
        double mouse_x, double mouse_y
    );

    /**
     * @brief Draws the rendered fractal image onto the SFML render target.
     * @param target The SFML render target (such as a window) where the image is drawn.
     * @param states SFML render states used to control how the drawing is done.
     */
    void draw(sf::RenderTarget& target, sf::RenderStates states) const override;

    /**
     * @brief Handles zoom functionality based on mouse wheel input.
     * @param wheel_delta The direction and magnitude of the mouse wheel scroll.
     * @param mouse_pos The current mouse position in window coordinates.
     */
    void handleZoom(float wheel_delta, const sf::Vector2i mouse_pos);

    /**
     * @brief Starts the dragging operation when the mouse button is pressed.
     * @param mouse_pos The current mouse position.
     */
    void start_dragging(sf::Vector2i mouse_pos);

    /**
     * @brief Handles the dragging operation while the mouse is moved and the button is held down.
     * @param mouse_pos The current mouse position.
     */
    void dragging(sf::Vector2i mouse_pos);

    /**
     * @brief Stops the dragging operation when the mouse button is released.
     */
    void stop_dragging();

    /*@brief moves fractal by given coords
    * @param offset: x, y
    */
    void move_fractal(sf::Vector2i offset);
};

julia.cu

#include "julia.cuh"
#include <iostream>

__global__ void fractal_rendering(
    unsigned char* pixels, size_t size_of_pixels, int width, int height,
    float zoom_x, float zoom_y, float x_offset, float y_offset,
    sf::Color* d_palette, int paletteSize, float maxIterations, bool* stopFlagDevice,
    float cReal, float cImaginary) {

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x == 0 && y == 0)
        *stopFlagDevice = false;

    size_t expected_size = width * height * 4;

    float scale_factor = (float)size_of_pixels / expected_size;



    if (x < width && y < height) {
        float real = cReal;
        float imag = cImaginary;
        float new_real, new_imag;
        float z_real = x / zoom_x - x_offset;;
        float z_imag = y / zoom_y - y_offset;
        float current_iteration = 0;

        while (complex_abs2(z_real, z_imag) < 4 && current_iteration < maxIterations) {
            new_real = complex_mult_real(z_real, z_imag, z_real, z_imag) + real;
            new_imag = complex_mult_imag(z_real, z_imag, z_real, z_imag) + imag;
            z_real = new_real;
            z_imag = new_imag;
            current_iteration++;
            if (*stopFlagDevice) {
                printf("Rendering: width=%d, height=%d, x=%f, y=%d\n", width, height, x, y);
                return;
            }

        }

        unsigned char r, g, b;
        if (current_iteration == maxIterations) {
            r = g = b = 0; 
        }

        else {
            // Smooth iteration count
            current_iteration = current_iteration + 1 - dev_log2(dev_log2(dev_abs(dev_sqrt(complex_abs2(z_real, z_imag)))));
            // Calculate gradient value
            float gradient = Gradient(current_iteration, maxIterations);
            // Map gradient to palette index
            int index = static_cast<int>(gradient * (paletteSize - 1));
            sf::Color color = getPaletteColor(index, paletteSize, d_palette);
            r = color.r;
            g = color.g;
            b = color.b;
        }

        int base_index = (y * width + x) * 4;
        for (int i = 0; i < scale_factor * 4; i += 4) {
            int index = base_index + i;
            pixels[index] = r;
            pixels[index + 1] = g;
            pixels[index + 2] = b;
            pixels[index + 3] = 255;
        }
    }
    if (x == 0 && y == 0)
        *stopFlagDevice = false;

}

mandelbrot.cu

#include "mandelbrot.cuh"
#include <iostream>

/**
 * @brief CUDA kernel function to calculate and render the Mandelbrot set.
 * This kernel is executed in parallel by multiple threads on the GPU.
 * Each thread calculates the color of a single pixel based on its position
 * and the Mandelbrot set algorithm.
 *
 * @param pixels Pointer to the pixel data buffer in device memory.
 * @param width Image width.
 * @param height Image height.
 * @param zoom_x Zoom level along the x-axis.
 * @param zoom_y Zoom level along the y-axis.
 * @param x_offset Offset in the x-direction to move the view.
 * @param y_offset Offset in the y-direction to move the view.
 * @param d_palette Color palette in device memory to color the Mandelbrot set.
 * @param paletteSize Size of the color palette.
 * @param maxIterations Maximum iterations for Mandelbrot calculation.
 */
__global__ void fractal_rendering(
    unsigned char* pixels, size_t size_of_pixels, int width, int height,
    float zoom_x, float zoom_y, float x_offset, float y_offset,
    sf::Color* d_palette, int paletteSize, float maxIterations, bool* stopFlagDevice) {
    maxIterations = float(maxIterations);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x == 0 && y == 0)
        *stopFlagDevice = false;
    
    size_t expected_size = width * height * 4;

    float scale_factor = (float)size_of_pixels / expected_size;




    if (x < width && y < height) {
        float real = x / zoom_x - x_offset;
        float imag = y / zoom_y - y_offset;
        float new_real, new_imag;
        float z_real = 0.0;
        float z_imag = 0.0;
        float current_iteration = 0;

        while (complex_abs2(z_real, z_imag) < 4 && current_iteration < maxIterations) {
            new_real = complex_mult_real(z_real, z_imag, z_real, z_imag) + real;
            new_imag = complex_mult_imag(z_real, z_imag, z_real, z_imag) + imag;
            z_real = new_real;
            z_imag = new_imag;
            current_iteration++;
            if (*stopFlagDevice) {
                printf("Rendering: width=%i, height=%i, x=%i, y=%i\n", width, height, x, y);
                return;
            }
        }

        unsigned char r, g, b;
        if (current_iteration == maxIterations) {
            r = g = b = 0;
        }

        else {
            // Smooth iteration count
            current_iteration = current_iteration + 1 - dev_log2(dev_log2(dev_abs(dev_sqrt(complex_abs2(z_real, z_imag)))));
            // Calculate gradient value
            float gradient = Gradient(current_iteration, maxIterations);
            // Map gradient to palette index
            int index = static_cast<int>(gradient * (paletteSize - 1));
            sf::Color color = getPaletteColor(index, paletteSize, d_palette);
            r = color.r;
            g = color.g;
            b = color.b;
        }
        int base_index = (y * width + x) * 4;
        for (int i = 0; i < scale_factor * 4; i += 4) {
            int index = base_index + i;
            pixels[index] = r;
            pixels[index + 1] = g;
            pixels[index + 2] = b;
            pixels[index + 3] = 255;
        }
    }
    if (x == 0 && y == 0)
        *stopFlagDevice = false;
}

(yes I know that setting value to another is bad in multi threaded kernels, but it is just a demo for now, if it works, it works, will fix it later)

also would like not only a review of the code, but optimizations suggestions.

UPD1: updater the rendering functions to only change the value to false when it is the first global thread, thus decrease chance of data race, and decreased amount of times to access varaible in kernel

UPD2: as it turned out, gpu's are ass slow on double precision types, i changed double to float and then the average rendeiring time went down to 3ms.

UPD3: I completely removed all synchronization, and now I have stable 60-70 FPS. However, the rendering becomes desynchronized and responds to dragging/zooming with a 5-10 frame delay on hard parts, with a lot of black zones. I prefer slightly lower FPS but real-time interaction.

Is there a way to track when the kernel finishes? Or, in my case, after every 5 kernels? I would synchronize the stream only if the delay exceeds 3 frames to maintain responsiveness.

github page

Stack Exchange Network

FractalRendering on GPU with CUDA

0

You must log in to answer this question.

Hot Network Questions

FractalRendering on GPU with CUDA

0

You must log in to answer this question.

Related

Hot Network Questions