Implement a multi-threaded C++ renderer for a decent speed boost.

2024-11-27 23:50:47 +01:00 · 2021-05-20 03:51:43 +00:00 · 2021-05-20 03:51:43 +00:00 · 0d648f1371
commit 0d648f1371
parent f15ba3c718
3 changed files with 156 additions and 19 deletions
--- a/bemani/format/afp/blend.py
+++ b/bemani/format/afp/blend.py
@ -153,7 +153,7 @@ except ImportError:
        miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
        maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
-        if maxx <= 0 or maxy <= 0:
+        if maxx <= minx or maxy <= miny:
            # This image is entirely off the screen!
            return img
--- a/bemani/format/afp/blendalt.pyx
+++ b/bemani/format/afp/blendalt.pyx
@ -1,3 +1,4 @@
 import multiprocessing
 from PIL import Image  # type: ignore
 from typing import Tuple
@ -43,7 +44,7 @@ cdef extern int affine_composite_fast(
    unsigned char *texdata,
    unsigned int texwidth,
    unsigned int texheight,
-    int single_threaded
+    unsigned int threads
 )
 def affine_composite(
@ -89,7 +90,7 @@ def affine_composite(
    miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
    maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
-    if maxx <= 0 or maxy <= 0:
+    if maxx <= minx or maxy <= miny:
        # This image is entirely off the screen!
        return img
@ -102,6 +103,7 @@ def affine_composite(
    cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
    cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
    cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
    cdef unsigned int threads = 1 if single_threaded else multiprocessing.cpu_count()
    # Call the C++ function.
    errors = affine_composite_fast(
@ -120,7 +122,7 @@ def affine_composite(
        texbytes,
        texwidth,
        texheight,
-        single_threaded,
+        threads,
    )
    if errors != 0:
        raise Exception("Error raised in C++!")
--- a/bemani/format/afp/blendaltimpl.cxx
+++ b/bemani/format/afp/blendaltimpl.cxx
@ -1,5 +1,9 @@
 #include <stdio.h>
 #include <math.h>
 #include <pthread.h>
 #include <list>
 #define MIN_THREAD_WORK 10
 extern "C"
 {
@ -45,6 +49,24 @@ extern "C"
        }
    } matrix_t;
    typedef struct work {
        intcolor_t *imgdata;
        unsigned int imgwidth;
        unsigned int minx;
        unsigned int maxx;
        unsigned int miny;
        unsigned int maxy;
        intcolor_t *texdata;
        unsigned int texwidth;
        unsigned int texheight;
        matrix_t inverse;
        point_t origin;
        intcolor_t add_color;
        floatcolor_t mult_color;
        int blendfunc;
        pthread_t *thread;
    } work_t;
    inline unsigned char clamp(float color) {
        return fmin(fmax(0.0, roundf(color)), 255.0);
    }
@ -184,6 +206,35 @@ extern "C"
        return blend_normal(dest_color, src_color);
    }
    void chunk_composite_fast(work_t *work) {
        for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) {
            for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) {
                // Determine offset.
                unsigned int imgoff = imgx + (imgy * work->imgwidth);
                // Calculate what texture pixel data goes here.
                point_t texloc = work->inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(work->origin);
                int texx = roundf(texloc.x);
                int texy = roundf(texloc.y);
                // If we're out of bounds, don't update.
                if (texx < 0 or texy < 0 or texx >= (int)work->texwidth or texy >= (int)work->texheight) {
                    continue;
                }
                // Blend it.
                unsigned int texoff = texx + (texy * work->texwidth);
                work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, work->texdata[texoff], work->imgdata[imgoff], work->blendfunc);
            }
        }
    }
    void *chunk_composite_worker(void *arg) {
        work_t *work = (work_t *)arg;
        chunk_composite_fast(work);
        return NULL;
    }
    int affine_composite_fast(
        unsigned char *imgbytes,
        unsigned int imgwidth,
@ -200,31 +251,115 @@ extern "C"
        unsigned char *texbytes,
        unsigned int texwidth,
        unsigned int texheight,
-        int single_threaded
+        unsigned int threads
    ) {
        // Cast to a usable type.
        intcolor_t *imgdata = (intcolor_t *)imgbytes;
        intcolor_t *texdata = (intcolor_t *)texbytes;
-        for (unsigned int imgy = miny; imgy < maxy; imgy++) {
+        if (threads == 1 || (maxy - miny) < (MIN_THREAD_WORK * 2)) {
-            for (unsigned int imgx = minx; imgx < maxx; imgx++) {
+            // Just create a local work structure so we can call the common function.
-                // Determine offset.
+            work_t work;
-                unsigned int imgoff = imgx + (imgy * imgwidth);
+            work.imgdata = imgdata;
            work.imgwidth = imgwidth;
            work.minx = minx;
            work.maxx = maxx;
            work.miny = miny;
            work.maxy = maxy;
            work.texdata = texdata;
            work.texwidth = texwidth;
            work.texheight = texheight;
            work.inverse = inverse;
            work.origin = origin;
            work.add_color = add_color;
            work.mult_color = mult_color;
            work.blendfunc = blendfunc;
-                // Calculate what texture pixel data goes here.
+            chunk_composite_fast(&work);
-                point_t texloc = inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(origin);
+        } else {
-                int texx = roundf(texloc.x);
+            std::list<work_t *> workers;
-                int texy = roundf(texloc.y);
+            work_t *mywork = NULL;
-
+            unsigned int imgy = miny;
-                // If we're out of bounds, don't update.
+            unsigned int step = (maxy - miny) / threads;
-                if (texx < 0 or texy < 0 or texx >= (int)texwidth or texy >= (int)texheight) {
+            if (step < MIN_THREAD_WORK) {
-                    continue;
+                step = MIN_THREAD_WORK;
            }
-                // Blend it.
+            for (unsigned int worker = 0; worker < threads; worker++) {
-                unsigned int texoff = texx + (texy * texwidth);
+                // We are slightly different if this is the last worker, because
-                imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
+                // its going to this thread. Make sure it consumes the rest of the
                // work, as well as not getting a pthread. Make sure each thread
                // has a minimum amount of work so we don't waste pthread overhead
                // starting and stopping it. Because of this, make sure that the
                // last chunk we create is always our own.
                unsigned int me = 0;
                if (worker == (threads - 1) || (imgy + step) >= maxy) {
                    me = 1;
                }
                // Create storage for this worker.
                pthread_t *thread = me ? NULL : (pthread_t *)malloc(sizeof(pthread_t));
                work_t *work = (work_t *)malloc(sizeof(work_t));
                // Pass to it all of the params it needs.
                work->imgdata = imgdata;
                work->imgwidth = imgwidth;
                work->minx = minx;
                work->maxx = maxx;
                work->miny = imgy;
                work->maxy = me ? maxy : imgy + step;
                work->texdata = texdata;
                work->texwidth = texwidth;
                work->texheight = texheight;
                work->inverse = inverse;
                work->origin = origin;
                work->add_color = add_color;
                work->mult_color = mult_color;
                work->blendfunc = blendfunc;
                work->thread = thread;
                if (me)
                {
                    // This is the row for this thread.
                    mywork = work;
                    // Always exit here, we might not have actually scheduled
                    // the maximum permitted threads.
                    break;
                }
                else
                {
                    // Kick off the thread.
                    pthread_create(thread, NULL, chunk_composite_worker, work);
                    // Save the row so we can access it for scheduling.
                    workers.push_back(work);
                    // The next chunk of work is the next step.
                    imgy += step;
                }
            }
            // Now, run my own work.
            chunk_composite_fast(mywork);
            // Join on all threads once they're finished.
            std::list<work_t *>::iterator work = workers.begin();
            while(work != workers.end()) {
                // Join the thread.
                pthread_join(*((*work)->thread), NULL);
                // Free the memory we allocated.
                free((*work)->thread);
                free((*work));
                // Remove it from our bookkeeping.
                work = workers.erase(work);
            }
            // Free the memory we allocated.
            free(mywork);
        }
        return 0;