Implement a multi-threaded C++ renderer for a decent speed boost.

2025-01-31 12:13:49 +01:00 · 2021-05-20 03:51:43 +00:00 · 2021-05-20 03:51:43 +00:00 · 0d648f1371
commit 0d648f1371
parent f15ba3c718
3 changed files with 156 additions and 19 deletions
--- a/bemani/format/afp/blend.py
+++ b/bemani/format/afp/blend.py
@ -153,7 +153,7 @@ except ImportError:
        miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
        maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)

-        if maxx <= 0 or maxy <= 0:
+        if maxx <= minx or maxy <= miny:
            # This image is entirely off the screen!
            return img

--- a/bemani/format/afp/blendalt.pyx
+++ b/bemani/format/afp/blendalt.pyx
@ -1,3 +1,4 @@
+import multiprocessing
 from PIL import Image  # type: ignore
 from typing import Tuple

@ -43,7 +44,7 @@ cdef extern int affine_composite_fast(
    unsigned char *texdata,
    unsigned int texwidth,
    unsigned int texheight,
-    int single_threaded
+    unsigned int threads
 )

 def affine_composite(
@ -89,7 +90,7 @@ def affine_composite(
    miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
    maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)

-    if maxx <= 0 or maxy <= 0:
+    if maxx <= minx or maxy <= miny:
        # This image is entirely off the screen!
        return img

@ -102,6 +103,7 @@ def affine_composite(
    cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
    cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
    cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
+    cdef unsigned int threads = 1 if single_threaded else multiprocessing.cpu_count()

    # Call the C++ function.
    errors = affine_composite_fast(
@ -120,7 +122,7 @@ def affine_composite(
        texbytes,
        texwidth,
        texheight,
-        single_threaded,
+        threads,
    )
    if errors != 0:
        raise Exception("Error raised in C++!")
--- a/bemani/format/afp/blendaltimpl.cxx
+++ b/bemani/format/afp/blendaltimpl.cxx
@ -1,5 +1,9 @@
 #include <stdio.h>
 #include <math.h>
+#include <pthread.h>
+#include <list>
+
+#define MIN_THREAD_WORK 10

 extern "C"
 {
@ -45,6 +49,24 @@ extern "C"
        }
    } matrix_t;

+    typedef struct work {
+        intcolor_t *imgdata;
+        unsigned int imgwidth;
+        unsigned int minx;
+        unsigned int maxx;
+        unsigned int miny;
+        unsigned int maxy;
+        intcolor_t *texdata;
+        unsigned int texwidth;
+        unsigned int texheight;
+        matrix_t inverse;
+        point_t origin;
+        intcolor_t add_color;
+        floatcolor_t mult_color;
+        int blendfunc;
+        pthread_t *thread;
+    } work_t;
+
    inline unsigned char clamp(float color) {
        return fmin(fmax(0.0, roundf(color)), 255.0);
    }
@ -184,6 +206,35 @@ extern "C"
        return blend_normal(dest_color, src_color);
    }

+    void chunk_composite_fast(work_t *work) {
+        for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) {
+            for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) {
+                // Determine offset.
+                unsigned int imgoff = imgx + (imgy * work->imgwidth);
+
+                // Calculate what texture pixel data goes here.
+                point_t texloc = work->inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(work->origin);
+                int texx = roundf(texloc.x);
+                int texy = roundf(texloc.y);
+
+                // If we're out of bounds, don't update.
+                if (texx < 0 or texy < 0 or texx >= (int)work->texwidth or texy >= (int)work->texheight) {
+                    continue;
+                }
+
+                // Blend it.
+                unsigned int texoff = texx + (texy * work->texwidth);
+                work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, work->texdata[texoff], work->imgdata[imgoff], work->blendfunc);
+            }
+        }
+    }
+
+    void *chunk_composite_worker(void *arg) {
+        work_t *work = (work_t *)arg;
+        chunk_composite_fast(work);
+        return NULL;
+    }
+
    int affine_composite_fast(
        unsigned char *imgbytes,
        unsigned int imgwidth,
@ -200,31 +251,115 @@ extern "C"
        unsigned char *texbytes,
        unsigned int texwidth,
        unsigned int texheight,
-        int single_threaded
+        unsigned int threads
    ) {
        // Cast to a usable type.
        intcolor_t *imgdata = (intcolor_t *)imgbytes;
        intcolor_t *texdata = (intcolor_t *)texbytes;

-        for (unsigned int imgy = miny; imgy < maxy; imgy++) {
-            for (unsigned int imgx = minx; imgx < maxx; imgx++) {
-                // Determine offset.
-                unsigned int imgoff = imgx + (imgy * imgwidth);
+        if (threads == 1 || (maxy - miny) < (MIN_THREAD_WORK * 2)) {
+            // Just create a local work structure so we can call the common function.
+            work_t work;
+            work.imgdata = imgdata;
+            work.imgwidth = imgwidth;
+            work.minx = minx;
+            work.maxx = maxx;
+            work.miny = miny;
+            work.maxy = maxy;
+            work.texdata = texdata;
+            work.texwidth = texwidth;
+            work.texheight = texheight;
+            work.inverse = inverse;
+            work.origin = origin;
+            work.add_color = add_color;
+            work.mult_color = mult_color;
+            work.blendfunc = blendfunc;

-                // Calculate what texture pixel data goes here.
-                point_t texloc = inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(origin);
-                int texx = roundf(texloc.x);
-                int texy = roundf(texloc.y);
+            chunk_composite_fast(&work);
+        } else {
+            std::list<work_t *> workers;
+            work_t *mywork = NULL;
+            unsigned int imgy = miny;
+            unsigned int step = (maxy - miny) / threads;
+            if (step < MIN_THREAD_WORK) {
+                step = MIN_THREAD_WORK;
+            }

-                // If we're out of bounds, don't update.
-                if (texx < 0 or texy < 0 or texx >= (int)texwidth or texy >= (int)texheight) {
-                    continue;
+            for (unsigned int worker = 0; worker < threads; worker++) {
+                // We are slightly different if this is the last worker, because
+                // its going to this thread. Make sure it consumes the rest of the
+                // work, as well as not getting a pthread. Make sure each thread
+                // has a minimum amount of work so we don't waste pthread overhead
+                // starting and stopping it. Because of this, make sure that the
+                // last chunk we create is always our own.
+                unsigned int me = 0;
+                if (worker == (threads - 1) || (imgy + step) >= maxy) {
+                    me = 1;
                }

-                // Blend it.
-                unsigned int texoff = texx + (texy * texwidth);
-                imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
+                // Create storage for this worker.
+                pthread_t *thread = me ? NULL : (pthread_t *)malloc(sizeof(pthread_t));
+                work_t *work = (work_t *)malloc(sizeof(work_t));
+
+                // Pass to it all of the params it needs.
+                work->imgdata = imgdata;
+                work->imgwidth = imgwidth;
+                work->minx = minx;
+                work->maxx = maxx;
+                work->miny = imgy;
+                work->maxy = me ? maxy : imgy + step;
+                work->texdata = texdata;
+                work->texwidth = texwidth;
+                work->texheight = texheight;
+                work->inverse = inverse;
+                work->origin = origin;
+                work->add_color = add_color;
+                work->mult_color = mult_color;
+                work->blendfunc = blendfunc;
+                work->thread = thread;
+
+                if (me)
+                {
+                    // This is the row for this thread.
+                    mywork = work;
+
+                    // Always exit here, we might not have actually scheduled
+                    // the maximum permitted threads.
+                    break;
+                }
+                else
+                {
+                    // Kick off the thread.
+                    pthread_create(thread, NULL, chunk_composite_worker, work);
+
+                    // Save the row so we can access it for scheduling.
+                    workers.push_back(work);
+
+                    // The next chunk of work is the next step.
+                    imgy += step;
+                }
            }
+
+            // Now, run my own work.
+            chunk_composite_fast(mywork);
+
+            // Join on all threads once they're finished.
+            std::list<work_t *>::iterator work = workers.begin();
+
+            while(work != workers.end()) {
+                // Join the thread.
+                pthread_join(*((*work)->thread), NULL);
+
+                // Free the memory we allocated.
+                free((*work)->thread);
+                free((*work));
+
+                // Remove it from our bookkeeping.
+                work = workers.erase(work);
+            }
+
+            // Free the memory we allocated.
+            free(mywork);
        }

        return 0;