From 0d648f13713aa0b41a7da33e9f4d4b7db816f872 Mon Sep 17 00:00:00 2001
From: Jennifer Taylor <dragonminded@dragonminded.com>
Date: Thu, 20 May 2021 03:51:43 +0000
Subject: [PATCH] Implement a multi-threaded C++ renderer for a decent speed
 boost.

---
 bemani/format/afp/blend.py         |   2 +-
 bemani/format/afp/blendalt.pyx     |   8 +-
 bemani/format/afp/blendaltimpl.cxx | 165 ++++++++++++++++++++++++++---
 3 files changed, 156 insertions(+), 19 deletions(-)
diff --git a/bemani/format/afp/blend.py b/bemani/format/afp/blend.py
index 9694068..851c29f 100644
--- a/bemani/format/afp/blend.py
+++ b/bemani/format/afp/blend.py
@@ -153,7 +153,7 @@ except ImportError:
         miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
         maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
 
-        if maxx <= 0 or maxy <= 0:
+        if maxx <= minx or maxy <= miny:
             # This image is entirely off the screen!
             return img
 
diff --git a/bemani/format/afp/blendalt.pyx b/bemani/format/afp/blendalt.pyx
index ad4afb2..dd388c8 100644
--- a/bemani/format/afp/blendalt.pyx
+++ b/bemani/format/afp/blendalt.pyx
@@ -1,3 +1,4 @@
+import multiprocessing
 from PIL import Image  # type: ignore
 from typing import Tuple
 
@@ -43,7 +44,7 @@ cdef extern int affine_composite_fast(
     unsigned char *texdata,
     unsigned int texwidth,
     unsigned int texheight,
-    int single_threaded
+    unsigned int threads
 )
 
 def affine_composite(
@@ -89,7 +90,7 @@ def affine_composite(
     miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
     maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
 
-    if maxx <= 0 or maxy <= 0:
+    if maxx <= minx or maxy <= miny:
         # This image is entirely off the screen!
         return img
 
@@ -102,6 +103,7 @@ def affine_composite(
     cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
     cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
     cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
+    cdef unsigned int threads = 1 if single_threaded else multiprocessing.cpu_count()
 
     # Call the C++ function.
     errors = affine_composite_fast(
@@ -120,7 +122,7 @@ def affine_composite(
         texbytes,
         texwidth,
         texheight,
-        single_threaded,
+        threads,
     )
     if errors != 0:
         raise Exception("Error raised in C++!")
diff --git a/bemani/format/afp/blendaltimpl.cxx b/bemani/format/afp/blendaltimpl.cxx
index 9a4c8c9..a788da6 100644
--- a/bemani/format/afp/blendaltimpl.cxx
+++ b/bemani/format/afp/blendaltimpl.cxx
@@ -1,5 +1,9 @@
 #include <stdio.h>
 #include <math.h>
+#include <pthread.h>
+#include <list>
+
+#define MIN_THREAD_WORK 10
 
 extern "C"
 {
@@ -45,6 +49,24 @@ extern "C"
         }
     } matrix_t;
 
+    typedef struct work {
+        intcolor_t *imgdata;
+        unsigned int imgwidth;
+        unsigned int minx;
+        unsigned int maxx;
+        unsigned int miny;
+        unsigned int maxy;
+        intcolor_t *texdata;
+        unsigned int texwidth;
+        unsigned int texheight;
+        matrix_t inverse;
+        point_t origin;
+        intcolor_t add_color;
+        floatcolor_t mult_color;
+        int blendfunc;
+        pthread_t *thread;
+    } work_t;
+
     inline unsigned char clamp(float color) {
         return fmin(fmax(0.0, roundf(color)), 255.0);
     }
@@ -184,6 +206,35 @@ extern "C"
         return blend_normal(dest_color, src_color);
     }
 
+    void chunk_composite_fast(work_t *work) {
+        for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) {
+            for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) {
+                // Determine offset.
+                unsigned int imgoff = imgx + (imgy * work->imgwidth);
+
+                // Calculate what texture pixel data goes here.
+                point_t texloc = work->inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(work->origin);
+                int texx = roundf(texloc.x);
+                int texy = roundf(texloc.y);
+
+                // If we're out of bounds, don't update.
+                if (texx < 0 or texy < 0 or texx >= (int)work->texwidth or texy >= (int)work->texheight) {
+                    continue;
+                }
+
+                // Blend it.
+                unsigned int texoff = texx + (texy * work->texwidth);
+                work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, work->texdata[texoff], work->imgdata[imgoff], work->blendfunc);
+            }
+        }
+    }
+
+    void *chunk_composite_worker(void *arg) {
+        work_t *work = (work_t *)arg;
+        chunk_composite_fast(work);
+        return NULL;
+    }
+
     int affine_composite_fast(
         unsigned char *imgbytes,
         unsigned int imgwidth,
@@ -200,31 +251,115 @@ extern "C"
         unsigned char *texbytes,
         unsigned int texwidth,
         unsigned int texheight,
-        int single_threaded
+        unsigned int threads
     ) {
         // Cast to a usable type.
         intcolor_t *imgdata = (intcolor_t *)imgbytes;
         intcolor_t *texdata = (intcolor_t *)texbytes;
 
-        for (unsigned int imgy = miny; imgy < maxy; imgy++) {
-            for (unsigned int imgx = minx; imgx < maxx; imgx++) {
-                // Determine offset.
-                unsigned int imgoff = imgx + (imgy * imgwidth);
+        if (threads == 1 || (maxy - miny) < (MIN_THREAD_WORK * 2)) {
+            // Just create a local work structure so we can call the common function.
+            work_t work;
+            work.imgdata = imgdata;
+            work.imgwidth = imgwidth;
+            work.minx = minx;
+            work.maxx = maxx;
+            work.miny = miny;
+            work.maxy = maxy;
+            work.texdata = texdata;
+            work.texwidth = texwidth;
+            work.texheight = texheight;
+            work.inverse = inverse;
+            work.origin = origin;
+            work.add_color = add_color;
+            work.mult_color = mult_color;
+            work.blendfunc = blendfunc;
 
-                // Calculate what texture pixel data goes here.
-                point_t texloc = inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(origin);
-                int texx = roundf(texloc.x);
-                int texy = roundf(texloc.y);
+            chunk_composite_fast(&work);
+        } else {
+            std::list<work_t *> workers;
+            work_t *mywork = NULL;
+            unsigned int imgy = miny;
+            unsigned int step = (maxy - miny) / threads;
+            if (step < MIN_THREAD_WORK) {
+                step = MIN_THREAD_WORK;
+            }
 
-                // If we're out of bounds, don't update.
-                if (texx < 0 or texy < 0 or texx >= (int)texwidth or texy >= (int)texheight) {
-                    continue;
+            for (unsigned int worker = 0; worker < threads; worker++) {
+                // We are slightly different if this is the last worker, because
+                // its going to this thread. Make sure it consumes the rest of the
+                // work, as well as not getting a pthread. Make sure each thread
+                // has a minimum amount of work so we don't waste pthread overhead
+                // starting and stopping it. Because of this, make sure that the
+                // last chunk we create is always our own.
+                unsigned int me = 0;
+                if (worker == (threads - 1) || (imgy + step) >= maxy) {
+                    me = 1;
                 }
 
-                // Blend it.
-                unsigned int texoff = texx + (texy * texwidth);
-                imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
+                // Create storage for this worker.
+                pthread_t *thread = me ? NULL : (pthread_t *)malloc(sizeof(pthread_t));
+                work_t *work = (work_t *)malloc(sizeof(work_t));
+
+                // Pass to it all of the params it needs.
+                work->imgdata = imgdata;
+                work->imgwidth = imgwidth;
+                work->minx = minx;
+                work->maxx = maxx;
+                work->miny = imgy;
+                work->maxy = me ? maxy : imgy + step;
+                work->texdata = texdata;
+                work->texwidth = texwidth;
+                work->texheight = texheight;
+                work->inverse = inverse;
+                work->origin = origin;
+                work->add_color = add_color;
+                work->mult_color = mult_color;
+                work->blendfunc = blendfunc;
+                work->thread = thread;
+
+                if (me)
+                {
+                    // This is the row for this thread.
+                    mywork = work;
+
+                    // Always exit here, we might not have actually scheduled
+                    // the maximum permitted threads.
+                    break;
+                }
+                else
+                {
+                    // Kick off the thread.
+                    pthread_create(thread, NULL, chunk_composite_worker, work);
+
+                    // Save the row so we can access it for scheduling.
+                    workers.push_back(work);
+
+                    // The next chunk of work is the next step.
+                    imgy += step;
+                }
             }
+
+            // Now, run my own work.
+            chunk_composite_fast(mywork);
+
+            // Join on all threads once they're finished.
+            std::list<work_t *>::iterator work = workers.begin();
+
+            while(work != workers.end()) {
+                // Join the thread.
+                pthread_join(*((*work)->thread), NULL);
+
+                // Free the memory we allocated.
+                free((*work)->thread);
+                free((*work));
+
+                // Remove it from our bookkeeping.
+                work = workers.erase(work);
+            }
+
+            // Free the memory we allocated.
+            free(mywork);
         }
 
         return 0;