1
0
mirror of synced 2024-11-24 06:20:12 +01:00

Implement a multi-threaded C++ renderer for a decent speed boost.

This commit is contained in:
Jennifer Taylor 2021-05-20 03:51:43 +00:00
parent f15ba3c718
commit 0d648f1371
3 changed files with 156 additions and 19 deletions

View File

@ -153,7 +153,7 @@ except ImportError:
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
if maxx <= 0 or maxy <= 0:
if maxx <= minx or maxy <= miny:
# This image is entirely off the screen!
return img

View File

@ -1,3 +1,4 @@
import multiprocessing
from PIL import Image # type: ignore
from typing import Tuple
@ -43,7 +44,7 @@ cdef extern int affine_composite_fast(
unsigned char *texdata,
unsigned int texwidth,
unsigned int texheight,
int single_threaded
unsigned int threads
)
def affine_composite(
@ -89,7 +90,7 @@ def affine_composite(
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
if maxx <= 0 or maxy <= 0:
if maxx <= minx or maxy <= miny:
# This image is entirely off the screen!
return img
@ -102,6 +103,7 @@ def affine_composite(
cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
cdef unsigned int threads = 1 if single_threaded else multiprocessing.cpu_count()
# Call the C++ function.
errors = affine_composite_fast(
@ -120,7 +122,7 @@ def affine_composite(
texbytes,
texwidth,
texheight,
single_threaded,
threads,
)
if errors != 0:
raise Exception("Error raised in C++!")

View File

@ -1,5 +1,9 @@
#include <stdio.h>
#include <math.h>
#include <pthread.h>
#include <list>
#define MIN_THREAD_WORK 10
extern "C"
{
@ -45,6 +49,24 @@ extern "C"
}
} matrix_t;
typedef struct work {
intcolor_t *imgdata;
unsigned int imgwidth;
unsigned int minx;
unsigned int maxx;
unsigned int miny;
unsigned int maxy;
intcolor_t *texdata;
unsigned int texwidth;
unsigned int texheight;
matrix_t inverse;
point_t origin;
intcolor_t add_color;
floatcolor_t mult_color;
int blendfunc;
pthread_t *thread;
} work_t;
inline unsigned char clamp(float color) {
return fmin(fmax(0.0, roundf(color)), 255.0);
}
@ -184,6 +206,35 @@ extern "C"
return blend_normal(dest_color, src_color);
}
void chunk_composite_fast(work_t *work) {
for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) {
for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) {
// Determine offset.
unsigned int imgoff = imgx + (imgy * work->imgwidth);
// Calculate what texture pixel data goes here.
point_t texloc = work->inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(work->origin);
int texx = roundf(texloc.x);
int texy = roundf(texloc.y);
// If we're out of bounds, don't update.
if (texx < 0 or texy < 0 or texx >= (int)work->texwidth or texy >= (int)work->texheight) {
continue;
}
// Blend it.
unsigned int texoff = texx + (texy * work->texwidth);
work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, work->texdata[texoff], work->imgdata[imgoff], work->blendfunc);
}
}
}
void *chunk_composite_worker(void *arg) {
work_t *work = (work_t *)arg;
chunk_composite_fast(work);
return NULL;
}
int affine_composite_fast(
unsigned char *imgbytes,
unsigned int imgwidth,
@ -200,31 +251,115 @@ extern "C"
unsigned char *texbytes,
unsigned int texwidth,
unsigned int texheight,
int single_threaded
unsigned int threads
) {
// Cast to a usable type.
intcolor_t *imgdata = (intcolor_t *)imgbytes;
intcolor_t *texdata = (intcolor_t *)texbytes;
for (unsigned int imgy = miny; imgy < maxy; imgy++) {
for (unsigned int imgx = minx; imgx < maxx; imgx++) {
// Determine offset.
unsigned int imgoff = imgx + (imgy * imgwidth);
if (threads == 1 || (maxy - miny) < (MIN_THREAD_WORK * 2)) {
// Just create a local work structure so we can call the common function.
work_t work;
work.imgdata = imgdata;
work.imgwidth = imgwidth;
work.minx = minx;
work.maxx = maxx;
work.miny = miny;
work.maxy = maxy;
work.texdata = texdata;
work.texwidth = texwidth;
work.texheight = texheight;
work.inverse = inverse;
work.origin = origin;
work.add_color = add_color;
work.mult_color = mult_color;
work.blendfunc = blendfunc;
// Calculate what texture pixel data goes here.
point_t texloc = inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(origin);
int texx = roundf(texloc.x);
int texy = roundf(texloc.y);
chunk_composite_fast(&work);
} else {
std::list<work_t *> workers;
work_t *mywork = NULL;
unsigned int imgy = miny;
unsigned int step = (maxy - miny) / threads;
if (step < MIN_THREAD_WORK) {
step = MIN_THREAD_WORK;
}
// If we're out of bounds, don't update.
if (texx < 0 or texy < 0 or texx >= (int)texwidth or texy >= (int)texheight) {
continue;
for (unsigned int worker = 0; worker < threads; worker++) {
// We are slightly different if this is the last worker, because
// its going to this thread. Make sure it consumes the rest of the
// work, as well as not getting a pthread. Make sure each thread
// has a minimum amount of work so we don't waste pthread overhead
// starting and stopping it. Because of this, make sure that the
// last chunk we create is always our own.
unsigned int me = 0;
if (worker == (threads - 1) || (imgy + step) >= maxy) {
me = 1;
}
// Blend it.
unsigned int texoff = texx + (texy * texwidth);
imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
// Create storage for this worker.
pthread_t *thread = me ? NULL : (pthread_t *)malloc(sizeof(pthread_t));
work_t *work = (work_t *)malloc(sizeof(work_t));
// Pass to it all of the params it needs.
work->imgdata = imgdata;
work->imgwidth = imgwidth;
work->minx = minx;
work->maxx = maxx;
work->miny = imgy;
work->maxy = me ? maxy : imgy + step;
work->texdata = texdata;
work->texwidth = texwidth;
work->texheight = texheight;
work->inverse = inverse;
work->origin = origin;
work->add_color = add_color;
work->mult_color = mult_color;
work->blendfunc = blendfunc;
work->thread = thread;
if (me)
{
// This is the row for this thread.
mywork = work;
// Always exit here, we might not have actually scheduled
// the maximum permitted threads.
break;
}
else
{
// Kick off the thread.
pthread_create(thread, NULL, chunk_composite_worker, work);
// Save the row so we can access it for scheduling.
workers.push_back(work);
// The next chunk of work is the next step.
imgy += step;
}
}
// Now, run my own work.
chunk_composite_fast(mywork);
// Join on all threads once they're finished.
std::list<work_t *>::iterator work = workers.begin();
while(work != workers.end()) {
// Join the thread.
pthread_join(*((*work)->thread), NULL);
// Free the memory we allocated.
free((*work)->thread);
free((*work));
// Remove it from our bookkeeping.
work = workers.erase(work);
}
// Free the memory we allocated.
free(mywork);
}
return 0;