Implement a multi-threaded C++ renderer for a decent speed boost.
This commit is contained in:
parent
f15ba3c718
commit
0d648f1371
@ -153,7 +153,7 @@ except ImportError:
|
|||||||
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
|
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
|
||||||
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
|
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
|
||||||
|
|
||||||
if maxx <= 0 or maxy <= 0:
|
if maxx <= minx or maxy <= miny:
|
||||||
# This image is entirely off the screen!
|
# This image is entirely off the screen!
|
||||||
return img
|
return img
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import multiprocessing
|
||||||
from PIL import Image # type: ignore
|
from PIL import Image # type: ignore
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
@ -43,7 +44,7 @@ cdef extern int affine_composite_fast(
|
|||||||
unsigned char *texdata,
|
unsigned char *texdata,
|
||||||
unsigned int texwidth,
|
unsigned int texwidth,
|
||||||
unsigned int texheight,
|
unsigned int texheight,
|
||||||
int single_threaded
|
unsigned int threads
|
||||||
)
|
)
|
||||||
|
|
||||||
def affine_composite(
|
def affine_composite(
|
||||||
@ -89,7 +90,7 @@ def affine_composite(
|
|||||||
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
|
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
|
||||||
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
|
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
|
||||||
|
|
||||||
if maxx <= 0 or maxy <= 0:
|
if maxx <= minx or maxy <= miny:
|
||||||
# This image is entirely off the screen!
|
# This image is entirely off the screen!
|
||||||
return img
|
return img
|
||||||
|
|
||||||
@ -102,6 +103,7 @@ def affine_composite(
|
|||||||
cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
|
cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
|
||||||
cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
|
cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
|
||||||
cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
|
cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
|
||||||
|
cdef unsigned int threads = 1 if single_threaded else multiprocessing.cpu_count()
|
||||||
|
|
||||||
# Call the C++ function.
|
# Call the C++ function.
|
||||||
errors = affine_composite_fast(
|
errors = affine_composite_fast(
|
||||||
@ -120,7 +122,7 @@ def affine_composite(
|
|||||||
texbytes,
|
texbytes,
|
||||||
texwidth,
|
texwidth,
|
||||||
texheight,
|
texheight,
|
||||||
single_threaded,
|
threads,
|
||||||
)
|
)
|
||||||
if errors != 0:
|
if errors != 0:
|
||||||
raise Exception("Error raised in C++!")
|
raise Exception("Error raised in C++!")
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <list>
|
||||||
|
|
||||||
|
#define MIN_THREAD_WORK 10
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
{
|
{
|
||||||
@ -45,6 +49,24 @@ extern "C"
|
|||||||
}
|
}
|
||||||
} matrix_t;
|
} matrix_t;
|
||||||
|
|
||||||
|
typedef struct work {
|
||||||
|
intcolor_t *imgdata;
|
||||||
|
unsigned int imgwidth;
|
||||||
|
unsigned int minx;
|
||||||
|
unsigned int maxx;
|
||||||
|
unsigned int miny;
|
||||||
|
unsigned int maxy;
|
||||||
|
intcolor_t *texdata;
|
||||||
|
unsigned int texwidth;
|
||||||
|
unsigned int texheight;
|
||||||
|
matrix_t inverse;
|
||||||
|
point_t origin;
|
||||||
|
intcolor_t add_color;
|
||||||
|
floatcolor_t mult_color;
|
||||||
|
int blendfunc;
|
||||||
|
pthread_t *thread;
|
||||||
|
} work_t;
|
||||||
|
|
||||||
inline unsigned char clamp(float color) {
|
inline unsigned char clamp(float color) {
|
||||||
return fmin(fmax(0.0, roundf(color)), 255.0);
|
return fmin(fmax(0.0, roundf(color)), 255.0);
|
||||||
}
|
}
|
||||||
@ -184,6 +206,35 @@ extern "C"
|
|||||||
return blend_normal(dest_color, src_color);
|
return blend_normal(dest_color, src_color);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void chunk_composite_fast(work_t *work) {
|
||||||
|
for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) {
|
||||||
|
for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) {
|
||||||
|
// Determine offset.
|
||||||
|
unsigned int imgoff = imgx + (imgy * work->imgwidth);
|
||||||
|
|
||||||
|
// Calculate what texture pixel data goes here.
|
||||||
|
point_t texloc = work->inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(work->origin);
|
||||||
|
int texx = roundf(texloc.x);
|
||||||
|
int texy = roundf(texloc.y);
|
||||||
|
|
||||||
|
// If we're out of bounds, don't update.
|
||||||
|
if (texx < 0 or texy < 0 or texx >= (int)work->texwidth or texy >= (int)work->texheight) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Blend it.
|
||||||
|
unsigned int texoff = texx + (texy * work->texwidth);
|
||||||
|
work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, work->texdata[texoff], work->imgdata[imgoff], work->blendfunc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void *chunk_composite_worker(void *arg) {
|
||||||
|
work_t *work = (work_t *)arg;
|
||||||
|
chunk_composite_fast(work);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
int affine_composite_fast(
|
int affine_composite_fast(
|
||||||
unsigned char *imgbytes,
|
unsigned char *imgbytes,
|
||||||
unsigned int imgwidth,
|
unsigned int imgwidth,
|
||||||
@ -200,31 +251,115 @@ extern "C"
|
|||||||
unsigned char *texbytes,
|
unsigned char *texbytes,
|
||||||
unsigned int texwidth,
|
unsigned int texwidth,
|
||||||
unsigned int texheight,
|
unsigned int texheight,
|
||||||
int single_threaded
|
unsigned int threads
|
||||||
) {
|
) {
|
||||||
// Cast to a usable type.
|
// Cast to a usable type.
|
||||||
intcolor_t *imgdata = (intcolor_t *)imgbytes;
|
intcolor_t *imgdata = (intcolor_t *)imgbytes;
|
||||||
intcolor_t *texdata = (intcolor_t *)texbytes;
|
intcolor_t *texdata = (intcolor_t *)texbytes;
|
||||||
|
|
||||||
for (unsigned int imgy = miny; imgy < maxy; imgy++) {
|
if (threads == 1 || (maxy - miny) < (MIN_THREAD_WORK * 2)) {
|
||||||
for (unsigned int imgx = minx; imgx < maxx; imgx++) {
|
// Just create a local work structure so we can call the common function.
|
||||||
// Determine offset.
|
work_t work;
|
||||||
unsigned int imgoff = imgx + (imgy * imgwidth);
|
work.imgdata = imgdata;
|
||||||
|
work.imgwidth = imgwidth;
|
||||||
|
work.minx = minx;
|
||||||
|
work.maxx = maxx;
|
||||||
|
work.miny = miny;
|
||||||
|
work.maxy = maxy;
|
||||||
|
work.texdata = texdata;
|
||||||
|
work.texwidth = texwidth;
|
||||||
|
work.texheight = texheight;
|
||||||
|
work.inverse = inverse;
|
||||||
|
work.origin = origin;
|
||||||
|
work.add_color = add_color;
|
||||||
|
work.mult_color = mult_color;
|
||||||
|
work.blendfunc = blendfunc;
|
||||||
|
|
||||||
// Calculate what texture pixel data goes here.
|
chunk_composite_fast(&work);
|
||||||
point_t texloc = inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(origin);
|
} else {
|
||||||
int texx = roundf(texloc.x);
|
std::list<work_t *> workers;
|
||||||
int texy = roundf(texloc.y);
|
work_t *mywork = NULL;
|
||||||
|
unsigned int imgy = miny;
|
||||||
// If we're out of bounds, don't update.
|
unsigned int step = (maxy - miny) / threads;
|
||||||
if (texx < 0 or texy < 0 or texx >= (int)texwidth or texy >= (int)texheight) {
|
if (step < MIN_THREAD_WORK) {
|
||||||
continue;
|
step = MIN_THREAD_WORK;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Blend it.
|
for (unsigned int worker = 0; worker < threads; worker++) {
|
||||||
unsigned int texoff = texx + (texy * texwidth);
|
// We are slightly different if this is the last worker, because
|
||||||
imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
|
// its going to this thread. Make sure it consumes the rest of the
|
||||||
|
// work, as well as not getting a pthread. Make sure each thread
|
||||||
|
// has a minimum amount of work so we don't waste pthread overhead
|
||||||
|
// starting and stopping it. Because of this, make sure that the
|
||||||
|
// last chunk we create is always our own.
|
||||||
|
unsigned int me = 0;
|
||||||
|
if (worker == (threads - 1) || (imgy + step) >= maxy) {
|
||||||
|
me = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create storage for this worker.
|
||||||
|
pthread_t *thread = me ? NULL : (pthread_t *)malloc(sizeof(pthread_t));
|
||||||
|
work_t *work = (work_t *)malloc(sizeof(work_t));
|
||||||
|
|
||||||
|
// Pass to it all of the params it needs.
|
||||||
|
work->imgdata = imgdata;
|
||||||
|
work->imgwidth = imgwidth;
|
||||||
|
work->minx = minx;
|
||||||
|
work->maxx = maxx;
|
||||||
|
work->miny = imgy;
|
||||||
|
work->maxy = me ? maxy : imgy + step;
|
||||||
|
work->texdata = texdata;
|
||||||
|
work->texwidth = texwidth;
|
||||||
|
work->texheight = texheight;
|
||||||
|
work->inverse = inverse;
|
||||||
|
work->origin = origin;
|
||||||
|
work->add_color = add_color;
|
||||||
|
work->mult_color = mult_color;
|
||||||
|
work->blendfunc = blendfunc;
|
||||||
|
work->thread = thread;
|
||||||
|
|
||||||
|
if (me)
|
||||||
|
{
|
||||||
|
// This is the row for this thread.
|
||||||
|
mywork = work;
|
||||||
|
|
||||||
|
// Always exit here, we might not have actually scheduled
|
||||||
|
// the maximum permitted threads.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Kick off the thread.
|
||||||
|
pthread_create(thread, NULL, chunk_composite_worker, work);
|
||||||
|
|
||||||
|
// Save the row so we can access it for scheduling.
|
||||||
|
workers.push_back(work);
|
||||||
|
|
||||||
|
// The next chunk of work is the next step.
|
||||||
|
imgy += step;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now, run my own work.
|
||||||
|
chunk_composite_fast(mywork);
|
||||||
|
|
||||||
|
// Join on all threads once they're finished.
|
||||||
|
std::list<work_t *>::iterator work = workers.begin();
|
||||||
|
|
||||||
|
while(work != workers.end()) {
|
||||||
|
// Join the thread.
|
||||||
|
pthread_join(*((*work)->thread), NULL);
|
||||||
|
|
||||||
|
// Free the memory we allocated.
|
||||||
|
free((*work)->thread);
|
||||||
|
free((*work));
|
||||||
|
|
||||||
|
// Remove it from our bookkeeping.
|
||||||
|
work = workers.erase(work);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free the memory we allocated.
|
||||||
|
free(mywork);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user