#include #include #include #include #define MIN_THREAD_WORK 10 #define AA_MODE_NONE 0 #define AA_MODE_UNSCALED_SSAA_ONLY 1 #define AA_MODE_SSAA_ONLY 2 #define AA_MODE_SSAA_OR_BILINEAR 3 extern "C" { typedef struct intcolor { unsigned char r; unsigned char g; unsigned char b; unsigned char a; } intcolor_t; typedef struct floatcolor { double r; double g; double b; double a; } floatcolor_t; typedef struct point { double x; double y; double z; struct point add(struct point other) { return (struct point){ x + other.x, y + other.y, z + other.z, }; }; } point_t; typedef struct matrix { double a11; double a12; double a13; double a21; double a22; double a23; double a31; double a32; double a33; double a41; double a42; double a43; point_t multiply_point(point_t point) { return (point_t){ (a11 * point.x) + (a21 * point.y) + (a31 * point.z) + a41, (a12 * point.x) + (a22 * point.y) + (a32 * point.z) + a42, (a13 * point.x) + (a23 * point.y) + (a33 * point.z) + a43, }; } } matrix_t; typedef struct work { intcolor_t *imgdata; unsigned char *maskdata; unsigned int imgwidth; unsigned int imgheight; unsigned int minx; unsigned int maxx; unsigned int miny; unsigned int maxy; intcolor_t *texdata; unsigned int texwidth; unsigned int texheight; double xscale; double yscale; matrix_t inverse; int use_perspective; floatcolor_t add_color; floatcolor_t mult_color; int blendfunc; pthread_t *thread; int aa_mode; } work_t; inline unsigned char clamp(double color) { return fmin(fmax(0.0, roundf(color)), 255.0); } intcolor_t blend_normal( intcolor_t dest, intcolor_t src ) { // "Normal" blend mode, which is just alpha blending. Various games use the DX // equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since // we are blitting onto a destination that could have transparency. Once we are // done, we divide out the premultiplied Ad in order to put the pixes back to // their full blended values since we are not setting the destination alpha to 1.0. // This enables partial transparent backgrounds to work properly. // Short circuit for speed. if (src.a == 0) { return dest; } if (src.a == 255) { return src; } // Calculate alpha blending. double srcpercent = src.a / 255.0; double destpercent = dest.a / 255.0; double srcremainder = 1.0 - srcpercent; double new_alpha = fmin(fmax(0.0, srcpercent + destpercent * srcremainder), 1.0); return (intcolor_t){ clamp(((dest.r * destpercent * srcremainder) + (src.r * srcpercent)) / new_alpha), clamp(((dest.g * destpercent * srcremainder) + (src.g * srcpercent)) / new_alpha), clamp(((dest.b * destpercent * srcremainder) + (src.b * srcpercent)) / new_alpha), clamp(255 * new_alpha) }; } intcolor_t blend_addition( intcolor_t dest, intcolor_t src ) { // "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX // equation Src * As + Dst * 1. It appears jubeat does not premultiply the source // by its alpha component. // Short circuit for speed. if (src.a == 0) { return dest; } // Calculate final color blending. double srcpercent = src.a / 255.0; return (intcolor_t){ clamp(dest.r + (src.r * srcpercent)), clamp(dest.g + (src.g * srcpercent)), clamp(dest.b + (src.b * srcpercent)), // Additive blending doesn't actually make sense on semi-transparent destinations, // as that implies that the semi-transparent pixel will be later displayed on top // of something else. That doesn't work since additive blending needs to non-linearly // mix with the destination. So, in reality, we should be doing what subtractive // blending does and keeping the destination alpha (which should always be 255), // but if somebody renders an animation with additive blending meant to go over a // background onto a transparent or semi-transparent background this will make the // resulting graphic look more correct. clamp(dest.a + (255 * srcpercent)), }; } intcolor_t blend_subtraction( intcolor_t dest, intcolor_t src ) { // "Subtraction" blend mode, used for darkening an image. Various games use the DX // equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source // by its alpha component much like the "additive" blend above.. // Short circuit for speed. if (src.a == 0) { return dest; } // Calculate final color blending. double srcpercent = src.a / 255.0; return (intcolor_t){ clamp(dest.r - (src.r * srcpercent)), clamp(dest.g - (src.g * srcpercent)), clamp(dest.b - (src.b * srcpercent)), dest.a, }; } intcolor_t blend_multiply( intcolor_t dest, intcolor_t src ) { // "Multiply" blend mode, used for darkening an image. Various games use the DX // equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula // Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the // source alpha is always 255. // Calculate final color blending. double src_alpha = src.a / 255.0; double src_remainder = 1.0 - src_alpha; return (intcolor_t){ clamp((255 * ((dest.r / 255.0) * (src.r / 255.0) * src_alpha)) + (dest.r * src_remainder)), clamp((255 * ((dest.g / 255.0) * (src.g / 255.0) * src_alpha)) + (dest.g * src_remainder)), clamp((255 * ((dest.b / 255.0) * (src.b / 255.0) * src_alpha)) + (dest.b * src_remainder)), dest.a, }; } intcolor_t blend_mask_create( intcolor_t dest, intcolor_t src ) { // Mask creating just allows a pixel to be drawn if the source image has a nonzero // alpha, according to the SWF spec. if (src.a != 0) { return (intcolor_t){255, 0, 0, 255}; } else { return (intcolor_t){0, 0, 0, 0}; } } intcolor_t blend_mask_combine( intcolor_t dest, intcolor_t src ) { // Mask blending just takes the source and destination and ands them together, making // a final mask that is the intersection of the original mask and the new mask. The // reason we even have a color component to this is for debugging visibility. if (dest.a != 0 && src.a != 0) { return (intcolor_t){255, 0, 0, 255}; } else { return (intcolor_t){0, 0, 0, 0}; } } intcolor_t blend_point( floatcolor_t add_color, floatcolor_t mult_color, intcolor_t src_color, intcolor_t dest_color, int blendfunc ) { // Calculate multiplicative and additive colors against the source. src_color = (intcolor_t){ clamp((src_color.r * mult_color.r) + (255 * add_color.r)), clamp((src_color.g * mult_color.g) + (255 * add_color.g)), clamp((src_color.b * mult_color.b) + (255 * add_color.b)), clamp((src_color.a * mult_color.a) + (255 * add_color.a)), }; if (blendfunc == 3) { return blend_multiply(dest_color, src_color); } // TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this // in Jubeat and it implements it using OpenGL equation Src * (1 - Dst) + Dst * 1. // TODO: blend mode 5, which is "lighten" blending according to SWF references. Jubeat does not // premultiply by alpha, but the GL/DX equation is max(Src * As, Dst * 1). // TODO: blend mode 6, which is "darken" blending according to SWF references. Jubeat does not // premultiply by alpha, but the GL/DX equation is min(Src * As, Dst * 1). // TODO: blend mode 10, which is "invert" according to SWF references. The only game I could find // that implemented this had equation Src * (1 - Dst) + Dst * (1 - As). // TODO: blend mode 13, which is "overlay" according to SWF references. The equation seems to be // Src * Dst + Dst * Src but Jubeat thinks it should be Src * Dst + Dst * (1 - As). if (blendfunc == 8) { return blend_addition(dest_color, src_color); } if (blendfunc == 9 || blendfunc == 70) { return blend_subtraction(dest_color, src_color); } if (blendfunc == 256) { return blend_mask_combine(dest_color, src_color); } if (blendfunc == 257) { return blend_mask_create(dest_color, src_color); } // TODO: blend mode 75, which is not in the SWF spec and appears to have the equation // Src * (1 - Dst) + Dst * (1 - Src). return blend_normal(dest_color, src_color); } void chunk_composite_fast(work_t *work) { // Regardless of AA work, calculate the transform matrix for determining the stride for AA pixel lookups, since it // costs us almost nothing. Essentially what we're doing here is calculating the scale, clamping it at 1.0 as the // minimum and then setting the AA sample swing accordingly. This has the effect of anti-aliasing scaled up images // a bit softer than would otherwise be achieved. double xswing; double yswing; if (work->aa_mode == AA_MODE_UNSCALED_SSAA_ONLY) { xswing = 0.5; yswing = 0.5; } else { xswing = 0.5 * fmax(1.0, work->xscale); yswing = 0.5 * fmax(1.0, work->yscale); } for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) { for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) { // Determine offset. unsigned int imgoff = imgx + (imgy * work->imgwidth); // If we are masked off, don't do any other calculations. if (work->maskdata != NULL && work->maskdata[imgoff] == 0) { // This pixel is masked off! continue; } // Blend for simple anti-aliasing. if (work->aa_mode != AA_MODE_NONE) { // Calculate what texture pixel data goes here. int r = 0; int g = 0; int b = 0; int a = 0; int count = 0; int denom = 0; // First, figure out if we can use bilinear resampling. Bilinear seems to look // awful on perspective transforms, so disable it for all of them. int bilinear = 0; if (work->aa_mode == AA_MODE_SSAA_OR_BILINEAR && work->xscale >= 1.0 && work->yscale >= 1.0) { point_t aaloc = work->inverse.multiply_point((point_t){(double)(imgx + 0.5), (double)(imgy + 0.5)}); int aax = aaloc.x; int aay = aaloc.y; if (!(aax <= 0 || aay <= 0 || aax >= ((int)work->texwidth - 1) || aay >= ((int)work->texheight - 1))) { bilinear = 1; } } // Now perform the desired AA operation. intcolor_t average; if (bilinear) { // Calculate the pixel we're after, and what percentage into the pixel we are. int aax; int aay; double aaxrem; double aayrem; if (work->use_perspective) { // We don't check for negative here, because we already checked it above and wouldn't // have enabled bilinear interpoliation. point_t texloc = work->inverse.multiply_point((point_t){(double)(imgx + 0.5), (double)(imgy + 0.5)}); double fx = texloc.x / texloc.z; double fy = texloc.y / texloc.z; aax = fx; aay = fy; aaxrem = fx - (double)aax; aayrem = fy - (double)aay; } else { point_t texloc = work->inverse.multiply_point((point_t){(double)(imgx + 0.5), (double)(imgy + 0.5)}); aax = texloc.x; aay = texloc.y; aaxrem = texloc.x - (double)aax; aayrem = texloc.y - (double)aay; } // Find the four pixels that we can interpolate from. The first number is the x, and second is y. unsigned int tex00 = aax + (aay * work->texwidth); unsigned int tex10 = tex00 + 1; unsigned int tex01 = aax + ((aay + 1) * work->texwidth); unsigned int tex11 = tex01 + 1; // Calculate various scaling factors based on alpha and percentage. double tex00percent = work->texdata[tex00].a / 255.0; double tex10percent = work->texdata[tex10].a / 255.0; double tex01percent = work->texdata[tex01].a / 255.0; double tex11percent = work->texdata[tex11].a / 255.0; double y0percent = (tex00percent * (1.0 - aaxrem)) + (tex10percent * aaxrem); double y1percent = (tex01percent * (1.0 - aaxrem)) + (tex11percent * aaxrem); double finalpercent = (y0percent * (1.0 - aayrem)) + (y1percent * aayrem); if (finalpercent <= 0.0) { // This pixel would be blank, so we avoid dividing by zero. average = (intcolor_t){ 255, 255, 255, 0, }; } else { // Interpolate in the X direction on both Y axis. double y0r = ((work->texdata[tex00].r * tex00percent * (1.0 - aaxrem)) + (work->texdata[tex10].r * tex10percent * aaxrem)); double y0g = ((work->texdata[tex00].g * tex00percent * (1.0 - aaxrem)) + (work->texdata[tex10].g * tex10percent * aaxrem)); double y0b = ((work->texdata[tex00].b * tex00percent * (1.0 - aaxrem)) + (work->texdata[tex10].b * tex10percent * aaxrem)); double y1r = ((work->texdata[tex01].r * tex01percent * (1.0 - aaxrem)) + (work->texdata[tex11].r * tex11percent * aaxrem)); double y1g = ((work->texdata[tex01].g * tex01percent * (1.0 - aaxrem)) + (work->texdata[tex11].g * tex11percent * aaxrem)); double y1b = ((work->texdata[tex01].b * tex01percent * (1.0 - aaxrem)) + (work->texdata[tex11].b * tex11percent * aaxrem)); // Now interpolate the Y direction to get the final pixel value. average = (intcolor_t){ (unsigned char)(((y0r * (1.0 - aayrem)) + (y1r * aayrem)) / finalpercent), (unsigned char)(((y0g * (1.0 - aayrem)) + (y1g * aayrem)) / finalpercent), (unsigned char)(((y0b * (1.0 - aayrem)) + (y1b * aayrem)) / finalpercent), (unsigned char)(finalpercent * 255), }; } } else { for (double addy = 0.5 - yswing; addy <= 0.5 + yswing; addy += yswing / 2.0) { for (double addx = 0.5 - xswing; addx <= 0.5 + xswing; addx += xswing / 2.0) { int aax = -1; int aay = -1; double xloc = (double)imgx + addx; double yloc = (double)imgy + addy; if (xloc < 0.0 || yloc < 0.0 || xloc >= (double)work->imgwidth || yloc >= (double)work->imgheight) { continue; } if (work->use_perspective) { point_t texloc = work->inverse.multiply_point((point_t){xloc, yloc}); if (texloc.z > 0.0) { aax = texloc.x / texloc.z; aay = texloc.y / texloc.z; } } else { point_t texloc = work->inverse.multiply_point((point_t){xloc, yloc}); aax = texloc.x; aay = texloc.y; } // If we're out of bounds, don't update. Factor this in, however, so we can get partial // transparency to the pixel that is already there. denom ++; if (aax < 0 || aay < 0 || aax >= (int)work->texwidth || aay >= (int)work->texheight) { continue; } // Grab the values to average, for SSAA. Make sure to factor in alpha as a poor-man's // blend to ensure that partial transparency pixel values don't unnecessarily factor // into average calculations. unsigned int texoff = aax + (aay * work->texwidth); // If this is a fully transparent pixel, the below formulas work out to adding nothing // so we should skip this altogether. if (work->texdata[texoff].a == 0) { continue; } double apercent = work->texdata[texoff].a / 255.0; r += (int)(work->texdata[texoff].r * apercent); g += (int)(work->texdata[texoff].g * apercent); b += (int)(work->texdata[texoff].b * apercent); a += work->texdata[texoff].a; count ++; } } if (count == 0) { // None of the samples existed in-bounds. continue; } // Average the pixels. Make sure to divide out the alpha in preparation for blending. unsigned char alpha = (unsigned char)(a / denom); if (alpha == 0) { // Samples existed in bounds, but with zero alpha. average = (intcolor_t){ 255, 255, 255, 0, }; } else { // Samples existed in bounds, with some alpha component, un-premultiply it. double apercent = alpha / 255.0; average = (intcolor_t){ (unsigned char)((r / denom) / apercent), (unsigned char)((g / denom) / apercent), (unsigned char)((b / denom) / apercent), alpha, }; } } // Blend it. work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, average, work->imgdata[imgoff], work->blendfunc); } else { // Grab the center of the pixel to get the color. int texx = -1; int texy = -1; if (work->use_perspective) { point_t texloc = work->inverse.multiply_point((point_t){(double)imgx + (double)0.5, (double)imgy + (double)0.5}); if (texloc.z > 0.0) { texx = texloc.x / texloc.z; texy = texloc.y / texloc.z; } } else { point_t texloc = work->inverse.multiply_point((point_t){(double)imgx + (double)0.5, (double)imgy + (double)0.5}); texx = texloc.x; texy = texloc.y; } // If we're out of bounds, don't update. if (texx < 0 || texy < 0 || texx >= (int)work->texwidth || texy >= (int)work->texheight) { continue; } // Blend it. unsigned int texoff = texx + (texy * work->texwidth); work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, work->texdata[texoff], work->imgdata[imgoff], work->blendfunc); } } } } void *chunk_composite_worker(void *arg) { work_t *work = (work_t *)arg; chunk_composite_fast(work); return NULL; } int composite_fast( unsigned char *imgbytes, unsigned char *maskbytes, unsigned int imgwidth, unsigned int imgheight, unsigned int minx, unsigned int maxx, unsigned int miny, unsigned int maxy, floatcolor_t add_color, floatcolor_t mult_color, double xscale, double yscale, matrix_t inverse, int use_perspective, int blendfunc, unsigned char *texbytes, unsigned int texwidth, unsigned int texheight, unsigned int threads, unsigned int aa_mode ) { // Cast to a usable type. intcolor_t *imgdata = (intcolor_t *)imgbytes; intcolor_t *texdata = (intcolor_t *)texbytes; if (threads == 1 || (maxy - miny) < (MIN_THREAD_WORK * 2)) { // Just create a local work structure so we can call the common function. work_t work; work.imgdata = imgdata; work.maskdata = maskbytes; work.imgwidth = imgwidth; work.imgheight = imgheight; work.minx = minx; work.maxx = maxx; work.miny = miny; work.maxy = maxy; work.texdata = texdata; work.texwidth = texwidth; work.texheight = texheight; work.xscale = xscale; work.yscale = yscale; work.inverse = inverse; work.add_color = add_color; work.mult_color = mult_color; work.blendfunc = blendfunc; work.aa_mode = aa_mode; work.use_perspective = use_perspective; chunk_composite_fast(&work); } else { std::list workers; work_t *mywork = NULL; unsigned int imgy = miny; unsigned int step = (maxy - miny) / threads; if (step < MIN_THREAD_WORK) { step = MIN_THREAD_WORK; } for (unsigned int worker = 0; worker < threads; worker++) { // We are slightly different if this is the last worker, because // its going to this thread. Make sure it consumes the rest of the // work, as well as not getting a pthread. Make sure each thread // has a minimum amount of work so we don't waste pthread overhead // starting and stopping it. Because of this, make sure that the // last chunk we create is always our own. unsigned int me = 0; if (worker == (threads - 1) || (imgy + step) >= maxy) { me = 1; } // Create storage for this worker. pthread_t *thread = me ? NULL : (pthread_t *)malloc(sizeof(pthread_t)); work_t *work = (work_t *)malloc(sizeof(work_t)); // Pass to it all of the params it needs. work->imgdata = imgdata; work->maskdata = maskbytes; work->imgwidth = imgwidth; work->imgheight = imgheight; work->minx = minx; work->maxx = maxx; work->miny = imgy; work->maxy = me ? maxy : imgy + step; work->texdata = texdata; work->texwidth = texwidth; work->texheight = texheight; work->xscale = xscale; work->yscale = yscale; work->inverse = inverse; work->add_color = add_color; work->mult_color = mult_color; work->blendfunc = blendfunc; work->thread = thread; work->aa_mode = aa_mode; work->use_perspective = use_perspective; if (me) { // This is the row for this thread. mywork = work; // Always exit here, we might not have actually scheduled // the maximum permitted threads. break; } else { // Kick off the thread. pthread_create(thread, NULL, chunk_composite_worker, work); // Save the row so we can access it for scheduling. workers.push_back(work); // The next chunk of work is the next step. imgy += step; } } // Now, run my own work. chunk_composite_fast(mywork); // Join on all threads once they're finished. std::list::iterator work = workers.begin(); while(work != workers.end()) { // Join the thread. pthread_join(*((*work)->thread), NULL); // Free the memory we allocated. free((*work)->thread); free((*work)); // Remove it from our bookkeeping. work = workers.erase(work); } // Free the memory we allocated. free(mywork); } return 0; } }