1
0
mirror of synced 2025-01-18 22:24:04 +01:00

Initial implementation of C++ affine renderer for massive speed boost.

This commit is contained in:
Jennifer Taylor 2021-05-19 16:25:13 +00:00
parent b02c4292be
commit 48e9c59513
6 changed files with 670 additions and 325 deletions

View File

@ -6,203 +6,279 @@ from typing import Any, List, Sequence, Tuple
from .types.generic import Color, Matrix, Point
def clamp(color: float) -> int:
return min(max(0, round(color)), 255)
# If we compiled the faster cython code, we can use it instead!
try:
from .blendalt import affine_composite
except ImportError:
def clamp(color: float) -> int:
return min(max(0, round(color)), 255)
def blend_normal(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
) -> Sequence[int]:
# "Normal" blend mode, which is just alpha blending. Various games use the DX
# equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since
# we are blitting onto a destination that could have transparency. Once we are
# done, we divide out the premultiplied Ad in order to put the pixes back to
# their full blended values since we are not setting the destination alpha to 1.0.
# This enables partial transparent backgrounds to work properly.
def blend_normal(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
# A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
mult_color: Color,
# A RGBA color tuple where all values are 0-255, used to calculate the final color.
add_color: Tuple[int, int, int, int],
) -> Sequence[int]:
# "Normal" blend mode, which is just alpha blending. Various games use the DX
# equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since
# we are blitting onto a destination that could have transparency. Once we are
# done, we divide out the premultiplied Ad in order to put the pixes back to
# their full blended values since we are not setting the destination alpha to 1.0.
# This enables partial transparent backgrounds to work properly.
# Short circuit for speed.
if src[3] == 0:
return dest
if src[3] == 255:
return src
# Calculate multiplicative and additive colors against the source.
src = (
clamp((src[0] * mult_color.r) + add_color[0]),
clamp((src[1] * mult_color.g) + add_color[1]),
clamp((src[2] * mult_color.b) + add_color[2]),
clamp((src[3] * mult_color.a) + add_color[3]),
)
# Calculate alpha blending.
srcpercent = src[3] / 255.0
destpercent = dest[3] / 255.0
srcremaineder = 1.0 - srcpercent
new_alpha = (srcpercent + destpercent * srcremaineder)
return (
clamp(((dest[0] * destpercent * srcremaineder) + (src[0] * srcpercent)) / new_alpha),
clamp(((dest[1] * destpercent * srcremaineder) + (src[1] * srcpercent)) / new_alpha),
clamp(((dest[2] * destpercent * srcremaineder) + (src[2] * srcpercent)) / new_alpha),
clamp(255 * new_alpha)
)
# Short circuit for speed.
if src[3] == 0:
return dest
if src[3] == 255:
return src
def blend_addition(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
) -> Sequence[int]:
# "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX
# equation Src * As + Dst * 1. It appears jubeat does not premultiply the source
# by its alpha component.
# Calculate alpha blending.
srcpercent = src[3] / 255.0
destpercent = dest[3] / 255.0
srcremaineder = 1.0 - srcpercent
new_alpha = (srcpercent + destpercent * srcremaineder)
return (
clamp(((dest[0] * destpercent * srcremaineder) + (src[0] * srcpercent)) / new_alpha),
clamp(((dest[1] * destpercent * srcremaineder) + (src[1] * srcpercent)) / new_alpha),
clamp(((dest[2] * destpercent * srcremaineder) + (src[2] * srcpercent)) / new_alpha),
clamp(255 * new_alpha)
)
# Short circuit for speed.
if src[3] == 0:
return dest
# Calculate final color blending.
srcpercent = src[3] / 255.0
return (
clamp(dest[0] + (src[0] * srcpercent)),
clamp(dest[1] + (src[1] * srcpercent)),
clamp(dest[2] + (src[2] * srcpercent)),
dest[3],
)
def blend_addition(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
# A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
mult_color: Color,
# A RGBA color tuple where all values are 0-255, used to calculate the final color.
add_color: Tuple[int, int, int, int],
) -> Sequence[int]:
# "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX
# equation Src * As + Dst * 1. It appears jubeat does not premultiply the source
# by its alpha component.
def blend_subtraction(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
) -> Sequence[int]:
# "Subtraction" blend mode, used for darkening an image. Various games use the DX
# equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source
# by its alpha component much like the "additive" blend above..
# Calculate multiplicative and additive colors against the source.
src = (
clamp((src[0] * mult_color.r) + add_color[0]),
clamp((src[1] * mult_color.g) + add_color[1]),
clamp((src[2] * mult_color.b) + add_color[2]),
clamp((src[3] * mult_color.a) + add_color[3]),
)
# Short circuit for speed.
if src[3] == 0:
return dest
# Short circuit for speed.
if src[3] == 0:
return dest
# Calculate final color blending.
srcpercent = src[3] / 255.0
return (
clamp(dest[0] - (src[0] * srcpercent)),
clamp(dest[1] - (src[1] * srcpercent)),
clamp(dest[2] - (src[2] * srcpercent)),
dest[3],
)
# Calculate final color blending.
srcpercent = src[3] / 255.0
return (
clamp(dest[0] + (src[0] * srcpercent)),
clamp(dest[1] + (src[1] * srcpercent)),
clamp(dest[2] + (src[2] * srcpercent)),
dest[3],
)
def blend_multiply(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
) -> Sequence[int]:
# "Multiply" blend mode, used for darkening an image. Various games use the DX
# equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula
# Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the
# source alpha is always 255.
# Calculate final color blending.
return (
clamp(255 * ((dest[0] / 255.0) * (src[0] / 255.0))),
clamp(255 * ((dest[1] / 255.0) * (src[1] / 255.0))),
clamp(255 * ((dest[2] / 255.0) * (src[2] / 255.0))),
dest[3],
)
def blend_subtraction(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
# A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
mult_color: Color,
# A RGBA color tuple where all values are 0-255, used to calculate the final color.
add_color: Tuple[int, int, int, int],
) -> Sequence[int]:
# "Subtraction" blend mode, used for darkening an image. Various games use the DX
# equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source
# by its alpha component much like the "additive" blend above..
def affine_composite(
img: Image.Image,
add_color: Tuple[int, int, int, int],
mult_color: Color,
transform: Matrix,
origin: Point,
blendfunc: int,
texture: Image.Image,
single_threaded: bool = False,
) -> Image.Image:
# Calculate the inverse so we can map canvas space back to texture space.
try:
inverse = transform.inverse()
except ZeroDivisionError:
# If this happens, that means one of the scaling factors was zero, making
# this object invisible. We can ignore this since the object should not
# be drawn.
print(f"WARNING: Transform Matrix {transform} has zero scaling factor, making it non-invertible!")
return img
# Calculate multiplicative and additive colors against the source.
src = (
clamp((src[0] * mult_color.r) + add_color[0]),
clamp((src[1] * mult_color.g) + add_color[1]),
clamp((src[2] * mult_color.b) + add_color[2]),
clamp((src[3] * mult_color.a) + add_color[3]),
)
# Warn if we have an unsupported blend.
if blendfunc not in {0, 2, 3, 8, 9, 70}:
print(f"WARNING: Unsupported blend {blendfunc}")
return img
# Short circuit for speed.
if src[3] == 0:
return dest
# These are calculated properties and caching them outside of the loop
# speeds things up a bit.
imgwidth = img.width
imgheight = img.height
texwidth = texture.width
texheight = texture.height
# Calculate final color blending.
srcpercent = src[3] / 255.0
return (
clamp(dest[0] - (src[0] * srcpercent)),
clamp(dest[1] - (src[1] * srcpercent)),
clamp(dest[2] - (src[2] * srcpercent)),
dest[3],
)
# Calculate the maximum range of update this texture can possibly reside in.
pix1 = transform.multiply_point(Point.identity().subtract(origin))
pix2 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, 0)))
pix3 = transform.multiply_point(Point.identity().subtract(origin).add(Point(0, texheight)))
pix4 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, texheight)))
# Map this to the rectangle we need to sweep in the rendering image.
minx = max(int(min(pix1.x, pix2.x, pix3.x, pix4.x)), 0)
maxx = min(int(max(pix1.x, pix2.x, pix3.x, pix4.x)) + 1, imgwidth)
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
def blend_multiply(
# RGBA color tuple representing what's already at the dest.
dest: Sequence[int],
# RGBA color tuple representing the source we want to blend to the dest.
src: Sequence[int],
# A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
mult_color: Color,
# A RGBA color tuple where all values are 0-255, used to calculate the final color.
add_color: Tuple[int, int, int, int],
) -> Sequence[int]:
# "Multiply" blend mode, used for darkening an image. Various games use the DX
# equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula
# Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the
# source alpha is always 255.
if maxx <= 0 or maxy <= 0:
# This image is entirely off the screen!
return img
# Calculate multiplicative and additive colors against the source.
src = (
clamp((src[0] * mult_color.r) + add_color[0]),
clamp((src[1] * mult_color.g) + add_color[1]),
clamp((src[2] * mult_color.b) + add_color[2]),
clamp((src[3] * mult_color.a) + add_color[3]),
)
cores = multiprocessing.cpu_count()
if single_threaded or cores < 2:
# Get the data in an easier to manipulate and faster to update fashion.
imgmap = list(img.getdata())
texmap = list(texture.getdata())
# Calculate final color blending.
return (
clamp(255 * ((dest[0] / 255.0) * (src[0] / 255.0))),
clamp(255 * ((dest[1] / 255.0) * (src[1] / 255.0))),
clamp(255 * ((dest[2] / 255.0) * (src[2] / 255.0))),
dest[3],
)
# We don't have enough CPU cores to bother multiprocessing.
for imgy in range(miny, maxy):
for imgx in range(minx, maxx):
# Determine offset
imgoff = imgx + (imgy * imgwidth)
# Calculate what texture pixel data goes here.
texloc = inverse.multiply_point(Point(float(imgx), float(imgy))).add(origin)
texx, texy = texloc.as_tuple()
def affine_composite(
img: Image.Image,
add_color: Tuple[int, int, int, int],
mult_color: Color,
transform: Matrix,
inverse: Matrix,
origin: Point,
blendfunc: int,
texture: Image.Image,
single_threaded: bool = False,
) -> Image.Image:
# Warn if we have an unsupported blend.
if blendfunc not in {0, 2, 3, 8, 9, 70}:
print(f"WARNING: Unsupported blend {blendfunc}")
# If we're out of bounds, don't update.
if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
continue
# These are calculated properties and caching them outside of the loop
# speeds things up a bit.
imgwidth = img.width
imgheight = img.height
texwidth = texture.width
texheight = texture.height
# Blend it.
texoff = texx + (texy * texwidth)
imgmap[imgoff] = blend_point(add_color, mult_color, texmap[texoff], imgmap[imgoff], blendfunc)
# Calculate the maximum range of update this texture can possibly reside in.
pix1 = transform.multiply_point(Point.identity().subtract(origin))
pix2 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, 0)))
pix3 = transform.multiply_point(Point.identity().subtract(origin).add(Point(0, texheight)))
pix4 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, texheight)))
img.putdata(imgmap)
else:
imgbytes = img.tobytes('raw', 'RGBA')
texbytes = texture.tobytes('raw', 'RGBA')
# Map this to the rectangle we need to sweep in the rendering image.
minx = max(int(min(pix1.x, pix2.x, pix3.x, pix4.x)), 0)
maxx = min(int(max(pix1.x, pix2.x, pix3.x, pix4.x)) + 1, imgwidth)
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
# Let's spread the load across multiple processors.
procs: List[multiprocessing.Process] = []
work: multiprocessing.Queue = multiprocessing.Queue()
results: multiprocessing.Queue = multiprocessing.Queue()
expected: int = 0
interrupted: bool = False
cores = multiprocessing.cpu_count()
if single_threaded or cores < 2:
# Get the data in an easier to manipulate and faster to update fashion.
imgmap = list(img.getdata())
texmap = list(texture.getdata())
def ctrlc(sig: Any, frame: Any) -> None:
nonlocal interrupted
interrupted = True
# We don't have enough CPU cores to bother multiprocessing.
for imgy in range(miny, maxy):
for imgx in range(minx, maxx):
original_handler = signal.getsignal(signal.SIGINT)
signal.signal(signal.SIGINT, ctrlc)
for _ in range(cores):
proc = multiprocessing.Process(
target=pixel_renderer,
args=(
work,
results,
minx,
maxx,
imgwidth,
texwidth,
texheight,
inverse,
origin,
add_color,
mult_color,
blendfunc,
imgbytes,
texbytes,
),
)
procs.append(proc)
proc.start()
for imgy in range(miny, maxy):
work.put(imgy)
expected += 1
lines: List[bytes] = [
imgbytes[x:(x + (imgwidth * 4))]
for x in range(
0,
imgwidth * imgheight * 4,
imgwidth * 4,
)
]
for _ in range(expected):
imgy, result = results.get()
lines[imgy] = result
for proc in procs:
work.put(None)
for proc in procs:
proc.join()
signal.signal(signal.SIGINT, original_handler)
if interrupted:
raise KeyboardInterrupt()
img = Image.frombytes('RGBA', (imgwidth, imgheight), b''.join(lines))
return img
def pixel_renderer(
work: multiprocessing.Queue,
results: multiprocessing.Queue,
minx: int,
maxx: int,
imgwidth: int,
texwidth: int,
texheight: int,
inverse: Matrix,
origin: Point,
add_color: Tuple[int, int, int, int],
mult_color: Color,
blendfunc: int,
imgbytes: bytes,
texbytes: bytes,
) -> None:
while True:
imgy = work.get()
if imgy is None:
return
result: List[Sequence[int]] = []
for imgx in range(imgwidth):
# Determine offset
imgoff = imgx + (imgy * imgwidth)
if imgx < minx or imgx >= maxx:
result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
continue
# Calculate what texture pixel data goes here.
texloc = inverse.multiply_point(Point(float(imgx), float(imgy))).add(origin)
@ -210,155 +286,50 @@ def affine_composite(
# If we're out of bounds, don't update.
if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
continue
# Blend it.
texoff = texx + (texy * texwidth)
imgmap[imgoff] = affine_blend_impl(add_color, mult_color, texmap[texoff], imgmap[imgoff], blendfunc)
result.append(blend_point(add_color, mult_color, texbytes[(texoff * 4):((texoff + 1) * 4)], imgbytes[(imgoff * 4):((imgoff + 1) * 4)], blendfunc))
img.putdata(imgmap)
else:
imgbytes = img.tobytes('raw', 'RGBA')
texbytes = texture.tobytes('raw', 'RGBA')
linebytes = bytes([channel for pixel in result for channel in pixel])
results.put((imgy, linebytes))
# Let's spread the load across multiple processors.
procs: List[multiprocessing.Process] = []
work: multiprocessing.Queue = multiprocessing.Queue()
results: multiprocessing.Queue = multiprocessing.Queue()
expected: int = 0
interrupted: bool = False
def blend_point(
add_color: Tuple[int, int, int, int],
mult_color: Color,
# This should be a sequence of exactly 4 values, either bytes or a tuple.
src_color: Sequence[int],
# This should be a sequence of exactly 4 values, either bytes or a tuple.
dest_color: Sequence[int],
blendfunc: int,
) -> Sequence[int]:
# Calculate multiplicative and additive colors against the source.
src_color = (
clamp((src_color[0] * mult_color.r) + add_color[0]),
clamp((src_color[1] * mult_color.g) + add_color[1]),
clamp((src_color[2] * mult_color.b) + add_color[2]),
clamp((src_color[3] * mult_color.a) + add_color[3]),
)
def ctrlc(sig: Any, frame: Any) -> None:
nonlocal interrupted
interrupted = True
original_handler = signal.getsignal(signal.SIGINT)
signal.signal(signal.SIGINT, ctrlc)
for _ in range(cores):
proc = multiprocessing.Process(
target=pixel_renderer,
args=(
work,
results,
minx,
maxx,
imgwidth,
texwidth,
texheight,
inverse,
origin,
add_color,
mult_color,
blendfunc,
imgbytes,
texbytes,
),
)
procs.append(proc)
proc.start()
for imgy in range(miny, maxy):
work.put(imgy)
expected += 1
lines: List[bytes] = [
imgbytes[x:(x + (imgwidth * 4))]
for x in range(
0,
imgwidth * imgheight * 4,
imgwidth * 4,
)
]
for _ in range(expected):
imgy, result = results.get()
lines[imgy] = result
for proc in procs:
work.put(None)
for proc in procs:
proc.join()
signal.signal(signal.SIGINT, original_handler)
if interrupted:
raise KeyboardInterrupt()
img = Image.frombytes('RGBA', (imgwidth, imgheight), b''.join(lines))
return img
def pixel_renderer(
work: multiprocessing.Queue,
results: multiprocessing.Queue,
minx: int,
maxx: int,
imgwidth: int,
texwidth: int,
texheight: int,
inverse: Matrix,
origin: Point,
add_color: Tuple[int, int, int, int],
mult_color: Color,
blendfunc: int,
imgbytes: bytes,
texbytes: bytes,
) -> None:
while True:
imgy = work.get()
if imgy is None:
return
result: List[Sequence[int]] = []
for imgx in range(imgwidth):
# Determine offset
imgoff = imgx + (imgy * imgwidth)
if imgx < minx or imgx >= maxx:
result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
continue
# Calculate what texture pixel data goes here.
texloc = inverse.multiply_point(Point(float(imgx), float(imgy))).add(origin)
texx, texy = texloc.as_tuple()
# If we're out of bounds, don't update.
if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
continue
# Blend it.
texoff = texx + (texy * texwidth)
result.append(affine_blend_impl(add_color, mult_color, texbytes[(texoff * 4):((texoff + 1) * 4)], imgbytes[(imgoff * 4):((imgoff + 1) * 4)], blendfunc))
linebytes = bytes([channel for pixel in result for channel in pixel])
results.put((imgy, linebytes))
def affine_blend_impl(
add_color: Tuple[int, int, int, int],
mult_color: Color,
# This should be a sequence of exactly 4 values, either bytes or a tuple.
src_color: Sequence[int],
# This should be a sequence of exactly 4 values, either bytes or a tuple.
dest_color: Sequence[int],
blendfunc: int,
) -> Sequence[int]:
if blendfunc == 3:
return blend_multiply(dest_color, src_color, mult_color, add_color)
# TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this
# in Jubeat and it implements it using OpenGL equation Src * (1 - Dst) + Dst * 1.
# TODO: blend mode 5, which is "lighten" blending according to SWF references. Jubeat does not
# premultiply by alpha, but the GL/DX equation is max(Src * As, Dst * 1).
# TODO: blend mode 6, which is "darken" blending according to SWF references. Jubeat does not
# premultiply by alpha, but the GL/DX equation is min(Src * As, Dst * 1).
# TODO: blend mode 10, which is "invert" according to SWF references. The only game I could find
# that implemented this had equation Src * (1 - Dst) + Dst * (1 - As).
# TODO: blend mode 13, which is "overlay" according to SWF references. The equation seems to be
# Src * Dst + Dst * Src but Jubeat thinks it should be Src * Dst + Dst * (1 - As).
elif blendfunc == 8:
return blend_addition(dest_color, src_color, mult_color, add_color)
elif blendfunc == 9 or blendfunc == 70:
return blend_subtraction(dest_color, src_color, mult_color, add_color)
# TODO: blend mode 75, which is not in the SWF spec and appears to have the equation
# Src * (1 - Dst) + Dst * (1 - Src).
else:
return blend_normal(dest_color, src_color, mult_color, add_color)
if blendfunc == 3:
return blend_multiply(dest_color, src_color)
# TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this
# in Jubeat and it implements it using OpenGL equation Src * (1 - Dst) + Dst * 1.
# TODO: blend mode 5, which is "lighten" blending according to SWF references. Jubeat does not
# premultiply by alpha, but the GL/DX equation is max(Src * As, Dst * 1).
# TODO: blend mode 6, which is "darken" blending according to SWF references. Jubeat does not
# premultiply by alpha, but the GL/DX equation is min(Src * As, Dst * 1).
# TODO: blend mode 10, which is "invert" according to SWF references. The only game I could find
# that implemented this had equation Src * (1 - Dst) + Dst * (1 - As).
# TODO: blend mode 13, which is "overlay" according to SWF references. The equation seems to be
# Src * Dst + Dst * Src but Jubeat thinks it should be Src * Dst + Dst * (1 - As).
elif blendfunc == 8:
return blend_addition(dest_color, src_color)
elif blendfunc == 9 or blendfunc == 70:
return blend_subtraction(dest_color, src_color)
# TODO: blend mode 75, which is not in the SWF spec and appears to have the equation
# Src * (1 - Dst) + Dst * (1 - Src).
else:
return blend_normal(dest_color, src_color)

View File

@ -0,0 +1,16 @@
from PIL import Image # type: ignore
from typing import Tuple
from .types.generic import Color, Matrix, Point
def affine_composite(
img: Image.Image,
add_color: Tuple[int, int, int, int],
mult_color: Color,
transform: Matrix,
origin: Point,
blendfunc: int,
texture: Image.Image,
single_threaded: bool = False,
) -> Image.Image:
...

View File

@ -0,0 +1,129 @@
from PIL import Image # type: ignore
from typing import Tuple
from .types.generic import Color, Matrix, Point
cdef extern struct intcolor_t:
unsigned char r;
unsigned char g;
unsigned char b;
unsigned char a;
cdef extern struct floatcolor_t:
float r;
float g;
float b;
float a;
cdef extern struct matrix_t:
float a;
float b;
float c;
float d;
float tx;
float ty;
cdef extern struct point_t:
float x;
float y;
cdef extern int affine_composite_fast(
unsigned char *imgdata,
unsigned int imgwidth,
unsigned int imgheight,
unsigned int minx,
unsigned int maxx,
unsigned int miny,
unsigned int maxy,
intcolor_t add_color,
floatcolor_t mult_color,
matrix_t inverse,
point_t origin,
int blendfunc,
unsigned char *texdata,
unsigned int texwidth,
unsigned int texheight,
int single_threaded
)
def affine_composite(
img: Image.Image,
add_color: Tuple[int, int, int, int],
mult_color: Color,
transform: Matrix,
origin: Point,
blendfunc: int,
texture: Image.Image,
single_threaded: bool = False,
) -> Image.Image:
# Calculate the inverse so we can map canvas space back to texture space.
try:
inverse = transform.inverse()
except ZeroDivisionError:
# If this happens, that means one of the scaling factors was zero, making
# this object invisible. We can ignore this since the object should not
# be drawn.
print(f"WARNING: Transform Matrix {transform} has zero scaling factor, making it non-invertible!")
return img
if blendfunc not in {0, 2, 3, 8, 9, 70}:
print(f"WARNING: Unsupported blend {blendfunc}")
return img
# These are calculated properties and caching them outside of the loop
# speeds things up a bit.
imgwidth = img.width
imgheight = img.height
texwidth = texture.width
texheight = texture.height
# Calculate the maximum range of update this texture can possibly reside in.
pix1 = transform.multiply_point(Point.identity().subtract(origin))
pix2 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, 0)))
pix3 = transform.multiply_point(Point.identity().subtract(origin).add(Point(0, texheight)))
pix4 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, texheight)))
# Map this to the rectangle we need to sweep in the rendering image.
minx = max(int(min(pix1.x, pix2.x, pix3.x, pix4.x)), 0)
maxx = min(int(max(pix1.x, pix2.x, pix3.x, pix4.x)) + 1, imgwidth)
miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
if maxx <= 0 or maxy <= 0:
# This image is entirely off the screen!
return img
# Grab the raw image data.
imgbytes = img.tobytes('raw', 'RGBA')
texbytes = texture.tobytes('raw', 'RGBA')
# Convert classes to C structs.
cdef intcolor_t c_addcolor = intcolor_t(r=add_color[0], g=add_color[1], b=add_color[2], a=add_color[3])
cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
# Call the C++ function.
errors = affine_composite_fast(
imgbytes,
imgwidth,
imgheight,
minx,
maxx,
miny,
maxy,
c_addcolor,
c_multcolor,
c_inverse,
c_origin,
blendfunc,
texbytes,
texwidth,
texheight,
single_threaded,
)
if errors != 0:
raise Exception("Error raised in C++!")
# We blitted in-place, return that.
return Image.frombytes('RGBA', (imgwidth, imgheight), imgbytes)

View File

@ -0,0 +1,232 @@
#include <stdio.h>
#include <math.h>
extern "C"
{
typedef struct intcolor {
unsigned char r;
unsigned char g;
unsigned char b;
unsigned char a;
} intcolor_t;
typedef struct floatcolor {
float r;
float g;
float b;
float a;
} floatcolor_t;
typedef struct point {
float x;
float y;
struct point add(struct point other) {
return (struct point){
x + other.x,
y + other.y,
};
};
} point_t;
typedef struct matrix {
float a;
float b;
float c;
float d;
float tx;
float ty;
point_t multiply_point(point_t point) {
return (point_t){
(a * point.x) + (c * point.y) + tx,
(b * point.x) + (d * point.y) + ty,
};
}
} matrix_t;
inline unsigned char clamp(float color) {
return fmin(fmax(0.0, roundf(color)), 255.0);
}
intcolor_t blend_normal(
intcolor_t dest,
intcolor_t src
) {
// "Normal" blend mode, which is just alpha blending. Various games use the DX
// equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since
// we are blitting onto a destination that could have transparency. Once we are
// done, we divide out the premultiplied Ad in order to put the pixes back to
// their full blended values since we are not setting the destination alpha to 1.0.
// This enables partial transparent backgrounds to work properly.
// Short circuit for speed.
if (src.a == 0) {
return dest;
}
if (src.a == 255) {
return src;
}
// Calculate alpha blending.
float srcpercent = src.a / 255.0;
float destpercent = dest.a / 255.0;
float srcremaineder = 1.0 - srcpercent;
float new_alpha = (srcpercent + destpercent * srcremaineder);
return (intcolor_t){
clamp(((dest.r * destpercent * srcremaineder) + (src.r * srcpercent)) / new_alpha),
clamp(((dest.g * destpercent * srcremaineder) + (src.g * srcpercent)) / new_alpha),
clamp(((dest.b * destpercent * srcremaineder) + (src.b * srcpercent)) / new_alpha),
clamp(255 * new_alpha)
};
}
intcolor_t blend_addition(
intcolor_t dest,
intcolor_t src
) {
// "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX
// equation Src * As + Dst * 1. It appears jubeat does not premultiply the source
// by its alpha component.
// Short circuit for speed.
if (src.a == 0) {
return dest;
}
// Calculate final color blending.
float srcpercent = src.a / 255.0;
return (intcolor_t){
clamp(dest.r + (src.r * srcpercent)),
clamp(dest.g + (src.g * srcpercent)),
clamp(dest.b + (src.b * srcpercent)),
dest.a,
};
}
intcolor_t blend_subtraction(
intcolor_t dest,
intcolor_t src
) {
// "Subtraction" blend mode, used for darkening an image. Various games use the DX
// equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source
// by its alpha component much like the "additive" blend above..
// Short circuit for speed.
if (src.a == 0) {
return dest;
}
// Calculate final color blending.
float srcpercent = src.a / 255.0;
return (intcolor_t){
clamp(dest.r - (src.r * srcpercent)),
clamp(dest.g - (src.g * srcpercent)),
clamp(dest.b - (src.b * srcpercent)),
dest.a,
};
}
intcolor_t blend_multiply(
intcolor_t dest,
intcolor_t src
) {
// "Multiply" blend mode, used for darkening an image. Various games use the DX
// equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula
// Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the
// source alpha is always 255.
// Calculate final color blending.
return (intcolor_t){
clamp(255 * ((dest.r / 255.0) * (src.r / 255.0))),
clamp(255 * ((dest.g / 255.0) * (src.g / 255.0))),
clamp(255 * ((dest.b / 255.0) * (src.b / 255.0))),
dest.a,
};
}
intcolor_t blend_point(
intcolor_t add_color,
floatcolor_t mult_color,
intcolor_t src_color,
intcolor_t dest_color,
int blendfunc
) {
// Calculate multiplicative and additive colors against the source.
src_color = (intcolor_t){
clamp((src_color.r * mult_color.r) + add_color.r),
clamp((src_color.g * mult_color.g) + add_color.g),
clamp((src_color.b * mult_color.b) + add_color.b),
clamp((src_color.a * mult_color.a) + add_color.a),
};
if (blendfunc == 3) {
return blend_multiply(dest_color, src_color);
}
// TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this
// in Jubeat and it implements it using OpenGL equation Src * (1 - Dst) + Dst * 1.
// TODO: blend mode 5, which is "lighten" blending according to SWF references. Jubeat does not
// premultiply by alpha, but the GL/DX equation is max(Src * As, Dst * 1).
// TODO: blend mode 6, which is "darken" blending according to SWF references. Jubeat does not
// premultiply by alpha, but the GL/DX equation is min(Src * As, Dst * 1).
// TODO: blend mode 10, which is "invert" according to SWF references. The only game I could find
// that implemented this had equation Src * (1 - Dst) + Dst * (1 - As).
// TODO: blend mode 13, which is "overlay" according to SWF references. The equation seems to be
// Src * Dst + Dst * Src but Jubeat thinks it should be Src * Dst + Dst * (1 - As).
if (blendfunc == 8) {
return blend_addition(dest_color, src_color);
}
if (blendfunc == 9 || blendfunc == 70) {
return blend_subtraction(dest_color, src_color);
}
// TODO: blend mode 75, which is not in the SWF spec and appears to have the equation
// Src * (1 - Dst) + Dst * (1 - Src).
return blend_normal(dest_color, src_color);
}
int affine_composite_fast(
unsigned char *imgbytes,
unsigned int imgwidth,
unsigned int imgheight,
unsigned int minx,
unsigned int maxx,
unsigned int miny,
unsigned int maxy,
intcolor_t add_color,
floatcolor_t mult_color,
matrix_t inverse,
point_t origin,
int blendfunc,
unsigned char *texbytes,
unsigned int texwidth,
unsigned int texheight,
int single_threaded
) {
// Cast to a usable type.
intcolor_t *imgdata = (intcolor_t *)imgbytes;
intcolor_t *texdata = (intcolor_t *)texbytes;
for (unsigned int imgy = miny; imgy < maxy; imgy++) {
for (unsigned int imgx = minx; imgx < maxx; imgx++) {
// Determine offset.
unsigned int imgoff = imgx + (imgy * imgwidth);
// Calculate what texture pixel data goes here.
point_t texloc = inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(origin);
int texx = roundf(texloc.x);
int texy = roundf(texloc.y);
// If we're out of bounds, don't update.
if (texx < 0 or texy < 0 or texx >= (int)texwidth or texy >= (int)texheight) {
continue;
}
// Blend it.
unsigned int texoff = texx + (texy * texwidth);
imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
}
}
return 0;
}
}

View File

@ -374,16 +374,6 @@ class AFPRenderer(VerboseOutput):
# Compute the affine transformation matrix for this object.
transform = parent_transform.multiply(renderable.transform)
# Calculate the inverse so we can map canvas space back to texture space.
try:
inverse = transform.inverse()
except ZeroDivisionError:
# If this happens, that means one of the scaling factors was zero, making
# this object invisible. We can ignore this since the object should not
# be drawn.
print(f"WARNING: Transform Matrix {transform} has zero scaling factor, making it non-invertible!")
return img
# Render individual shapes if this is a sprite.
if isinstance(renderable, PlacedClip):
# This is a sprite placement reference.
@ -458,7 +448,7 @@ class AFPRenderer(VerboseOutput):
img.alpha_composite(texture, cutin.as_tuple(), cutoff.as_tuple())
else:
# We can't, so do the slow render that's correct.
img = affine_composite(img, add_color, mult_color, transform, inverse, origin, blend, texture, single_threaded=self.__single_threaded)
img = affine_composite(img, add_color, mult_color, transform, origin, blend, texture, single_threaded=self.__single_threaded)
else:
raise Exception(f"Unknown placed object type to render {renderable}!")

View File

@ -123,6 +123,13 @@ setup(
"bemani/format/afp/blend.py",
]
),
Extension(
"bemani.format.afp.blendalt",
[
"bemani/format/afp/blendalt.pyx",
"bemani/format/afp/blendaltimpl.cpp",
]
),
Extension(
"bemani.format.afp.types.generic",
[