Initial implementation of C++ affine renderer for massive speed boost.

2025-02-17 11:18:33 +01:00 · 2021-05-19 16:25:13 +00:00 · 2021-05-19 16:25:13 +00:00 · 48e9c59513
commit 48e9c59513
parent b02c4292be
6 changed files with 670 additions and 325 deletions
--- a/bemani/format/afp/blend.py
+++ b/bemani/format/afp/blend.py
@ -6,203 +6,279 @@ from typing import Any, List, Sequence, Tuple
 from .types.generic import Color, Matrix, Point


-def clamp(color: float) -> int:
-    return min(max(0, round(color)), 255)
+# If we compiled the faster cython code, we can use it instead!
+try:
+    from .blendalt import affine_composite
+except ImportError:
+    def clamp(color: float) -> int:
+        return min(max(0, round(color)), 255)

+    def blend_normal(
+        # RGBA color tuple representing what's already at the dest.
+        dest: Sequence[int],
+        # RGBA color tuple representing the source we want to blend to the dest.
+        src: Sequence[int],
+    ) -> Sequence[int]:
+        # "Normal" blend mode, which is just alpha blending. Various games use the DX
+        # equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since
+        # we are blitting onto a destination that could have transparency. Once we are
+        # done, we divide out the premultiplied Ad in order to put the pixes back to
+        # their full blended values since we are not setting the destination alpha to 1.0.
+        # This enables partial transparent backgrounds to work properly.

-def blend_normal(
-    # RGBA color tuple representing what's already at the dest.
-    dest: Sequence[int],
-    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Sequence[int],
-    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
-    mult_color: Color,
-    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
-    add_color: Tuple[int, int, int, int],
-) -> Sequence[int]:
-    # "Normal" blend mode, which is just alpha blending. Various games use the DX
-    # equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since
-    # we are blitting onto a destination that could have transparency. Once we are
-    # done, we divide out the premultiplied Ad in order to put the pixes back to
-    # their full blended values since we are not setting the destination alpha to 1.0.
-    # This enables partial transparent backgrounds to work properly.
+        # Short circuit for speed.
+        if src[3] == 0:
+            return dest
+        if src[3] == 255:
+            return src

-    # Calculate multiplicative and additive colors against the source.
-    src = (
-        clamp((src[0] * mult_color.r) + add_color[0]),
-        clamp((src[1] * mult_color.g) + add_color[1]),
-        clamp((src[2] * mult_color.b) + add_color[2]),
-        clamp((src[3] * mult_color.a) + add_color[3]),
-    )
+        # Calculate alpha blending.
+        srcpercent = src[3] / 255.0
+        destpercent = dest[3] / 255.0
+        srcremaineder = 1.0 - srcpercent
+        new_alpha = (srcpercent + destpercent * srcremaineder)
+        return (
+            clamp(((dest[0] * destpercent * srcremaineder) + (src[0] * srcpercent)) / new_alpha),
+            clamp(((dest[1] * destpercent * srcremaineder) + (src[1] * srcpercent)) / new_alpha),
+            clamp(((dest[2] * destpercent * srcremaineder) + (src[2] * srcpercent)) / new_alpha),
+            clamp(255 * new_alpha)
+        )

-    # Short circuit for speed.
-    if src[3] == 0:
-        return dest
-    if src[3] == 255:
-        return src
+    def blend_addition(
+        # RGBA color tuple representing what's already at the dest.
+        dest: Sequence[int],
+        # RGBA color tuple representing the source we want to blend to the dest.
+        src: Sequence[int],
+    ) -> Sequence[int]:
+        # "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX
+        # equation Src * As + Dst * 1. It appears jubeat does not premultiply the source
+        # by its alpha component.

-    # Calculate alpha blending.
-    srcpercent = src[3] / 255.0
-    destpercent = dest[3] / 255.0
-    srcremaineder = 1.0 - srcpercent
-    new_alpha = (srcpercent + destpercent * srcremaineder)
-    return (
-        clamp(((dest[0] * destpercent * srcremaineder) + (src[0] * srcpercent)) / new_alpha),
-        clamp(((dest[1] * destpercent * srcremaineder) + (src[1] * srcpercent)) / new_alpha),
-        clamp(((dest[2] * destpercent * srcremaineder) + (src[2] * srcpercent)) / new_alpha),
-        clamp(255 * new_alpha)
-    )
+        # Short circuit for speed.
+        if src[3] == 0:
+            return dest

+        # Calculate final color blending.
+        srcpercent = src[3] / 255.0
+        return (
+            clamp(dest[0] + (src[0] * srcpercent)),
+            clamp(dest[1] + (src[1] * srcpercent)),
+            clamp(dest[2] + (src[2] * srcpercent)),
+            dest[3],
+        )

-def blend_addition(
-    # RGBA color tuple representing what's already at the dest.
-    dest: Sequence[int],
-    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Sequence[int],
-    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
-    mult_color: Color,
-    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
-    add_color: Tuple[int, int, int, int],
-) -> Sequence[int]:
-    # "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX
-    # equation Src * As + Dst * 1. It appears jubeat does not premultiply the source
-    # by its alpha component.
+    def blend_subtraction(
+        # RGBA color tuple representing what's already at the dest.
+        dest: Sequence[int],
+        # RGBA color tuple representing the source we want to blend to the dest.
+        src: Sequence[int],
+    ) -> Sequence[int]:
+        # "Subtraction" blend mode, used for darkening an image. Various games use the DX
+        # equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source
+        # by its alpha component much like the "additive" blend above..

-    # Calculate multiplicative and additive colors against the source.
-    src = (
-        clamp((src[0] * mult_color.r) + add_color[0]),
-        clamp((src[1] * mult_color.g) + add_color[1]),
-        clamp((src[2] * mult_color.b) + add_color[2]),
-        clamp((src[3] * mult_color.a) + add_color[3]),
-    )
+        # Short circuit for speed.
+        if src[3] == 0:
+            return dest

-    # Short circuit for speed.
-    if src[3] == 0:
-        return dest
+        # Calculate final color blending.
+        srcpercent = src[3] / 255.0
+        return (
+            clamp(dest[0] - (src[0] * srcpercent)),
+            clamp(dest[1] - (src[1] * srcpercent)),
+            clamp(dest[2] - (src[2] * srcpercent)),
+            dest[3],
+        )

-    # Calculate final color blending.
-    srcpercent = src[3] / 255.0
-    return (
-        clamp(dest[0] + (src[0] * srcpercent)),
-        clamp(dest[1] + (src[1] * srcpercent)),
-        clamp(dest[2] + (src[2] * srcpercent)),
-        dest[3],
-    )
+    def blend_multiply(
+        # RGBA color tuple representing what's already at the dest.
+        dest: Sequence[int],
+        # RGBA color tuple representing the source we want to blend to the dest.
+        src: Sequence[int],
+    ) -> Sequence[int]:
+        # "Multiply" blend mode, used for darkening an image. Various games use the DX
+        # equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula
+        # Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the
+        # source alpha is always 255.

+        # Calculate final color blending.
+        return (
+            clamp(255 * ((dest[0] / 255.0) * (src[0] / 255.0))),
+            clamp(255 * ((dest[1] / 255.0) * (src[1] / 255.0))),
+            clamp(255 * ((dest[2] / 255.0) * (src[2] / 255.0))),
+            dest[3],
+        )

-def blend_subtraction(
-    # RGBA color tuple representing what's already at the dest.
-    dest: Sequence[int],
-    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Sequence[int],
-    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
-    mult_color: Color,
-    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
-    add_color: Tuple[int, int, int, int],
-) -> Sequence[int]:
-    # "Subtraction" blend mode, used for darkening an image. Various games use the DX
-    # equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source
-    # by its alpha component much like the "additive" blend above..
+    def affine_composite(
+        img: Image.Image,
+        add_color: Tuple[int, int, int, int],
+        mult_color: Color,
+        transform: Matrix,
+        origin: Point,
+        blendfunc: int,
+        texture: Image.Image,
+        single_threaded: bool = False,
+    ) -> Image.Image:
+        # Calculate the inverse so we can map canvas space back to texture space.
+        try:
+            inverse = transform.inverse()
+        except ZeroDivisionError:
+            # If this happens, that means one of the scaling factors was zero, making
+            # this object invisible. We can ignore this since the object should not
+            # be drawn.
+            print(f"WARNING: Transform Matrix {transform} has zero scaling factor, making it non-invertible!")
+            return img

-    # Calculate multiplicative and additive colors against the source.
-    src = (
-        clamp((src[0] * mult_color.r) + add_color[0]),
-        clamp((src[1] * mult_color.g) + add_color[1]),
-        clamp((src[2] * mult_color.b) + add_color[2]),
-        clamp((src[3] * mult_color.a) + add_color[3]),
-    )
+        # Warn if we have an unsupported blend.
+        if blendfunc not in {0, 2, 3, 8, 9, 70}:
+            print(f"WARNING: Unsupported blend {blendfunc}")
+            return img

-    # Short circuit for speed.
-    if src[3] == 0:
-        return dest
+        # These are calculated properties and caching them outside of the loop
+        # speeds things up a bit.
+        imgwidth = img.width
+        imgheight = img.height
+        texwidth = texture.width
+        texheight = texture.height

-    # Calculate final color blending.
-    srcpercent = src[3] / 255.0
-    return (
-        clamp(dest[0] - (src[0] * srcpercent)),
-        clamp(dest[1] - (src[1] * srcpercent)),
-        clamp(dest[2] - (src[2] * srcpercent)),
-        dest[3],
-    )
+        # Calculate the maximum range of update this texture can possibly reside in.
+        pix1 = transform.multiply_point(Point.identity().subtract(origin))
+        pix2 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, 0)))
+        pix3 = transform.multiply_point(Point.identity().subtract(origin).add(Point(0, texheight)))
+        pix4 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, texheight)))

+        # Map this to the rectangle we need to sweep in the rendering image.
+        minx = max(int(min(pix1.x, pix2.x, pix3.x, pix4.x)), 0)
+        maxx = min(int(max(pix1.x, pix2.x, pix3.x, pix4.x)) + 1, imgwidth)
+        miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
+        maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)

-def blend_multiply(
-    # RGBA color tuple representing what's already at the dest.
-    dest: Sequence[int],
-    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Sequence[int],
-    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
-    mult_color: Color,
-    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
-    add_color: Tuple[int, int, int, int],
-) -> Sequence[int]:
-    # "Multiply" blend mode, used for darkening an image. Various games use the DX
-    # equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula
-    # Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the
-    # source alpha is always 255.
+        if maxx <= 0 or maxy <= 0:
+            # This image is entirely off the screen!
+            return img

-    # Calculate multiplicative and additive colors against the source.
-    src = (
-        clamp((src[0] * mult_color.r) + add_color[0]),
-        clamp((src[1] * mult_color.g) + add_color[1]),
-        clamp((src[2] * mult_color.b) + add_color[2]),
-        clamp((src[3] * mult_color.a) + add_color[3]),
-    )
+        cores = multiprocessing.cpu_count()
+        if single_threaded or cores < 2:
+            # Get the data in an easier to manipulate and faster to update fashion.
+            imgmap = list(img.getdata())
+            texmap = list(texture.getdata())

-    # Calculate final color blending.
-    return (
-        clamp(255 * ((dest[0] / 255.0) * (src[0] / 255.0))),
-        clamp(255 * ((dest[1] / 255.0) * (src[1] / 255.0))),
-        clamp(255 * ((dest[2] / 255.0) * (src[2] / 255.0))),
-        dest[3],
-    )
+            # We don't have enough CPU cores to bother multiprocessing.
+            for imgy in range(miny, maxy):
+                for imgx in range(minx, maxx):
+                    # Determine offset
+                    imgoff = imgx + (imgy * imgwidth)

+                    # Calculate what texture pixel data goes here.
+                    texloc = inverse.multiply_point(Point(float(imgx), float(imgy))).add(origin)
+                    texx, texy = texloc.as_tuple()

-def affine_composite(
-    img: Image.Image,
-    add_color: Tuple[int, int, int, int],
-    mult_color: Color,
-    transform: Matrix,
-    inverse: Matrix,
-    origin: Point,
-    blendfunc: int,
-    texture: Image.Image,
-    single_threaded: bool = False,
-) -> Image.Image:
-    # Warn if we have an unsupported blend.
-    if blendfunc not in {0, 2, 3, 8, 9, 70}:
-        print(f"WARNING: Unsupported blend {blendfunc}")
+                    # If we're out of bounds, don't update.
+                    if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
+                        continue

-    # These are calculated properties and caching them outside of the loop
-    # speeds things up a bit.
-    imgwidth = img.width
-    imgheight = img.height
-    texwidth = texture.width
-    texheight = texture.height
+                    # Blend it.
+                    texoff = texx + (texy * texwidth)
+                    imgmap[imgoff] = blend_point(add_color, mult_color, texmap[texoff], imgmap[imgoff], blendfunc)

-    # Calculate the maximum range of update this texture can possibly reside in.
-    pix1 = transform.multiply_point(Point.identity().subtract(origin))
-    pix2 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, 0)))
-    pix3 = transform.multiply_point(Point.identity().subtract(origin).add(Point(0, texheight)))
-    pix4 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, texheight)))
+            img.putdata(imgmap)
+        else:
+            imgbytes = img.tobytes('raw', 'RGBA')
+            texbytes = texture.tobytes('raw', 'RGBA')

-    # Map this to the rectangle we need to sweep in the rendering image.
-    minx = max(int(min(pix1.x, pix2.x, pix3.x, pix4.x)), 0)
-    maxx = min(int(max(pix1.x, pix2.x, pix3.x, pix4.x)) + 1, imgwidth)
-    miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
-    maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
+            # Let's spread the load across multiple processors.
+            procs: List[multiprocessing.Process] = []
+            work: multiprocessing.Queue = multiprocessing.Queue()
+            results: multiprocessing.Queue = multiprocessing.Queue()
+            expected: int = 0
+            interrupted: bool = False

-    cores = multiprocessing.cpu_count()
-    if single_threaded or cores < 2:
-        # Get the data in an easier to manipulate and faster to update fashion.
-        imgmap = list(img.getdata())
-        texmap = list(texture.getdata())
+            def ctrlc(sig: Any, frame: Any) -> None:
+                nonlocal interrupted
+                interrupted = True

-        # We don't have enough CPU cores to bother multiprocessing.
-        for imgy in range(miny, maxy):
-            for imgx in range(minx, maxx):
+            original_handler = signal.getsignal(signal.SIGINT)
+            signal.signal(signal.SIGINT, ctrlc)
+
+            for _ in range(cores):
+                proc = multiprocessing.Process(
+                    target=pixel_renderer,
+                    args=(
+                        work,
+                        results,
+                        minx,
+                        maxx,
+                        imgwidth,
+                        texwidth,
+                        texheight,
+                        inverse,
+                        origin,
+                        add_color,
+                        mult_color,
+                        blendfunc,
+                        imgbytes,
+                        texbytes,
+                    ),
+                )
+                procs.append(proc)
+                proc.start()
+
+            for imgy in range(miny, maxy):
+                work.put(imgy)
+                expected += 1
+
+            lines: List[bytes] = [
+                imgbytes[x:(x + (imgwidth * 4))]
+                for x in range(
+                    0,
+                    imgwidth * imgheight * 4,
+                    imgwidth * 4,
+                )
+            ]
+            for _ in range(expected):
+                imgy, result = results.get()
+                lines[imgy] = result
+
+            for proc in procs:
+                work.put(None)
+            for proc in procs:
+                proc.join()
+
+            signal.signal(signal.SIGINT, original_handler)
+            if interrupted:
+                raise KeyboardInterrupt()
+
+            img = Image.frombytes('RGBA', (imgwidth, imgheight), b''.join(lines))
+        return img
+
+    def pixel_renderer(
+        work: multiprocessing.Queue,
+        results: multiprocessing.Queue,
+        minx: int,
+        maxx: int,
+        imgwidth: int,
+        texwidth: int,
+        texheight: int,
+        inverse: Matrix,
+        origin: Point,
+        add_color: Tuple[int, int, int, int],
+        mult_color: Color,
+        blendfunc: int,
+        imgbytes: bytes,
+        texbytes: bytes,
+    ) -> None:
+        while True:
+            imgy = work.get()
+            if imgy is None:
+                return
+
+            result: List[Sequence[int]] = []
+            for imgx in range(imgwidth):
                # Determine offset
                imgoff = imgx + (imgy * imgwidth)
+                if imgx < minx or imgx >= maxx:
+                    result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
+                    continue

                # Calculate what texture pixel data goes here.
                texloc = inverse.multiply_point(Point(float(imgx), float(imgy))).add(origin)
@ -210,155 +286,50 @@ def affine_composite(

                # If we're out of bounds, don't update.
                if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
+                    result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
                    continue

                # Blend it.
                texoff = texx + (texy * texwidth)
-                imgmap[imgoff] = affine_blend_impl(add_color, mult_color, texmap[texoff], imgmap[imgoff], blendfunc)
+                result.append(blend_point(add_color, mult_color, texbytes[(texoff * 4):((texoff + 1) * 4)], imgbytes[(imgoff * 4):((imgoff + 1) * 4)], blendfunc))

-        img.putdata(imgmap)
-    else:
-        imgbytes = img.tobytes('raw', 'RGBA')
-        texbytes = texture.tobytes('raw', 'RGBA')
+            linebytes = bytes([channel for pixel in result for channel in pixel])
+            results.put((imgy, linebytes))

-        # Let's spread the load across multiple processors.
-        procs: List[multiprocessing.Process] = []
-        work: multiprocessing.Queue = multiprocessing.Queue()
-        results: multiprocessing.Queue = multiprocessing.Queue()
-        expected: int = 0
-        interrupted: bool = False
+    def blend_point(
+        add_color: Tuple[int, int, int, int],
+        mult_color: Color,
+        # This should be a sequence of exactly 4 values, either bytes or a tuple.
+        src_color: Sequence[int],
+        # This should be a sequence of exactly 4 values, either bytes or a tuple.
+        dest_color: Sequence[int],
+        blendfunc: int,
+    ) -> Sequence[int]:
+        # Calculate multiplicative and additive colors against the source.
+        src_color = (
+            clamp((src_color[0] * mult_color.r) + add_color[0]),
+            clamp((src_color[1] * mult_color.g) + add_color[1]),
+            clamp((src_color[2] * mult_color.b) + add_color[2]),
+            clamp((src_color[3] * mult_color.a) + add_color[3]),
+        )

-        def ctrlc(sig: Any, frame: Any) -> None:
-            nonlocal interrupted
-            interrupted = True
-
-        original_handler = signal.getsignal(signal.SIGINT)
-        signal.signal(signal.SIGINT, ctrlc)
-
-        for _ in range(cores):
-            proc = multiprocessing.Process(
-                target=pixel_renderer,
-                args=(
-                    work,
-                    results,
-                    minx,
-                    maxx,
-                    imgwidth,
-                    texwidth,
-                    texheight,
-                    inverse,
-                    origin,
-                    add_color,
-                    mult_color,
-                    blendfunc,
-                    imgbytes,
-                    texbytes,
-                ),
-            )
-            procs.append(proc)
-            proc.start()
-
-        for imgy in range(miny, maxy):
-            work.put(imgy)
-            expected += 1
-
-        lines: List[bytes] = [
-            imgbytes[x:(x + (imgwidth * 4))]
-            for x in range(
-                0,
-                imgwidth * imgheight * 4,
-                imgwidth * 4,
-            )
-        ]
-        for _ in range(expected):
-            imgy, result = results.get()
-            lines[imgy] = result
-
-        for proc in procs:
-            work.put(None)
-        for proc in procs:
-            proc.join()
-
-        signal.signal(signal.SIGINT, original_handler)
-        if interrupted:
-            raise KeyboardInterrupt()
-
-        img = Image.frombytes('RGBA', (imgwidth, imgheight), b''.join(lines))
-    return img
-
-
-def pixel_renderer(
-    work: multiprocessing.Queue,
-    results: multiprocessing.Queue,
-    minx: int,
-    maxx: int,
-    imgwidth: int,
-    texwidth: int,
-    texheight: int,
-    inverse: Matrix,
-    origin: Point,
-    add_color: Tuple[int, int, int, int],
-    mult_color: Color,
-    blendfunc: int,
-    imgbytes: bytes,
-    texbytes: bytes,
-) -> None:
-    while True:
-        imgy = work.get()
-        if imgy is None:
-            return
-
-        result: List[Sequence[int]] = []
-        for imgx in range(imgwidth):
-            # Determine offset
-            imgoff = imgx + (imgy * imgwidth)
-            if imgx < minx or imgx >= maxx:
-                result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
-                continue
-
-            # Calculate what texture pixel data goes here.
-            texloc = inverse.multiply_point(Point(float(imgx), float(imgy))).add(origin)
-            texx, texy = texloc.as_tuple()
-
-            # If we're out of bounds, don't update.
-            if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
-                result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
-                continue
-
-            # Blend it.
-            texoff = texx + (texy * texwidth)
-            result.append(affine_blend_impl(add_color, mult_color, texbytes[(texoff * 4):((texoff + 1) * 4)], imgbytes[(imgoff * 4):((imgoff + 1) * 4)], blendfunc))
-
-        linebytes = bytes([channel for pixel in result for channel in pixel])
-        results.put((imgy, linebytes))
-
-
-def affine_blend_impl(
-    add_color: Tuple[int, int, int, int],
-    mult_color: Color,
-    # This should be a sequence of exactly 4 values, either bytes or a tuple.
-    src_color: Sequence[int],
-    # This should be a sequence of exactly 4 values, either bytes or a tuple.
-    dest_color: Sequence[int],
-    blendfunc: int,
-) -> Sequence[int]:
-    if blendfunc == 3:
-        return blend_multiply(dest_color, src_color, mult_color, add_color)
-    # TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this
-    # in Jubeat and it implements it using OpenGL equation Src * (1 - Dst) + Dst * 1.
-    # TODO: blend mode 5, which is "lighten" blending according to SWF references. Jubeat does not
-    # premultiply by alpha, but the GL/DX equation is max(Src * As, Dst * 1).
-    # TODO: blend mode 6, which is "darken" blending according to SWF references. Jubeat does not
-    # premultiply by alpha, but the GL/DX equation is min(Src * As, Dst * 1).
-    # TODO: blend mode 10, which is "invert" according to SWF references. The only game I could find
-    # that implemented this had equation Src * (1 - Dst) + Dst * (1 - As).
-    # TODO: blend mode 13, which is "overlay" according to SWF references. The equation seems to be
-    # Src * Dst + Dst * Src but Jubeat thinks it should be Src * Dst + Dst * (1 - As).
-    elif blendfunc == 8:
-        return blend_addition(dest_color, src_color, mult_color, add_color)
-    elif blendfunc == 9 or blendfunc == 70:
-        return blend_subtraction(dest_color, src_color, mult_color, add_color)
-    # TODO: blend mode 75, which is not in the SWF spec and appears to have the equation
-    # Src * (1 - Dst) + Dst * (1 - Src).
-    else:
-        return blend_normal(dest_color, src_color, mult_color, add_color)
+        if blendfunc == 3:
+            return blend_multiply(dest_color, src_color)
+        # TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this
+        # in Jubeat and it implements it using OpenGL equation Src * (1 - Dst) + Dst * 1.
+        # TODO: blend mode 5, which is "lighten" blending according to SWF references. Jubeat does not
+        # premultiply by alpha, but the GL/DX equation is max(Src * As, Dst * 1).
+        # TODO: blend mode 6, which is "darken" blending according to SWF references. Jubeat does not
+        # premultiply by alpha, but the GL/DX equation is min(Src * As, Dst * 1).
+        # TODO: blend mode 10, which is "invert" according to SWF references. The only game I could find
+        # that implemented this had equation Src * (1 - Dst) + Dst * (1 - As).
+        # TODO: blend mode 13, which is "overlay" according to SWF references. The equation seems to be
+        # Src * Dst + Dst * Src but Jubeat thinks it should be Src * Dst + Dst * (1 - As).
+        elif blendfunc == 8:
+            return blend_addition(dest_color, src_color)
+        elif blendfunc == 9 or blendfunc == 70:
+            return blend_subtraction(dest_color, src_color)
+        # TODO: blend mode 75, which is not in the SWF spec and appears to have the equation
+        # Src * (1 - Dst) + Dst * (1 - Src).
+        else:
+            return blend_normal(dest_color, src_color)
--- a/bemani/format/afp/blendalt.pyi
+++ b/bemani/format/afp/blendalt.pyi
@ -0,0 +1,16 @@
+from PIL import Image  # type: ignore
+from typing import Tuple
+
+from .types.generic import Color, Matrix, Point
+
+def affine_composite(
+    img: Image.Image,
+    add_color: Tuple[int, int, int, int],
+    mult_color: Color,
+    transform: Matrix,
+    origin: Point,
+    blendfunc: int,
+    texture: Image.Image,
+    single_threaded: bool = False,
+) -> Image.Image:
+    ...
--- a/bemani/format/afp/blendalt.pyx
+++ b/bemani/format/afp/blendalt.pyx
@ -0,0 +1,129 @@
+from PIL import Image  # type: ignore
+from typing import Tuple
+
+from .types.generic import Color, Matrix, Point
+
+cdef extern struct intcolor_t:
+    unsigned char r;
+    unsigned char g;
+    unsigned char b;
+    unsigned char a;
+
+cdef extern struct floatcolor_t:
+    float r;
+    float g;
+    float b;
+    float a;
+
+cdef extern struct matrix_t:
+    float a;
+    float b;
+    float c;
+    float d;
+    float tx;
+    float ty;
+
+cdef extern struct point_t:
+    float x;
+    float y;
+
+cdef extern int affine_composite_fast(
+    unsigned char *imgdata,
+    unsigned int imgwidth,
+    unsigned int imgheight,
+    unsigned int minx,
+    unsigned int maxx,
+    unsigned int miny,
+    unsigned int maxy,
+    intcolor_t add_color,
+    floatcolor_t mult_color,
+    matrix_t inverse,
+    point_t origin,
+    int blendfunc,
+    unsigned char *texdata,
+    unsigned int texwidth,
+    unsigned int texheight,
+    int single_threaded
+)
+
+def affine_composite(
+    img: Image.Image,
+    add_color: Tuple[int, int, int, int],
+    mult_color: Color,
+    transform: Matrix,
+    origin: Point,
+    blendfunc: int,
+    texture: Image.Image,
+    single_threaded: bool = False,
+) -> Image.Image:
+    # Calculate the inverse so we can map canvas space back to texture space.
+    try:
+        inverse = transform.inverse()
+    except ZeroDivisionError:
+        # If this happens, that means one of the scaling factors was zero, making
+        # this object invisible. We can ignore this since the object should not
+        # be drawn.
+        print(f"WARNING: Transform Matrix {transform} has zero scaling factor, making it non-invertible!")
+        return img
+
+    if blendfunc not in {0, 2, 3, 8, 9, 70}:
+        print(f"WARNING: Unsupported blend {blendfunc}")
+        return img
+
+    # These are calculated properties and caching them outside of the loop
+    # speeds things up a bit.
+    imgwidth = img.width
+    imgheight = img.height
+    texwidth = texture.width
+    texheight = texture.height
+
+    # Calculate the maximum range of update this texture can possibly reside in.
+    pix1 = transform.multiply_point(Point.identity().subtract(origin))
+    pix2 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, 0)))
+    pix3 = transform.multiply_point(Point.identity().subtract(origin).add(Point(0, texheight)))
+    pix4 = transform.multiply_point(Point.identity().subtract(origin).add(Point(texwidth, texheight)))
+
+    # Map this to the rectangle we need to sweep in the rendering image.
+    minx = max(int(min(pix1.x, pix2.x, pix3.x, pix4.x)), 0)
+    maxx = min(int(max(pix1.x, pix2.x, pix3.x, pix4.x)) + 1, imgwidth)
+    miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
+    maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)
+
+    if maxx <= 0 or maxy <= 0:
+        # This image is entirely off the screen!
+        return img
+
+    # Grab the raw image data.
+    imgbytes = img.tobytes('raw', 'RGBA')
+    texbytes = texture.tobytes('raw', 'RGBA')
+
+    # Convert classes to C structs.
+    cdef intcolor_t c_addcolor = intcolor_t(r=add_color[0], g=add_color[1], b=add_color[2], a=add_color[3])
+    cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
+    cdef matrix_t c_inverse = matrix_t(a=inverse.a, b=inverse.b, c=inverse.c, d=inverse.d, tx=inverse.tx, ty=inverse.ty)
+    cdef point_t c_origin = point_t(x=origin.x, y=origin.y)
+
+    # Call the C++ function.
+    errors = affine_composite_fast(
+        imgbytes,
+        imgwidth,
+        imgheight,
+        minx,
+        maxx,
+        miny,
+        maxy,
+        c_addcolor,
+        c_multcolor,
+        c_inverse,
+        c_origin,
+        blendfunc,
+        texbytes,
+        texwidth,
+        texheight,
+        single_threaded,
+    )
+    if errors != 0:
+        raise Exception("Error raised in C++!")
+
+    # We blitted in-place, return that.
+    return Image.frombytes('RGBA', (imgwidth, imgheight), imgbytes)
--- a/bemani/format/afp/blendaltimpl.cpp
+++ b/bemani/format/afp/blendaltimpl.cpp
@ -0,0 +1,232 @@
+#include <stdio.h>
+#include <math.h>
+
+extern "C"
+{
+    typedef struct intcolor {
+        unsigned char r;
+        unsigned char g;
+        unsigned char b;
+        unsigned char a;
+    } intcolor_t;
+
+    typedef struct floatcolor {
+        float r;
+        float g;
+        float b;
+        float a;
+    } floatcolor_t;
+
+    typedef struct point {
+        float x;
+        float y;
+
+        struct point add(struct point other) {
+            return (struct point){
+                x + other.x,
+                y + other.y,
+            };
+        };
+    } point_t;
+
+    typedef struct matrix {
+        float a;
+        float b;
+        float c;
+        float d;
+        float tx;
+        float ty;
+
+        point_t multiply_point(point_t point) {
+            return (point_t){
+                (a * point.x) + (c * point.y) + tx,
+                (b * point.x) + (d * point.y) + ty,
+            };
+        }
+    } matrix_t;
+
+    inline unsigned char clamp(float color) {
+        return fmin(fmax(0.0, roundf(color)), 255.0);
+    }
+
+    intcolor_t blend_normal(
+        intcolor_t dest,
+        intcolor_t src
+    ) {
+        // "Normal" blend mode, which is just alpha blending. Various games use the DX
+        // equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since
+        // we are blitting onto a destination that could have transparency. Once we are
+        // done, we divide out the premultiplied Ad in order to put the pixes back to
+        // their full blended values since we are not setting the destination alpha to 1.0.
+        // This enables partial transparent backgrounds to work properly.
+
+        // Short circuit for speed.
+        if (src.a == 0) {
+            return dest;
+        }
+        if (src.a == 255) {
+            return src;
+        }
+
+        // Calculate alpha blending.
+        float srcpercent = src.a / 255.0;
+        float destpercent = dest.a / 255.0;
+        float srcremaineder = 1.0 - srcpercent;
+        float new_alpha = (srcpercent + destpercent * srcremaineder);
+        return (intcolor_t){
+            clamp(((dest.r * destpercent * srcremaineder) + (src.r * srcpercent)) / new_alpha),
+            clamp(((dest.g * destpercent * srcremaineder) + (src.g * srcpercent)) / new_alpha),
+            clamp(((dest.b * destpercent * srcremaineder) + (src.b * srcpercent)) / new_alpha),
+            clamp(255 * new_alpha)
+        };
+    }
+
+    intcolor_t blend_addition(
+        intcolor_t dest,
+        intcolor_t src
+    ) {
+        // "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX
+        // equation Src * As + Dst * 1. It appears jubeat does not premultiply the source
+        // by its alpha component.
+
+        // Short circuit for speed.
+        if (src.a == 0) {
+            return dest;
+        }
+
+        // Calculate final color blending.
+        float srcpercent = src.a / 255.0;
+        return (intcolor_t){
+            clamp(dest.r + (src.r * srcpercent)),
+            clamp(dest.g + (src.g * srcpercent)),
+            clamp(dest.b + (src.b * srcpercent)),
+            dest.a,
+        };
+    }
+
+    intcolor_t blend_subtraction(
+        intcolor_t dest,
+        intcolor_t src
+    ) {
+        // "Subtraction" blend mode, used for darkening an image. Various games use the DX
+        // equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source
+        // by its alpha component much like the "additive" blend above..
+
+        // Short circuit for speed.
+        if (src.a == 0) {
+            return dest;
+        }
+
+        // Calculate final color blending.
+        float srcpercent = src.a / 255.0;
+        return (intcolor_t){
+            clamp(dest.r - (src.r * srcpercent)),
+            clamp(dest.g - (src.g * srcpercent)),
+            clamp(dest.b - (src.b * srcpercent)),
+            dest.a,
+        };
+    }
+
+    intcolor_t blend_multiply(
+        intcolor_t dest,
+        intcolor_t src
+    ) {
+        // "Multiply" blend mode, used for darkening an image. Various games use the DX
+        // equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula
+        // Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the
+        // source alpha is always 255.
+
+        // Calculate final color blending.
+        return (intcolor_t){
+            clamp(255 * ((dest.r / 255.0) * (src.r / 255.0))),
+            clamp(255 * ((dest.g / 255.0) * (src.g / 255.0))),
+            clamp(255 * ((dest.b / 255.0) * (src.b / 255.0))),
+            dest.a,
+        };
+    }
+
+    intcolor_t blend_point(
+        intcolor_t add_color,
+        floatcolor_t mult_color,
+        intcolor_t src_color,
+        intcolor_t dest_color,
+        int blendfunc
+    ) {
+        // Calculate multiplicative and additive colors against the source.
+        src_color = (intcolor_t){
+            clamp((src_color.r * mult_color.r) + add_color.r),
+            clamp((src_color.g * mult_color.g) + add_color.g),
+            clamp((src_color.b * mult_color.b) + add_color.b),
+            clamp((src_color.a * mult_color.a) + add_color.a),
+        };
+
+        if (blendfunc == 3) {
+            return blend_multiply(dest_color, src_color);
+        }
+        // TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this
+        // in Jubeat and it implements it using OpenGL equation Src * (1 - Dst) + Dst * 1.
+        // TODO: blend mode 5, which is "lighten" blending according to SWF references. Jubeat does not
+        // premultiply by alpha, but the GL/DX equation is max(Src * As, Dst * 1).
+        // TODO: blend mode 6, which is "darken" blending according to SWF references. Jubeat does not
+        // premultiply by alpha, but the GL/DX equation is min(Src * As, Dst * 1).
+        // TODO: blend mode 10, which is "invert" according to SWF references. The only game I could find
+        // that implemented this had equation Src * (1 - Dst) + Dst * (1 - As).
+        // TODO: blend mode 13, which is "overlay" according to SWF references. The equation seems to be
+        // Src * Dst + Dst * Src but Jubeat thinks it should be Src * Dst + Dst * (1 - As).
+        if (blendfunc == 8) {
+            return blend_addition(dest_color, src_color);
+        }
+        if (blendfunc == 9 || blendfunc == 70) {
+            return blend_subtraction(dest_color, src_color);
+        }
+        // TODO: blend mode 75, which is not in the SWF spec and appears to have the equation
+        // Src * (1 - Dst) + Dst * (1 - Src).
+        return blend_normal(dest_color, src_color);
+    }
+
+    int affine_composite_fast(
+        unsigned char *imgbytes,
+        unsigned int imgwidth,
+        unsigned int imgheight,
+        unsigned int minx,
+        unsigned int maxx,
+        unsigned int miny,
+        unsigned int maxy,
+        intcolor_t add_color,
+        floatcolor_t mult_color,
+        matrix_t inverse,
+        point_t origin,
+        int blendfunc,
+        unsigned char *texbytes,
+        unsigned int texwidth,
+        unsigned int texheight,
+        int single_threaded
+    ) {
+        // Cast to a usable type.
+        intcolor_t *imgdata = (intcolor_t *)imgbytes;
+        intcolor_t *texdata = (intcolor_t *)texbytes;
+
+        for (unsigned int imgy = miny; imgy < maxy; imgy++) {
+            for (unsigned int imgx = minx; imgx < maxx; imgx++) {
+                // Determine offset.
+                unsigned int imgoff = imgx + (imgy * imgwidth);
+
+                // Calculate what texture pixel data goes here.
+                point_t texloc = inverse.multiply_point((point_t){(float)imgx, (float)imgy}).add(origin);
+                int texx = roundf(texloc.x);
+                int texy = roundf(texloc.y);
+
+                // If we're out of bounds, don't update.
+                if (texx < 0 or texy < 0 or texx >= (int)texwidth or texy >= (int)texheight) {
+                    continue;
+                }
+
+                // Blend it.
+                unsigned int texoff = texx + (texy * texwidth);
+                imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
+            }
+        }
+
+        return 0;
+    }
+}
--- a/bemani/format/afp/render.py
+++ b/bemani/format/afp/render.py
@ -374,16 +374,6 @@ class AFPRenderer(VerboseOutput):
        # Compute the affine transformation matrix for this object.
        transform = parent_transform.multiply(renderable.transform)

-        # Calculate the inverse so we can map canvas space back to texture space.
-        try:
-            inverse = transform.inverse()
-        except ZeroDivisionError:
-            # If this happens, that means one of the scaling factors was zero, making
-            # this object invisible. We can ignore this since the object should not
-            # be drawn.
-            print(f"WARNING: Transform Matrix {transform} has zero scaling factor, making it non-invertible!")
-            return img
-
        # Render individual shapes if this is a sprite.
        if isinstance(renderable, PlacedClip):
            # This is a sprite placement reference.
@ -458,7 +448,7 @@ class AFPRenderer(VerboseOutput):
                        img.alpha_composite(texture, cutin.as_tuple(), cutoff.as_tuple())
                    else:
                        # We can't, so do the slow render that's correct.
-                        img = affine_composite(img, add_color, mult_color, transform, inverse, origin, blend, texture, single_threaded=self.__single_threaded)
+                        img = affine_composite(img, add_color, mult_color, transform, origin, blend, texture, single_threaded=self.__single_threaded)
        else:
            raise Exception(f"Unknown placed object type to render {renderable}!")

--- a/setup.py
+++ b/setup.py
@ -123,6 +123,13 @@ setup(
                    "bemani/format/afp/blend.py",
                ]
            ),
+            Extension(
+                "bemani.format.afp.blendalt",
+                [
+                    "bemani/format/afp/blendalt.pyx",
+                    "bemani/format/afp/blendaltimpl.cpp",
+                ]
+            ),
            Extension(
                "bemani.format.afp.types.generic",
                [