From 74838e698d213d6d8afcb8366988993d34740d6d Mon Sep 17 00:00:00 2001
From: Jennifer Taylor <dragonminded@dragonminded.com>
Date: Thu, 5 Aug 2021 17:32:00 +0000
Subject: [PATCH] Switch perspective engine to using existing blitter with
 correct perspective-based texture mapping.

---
 bemani/format/afp/blend/blend.py         | 530 +++++++++++++++--------
 bemani/format/afp/blend/blendcpp.pyx     |  66 ++-
 bemani/format/afp/blend/blendcppimpl.cxx | 164 ++++---
 bemani/format/afp/blend/perspective.py   |  95 ++++
 4 files changed, 557 insertions(+), 298 deletions(-)
 create mode 100644 bemani/format/afp/blend/perspective.py

diff --git a/bemani/format/afp/blend/blend.py b/bemani/format/afp/blend/blend.py
index 2425eb9..67b8b8a 100644
--- a/bemani/format/afp/blend/blend.py
+++ b/bemani/format/afp/blend/blend.py
@@ -1,9 +1,10 @@
 import multiprocessing
 import signal
 from PIL import Image  # type: ignore
-from typing import Any, List, Optional, Sequence, Union
+from typing import Any, Callable, List, Optional, Sequence, Union
 
 from ..types import Color, Matrix, Point
+from .perspective import perspective_calculate
 
 
 def clamp(color: float) -> int:
@@ -192,6 +193,218 @@ def blend_point(
         return blend_normal(dest_color, src_color)
 
 
+def pixel_renderer(
+    imgx: int,
+    imgy: int,
+    imgwidth: int,
+    texwidth: int,
+    texheight: int,
+    xscale: float,
+    yscale: float,
+    callback: Callable[[Point], Optional[Point]],
+    add_color: Color,
+    mult_color: Color,
+    blendfunc: int,
+    imgbytes: Union[bytes, bytearray],
+    texbytes: Union[bytes, bytearray],
+    maskbytes: Optional[Union[bytes, bytearray]],
+    enable_aa: bool,
+) -> Sequence[int]:
+    # Determine offset
+    maskoff = imgx + (imgy * imgwidth)
+    imgoff = maskoff * 4
+
+    if maskbytes is not None and maskbytes[maskoff] == 0:
+        # This pixel is masked off!
+        return imgbytes[imgoff:(imgoff + 4)]
+
+    if enable_aa:
+        r = 0
+        g = 0
+        b = 0
+        a = 0
+        count = 0
+        denom = 0
+
+        # Essentially what we're doing here is calculating the scale, clamping it at 1.0 as the
+        # minimum and then setting the AA sample swing accordingly. This has the effect of anti-aliasing
+        # scaled up images a bit softer than would otherwise be achieved.
+        xswing = 0.5 * max(1.0, xscale)
+        yswing = 0.5 * max(1.0, yscale)
+
+        xpoints = [0.5 - xswing, 0.5 - (xswing / 2.0), 0.5, 0.5 + (xswing / 2.0), 0.5 + xswing]
+        ypoints = [0.5 - yswing, 0.5 - (yswing / 2.0), 0.5, 0.5 + (yswing / 2.0), 0.5 + yswing]
+
+        # First, figure out if we can use bilinear resampling.
+        bilinear = False
+        if xscale >= 1.0 and yscale >= 1.0:
+            aaloc = callback(Point(imgx + 0.5, imgy + 0.5))
+            if aaloc is not None:
+                aax, aay, _ = aaloc.as_tuple()
+                if not (aax <= 0 or aay <= 0 or aax >= (texwidth - 1) or aay >= (texheight - 1)):
+                    bilinear = True
+
+        # Now perform the desired AA operation.
+        if bilinear:
+            # Calculate the pixel we're after, and what percentage into the pixel we are.
+            texloc = callback(Point(imgx + 0.5, imgy + 0.5))
+            if texloc is None:
+                raise Exception("Logic error!")
+            aax, aay, _ = texloc.as_tuple()
+            aaxrem = texloc.x - aax
+            aayrem = texloc.y - aay
+
+            # Find the four pixels that we can interpolate from. The first number is the x, and second is y.
+            tex00 = (aax + (aay * texwidth)) * 4
+            tex10 = tex00 + 4
+            tex01 = (aax + ((aay + 1) * texwidth)) * 4
+            tex11 = tex01 + 4
+
+            # Calculate various scaling factors based on alpha and percentage.
+            tex00percent = texbytes[tex00 + 3] / 255.0
+            tex10percent = texbytes[tex10 + 3] / 255.0
+            tex01percent = texbytes[tex01 + 3] / 255.0
+            tex11percent = texbytes[tex11 + 3] / 255.0
+
+            y0percent = (tex00percent * (1.0 - aaxrem)) + (tex10percent * aaxrem)
+            y1percent = (tex01percent * (1.0 - aaxrem)) + (tex11percent * aaxrem)
+            finalpercent = (y0percent * (1.0 - aayrem)) + (y1percent * aayrem)
+
+            if finalpercent <= 0.0:
+                # This pixel would be blank, so we avoid dividing by zero.
+                average = [255, 255, 255, 0]
+            else:
+                # Interpolate in the X direction on both Y axis.
+                y0r = ((texbytes[tex00] * tex00percent * (1.0 - aaxrem)) + (texbytes[tex10] * tex10percent * aaxrem))
+                y0g = ((texbytes[tex00 + 1] * tex00percent * (1.0 - aaxrem)) + (texbytes[tex10 + 1] * tex10percent * aaxrem))
+                y0b = ((texbytes[tex00 + 2] * tex00percent * (1.0 - aaxrem)) + (texbytes[tex10 + 2] * tex10percent * aaxrem))
+
+                y1r = ((texbytes[tex01] * tex01percent * (1.0 - aaxrem)) + (texbytes[tex11] * tex11percent * aaxrem))
+                y1g = ((texbytes[tex01 + 1] * tex01percent * (1.0 - aaxrem)) + (texbytes[tex11 + 1] * tex11percent * aaxrem))
+                y1b = ((texbytes[tex01 + 2] * tex01percent * (1.0 - aaxrem)) + (texbytes[tex11 + 2] * tex11percent * aaxrem))
+
+                # Now interpolate the Y direction to get the final pixel value.
+                average = [
+                    int(((y0r * (1.0 - aayrem)) + (y1r * aayrem)) / finalpercent),
+                    int(((y0g * (1.0 - aayrem)) + (y1g * aayrem)) / finalpercent),
+                    int(((y0b * (1.0 - aayrem)) + (y1b * aayrem)) / finalpercent),
+                    int(finalpercent * 255),
+                ]
+        else:
+            for addy in ypoints:
+                for addx in xpoints:
+                    texloc = callback(Point(imgx + addx, imgy + addy))
+                    denom += 1
+
+                    if texloc is None:
+                        continue
+
+                    aax, aay, _ = texloc.as_tuple()
+
+                    # If we're out of bounds, don't update. Factor this in, however, so we can get partial
+                    # transparency to the pixel that is already there.
+                    if aax < 0 or aay < 0 or aax >= texwidth or aay >= texheight:
+                        continue
+
+                    # Grab the values to average, for SSAA. Make sure to factor in alpha as a poor-man's
+                    # blend to ensure that partial transparency pixel values don't unnecessarily factor
+                    # into average calculations.
+                    texoff = (aax + (aay * texwidth)) * 4
+
+                    # If this is a fully transparent pixel, the below formulas work out to adding nothing
+                    # so we should skip this altogether.
+                    if texbytes[texoff + 3] == 0:
+                        continue
+
+                    apercent = texbytes[texoff + 3] / 255.0
+                    r += int(texbytes[texoff] * apercent)
+                    g += int(texbytes[texoff + 1] * apercent)
+                    b += int(texbytes[texoff + 2] * apercent)
+                    a += texbytes[texoff + 3]
+                    count += 1
+
+            if count == 0:
+                # None of the samples existed in-bounds.
+                return imgbytes[imgoff:(imgoff + 4)]
+
+            # Average the pixels. Make sure to divide out the alpha in preparation for blending.
+            alpha = a // denom
+
+            if alpha == 0:
+                average = [255, 255, 255, alpha]
+            else:
+                apercent = alpha / 255.0
+                average = [int((r / denom) / apercent), int((g / denom) / apercent), int((b / denom) / apercent), alpha]
+
+        # Finally, blend it with the destination.
+        return blend_point(add_color, mult_color, average, imgbytes[imgoff:(imgoff + 4)], blendfunc)
+    else:
+        # Calculate what texture pixel data goes here.
+        texloc = callback(Point(imgx + 0.5, imgy + 0.5))
+        if texloc is None:
+            return imgbytes[imgoff:(imgoff + 4)]
+
+        texx, texy, _ = texloc.as_tuple()
+
+        # If we're out of bounds, don't update.
+        if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
+            return imgbytes[imgoff:(imgoff + 4)]
+
+        # Blend it.
+        texoff = (texx + (texy * texwidth)) * 4
+        return blend_point(add_color, mult_color, texbytes[texoff:(texoff + 4)], imgbytes[imgoff:(imgoff + 4)], blendfunc)
+
+
+def affine_line_renderer(
+    work: multiprocessing.Queue,
+    results: multiprocessing.Queue,
+    minx: int,
+    maxx: int,
+    imgwidth: int,
+    texwidth: int,
+    texheight: int,
+    inverse: Matrix,
+    add_color: Color,
+    mult_color: Color,
+    blendfunc: int,
+    imgbytes: Union[bytes, bytearray],
+    texbytes: Union[bytes, bytearray],
+    maskbytes: Optional[Union[bytes, bytearray]],
+    enable_aa: bool,
+) -> None:
+    while True:
+        imgy = work.get()
+        if imgy is None:
+            return
+
+        rowbytes = bytearray(imgbytes[(imgy * imgwidth * 4):((imgy + 1) * imgwidth * 4)])
+        for imgx in range(imgwidth):
+            if imgx < minx or imgx >= maxx:
+                # No need to even consider this pixel.
+                continue
+            else:
+                # Blit new pixel into the correct range.
+                rowbytes[(imgx * 4):((imgx + 1) * 4)] = pixel_renderer(
+                    imgx,
+                    imgy,
+                    imgwidth,
+                    texwidth,
+                    texheight,
+                    1.0 / inverse.xscale,
+                    1.0 / inverse.yscale,
+                    lambda point: inverse.multiply_point(point),
+                    add_color,
+                    mult_color,
+                    blendfunc,
+                    imgbytes,
+                    texbytes,
+                    maskbytes,
+                    enable_aa,
+                )
+
+        results.put((imgy, bytes(rowbytes)))
+
+
 def affine_composite(
     img: Image.Image,
     add_color: Color,
@@ -262,7 +475,9 @@ def affine_composite(
                     imgwidth,
                     texwidth,
                     texheight,
-                    inverse,
+                    1.0 / inverse.xscale,
+                    1.0 / inverse.yscale,
+                    lambda point: inverse.multiply_point(point),
                     add_color,
                     mult_color,
                     blendfunc,
@@ -298,7 +513,7 @@ def affine_composite(
 
         for _ in range(cores):
             proc = multiprocessing.Process(
-                target=line_renderer,
+                target=affine_line_renderer,
                 args=(
                     work,
                     results,
@@ -349,7 +564,7 @@ def affine_composite(
     return img
 
 
-def line_renderer(
+def perspective_line_renderer(
     work: multiprocessing.Queue,
     results: multiprocessing.Queue,
     minx: int,
@@ -357,6 +572,8 @@ def line_renderer(
     imgwidth: int,
     texwidth: int,
     texheight: int,
+    xscale: float,
+    yscale: float,
     inverse: Matrix,
     add_color: Color,
     mult_color: Color,
@@ -366,6 +583,14 @@ def line_renderer(
     maskbytes: Optional[Union[bytes, bytearray]],
     enable_aa: bool,
 ) -> None:
+    def perspective_inverse(imgpoint: Point) -> Optional[Point]:
+        # Calculate the texture coordinate with our perspective interpolation.
+        texdiv = inverse.multiply_point(imgpoint)
+        if texdiv.z <= 0.0:
+            return None
+
+        return Point(texdiv.x / texdiv.z, texdiv.y / texdiv.z)
+
     while True:
         imgy = work.get()
         if imgy is None:
@@ -384,7 +609,9 @@ def line_renderer(
                     imgwidth,
                     texwidth,
                     texheight,
-                    inverse,
+                    xscale,
+                    yscale,
+                    perspective_inverse,
                     add_color,
                     mult_color,
                     blendfunc,
@@ -397,160 +624,6 @@ def line_renderer(
         results.put((imgy, bytes(rowbytes)))
 
 
-def pixel_renderer(
-    imgx: int,
-    imgy: int,
-    imgwidth: int,
-    texwidth: int,
-    texheight: int,
-    inverse: Matrix,
-    add_color: Color,
-    mult_color: Color,
-    blendfunc: int,
-    imgbytes: Union[bytes, bytearray],
-    texbytes: Union[bytes, bytearray],
-    maskbytes: Optional[Union[bytes, bytearray]],
-    enable_aa: bool,
-) -> Sequence[int]:
-    # Determine offset
-    maskoff = imgx + (imgy * imgwidth)
-    imgoff = maskoff * 4
-
-    if maskbytes is not None and maskbytes[maskoff] == 0:
-        # This pixel is masked off!
-        return imgbytes[imgoff:(imgoff + 4)]
-
-    if enable_aa:
-        r = 0
-        g = 0
-        b = 0
-        a = 0
-        count = 0
-        denom = 0
-
-        # Essentially what we're doing here is calculating the scale, clamping it at 1.0 as the
-        # minimum and then setting the AA sample swing accordingly. This has the effect of anti-aliasing
-        # scaled up images a bit softer than would otherwise be achieved.
-        xscale = 1.0 / inverse.xscale
-        yscale = 1.0 / inverse.yscale
-
-        # These are used for picking the various sample points for SSAA method below.
-        xswing = 0.5 * max(1.0, xscale)
-        yswing = 0.5 * max(1.0, yscale)
-
-        xpoints = [0.5 - xswing, 0.5 - (xswing / 2.0), 0.5, 0.5 + (xswing / 2.0), 0.5 + xswing]
-        ypoints = [0.5 - yswing, 0.5 - (yswing / 2.0), 0.5, 0.5 + (yswing / 2.0), 0.5 + yswing]
-
-        # First, figure out if we can use bilinear resampling.
-        bilinear = False
-        if xscale >= 1.0 and yscale >= 1.0:
-            aaloc = inverse.multiply_point(Point(imgx + 0.5, imgy + 0.5))
-            aax, aay, _ = aaloc.as_tuple()
-            if not (aax <= 0 or aay <= 0 or aax >= (texwidth - 1) or aay >= (texheight - 1)):
-                bilinear = True
-
-        # Now perform the desired AA operation.
-        if bilinear:
-            # Calculate the pixel we're after, and what percentage into the pixel we are.
-            texloc = inverse.multiply_point(Point(imgx + 0.5, imgy + 0.5))
-            aax, aay, _ = texloc.as_tuple()
-            aaxrem = texloc.x - aax
-            aayrem = texloc.y - aay
-
-            # Find the four pixels that we can interpolate from. The first number is the x, and second is y.
-            tex00 = (aax + (aay * texwidth)) * 4
-            tex10 = tex00 + 4
-            tex01 = (aax + ((aay + 1) * texwidth)) * 4
-            tex11 = tex01 + 4
-
-            # Calculate various scaling factors based on alpha and percentage.
-            tex00percent = texbytes[tex00 + 3] / 255.0
-            tex10percent = texbytes[tex10 + 3] / 255.0
-            tex01percent = texbytes[tex01 + 3] / 255.0
-            tex11percent = texbytes[tex11 + 3] / 255.0
-
-            y0percent = (tex00percent * (1.0 - aaxrem)) + (tex10percent * aaxrem)
-            y1percent = (tex01percent * (1.0 - aaxrem)) + (tex11percent * aaxrem)
-            finalpercent = (y0percent * (1.0 - aayrem)) + (y1percent * aayrem)
-
-            if finalpercent <= 0.0:
-                # This pixel would be blank, so we avoid dividing by zero.
-                average = [255, 255, 255, 0]
-            else:
-                # Interpolate in the X direction on both Y axis.
-                y0r = ((texbytes[tex00] * tex00percent * (1.0 - aaxrem)) + (texbytes[tex10] * tex10percent * aaxrem))
-                y0g = ((texbytes[tex00 + 1] * tex00percent * (1.0 - aaxrem)) + (texbytes[tex10 + 1] * tex10percent * aaxrem))
-                y0b = ((texbytes[tex00 + 2] * tex00percent * (1.0 - aaxrem)) + (texbytes[tex10 + 2] * tex10percent * aaxrem))
-
-                y1r = ((texbytes[tex01] * tex01percent * (1.0 - aaxrem)) + (texbytes[tex11] * tex11percent * aaxrem))
-                y1g = ((texbytes[tex01 + 1] * tex01percent * (1.0 - aaxrem)) + (texbytes[tex11 + 1] * tex11percent * aaxrem))
-                y1b = ((texbytes[tex01 + 2] * tex01percent * (1.0 - aaxrem)) + (texbytes[tex11 + 2] * tex11percent * aaxrem))
-
-                # Now interpolate the Y direction to get the final pixel value.
-                average = [
-                    int(((y0r * (1.0 - aayrem)) + (y1r * aayrem)) / finalpercent),
-                    int(((y0g * (1.0 - aayrem)) + (y1g * aayrem)) / finalpercent),
-                    int(((y0b * (1.0 - aayrem)) + (y1b * aayrem)) / finalpercent),
-                    int(finalpercent * 255),
-                ]
-        else:
-            for addy in ypoints:
-                for addx in xpoints:
-                    texloc = inverse.multiply_point(Point(imgx + addx, imgy + addy))
-                    aax, aay, _ = texloc.as_tuple()
-
-                    # If we're out of bounds, don't update. Factor this in, however, so we can get partial
-                    # transparency to the pixel that is already there.
-                    denom += 1
-                    if aax < 0 or aay < 0 or aax >= texwidth or aay >= texheight:
-                        continue
-
-                    # Grab the values to average, for SSAA. Make sure to factor in alpha as a poor-man's
-                    # blend to ensure that partial transparency pixel values don't unnecessarily factor
-                    # into average calculations.
-                    texoff = (aax + (aay * texwidth)) * 4
-
-                    # If this is a fully transparent pixel, the below formulas work out to adding nothing
-                    # so we should skip this altogether.
-                    if texbytes[texoff + 3] == 0:
-                        continue
-
-                    apercent = texbytes[texoff + 3] / 255.0
-                    r += int(texbytes[texoff] * apercent)
-                    g += int(texbytes[texoff + 1] * apercent)
-                    b += int(texbytes[texoff + 2] * apercent)
-                    a += texbytes[texoff + 3]
-                    count += 1
-
-            if count == 0:
-                # None of the samples existed in-bounds.
-                return imgbytes[imgoff:(imgoff + 4)]
-
-            # Average the pixels. Make sure to divide out the alpha in preparation for blending.
-            alpha = a // denom
-
-            if alpha == 0:
-                average = [255, 255, 255, alpha]
-            else:
-                apercent = alpha / 255.0
-                average = [int((r / denom) / apercent), int((g / denom) / apercent), int((b / denom) / apercent), alpha]
-
-        # Finally, blend it with the destination.
-        return blend_point(add_color, mult_color, average, imgbytes[imgoff:(imgoff + 4)], blendfunc)
-    else:
-        # Calculate what texture pixel data goes here.
-        texloc = inverse.multiply_point(Point(imgx + 0.5, imgy + 0.5))
-        texx, texy, _ = texloc.as_tuple()
-
-        # If we're out of bounds, don't update.
-        if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
-            return imgbytes[imgoff:(imgoff + 4)]
-
-        # Blend it.
-        texoff = (texx + (texy * texwidth)) * 4
-        return blend_point(add_color, mult_color, texbytes[texoff:(texoff + 4)], imgbytes[imgoff:(imgoff + 4)], blendfunc)
-
-
 def perspective_composite(
     img: Image.Image,
     add_color: Color,
@@ -576,6 +649,12 @@ def perspective_composite(
     texwidth = texture.width
     texheight = texture.height
 
+    # Get the perspective-correct inverse matrix for looking up texture coordinates.
+    inverse_matrix, minx, miny, maxx, maxy = perspective_calculate(imgwidth, imgheight, texwidth, texheight, transform, camera, focal_length)
+    if inverse_matrix is None:
+        # This texture is entirely off of the screen.
+        return img
+
     # Get the data in an easier to manipulate and faster to update fashion.
     imgbytes = bytearray(img.tobytes('raw', 'RGBA'))
     texbytes = texture.tobytes('raw', 'RGBA')
@@ -585,29 +664,122 @@ def perspective_composite(
     else:
         maskbytes = None
 
-    for texy in range(texheight):
-        for texx in range(texwidth):
-            # Calculate perspective projection.
-            imgloc = transform.multiply_point(Point(texx, texy))
-            perspective = focal_length / (imgloc.z - camera.z)
-            imgx = int(((imgloc.x - camera.x) * perspective) + camera.x)
-            imgy = int(((imgloc.y - camera.y) * perspective) + camera.y)
+    def perspective_inverse(imgpoint: Point) -> Optional[Point]:
+        # Calculate the texture coordinate with our perspective interpolation.
+        texdiv = inverse_matrix.multiply_point(imgpoint)
+        if texdiv.z <= 0.0:
+            return None
 
-            # Check clipping.
-            if imgx < 0 or imgx >= imgwidth:
-                continue
-            if imgy < 0 or imgy >= imgheight:
-                continue
+        return Point(texdiv.x / texdiv.z, texdiv.y / texdiv.z)
 
-            # Check mask rectangle.
-            maskoff = imgx + (imgy * imgwidth)
-            imgoff = maskoff * 4
-            if maskbytes is not None and maskbytes[maskoff] == 0:
-                continue
+    cores = multiprocessing.cpu_count()
+    if single_threaded or cores < 2:
+        # Get the data in an easier to manipulate and faster to update fashion.
+        imgbytes = bytearray(img.tobytes('raw', 'RGBA'))
+        texbytes = texture.tobytes('raw', 'RGBA')
+        if mask:
+            alpha = mask.split()[-1]
+            maskbytes = alpha.tobytes('raw', 'L')
+        else:
+            maskbytes = None
 
-            # Blend it.
-            texoff = (texx + (texy * texwidth)) * 4
-            imgbytes[imgoff:(imgoff + 4)] = blend_point(add_color, mult_color, texbytes[texoff:(texoff + 4)], imgbytes[imgoff:(imgoff + 4)], blendfunc)
+        # We don't have enough CPU cores to bother multiprocessing.
+        for imgy in range(miny, maxy):
+            for imgx in range(minx, maxx):
+                # Determine offset
+                imgoff = (imgx + (imgy * imgwidth)) * 4
+                imgbytes[imgoff:(imgoff + 4)] = pixel_renderer(
+                    imgx,
+                    imgy,
+                    imgwidth,
+                    texwidth,
+                    texheight,
+                    transform.xscale,
+                    transform.yscale,
+                    perspective_inverse,
+                    add_color,
+                    mult_color,
+                    blendfunc,
+                    imgbytes,
+                    texbytes,
+                    maskbytes,
+                    enable_aa,
+                )
 
         img = Image.frombytes('RGBA', (imgwidth, imgheight), bytes(imgbytes))
+    else:
+        imgbytes = img.tobytes('raw', 'RGBA')
+        texbytes = texture.tobytes('raw', 'RGBA')
+        if mask:
+            alpha = mask.split()[-1]
+            maskbytes = alpha.tobytes('raw', 'L')
+        else:
+            maskbytes = None
+
+        # Let's spread the load across multiple processors.
+        procs: List[multiprocessing.Process] = []
+        work: multiprocessing.Queue = multiprocessing.Queue()
+        results: multiprocessing.Queue = multiprocessing.Queue()
+        expected: int = 0
+        interrupted: bool = False
+
+        def ctrlc(sig: Any, frame: Any) -> None:
+            nonlocal interrupted
+            interrupted = True
+
+        previous_handler = signal.getsignal(signal.SIGINT)
+        signal.signal(signal.SIGINT, ctrlc)
+
+        for _ in range(cores):
+            proc = multiprocessing.Process(
+                target=perspective_line_renderer,
+                args=(
+                    work,
+                    results,
+                    minx,
+                    maxx,
+                    imgwidth,
+                    texwidth,
+                    texheight,
+                    transform.xscale,
+                    transform.yscale,
+                    inverse_matrix,
+                    add_color,
+                    mult_color,
+                    blendfunc,
+                    imgbytes,
+                    texbytes,
+                    maskbytes,
+                    enable_aa,
+                ),
+            )
+            procs.append(proc)
+            proc.start()
+
+        for imgy in range(miny, maxy):
+            work.put(imgy)
+            expected += 1
+
+        lines: List[bytes] = [
+            imgbytes[x:(x + (imgwidth * 4))]
+            for x in range(
+                0,
+                imgwidth * imgheight * 4,
+                imgwidth * 4,
+            )
+        ]
+        for _ in range(expected):
+            imgy, result = results.get()
+            lines[imgy] = result
+
+        for _proc in procs:
+            work.put(None)
+        for proc in procs:
+            proc.join()
+
+        signal.signal(signal.SIGINT, previous_handler)
+        if interrupted:
+            raise KeyboardInterrupt()
+
+        img = Image.frombytes('RGBA', (imgwidth, imgheight), b''.join(lines))
     return img
diff --git a/bemani/format/afp/blend/blendcpp.pyx b/bemani/format/afp/blend/blendcpp.pyx
index f186554..71eb7b1 100644
--- a/bemani/format/afp/blend/blendcpp.pyx
+++ b/bemani/format/afp/blend/blendcpp.pyx
@@ -3,6 +3,7 @@ from PIL import Image  # type: ignore
 from typing import Optional, Tuple
 
 from ..types import Color, Matrix, Point
+from .perspective import perspective_calculate
 
 cdef extern struct floatcolor_t:
     float r;
@@ -24,9 +25,9 @@ cdef extern struct matrix_t:
     float a42;
     float a43;
 
-cdef extern int affine_composite_fast(
-    unsigned char *imgdata,
-    unsigned char *maskdata,
+cdef extern int composite_fast(
+    unsigned char *imgbytes,
+    unsigned char *maskbytes,
     unsigned int imgwidth,
     unsigned int imgheight,
     unsigned int minx,
@@ -35,27 +36,10 @@ cdef extern int affine_composite_fast(
     unsigned int maxy,
     floatcolor_t add_color,
     floatcolor_t mult_color,
+    float xscale,
+    float yscale,
     matrix_t inverse,
-    int blendfunc,
-    unsigned char *texdata,
-    unsigned int texwidth,
-    unsigned int texheight,
-    unsigned int threads,
-    unsigned int enable_aa,
-)
-
-cdef extern int perspective_composite_fast(
-    unsigned char *imgbytes,
-    unsigned char *maskbytes,
-    unsigned int imgwidth,
-    unsigned int imgheight,
-    float camera_x,
-    float camera_y,
-    float camera_z,
-    float focal_length,
-    floatcolor_t add_color,
-    floatcolor_t mult_color,
-    matrix_t transform,
+    int use_perspective,
     int blendfunc,
     unsigned char *texbytes,
     unsigned int texwidth,
@@ -137,7 +121,7 @@ def affine_composite(
     cdef unsigned int threads = 1 if single_threaded else multiprocessing.cpu_count()
 
     # Call the C++ function.
-    errors = affine_composite_fast(
+    errors = composite_fast(
         imgbytes,
         maskbytes,
         imgwidth,
@@ -148,7 +132,10 @@ def affine_composite(
         maxy,
         c_addcolor,
         c_multcolor,
+        transform.xscale,
+        transform.yscale,
         c_inverse,
+        0,
         blendfunc,
         texbytes,
         texwidth,
@@ -190,6 +177,12 @@ def perspective_composite(
     texwidth = texture.width
     texheight = texture.height
 
+    # Get the perspective-correct inverse matrix for looking up texture coordinates.
+    inverse_matrix, minx, miny, maxx, maxy = perspective_calculate(imgwidth, imgheight, texwidth, texheight, transform, camera, focal_length)
+    if inverse_matrix is None:
+        # This texture is entirely off of the screen.
+        return img
+
     # Grab the raw image data.
     imgbytes = img.tobytes('raw', 'RGBA')
     texbytes = texture.tobytes('raw', 'RGBA')
@@ -207,27 +200,30 @@ def perspective_composite(
     # Convert classes to C structs.
     cdef floatcolor_t c_addcolor = floatcolor_t(r=add_color.r, g=add_color.g, b=add_color.b, a=add_color.a)
     cdef floatcolor_t c_multcolor = floatcolor_t(r=mult_color.r, g=mult_color.g, b=mult_color.b, a=mult_color.a)
-    cdef matrix_t c_transform = matrix_t(
-        a11=transform.a11, a12=transform.a12, a13=transform.a13,
-        a21=transform.a21, a22=transform.a22, a23=transform.a23,
-        a31=transform.a31, a32=transform.a32, a33=transform.a33,
-        a41=transform.a41, a42=transform.a42, a43=transform.a43,
+    cdef matrix_t c_inverse = matrix_t(
+        a11=inverse_matrix.a11, a12=inverse_matrix.a12, a13=inverse_matrix.a13,
+        a21=inverse_matrix.a21, a22=inverse_matrix.a22, a23=inverse_matrix.a23,
+        a31=inverse_matrix.a31, a32=inverse_matrix.a32, a33=inverse_matrix.a33,
+        a41=inverse_matrix.a41, a42=inverse_matrix.a42, a43=inverse_matrix.a43,
     )
     cdef unsigned int threads = 1 if single_threaded else multiprocessing.cpu_count()
 
     # Call the C++ function.
-    errors = perspective_composite_fast(
+    errors = composite_fast(
         imgbytes,
         maskbytes,
         imgwidth,
         imgheight,
-        camera.x,
-        camera.y,
-        camera.z,
-        focal_length,
+        minx,
+        maxx,
+        miny,
+        maxy,
         c_addcolor,
         c_multcolor,
-        c_transform,
+        transform.xscale,
+        transform.yscale,
+        c_inverse,
+        1,
         blendfunc,
         texbytes,
         texwidth,
diff --git a/bemani/format/afp/blend/blendcppimpl.cxx b/bemani/format/afp/blend/blendcppimpl.cxx
index 6b454fc..1247329 100644
--- a/bemani/format/afp/blend/blendcppimpl.cxx
+++ b/bemani/format/afp/blend/blendcppimpl.cxx
@@ -56,14 +56,6 @@ extern "C"
                 (a13 * point.x) + (a23 * point.y) + (a33 * point.z) + a43,
             };
         }
-
-        float xscale() {
-            return sqrt((a11 * a11) + (a12 * a12) + (a13 * a13));
-        }
-
-        float yscale() {
-            return sqrt((a21 * a21) + (a22 * a22) + (a23 * a23));
-        }
     } matrix_t;
 
     typedef struct work {
@@ -77,7 +69,10 @@ extern "C"
         intcolor_t *texdata;
         unsigned int texwidth;
         unsigned int texheight;
+        float xscale;
+        float yscale;
         matrix_t inverse;
+        int use_perspective;
         floatcolor_t add_color;
         floatcolor_t mult_color;
         int blendfunc;
@@ -270,12 +265,8 @@ extern "C"
         // costs us almost nothing. Essentially what we're doing here is calculating the scale, clamping it at 1.0 as the
         // minimum and then setting the AA sample swing accordingly. This has the effect of anti-aliasing scaled up images
         // a bit softer than would otherwise be achieved.
-        float xscale = 1.0 / work->inverse.xscale();
-        float yscale = 1.0 / work->inverse.yscale();
-
-        // These are used for picking the various sample points for SSAA method below.
-        float xswing = 0.5 * fmax(1.0, xscale);
-        float yswing = 0.5 * fmax(1.0, yscale);
+        float xswing = 0.5 * fmax(1.0, work->xscale);
+        float yswing = 0.5 * fmax(1.0, work->yscale);
 
         for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) {
             for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) {
@@ -300,10 +291,21 @@ extern "C"
 
                     // First, figure out if we can use bilinear resampling.
                     int bilinear = 0;
-                    if (xscale >= 1.0 && yscale >= 1.0) {
-                        point_t aaloc = work->inverse.multiply_point((point_t){(float)(imgx + 0.5), (float)(imgy + 0.5)});
-                        int aax = aaloc.x;
-                        int aay = aaloc.y;
+                    if (work->xscale >= 1.0 && work->yscale >= 1.0) {
+                        int aax = -1;
+                        int aay = -1;
+
+                        if (work->use_perspective) {
+                            point_t aaloc = work->inverse.multiply_point((point_t){(float)(imgx + 0.5), (float)(imgy + 0.5)});
+                            if (aaloc.z > 0.0) {
+                                aax = aaloc.x / aaloc.z;
+                                aay = aaloc.y / aaloc.z;
+                            }
+                        } else {
+                            point_t aaloc = work->inverse.multiply_point((point_t){(float)(imgx + 0.5), (float)(imgy + 0.5)});
+                            aax = aaloc.x;
+                            aay = aaloc.y;
+                        }
 
                         if (!(aax <= 0 || aay <= 0 || aax >= ((int)work->texwidth - 1) || aay >= ((int)work->texheight - 1))) {
                             bilinear = 1;
@@ -314,11 +316,28 @@ extern "C"
                     intcolor_t average;
                     if (bilinear) {
                         // Calculate the pixel we're after, and what percentage into the pixel we are.
-                        point_t texloc = work->inverse.multiply_point((point_t){(float)(imgx + 0.5), (float)(imgy + 0.5)});
-                        int aax = texloc.x;
-                        int aay = texloc.y;
-                        float aaxrem = texloc.x - (float)aax;
-                        float aayrem = texloc.y - (float)aay;
+                        int aax;
+                        int aay;
+                        float aaxrem;
+                        float aayrem;
+
+                        if (work->use_perspective) {
+                            // We don't check for negative here, because we already checked it above and wouldn't
+                            // have enabled bilinear interpoliation.
+                            point_t texloc = work->inverse.multiply_point((point_t){(float)(imgx + 0.5), (float)(imgy + 0.5)});
+                            float fx = texloc.x / texloc.z;
+                            float fy = texloc.y / texloc.z;
+                            aax = fx;
+                            aay = fy;
+                            aaxrem = fx - (float)aax;
+                            aayrem = fy - (float)aay;
+                        } else {
+                            point_t texloc = work->inverse.multiply_point((point_t){(float)(imgx + 0.5), (float)(imgy + 0.5)});
+                            aax = texloc.x;
+                            aay = texloc.y;
+                            aaxrem = texloc.x - (float)aax;
+                            aayrem = texloc.y - (float)aay;
+                        }
 
                         // Find the four pixels that we can interpolate from. The first number is the x, and second is y.
                         unsigned int tex00 = aax + (aay * work->texwidth);
@@ -366,9 +385,20 @@ extern "C"
                     } else {
                         for (float addy = 0.5 - yswing; addy <= 0.5 + yswing; addy += yswing / 2.0) {
                             for (float addx = 0.5 - xswing; addx <= 0.5 + xswing; addx += xswing / 2.0) {
-                                point_t texloc = work->inverse.multiply_point((point_t){(float)imgx + addx, (float)imgy + addy});
-                                int aax = texloc.x;
-                                int aay = texloc.y;
+                                int aax = -1;
+                                int aay = -1;
+
+                                if (work->use_perspective) {
+                                    point_t texloc = work->inverse.multiply_point((point_t){(float)imgx + addx, (float)imgy + addy});
+                                    if (texloc.z > 0.0) {
+                                        aax = texloc.x / texloc.z;
+                                        aay = texloc.y / texloc.z;
+                                    }
+                                } else {
+                                    point_t texloc = work->inverse.multiply_point((point_t){(float)imgx + addx, (float)imgy + addy});
+                                    aax = texloc.x;
+                                    aay = texloc.y;
+                                }
 
                                 // If we're out of bounds, don't update. Factor this in, however, so we can get partial
                                 // transparency to the pixel that is already there.
@@ -429,9 +459,20 @@ extern "C"
                     work->imgdata[imgoff] = blend_point(work->add_color, work->mult_color, average, work->imgdata[imgoff], work->blendfunc);
                 } else {
                     // Grab the center of the pixel to get the color.
-                    point_t texloc = work->inverse.multiply_point((point_t){(float)imgx + (float)0.5, (float)imgy + (float)0.5});
-                    int texx = texloc.x;
-                    int texy = texloc.y;
+                    int texx = -1;
+                    int texy = -1;
+
+                    if (work->use_perspective) {
+                        point_t texloc = work->inverse.multiply_point((point_t){(float)imgx + (float)0.5, (float)imgy + (float)0.5});
+                        if (texloc.z > 0.0) {
+                            texx = texloc.x / texloc.z;
+                            texy = texloc.y / texloc.z;
+                        }
+                    } else {
+                        point_t texloc = work->inverse.multiply_point((point_t){(float)imgx + (float)0.5, (float)imgy + (float)0.5});
+                        texx = texloc.x;
+                        texy = texloc.y;
+                    }
 
                     // If we're out of bounds, don't update.
                     if (texx < 0 || texy < 0 || texx >= (int)work->texwidth || texy >= (int)work->texheight) {
@@ -452,7 +493,7 @@ extern "C"
         return NULL;
     }
 
-    int affine_composite_fast(
+    int composite_fast(
         unsigned char *imgbytes,
         unsigned char *maskbytes,
         unsigned int imgwidth,
@@ -463,7 +504,10 @@ extern "C"
         unsigned int maxy,
         floatcolor_t add_color,
         floatcolor_t mult_color,
+        float xscale,
+        float yscale,
         matrix_t inverse,
+        int use_perspective,
         int blendfunc,
         unsigned char *texbytes,
         unsigned int texwidth,
@@ -488,11 +532,14 @@ extern "C"
             work.texdata = texdata;
             work.texwidth = texwidth;
             work.texheight = texheight;
+            work.xscale = xscale;
+            work.yscale = yscale;
             work.inverse = inverse;
             work.add_color = add_color;
             work.mult_color = mult_color;
             work.blendfunc = blendfunc;
             work.enable_aa = enable_aa;
+            work.use_perspective = use_perspective;
 
             chunk_composite_fast(&work);
         } else {
@@ -531,12 +578,15 @@ extern "C"
                 work->texdata = texdata;
                 work->texwidth = texwidth;
                 work->texheight = texheight;
+                work->xscale = xscale;
+                work->yscale = yscale;
                 work->inverse = inverse;
                 work->add_color = add_color;
                 work->mult_color = mult_color;
                 work->blendfunc = blendfunc;
                 work->thread = thread;
                 work->enable_aa = enable_aa;
+                work->use_perspective = use_perspective;
 
                 if (me)
                 {
@@ -584,58 +634,4 @@ extern "C"
 
         return 0;
     }
-
-    int perspective_composite_fast(
-        unsigned char *imgbytes,
-        unsigned char *maskbytes,
-        unsigned int imgwidth,
-        unsigned int imgheight,
-        float camera_x,
-        float camera_y,
-        float camera_z,
-        float focal_length,
-        floatcolor_t add_color,
-        floatcolor_t mult_color,
-        matrix_t transform,
-        int blendfunc,
-        unsigned char *texbytes,
-        unsigned int texwidth,
-        unsigned int texheight,
-        unsigned int threads,
-        unsigned int enable_aa
-    ) {
-        // Cast to a usable type.
-        intcolor_t *imgdata = (intcolor_t *)imgbytes;
-        intcolor_t *texdata = (intcolor_t *)texbytes;
-
-        for (unsigned int texy = 0; texy < texheight; texy++) {
-            for (unsigned int texx = 0; texx < texwidth; texx++) {
-                // Calculate perspective projection.
-                point_t imgloc = transform.multiply_point((point_t){(float)texx, (float)texy});
-                float perspective = focal_length / (imgloc.z - camera_z);
-                int imgx = ((imgloc.x - camera_x) * perspective) + camera_x;
-                int imgy = ((imgloc.y - camera_y) * perspective) + camera_y;
-
-                // Check clipping.
-                if (imgx < 0 || imgx >= (int)imgwidth) {
-                    continue;
-                }
-                if (imgy < 0 || imgy >= (int)imgheight) {
-                    continue;
-                }
-
-                // Check mask rectangle.
-                unsigned int imgoff = imgx + (imgy * imgwidth);
-                if (maskbytes != NULL && maskbytes[imgoff] == 0) {
-                    continue;
-                }
-
-                // Blend it.
-                unsigned int texoff = (texx + (texy * texwidth));
-                imgdata[imgoff] = blend_point(add_color, mult_color, texdata[texoff], imgdata[imgoff], blendfunc);
-            }
-        }
-
-        return 0;
-    }
 }
diff --git a/bemani/format/afp/blend/perspective.py b/bemani/format/afp/blend/perspective.py
new file mode 100644
index 0000000..65dd1a1
--- /dev/null
+++ b/bemani/format/afp/blend/perspective.py
@@ -0,0 +1,95 @@
+from typing import Dict, List, Optional, Tuple
+
+from ..types import Matrix, Point
+
+
+def perspective_calculate(
+    imgwidth: int,
+    imgheight: int,
+    texwidth: int,
+    texheight: int,
+    transform: Matrix,
+    camera: Point,
+    focal_length: float,
+) -> Tuple[Optional[Matrix], int, int, int, int]:
+    # Arbitrarily choose three points on the texture to create a pair of vectors
+    # so that we can interpolate backwards. This isn't as simple as inverting the
+    # view matrix like in affine compositing because dividing by Z makes the
+    # perspective transform non-linear. So instead we interpolate 1/Z, u/Z and
+    # v/Z since those ARE linear, and work backwards from there.
+    xy: List[Point] = []
+    uvz: Dict[Point, Point] = {}
+    for (texx, texy) in [
+        (0, 0),
+        (texwidth, 0),
+        (0, texheight),
+        # Include this just to get a good upper bounds for where the texture
+        # will be drawn.
+        (texwidth, texheight),
+    ]:
+        imgloc = transform.multiply_point(Point(texx, texy))
+        distance = imgloc.z - camera.z
+        imgx = int(((imgloc.x - camera.x) * (focal_length / distance)) + camera.x)
+        imgy = int(((imgloc.y - camera.y) * (focal_length / distance)) + camera.y)
+
+        xy_point = Point(imgx, imgy)
+        xy.append(xy_point)
+        uvz[xy_point] = Point(
+            focal_length * texx / distance,
+            focal_length * texy / distance,
+            focal_length / distance,
+        )
+
+    # Calculate the maximum range of update this texture can possibly reside in.
+    minx = max(int(min(p.x for p in xy)), 0)
+    maxx = min(int(max(p.x for p in xy)) + 1, imgwidth)
+    miny = max(int(min(p.y for p in xy)), 0)
+    maxy = min(int(max(p.y for p in xy)) + 1, imgheight)
+
+    if maxx <= minx or maxy <= miny:
+        # This image is entirely off the screen!
+        return (None, minx, miny, maxx, maxy)
+
+    # Now that we have three points, construct a matrix that allows us to calculate
+    # what amount of each u/z, v/z and 1/z vector we need to interpolate values. The
+    # below matrix gives us an affine transform that will convert a point that's in
+    # the range 0, 0 to 1, 1 to a point inside the parallellogram that is made by
+    # projecting the two vectors we got from calculating the three texture points above.
+    xy_matrix = Matrix.affine(
+        a=xy[1].x - xy[0].x,
+        b=xy[1].y - xy[0].y,
+        c=xy[2].x - xy[0].x,
+        d=xy[2].y - xy[0].y,
+        tx=xy[0].x,
+        ty=xy[0].y,
+    )
+
+    # We invert that above, which gives us a matrix that can take screen space (imgx,
+    # imgy) and gives us instead those ratios, which allows us to then interpolate the
+    # u/z, v/z and 1/z values.
+    try:
+        xy_matrix = xy_matrix.inverse()
+    except ZeroDivisionError:
+        # This can't be inverted, so this shouldn't be displayed.
+        return (None, minx, miny, maxx, maxy)
+
+    # We construct a second matrix, which interpolates coordinates in the range of
+    # 0, 0 to 1, 1 and gives us back the u/z, v/z and 1/z values.
+    uvz_matrix = Matrix(
+        a11=uvz[xy[1]].x - uvz[xy[0]].x,
+        a12=uvz[xy[1]].y - uvz[xy[0]].y,
+        a13=uvz[xy[1]].z - uvz[xy[0]].z,
+        a21=uvz[xy[2]].x - uvz[xy[0]].x,
+        a22=uvz[xy[2]].y - uvz[xy[0]].y,
+        a23=uvz[xy[2]].z - uvz[xy[0]].z,
+        a31=0.0,
+        a32=0.0,
+        a33=0.0,
+        a41=uvz[xy[0]].x,
+        a42=uvz[xy[0]].y,
+        a43=uvz[xy[0]].z,
+    )
+
+    # Finally, we can combine the two matrixes to do the interpolation all at once.
+    inverse_matrix = xy_matrix.multiply(uvz_matrix)
+    return (inverse_matrix, minx, miny, maxx, maxy)