diff --git a/bemani/format/afp/blend/blend.py b/bemani/format/afp/blend/blend.py
index 39d59a6..089d314 100644
--- a/bemani/format/afp/blend/blend.py
+++ b/bemani/format/afp/blend/blend.py
@@ -1,3 +1,4 @@
+import math
 import multiprocessing
 import signal
 from PIL import Image  # type: ignore
@@ -427,8 +428,11 @@ def pixel_renderer(
         a = 0
         count = 0
 
-        xswing = abs(0.5 / inverse.a)
-        yswing = abs(0.5 / inverse.d)
+        # Essentially what we're doing here is calculating the scale, clamping it at 1.0 as the
+        # minimum and then setting the AA sample swing accordingly. This has the effect of anti-aliasing
+        # scaled up images a bit softer than would otherwise be achieved.
+        xswing = 0.5 * max(1.0, 1.0 / math.sqrt(inverse.a * inverse.a + inverse.b * inverse.b))
+        yswing = 0.5 * max(1.0, 1.0 / math.sqrt(inverse.c * inverse.c + inverse.d * inverse.d))
 
         xpoints = [0.5 - xswing, 0.5 - (xswing / 2.0), 0.5, 0.5 + (xswing / 2.0), 0.5 + xswing]
         ypoints = [0.5 - yswing, 0.5 - (yswing / 2.0), 0.5, 0.5 + (yswing / 2.0), 0.5 + yswing]
diff --git a/bemani/format/afp/blend/blendcppimpl.cxx b/bemani/format/afp/blend/blendcppimpl.cxx
index 2613319..a6c3dd3 100644
--- a/bemani/format/afp/blend/blendcppimpl.cxx
+++ b/bemani/format/afp/blend/blendcppimpl.cxx
@@ -249,6 +249,13 @@ extern "C"
     }
 
     void chunk_composite_fast(work_t *work) {
+        // Regardless of AA work, calculate the transform matrix for determining the stride for AA pixel lookups, since it
+        // costs us almost nothing. Essentially what we're doing here is calculating the scale, clamping it at 1.0 as the
+        // minimum and then setting the AA sample swing accordingly. This has the effect of anti-aliasing scaled up images
+        // a bit softer than would otherwise be achieved.
+        float xswing = 0.5 * fmax(1.0, 1.0 / sqrt(work->inverse.a * work->inverse.a + work->inverse.b * work->inverse.b));
+        float yswing = 0.5 * fmax(1.0, 1.0 / sqrt(work->inverse.c * work->inverse.c + work->inverse.d * work->inverse.d));
+
         for (unsigned int imgy = work->miny; imgy < work->maxy; imgy++) {
             for (unsigned int imgx = work->minx; imgx < work->maxx; imgx++) {
                 // Determine offset.
@@ -269,9 +276,6 @@ extern "C"
                     int a = 0;
                     int count = 0;
 
-                    float xswing = fabs(0.5 / work->inverse.a);
-                    float yswing = fabs(0.5 / work->inverse.d);
-
                     for (float addy = 0.5 - yswing; addy <= 0.5 + yswing; addy += yswing / 2.0) {
                         for (float addx = 0.5 - xswing; addx <= 0.5 + xswing; addx += xswing / 2.0) {
                             point_t texloc = work->inverse.multiply_point((point_t){(float)imgx + addx, (float)imgy + addy});