Shave off another 33% of rendering time by better pcklable data in multithreaded case.

2025-01-31 12:13:49 +01:00 · 2021-05-16 15:15:06 +00:00 · 2021-05-16 15:15:06 +00:00 · 3db9cc9a58
commit 3db9cc9a58
parent fdf7e70bfd
3 changed files with 73 additions and 47 deletions
--- a/bemani/format/afp/blend.py
+++ b/bemani/format/afp/blend.py
@ -1,6 +1,6 @@
 import multiprocessing
 from PIL import Image  # type: ignore
-from typing import List, Tuple
+from typing import List, Sequence, Tuple

 from .types.generic import Color, Matrix, Point

@ -11,14 +11,14 @@ def clamp(color: float) -> int:

 def blend_normal(
    # RGBA color tuple representing what's already at the dest.
-    dest: Tuple[int, int, int, int],
+    dest: Sequence[int],
    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Tuple[int, int, int, int],
+    src: Sequence[int],
    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
    mult_color: Color,
    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
    add_color: Tuple[int, int, int, int],
-) -> Tuple[int, int, int, int]:
+) -> Sequence[int]:
    # "Normal" blend mode, which is just alpha blending. Various games use the DX
    # equation Src * As + Dst * (1 - As). We premultiply Dst by Ad as well, since
    # we are blitting onto a destination that could have transparency.
@ -51,14 +51,14 @@ def blend_normal(

 def blend_addition(
    # RGBA color tuple representing what's already at the dest.
-    dest: Tuple[int, int, int, int],
+    dest: Sequence[int],
    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Tuple[int, int, int, int],
+    src: Sequence[int],
    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
    mult_color: Color,
    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
    add_color: Tuple[int, int, int, int],
-) -> Tuple[int, int, int, int]:
+) -> Sequence[int]:
    # "Addition" blend mode, which is used for fog/clouds/etc. Various games use the DX
    # equation Src * As + Dst * 1. It appears jubeat does not premultiply the source
    # by its alpha component.
@ -87,14 +87,14 @@ def blend_addition(

 def blend_subtraction(
    # RGBA color tuple representing what's already at the dest.
-    dest: Tuple[int, int, int, int],
+    dest: Sequence[int],
    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Tuple[int, int, int, int],
+    src: Sequence[int],
    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
    mult_color: Color,
    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
    add_color: Tuple[int, int, int, int],
-) -> Tuple[int, int, int, int]:
+) -> Sequence[int]:
    # "Subtraction" blend mode, used for darkening an image. Various games use the DX
    # equation Dst * 1 - Src * As. It appears jubeat does not premultiply the source
    # by its alpha component much like the "additive" blend above..
@ -123,14 +123,14 @@ def blend_subtraction(

 def blend_multiply(
    # RGBA color tuple representing what's already at the dest.
-    dest: Tuple[int, int, int, int],
+    dest: Sequence[int],
    # RGBA color tuple representing the source we want to blend to the dest.
-    src: Tuple[int, int, int, int],
+    src: Sequence[int],
    # A pre-scaled color where all values are 0.0-1.0, used to calculate the final color.
    mult_color: Color,
    # A RGBA color tuple where all values are 0-255, used to calculate the final color.
    add_color: Tuple[int, int, int, int],
-) -> Tuple[int, int, int, int]:
+) -> Sequence[int]:
    # "Multiply" blend mode, used for darkening an image. Various games use the DX
    # equation Src * 0 + Dst * Src. It appears jubeat uses the alternative formula
    # Src * Dst + Dst * (1 - As) which reduces to the first equation as long as the
@ -166,12 +166,8 @@ def affine_composite(
    origin: Point,
    blendfunc: int,
    texture: Image.Image,
-) -> List[Tuple[int, int, int, int]]:
-    # Get the data in an easier to manipulate and faster to update fashion.
-    imgmap = list(img.getdata())
-    texmap = list(texture.getdata())
-    cores = multiprocessing.cpu_count()
-
+    single_threaded: bool = False,
+) -> Image.Image:
    # Warn if we have an unsupported blend.
    if blendfunc not in {0, 2, 3, 8, 9, 70}:
        print(f"WARNING: Unsupported blend {blendfunc}")
@ -195,7 +191,12 @@ def affine_composite(
    miny = max(int(min(pix1.y, pix2.y, pix3.y, pix4.y)), 0)
    maxy = min(int(max(pix1.y, pix2.y, pix3.y, pix4.y)) + 1, imgheight)

-    if cores < 2:
+    cores = multiprocessing.cpu_count()
+    if single_threaded or cores < 2:
+        # Get the data in an easier to manipulate and faster to update fashion.
+        imgmap = list(img.getdata())
+        texmap = list(texture.getdata())
+
        # We don't have enough CPU cores to bother multiprocessing.
        for imgy in range(miny, maxy):
            for imgx in range(minx, maxx):
@ -213,7 +214,12 @@ def affine_composite(
                # Blend it.
                texoff = texx + (texy * texwidth)
                imgmap[imgoff] = affine_blend_impl(add_color, mult_color, texmap[texoff], imgmap[imgoff], blendfunc)
+
+        img.putdata(imgmap)
    else:
+        imgbytes = img.tobytes('raw', 'RGBA')
+        texbytes = texture.tobytes('raw', 'RGBA')
+
        # Let's spread the load across multiple processors.
        procs: List[multiprocessing.Process] = []
        work: multiprocessing.Queue = multiprocessing.Queue()
@ -236,8 +242,8 @@ def affine_composite(
                    add_color,
                    mult_color,
                    blendfunc,
-                    imgmap,
-                    texmap,
+                    imgbytes,
+                    texbytes,
                ),
            )
            procs.append(proc)
@ -247,25 +253,25 @@ def affine_composite(
            work.put(imgy)
            expected += 1

-        lines: List[List[Tuple[int, int, int, int]]] = [
-            imgmap[x:(x + imgwidth)]
+        lines: List[bytes] = [
+            imgbytes[x:(x + (imgwidth * 4))]
            for x in range(
                0,
-                imgwidth * imgheight,
-                imgwidth,
+                imgwidth * imgheight * 4,
+                imgwidth * 4,
            )
        ]
        for _ in range(expected):
            imgy, result = results.get()
            lines[imgy] = result
-        imgmap = [pixel for line in lines for pixel in line]

        for proc in procs:
            work.put(None)
        for proc in procs:
            proc.join()

-    return imgmap
+        img = Image.frombytes('RGBA', (imgwidth, imgheight), b''.join(lines))
+    return img


 def pixel_renderer(
@ -281,20 +287,20 @@ def pixel_renderer(
    add_color: Tuple[int, int, int, int],
    mult_color: Color,
    blendfunc: int,
-    imgmap: List[Tuple[int, int, int, int]],
-    texmap: List[Tuple[int, int, int, int]],
+    imgbytes: bytes,
+    texbytes: bytes,
 ) -> None:
    while True:
        imgy = work.get()
        if imgy is None:
            return

-        result: List[Tuple[int, int, int, int]] = []
+        result: List[Sequence[int]] = []
        for imgx in range(imgwidth):
            # Determine offset
            imgoff = imgx + (imgy * imgwidth)
            if imgx < minx or imgx >= maxx:
-                result.append(imgmap[imgoff])
+                result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
                continue

            # Calculate what texture pixel data goes here.
@ -303,23 +309,26 @@ def pixel_renderer(

            # If we're out of bounds, don't update.
            if texx < 0 or texy < 0 or texx >= texwidth or texy >= texheight:
-                result.append(imgmap[imgoff])
+                result.append(imgbytes[(imgoff * 4):((imgoff + 1) * 4)])
                continue

            # Blend it.
            texoff = texx + (texy * texwidth)
-            result.append(affine_blend_impl(add_color, mult_color, texmap[texoff], imgmap[imgoff], blendfunc))
+            result.append(affine_blend_impl(add_color, mult_color, texbytes[(texoff * 4):((texoff + 1) * 4)], imgbytes[(imgoff * 4):((imgoff + 1) * 4)], blendfunc))

-        results.put((imgy, result))
+        linebytes = bytes([channel for pixel in result for channel in pixel])
+        results.put((imgy, linebytes))


 def affine_blend_impl(
    add_color: Tuple[int, int, int, int],
    mult_color: Color,
-    src_color: Tuple[int, int, int, int],
-    dest_color: Tuple[int, int, int, int],
+    # This should be a sequence of exactly 4 values, either bytes or a tuple.
+    src_color: Sequence[int],
+    # This should be a sequence of exactly 4 values, either bytes or a tuple.
+    dest_color: Sequence[int],
    blendfunc: int,
-) -> Tuple[int, int, int, int]:
+) -> Sequence[int]:
    if blendfunc == 3:
        return blend_multiply(dest_color, src_color, mult_color, add_color)
    # TODO: blend mode 4, which is "screen" blending according to SWF references. I've only seen this
--- a/bemani/format/afp/render.py
+++ b/bemani/format/afp/render.py
@ -110,9 +110,12 @@ class PlacedClip(PlacedObject):


 class AFPRenderer(VerboseOutput):
-    def __init__(self, shapes: Dict[str, Shape] = {}, textures: Dict[str, Image.Image] = {}, swfs: Dict[str, SWF] = {}) -> None:
+    def __init__(self, shapes: Dict[str, Shape] = {}, textures: Dict[str, Image.Image] = {}, swfs: Dict[str, SWF] = {}, single_threaded: bool = False) -> None:
        super().__init__()

+        # Options for rendering
+        self.__single_threaded = single_threaded
+
        self.shapes: Dict[str, Shape] = shapes
        self.textures: Dict[str, Image.Image] = textures
        self.swfs: Dict[str, SWF] = swfs
@ -363,7 +366,7 @@ class AFPRenderer(VerboseOutput):
        else:
            raise Exception(f"Failed to process tag: {tag}")

-    def __render_object(self, img: Image.Image, renderable: PlacedObject, parent_transform: Matrix, parent_origin: Point) -> None:
+    def __render_object(self, img: Image.Image, renderable: PlacedObject, parent_transform: Matrix, parent_origin: Point) -> Image.Image:
        # Compute the affine transformation matrix for this object.
        transform = parent_transform.multiply(renderable.transform)

@ -375,7 +378,7 @@ class AFPRenderer(VerboseOutput):
            # this object invisible. We can ignore this since the object should not
            # be drawn.
            print(f"WARNING: Transform Matrix {transform} has zero scaling factor, making it non-invertible!")
-            return
+            return img

        # Render individual shapes if this is a sprite.
        if isinstance(renderable, PlacedClip):
@ -386,7 +389,7 @@ class AFPRenderer(VerboseOutput):
            )
            for obj in objs:
                self.vprint(f"    Rendering placed object ID {obj.object_id} from sprite {obj.source.tag_id} onto Depth {obj.depth}")
-                self.__render_object(img, obj, transform, parent_origin.add(renderable.rotation_offset))
+                img = self.__render_object(img, obj, transform, parent_origin.add(renderable.rotation_offset))
        elif isinstance(renderable, PlacedShape):
            # This is a shape draw reference.
            shape = renderable.source
@ -400,7 +403,7 @@ class AFPRenderer(VerboseOutput):
            for params in shape.draw_params:
                if not (params.flags & 0x1):
                    # Not instantiable, don't render.
-                    return
+                    return img

                if params.flags & 0x8:
                    # TODO: Need to support blending and UV coordinate colors here.
@ -449,10 +452,12 @@ class AFPRenderer(VerboseOutput):
                        img.alpha_composite(texture, cutin.as_tuple(), cutoff.as_tuple())
                    else:
                        # We can't, so do the slow render that's correct.
-                        img.putdata(affine_composite(img, add_color, mult_color, transform, inverse, origin, blend, texture))
+                        img = affine_composite(img, add_color, mult_color, transform, inverse, origin, blend, texture, single_threaded=self.__single_threaded)
        else:
            raise Exception(f"Unknown placed object type to render {renderable}!")

+        return img
+
    def __process_tags(self, clip: PlacedClip, prefix: str = "  ") -> bool:
        self.vprint(f"{prefix}Handling placed clip {clip.object_id} at depth {clip.depth}")

@ -557,7 +562,7 @@ class AFPRenderer(VerboseOutput):
                    if clip:
                        for obj in sorted(clip.placed_objects, key=lambda obj: obj.depth):
                            self.vprint(f"  Rendering placed object ID {obj.object_id} from sprite {obj.source.tag_id} onto Depth {obj.depth}")
-                            self.__render_object(curimage, obj, root_clip.transform, root_clip.rotation_offset)
+                            curimage = self.__render_object(curimage, obj, root_clip.transform, root_clip.rotation_offset)
                else:
                    # Nothing changed, make a copy of the previous render.
                    self.vprint("  Using previous frame render")
--- a/bemani/utils/afputils.py
+++ b/bemani/utils/afputils.py
@ -265,7 +265,12 @@ def main() -> int:
        "--background-color",
        type=str,
        default=None,
-        help="Set the background color of the animation, overriding a default if present in the SWF.",
+        help="Set the background color of the animation as a comma-separated RGB or RGBA color, overriding a default if present in the SWF.",
+    )
+    render_parser.add_argument(
+        "--disable-threads",
+        action="store_true",
+        help="Disable multi-threaded rendering.",
    )

    list_parser = subparsers.add_parser('list', help='List out the possible paths to render from a series of SWFs')
@ -282,6 +287,12 @@ def main() -> int:
        action="store_true",
        help="Display verbuse debugging output",
    )
+    list_parser.add_argument(
+        "--disable-threads",
+        action="store_true",
+        help="Disable multi-threaded rendering.",
+    )
+

    args = parser.parse_args()

@ -560,7 +571,7 @@ def main() -> int:
    if args.action in ["render", "list"]:
        # This is a complicated one, as we need to be able to specify multiple
        # directories of files as well as support IFS files and TXP2 files.
-        renderer = AFPRenderer()
+        renderer = AFPRenderer(single_threaded=args.disable_threads)

        # TODO: Allow specifying individual folders and such.
        for container in args.container:
@ -700,6 +711,7 @@ def main() -> int:

            # Render the gif/webp frames.
            duration, images = renderer.render_path(args.path, verbose=args.verbose, background_color=color)
+
            if len(images) == 0:
                raise Exception("Did not render any frames!")