From 073178601f481b16497f2310b99deada400b1bb2 Mon Sep 17 00:00:00 2001
From: Viv <ms.vivaria@gmail.com>
Date: Sat, 3 Feb 2024 22:32:29 -0500
Subject: [PATCH] Add proper Don/Ka types to clusters of notes (#64)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This should match most fumens. Some exceptional cases I've seen:

- [x] ~In official fumens, even-numbered clusters of 4 notes will
sometimes be ドドドド or カカカカ. Right now, my converter will always use ドドドドン
or カカカカッ, since I don't really understand the rules behind it.~ Fixed:
Now, all 16th notes and above in groups of 4 will no longer use the
don/kat ending character.
- [X] ~In official fumens, sometimes isolated notes will be grouped
together, but sometimes they will be treated as single clusters of 1.
Right now, my converter will always try to cluster isolated notes.~
Fixed: Only 4th/8th/16th/etc. notes will be clustered. Whole/half notes
will not be clustered.
- [x] ~Right now, my logic always treats big notes as their own cluster.
But, sometimes you get a cluster like `ddddD`, and right now my
converter treats this as (ドドドドン) + big DON instead of do-ko-do-ko-DON.~
Fixed: Now big notes are included in clusters.
- [x] ~For high-level Oni songs with complex groups of dense notes (e.g.
(12th/16th/24th) notes mixed with (16th/34th/32nd) notes), official
fumens seem to group them together even though they technically have
different timings. Right now, my converter will group the 32nd notes,
but treat the 16th notes as their own separate group.~ Fixed: Now
anything above an 8th note will be clustered together.
- [X] Songs with BPM gimmicks. e.g. the TJA has low base BPM, but then
everything is doubled/tripled except for one section. I can't remember
which songs have this? Something like RNG Cinderella?
- Maybe solved by
https://github.com/vivaria/tja2fumen/pull/64/commits/1f640c1aa1fb436fe6edf421e077fdb2a41aaf00?

Fixes #41.
---
 src/tja2fumen/__init__.py   |  52 +++++++++----
 src/tja2fumen/classes.py    |   2 +
 src/tja2fumen/converters.py | 150 ++++++++++++++++++++++++++++++++++++
 src/tja2fumen/parsers.py    |   6 ++
 testing/test_conversion.py  |  33 ++++++++
 5 files changed, 229 insertions(+), 14 deletions(-)

diff --git a/src/tja2fumen/__init__.py b/src/tja2fumen/__init__.py
index a98703e..e60ebb2 100644
--- a/src/tja2fumen/__init__.py
+++ b/src/tja2fumen/__init__.py
@@ -4,11 +4,12 @@ Entry points for tja2fumen.
 
 import argparse
 import os
+import shutil
 import sys
 from typing import Sequence
 
-from tja2fumen.parsers import parse_tja
-from tja2fumen.converters import convert_tja_to_fumen
+from tja2fumen.parsers import parse_tja, parse_fumen
+from tja2fumen.converters import convert_tja_to_fumen, fix_dk_note_types_course
 from tja2fumen.writers import write_fumen
 from tja2fumen.constants import COURSE_IDS
 from tja2fumen.classes import TJACourse
@@ -18,10 +19,14 @@ def main(argv: Sequence[str] = ()) -> None:
     """
     Main entry point for tja2fumen's command line interface.
 
-    Three steps are performed:
-       1. Parse TJA into multiple TJACourse objects. Then, for each course:
+    tja2fumen can be used in 2 ways:
+
+    - If a .tja file is provided, then three steps are performed:
+          1. Parse TJA into multiple TJACourse objects. Then, for each course:
           2. Convert TJACourse objects into FumenCourse objects.
           3. Write each FumenCourse to its own .bin file.
+    - If a .bin file is provided, then the existing .bin is repaired:
+          1. Update don/kat senote types to do-ko-don and ka-kat.
     """
     if not argv:
         argv = sys.argv[1:]
@@ -30,20 +35,27 @@ def main(argv: Sequence[str] = ()) -> None:
         description="tja2fumen"
     )
     parser.add_argument(
-        "file.tja",
-        help="Path to a Taiko no Tatsujin TJA file.",
+        "file",
+        help="Path to a Taiko no Tatsujin chart file.",
     )
     args = parser.parse_args(argv)
-    fname_tja = getattr(args, "file.tja")
-    base_name = os.path.splitext(fname_tja)[0]
+    fname = getattr(args, "file")
+    base_name = os.path.splitext(fname)[0]
 
-    # Parse lines in TJA file
-    parsed_tja = parse_tja(fname_tja)
+    if fname.endswith(".tja"):
+        print("Converitng TJA to fumen files...")
+        # Parse lines in TJA file
+        parsed_tja = parse_tja(fname)
 
-    # Convert parsed TJA courses and write each course to `.bin` files
-    for course_name, course in parsed_tja.courses.items():
-        convert_and_write(course, course_name, base_name,
-                          single_course=len(parsed_tja.courses) == 1)
+        # Convert parsed TJA courses and write each course to `.bin` files
+        for course_name, course in parsed_tja.courses.items():
+            convert_and_write(course, course_name, base_name,
+                              single_course=len(parsed_tja.courses) == 1)
+    elif fname.endswith(".bin"):
+        print("Repairing existing fumen file...")
+        repair_bin(fname)
+    else:
+        raise ValueError(f"Unexpected file extension: {fname}")
 
 
 def convert_and_write(tja_data: TJACourse,
@@ -52,6 +64,8 @@ def convert_and_write(tja_data: TJACourse,
                       single_course: bool = False) -> None:
     """Process the parsed data for a single TJA course."""
     fumen_data = convert_tja_to_fumen(tja_data)
+    # fix don/ka types
+    fix_dk_note_types_course(fumen_data)
     # Add course ID (e.g. '_x', '_x_1', '_x_2') to the output file's base name
     output_name = base_name
     if single_course:
@@ -64,6 +78,16 @@ def convert_and_write(tja_data: TJACourse,
     write_fumen(f"{output_name}.bin", fumen_data)
 
 
+def repair_bin(fname_bin: str) -> None:
+    """Repair the don/ka types of an existing .bin file."""
+    fumen_data = parse_fumen(fname_bin)
+    # fix don/ka types
+    fix_dk_note_types_course(fumen_data)
+    # write repaired fumen
+    shutil.move(fname_bin, fname_bin+".bak")
+    write_fumen(fname_bin, fumen_data)
+
+
 # NB: This entry point is necessary for the Pyinstaller executable
 if __name__ == "__main__":
     main()
diff --git a/src/tja2fumen/classes.py b/src/tja2fumen/classes.py
index 5ee348e..8133fdf 100644
--- a/src/tja2fumen/classes.py
+++ b/src/tja2fumen/classes.py
@@ -82,6 +82,8 @@ class FumenNote:
     """Contains all the byte values for a single Fumen note."""
     note_type: str = ''
     pos: float = 0.0
+    pos_abs: float = 0.0
+    diff: int = 0
     score_init: int = 0
     score_diff: int = 0
     padding: float = 0.0
diff --git a/src/tja2fumen/converters.py b/src/tja2fumen/converters.py
index 168c200..1155c79 100644
--- a/src/tja2fumen/converters.py
+++ b/src/tja2fumen/converters.py
@@ -8,6 +8,7 @@ from typing import List, Dict, Tuple, Union
 from tja2fumen.classes import (TJACourse, TJAMeasure, TJAMeasureProcessed,
                                FumenCourse, FumenHeader, FumenMeasure,
                                FumenNote)
+from tja2fumen.constants import BRANCH_NAMES
 
 
 def process_commands(tja_branches: Dict[str, List[TJAMeasure]], bpm: float) \
@@ -410,3 +411,152 @@ def convert_tja_to_fumen(tja: TJACourse) -> FumenCourse:
             int(65536 * (total_notes['normal'] / total_notes['master']))
 
     return fumen
+
+
+def fix_dk_note_types_course(fumen: FumenCourse) -> None:
+    """
+    Call `fix_dk_note_types` once per branch on a FumenCourse.
+    """
+    # try to determine the song's BPM from its measures
+    measure_bpms = [m.bpm for m in fumen.measures]
+    unique_bpms = set(measure_bpms)
+    song_bpm = max(unique_bpms, key=measure_bpms.count)
+
+    # collect the d/k notes for each branch, then fix their types
+    for branch_name in BRANCH_NAMES:
+        dk_notes = []
+        for measure in fumen.measures:
+            for note in measure.branches[branch_name].notes:
+                if any(note.note_type.lower().startswith(t)
+                       for t in ['don', 'ka']):
+                    note.pos_abs = (measure.offset_start + note.pos +
+                                    (4 * 60_000 / measure.bpm))
+                    dk_notes.append(note)
+        if dk_notes:
+            fix_dk_note_types(dk_notes, song_bpm)
+
+
+def fix_dk_note_types(dk_notes: List[FumenNote], song_bpm: float) -> None:
+    """
+    Cluster Don/Ka notes based on their relative positions, then replace
+    Don/Ka notes with alternate versions (Don2, Don3, Ka2).
+
+    NB: Modifies FumenNote objects in-place
+    """
+    # Sort the notes by their absolute positions to account for BPMCHANGE
+    dk_notes = sorted(dk_notes, key=lambda note: note.pos_abs)
+
+    # Get the differences between each note and the previous one
+    for (note_1, note_2) in zip(dk_notes, dk_notes[1:]):
+        note_1.diff = int(note_2.pos_abs - note_1.pos_abs)
+
+    # Isolate the unique difference values and sort them
+    diffs_unique = sorted(list({note.diff for note in dk_notes}))
+
+    # Avoid clustering any whole notes, half notes, or quarter notes
+    # i.e. only cluster 8th notes, 16th notes, etc.
+    measure_duration = (4 * 60_000) / song_bpm
+    quarter_note_duration = int(measure_duration / 4)
+    diffs_under_quarter: List[int] = [diff for diff in diffs_unique
+                                      if diff < quarter_note_duration]
+
+    # Anything above an 8th note (12th, 16th, 24th, 36th, etc...) should be
+    # clustered together as a single stream
+    diffs_to_cluster: List[List[int]] = []
+    diffs_under_8th: List[int] = []
+    eighth_note_duration = int(measure_duration / 8)
+    for diff in diffs_under_quarter:
+        if diff < eighth_note_duration:
+            diffs_under_8th.append(diff)
+        else:
+            diffs_to_cluster.append([diff])
+    # Make sure to cluster the close-together notes first
+    if diffs_under_8th:
+        diffs_to_cluster.insert(0, diffs_under_8th)
+
+    # Cluster the notes from the smallest difference to the largest
+    semi_clustered: List[Union[FumenNote, List[FumenNote]]] = list(dk_notes)
+    for diff_vals in diffs_to_cluster:
+        semi_clustered = cluster_notes(semi_clustered, diff_vals)
+
+    # Turn any remaining isolated notes into clusters (i.e. long diffs)
+    clustered_notes = [cluster if isinstance(cluster, list) else [cluster]
+                       for cluster in semi_clustered]
+
+    # In each cluster, replace dons/kas with their alternate versions
+    replace_alternate_don_kas(clustered_notes, eighth_note_duration)
+
+
+def replace_alternate_don_kas(note_clusters: List[List[FumenNote]],
+                              eighth_note_duration: int) -> None:
+    """
+    Replace Don/Ka notes with alternate versions (Don2, Don3, Ka2) based on
+    positions within a cluster of notes.
+
+    NB: Modifies FumenNote objects in-place
+    """
+    big_notes = ['DON', 'DON2', 'KA', 'KA2']
+    for cluster in note_clusters:
+        # Replace all small notes with the basic do/ka notes ("Don2", "Ka2")
+        for note in cluster:
+            if note.note_type not in big_notes:
+                if note.note_type[-1].isdigit():
+                    note.note_type = note.note_type[:-1] + "2"
+                else:
+                    note.note_type += "2"
+
+        # The "ko" type of Don note only occurs every other note, and only
+        # in odd-length all-don runs (DDD: Do-ko-don, DDDDD: Do-ko-do-ko-don)
+        all_dons = all(note.note_type.startswith("Don") for note in cluster)
+        for i, note in enumerate(cluster):
+            if (all_dons and (len(cluster) % 2 == 1) and (i % 2 == 1)
+                    and note.note_type not in big_notes):
+                note.note_type = "Don3"
+
+        # Replace the last note in a cluster with the ending Don/Kat
+        # In other words, remove the '2' from the last note.
+        # However, there's one exception: Groups of 4 notes, faster than 8th
+        is_fast_cluster_of_4 = (len(cluster) == 4 and
+                                all(note.diff < eighth_note_duration
+                                    for note in cluster[:-1]))
+        if is_fast_cluster_of_4:
+            # Leave last note as Don2/Ka2
+            pass
+        else:
+            # Replace last Don2/Ka2 with Don/Ka
+            if cluster[-1].note_type not in big_notes:
+                cluster[-1].note_type = cluster[-1].note_type[:-1]
+
+
+def cluster_notes(item_list: List[Union[FumenNote, List[FumenNote]]],
+                  cluster_diffs: List[int]) \
+        -> List[Union[FumenNote, List[FumenNote]]]:
+    """Group notes based on the differences between them."""
+    clustered_notes: List[Union[FumenNote, List[FumenNote]]] = []
+    current_cluster: List[FumenNote] = []
+    for item in item_list:
+        # If we encounter an already-clustered group of items, the current
+        # cluster should end
+        if isinstance(item, list):
+            if current_cluster:
+                clustered_notes.append(current_cluster)
+                current_cluster = []
+            clustered_notes.append(item)
+        # Handle values that haven't been clustered yet
+        else:
+            assert isinstance(item, FumenNote)
+            # Start and/or continue the current cluster
+            if any(item.diff == diff for diff in cluster_diffs):
+                current_cluster.append(item)
+            else:
+                # Finish the existing cluster
+                if current_cluster:
+                    current_cluster.append(item)
+                    clustered_notes.append(current_cluster)
+                    current_cluster = []
+                # Or, if there is no cluster, append the item
+                else:
+                    clustered_notes.append(item)
+    if current_cluster:
+        clustered_notes.append(current_cluster)
+    return clustered_notes
diff --git a/src/tja2fumen/parsers.py b/src/tja2fumen/parsers.py
index ac80311..6270481 100644
--- a/src/tja2fumen/parsers.py
+++ b/src/tja2fumen/parsers.py
@@ -156,6 +156,12 @@ def split_tja_lines_into_courses(lines: List[str]) -> TJASong:
                         if not v.data]:
         del parsed_tja.courses[course_name]
 
+    # Recreate dict with consistent insertion order
+    parsed_tja.courses = {
+        key: parsed_tja.courses[key] for key
+        in sorted(parsed_tja.courses.keys())
+    }
+
     return parsed_tja
 
 
diff --git a/testing/test_conversion.py b/testing/test_conversion.py
index 3cf15e8..1ec93fb 100644
--- a/testing/test_conversion.py
+++ b/testing/test_conversion.py
@@ -10,6 +10,39 @@ from conftest import convert
 from tja2fumen.parsers import parse_fumen
 
 
+@pytest.mark.skipif("CI" in os.environ,
+                    reason="Test is only for local debugging")
+def test_converted_tja_no_comparison(tmp_path, entry_point):
+    """
+    A test purely to aid with debugging. It lets me drop a .tja into a
+    pre-determined folder and run the conversion, allowing me to set
+    breakpoints and debug internal state without any tedious setup.
+    """
+    # Define the testing directory
+    path_test = os.path.dirname(os.path.realpath(__file__))
+    path_test = os.path.join(path_test, "data", "unpaired_tjs")
+    for fname in os.listdir(path_test):
+        # Copy input TJA to working directory
+        path_tja = os.path.join(path_test, fname)
+        path_tja_tmp = os.path.join(tmp_path, fname)
+        shutil.copy(path_tja, path_tja_tmp)
+
+        # Convert TJA file to fumen files
+        convert(path_test, path_tja_tmp, entry_point)
+
+        # Fetch output fumen paths
+        paths_out = glob.glob(os.path.join(tmp_path, "*.bin"))
+        assert paths_out, f"No bin files generated in {tmp_path}"
+        order = "xmhne"  # Ura Oni -> Oni -> Hard -> Normal -> Easy
+        paths_out = sorted(paths_out,
+                           key=lambda s: [order.index(c) if c in order
+                                          else len(order) for c in s])
+        for path_out in paths_out:
+            difficulty = os.path.basename(path_out).split(".")[0].split("_")[1]
+            song = parse_fumen(path_out, exclude_empty_measures=False)
+            print(f"{difficulty}: {len(song.measures)}")
+
+
 @pytest.mark.parametrize('id_song', [
     pytest.param('butou5'),
     pytest.param('shoto9',