diff --git a/CHANGELOG.md b/CHANGELOG.md index d3af7ed..b32de65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# v1.1.3 +## Fixed +- [jubeat-analyser] All files are read and written in `surrogateescape` error + mode to mimick the way jubeat analyser handles files at the byte level, without + caring about whether the whole file can be properly decoded as shift-jis or not + (Thanks Nomlas and Mintice for noticing this !) + # v1.1.2 ## Fixed - [jubeat-analyser] diff --git a/jubeatools/formats/guess.py b/jubeatools/formats/guess.py index 7c04dc1..c4577f7 100644 --- a/jubeatools/formats/guess.py +++ b/jubeatools/formats/guess.py @@ -72,7 +72,7 @@ def _dirty_jba_line_strip(line: str) -> str: def recognize_jubeat_analyser_format(path: Path) -> Format: - with path.open(encoding="shift-jis-2004") as f: + with path.open(encoding="shift-jis-2004", errors="surrogateescape") as f: lines = f.readlines() saw_jubeat_analyser_commands = False diff --git a/jubeatools/formats/jubeat_analyser/dump_tools.py b/jubeatools/formats/jubeat_analyser/dump_tools.py index e9a45f9..07aad20 100644 --- a/jubeatools/formats/jubeat_analyser/dump_tools.py +++ b/jubeatools/formats/jubeat_analyser/dump_tools.py @@ -282,7 +282,9 @@ def make_full_dumper_from_jubeat_analyser_chart_dumper( timing, circle_free, ) - file_bytes = chart_file.getvalue().encode("shift-jis-2004") + file_bytes = chart_file.getvalue().encode( + "shift-jis-2004", errors="surrogateescape" + ) files.append(ChartFile(file_bytes, song, difficulty, chart)) return files diff --git a/jubeatools/formats/jubeat_analyser/load_tools.py b/jubeatools/formats/jubeat_analyser/load_tools.py index 2456450..24ab9bb 100644 --- a/jubeatools/formats/jubeat_analyser/load_tools.py +++ b/jubeatools/formats/jubeat_analyser/load_tools.py @@ -103,7 +103,9 @@ class DoubleColumnChartLine: def raise_if_position_unfit(self, bytes_per_panel: int) -> None: expected_length = 4 * bytes_per_panel - actual_length = len(self.position.encode("shift-jis-2004")) + actual_length = len( + self.position.encode("shift-jis-2004", errors="surrogateescape") + ) if expected_length != actual_length: raise SyntaxError( f"Invalid position part. Since #bpp={bytes_per_panel}, the " @@ -115,7 +117,7 @@ class DoubleColumnChartLine: if self.timing is None: return - length = len(self.timing.encode("shift-jis-2004")) + length = len(self.timing.encode("shift-jis-2004", errors="surrogateescape")) if length % bytes_per_panel != 0: raise SyntaxError( f"Invalid timing part. Since #bpp={bytes_per_panel}, the timing " @@ -173,7 +175,7 @@ def split_double_byte_line(line: str) -> List[str]: >>> split_chart_line("口⑪①25") ... ["口","⑪","①","25"] """ - encoded_line = line.encode("shift-jis-2004") + encoded_line = line.encode("shift-jis-2004", errors="surrogateescape") if len(encoded_line) % 2 != 0: raise ValueError( "Line of odd length encountered while trying to split a double-byte " @@ -181,7 +183,9 @@ def split_double_byte_line(line: str) -> List[str]: ) symbols = [] for i in range(0, len(encoded_line), 2): - symbols.append(encoded_line[i : i + 2].decode("shift-jis-2004")) + symbols.append( + encoded_line[i : i + 2].decode("shift-jis-2004", errors="surrogateescape") + ) return symbols @@ -415,7 +419,9 @@ class JubeatAnalyserParser: def define_symbol(self, symbol: str, timing: Decimal) -> None: bpp = self.bytes_per_panel - length_as_shift_jis = len(symbol.encode("shift-jis-2004")) + length_as_shift_jis = len( + symbol.encode("shift-jis-2004", errors="surrogateescape") + ) if length_as_shift_jis != bpp: raise ValueError( f"Invalid symbol definition. Since #bpp={bpp}, timing symbols " @@ -430,7 +436,10 @@ class JubeatAnalyserParser: self.symbols[symbol] = round_beats(timing) def is_short_line(self, line: str) -> bool: - return len(line.encode("shift-jis-2004")) < self.bytes_per_panel * 4 + return ( + len(line.encode("shift-jis-2004", errors="surrogateescape")) + < self.bytes_per_panel * 4 + ) def _split_chart_line(self, line: str) -> List[str]: if self.bytes_per_panel == 2: @@ -466,15 +475,13 @@ class DoubleColumnFrame: def read_jubeat_analyser_file(path: Path) -> Optional[List[str]]: - try: - # The vast majority of memo files you will encounter will be propely - # decoded using shift-jis-2004. Get ready for endless fun with the small - # portion of files that won't - lines = path.read_text(encoding="shift-jis-2004").split("\n") - except UnicodeDecodeError: - return None - else: - return lines + """The vast majority of memo files you will encounter will be propely + decoded using shift-jis-2004. Some won't but jubeat analyser works at the + byte level so it doesn't care, here we use surrogateescape to handle + potential decoding errors""" + return path.read_text(encoding="shift-jis-2004", errors="surrogateescape").split( + "\n" + ) load_folder = make_folder_loader( diff --git a/jubeatools/formats/jubeat_analyser/memo2/load.py b/jubeatools/formats/jubeat_analyser/memo2/load.py index f8bd0f0..1c0d25c 100644 --- a/jubeatools/formats/jubeat_analyser/memo2/load.py +++ b/jubeatools/formats/jubeat_analyser/memo2/load.py @@ -213,14 +213,18 @@ class Memo2Parser(JubeatAnalyserParser): self._do_bpp(value) def append_chart_line(self, raw_line: RawMemo2ChartLine) -> None: - if len(raw_line.position.encode("shift-jis-2004")) != 4 * self.bytes_per_panel: + if ( + len(raw_line.position.encode("shift-jis-2004", errors="surrogateescape")) + != 4 * self.bytes_per_panel + ): raise SyntaxError( f"Invalid chart line for #bpp={self.bytes_per_panel} : {raw_line}" ) if raw_line.timing is not None and self.bytes_per_panel == 2: if any( - len(e.string.encode("shift-jis-2004")) % 2 != 0 + len(e.string.encode("shift-jis-2004", errors="surrogateescape")) % 2 + != 0 for e in raw_line.timing if isinstance(e, NoteCluster) ): diff --git a/jubeatools/formats/jubeat_analyser/mono_column/load.py b/jubeatools/formats/jubeat_analyser/mono_column/load.py index 631cc38..fa42a6f 100644 --- a/jubeatools/formats/jubeat_analyser/mono_column/load.py +++ b/jubeatools/formats/jubeat_analyser/mono_column/load.py @@ -135,12 +135,15 @@ class MonoColumnParser(JubeatAnalyserParser): def append_chart_line(self, line: str) -> None: expected_length = 4 * self.bytes_per_panel - actual_length = len(line.encode("shift-jis-2004")) + actual_length = len(line.encode("shift-jis-2004", errors="surrogateescape")) if actual_length != expected_length: raise SyntaxError(f"Invalid chart line. Since for ") if self.bytes_per_panel == 1 and len(line) != 4: raise SyntaxError(f"Invalid chart line for #bpp=1 : {line}") - elif self.bytes_per_panel == 2 and len(line.encode("shift-jis-2004")) != 8: + elif ( + self.bytes_per_panel == 2 + and len(line.encode("shift-jis-2004", errors="surrogateescape")) != 8 + ): raise SyntaxError(f"Invalid chart line for #bpp=2 : {line}") self.current_chart_lines.append(line) diff --git a/jubeatools/formats/jubeat_analyser/tests/data/MTC_Mimi_EXT.txt b/jubeatools/formats/jubeat_analyser/tests/data/MTC_Mimi_EXT.txt new file mode 100644 index 0000000..dcff683 --- /dev/null +++ b/jubeatools/formats/jubeat_analyser/tests/data/MTC_Mimi_EXT.txt @@ -0,0 +1,513 @@ +//̹ - Ȳ +//by Mintice +//<>uցȡ +//20190503 + + +t=130 +m="[MTC] Mimi Cut.mp3" +o=225 + +#title="Mimi" +#dif=3 +#lev=9 +#artist="Pal Hwang Dan" + +#holdbyarrow=1 + +*01:0 +*02:0.25 +*03:0.5 +*04:0.75 +*05:1 +*06:1.25 +*07:1.5 +*08:1.75 +*09:2 +*10:2.25 +*11:2.5 +*12:2.75 +*13:3 +*14:3.25 +*15:3.5 +*16:3.75 + + + + + +---- +15 +13 +0511 +01090307 +---- +0107 +0903 +051311 +15 +---- +15 +051311 +0903 +0107 +---- +0115 +1103 +051307 +09 +---- +13031309 +0115 +0709 +01051105 +---- //Ȧ? +0511 +010315 +090107 +01130713 +---- +011507 +0113 +13090703 +0511 +---- +05150707 +010903 +11 +130113 +---- +1503 +05151107 +01010711 +13091309 +---- +010711 +011307 +090313 +0515 +---- +011311 +090313 +0107 +051507 +---- +050707 +13011115 +010913 +03 +---- // +01 + +0305 +0907 + +1111 +1111 +111515 + +---- // + +05 +05 +0105 + +0909 + + +1313 +---- //ĵ +0307 +03010705 +0105 + +---- // ֱ +0101 +0505 +0909 +151311 +---- // +11 +1515 +11 + +---- //鵵 +1309 + +13050913 +01090105 +---- //ŭ̳ +03 +01070707 +111111 +01 +---- //Ǿ +0515 +0513 +0111 +0109 +---- + +1507 +051309 +030111 +---- +011307 +010703 +050509 +11111503 +---- +01051103 +010707 +05150311 +0913 +---- +0513 +01050307 +01150911 +110307 +---- +0705 +0701 +0305 +030901 + +1515 +131211 +11 +13 +---- +151305 +071111 +03070105 +030901 +---- +03 +03110901 +070105 +07111305 + + +15 + + +---- +011103 +051311 +01050703 +150907 +---- // +01050705 +0309 +01151111 +13151309 +---- // +050509 +05010101 +131309 +0913 +---- // Բ +0701 +07050303 +0501 +111315 +---- // ٷ +051513 +15010513 +09050111 +091101 +---- // +111505 +1105 +090715 +0313 +---- // з +051309 +01130105 +050109 +0913 +---- // +1511 +151107 +11030703 +010701 +---- //ʰ Ǿ +01151305 +15110113 +11010509 +0509 +---- +01 +150307 +051109 +15130307 +---- //Ŭ +15010111 +03 +07111505 +0913 +---- +1511 +070103 +1115 +090513 +---- +150511 +0307 +011115 +0913 +---- +150311 +1511 +05111507 +090113 +---- +0101 + +0101 + + + +150711 +110515 +030913 +---- +0703 +150111 +110515 +0913 +---- +01 +150311 +110515 +091307 +---- +010113 +090315 +110713 +090505 +---- // +01 + + + + +13 + + + +---- // + +01 + + + + +13 + + +---- //ؾ +15 +13 +0111 + + + +15 +0913 + +---- +05 + + +13090913 +---- // +01 +131103 +051307 +08 + +13 + + + +---- // +0305 +131501 +0710 +130811 + + +13 + + +---- //ؿ +15 +13 +0111 + + + +15 +0913 + + + +05 +1310 +071103 + + +15 + + +---- +010305 +150109 +050705 +15131109 +---- //Ǻδ +0103 +0307 +0711 +1101 + + + +15 +1513 +---- //ãƿ ̻ +0301 +030701 +071115 +111513 +---- //Ͼ +010315 +13070315 +01110711 +1311 +---- //ʴµ +03 +09030715 +01110715 +0513 +---- //ƹ +01151101 +1107 +070315 +0313 +---- // ʴ +11 +110701 +070315 +03150113 +---- //ڽŸ 𸣰 +070315 +130703 +0101 +1311 + + +15 +11 + +---- //ִµ +010509 +13030711 +15010509 +13 +---- //Ǻδ 2 +0103 +0307 +071115 +111501 + +13 + + +13 +---- //ãƿ ̻ +0301 +030701 +071115 +111513 + + +13 + + +---- //Ͼ +0115 +13 +0111 + + +03 +070315 +1107 +1311 + + +15 +0913 + +---- //ʴµ +0305 +09030715 +01110715 +0513 +---- //ƹ +151101 +1107 +070315 +0313 + +01 + + + + +13 + + + +---- // ʴ +11 +110701 +070315 +03150113 + + +13 + + +---- //ڽŸ 𸣰 +0703 +0703 +0101 +1311 + +15 +1315 +0911 + + + +15 +13 + +---- //ִµ +05 + + + + +010509 +13030711 +15010509 +13 +---- + + +0101 +0101 \ No newline at end of file diff --git a/jubeatools/formats/jubeat_analyser/tests/data/MTC_Nageki_no_Ki_EXT.txt b/jubeatools/formats/jubeat_analyser/tests/data/MTC_Nageki_no_Ki_EXT.txt new file mode 100644 index 0000000..d87a7d2 --- /dev/null +++ b/jubeatools/formats/jubeat_analyser/tests/data/MTC_Nageki_no_Ki_EXT.txt @@ -0,0 +1,610 @@ +// - +//by Mintice +//20190409 + +t=160 +m="[MTC] Nageki no Ki.mp3" +o=1320 + +#title="Nageki no Ki" +#dif=3 +#lev=10 +#artist="Golden Lion" + +*01:0 +*02:0.25 +*03:0.5 +*04:0.75 +*05:1 +*06:1.25 +*07:1.5 +*08:1.75 +*09:2 +*10:2.25 +*11:2.5 +*12:2.75 +*13:3 +*14:3.25 +*15:3.5 +*16:3.75 + +*a1:0 +*a2:0.166667 +*a3:0.333333 +*a4:0.5 +*a5:0.666667 +*a6:0.833333 +*b1:1 +*b2:1.166667 +*b3:1.333333 +*b4:1.5 +*b5:1.666667 +*b6:1.833333 +*c1:2 +*c2:2.166667 +*c3:2.333333 +*c4:2.5 +*c5:2.666667 +*c6:2.833333 +*d1:3 +*d2:3.166667 +*d3:3.333333 +*d4:3.5 +*d5:3.666667 +*d6:3.833333 + + + + + +---- + + + + +---- + + + + +---- + + + + +---- + + + + +---- +0102 +15030416 +05 +c1b6b5b4 +---- +12030215 +1411 +10010413 +b3b4b5b6 + + + + +16 +---- +110414 +02131603 +15090510 +12 +---- +15051210 +03131402 +11090406 +07010816 +---- //巳 +15070614 +01091608 +05130204 +11031012 +---- +09030812 +15071606 +01111002 +13050414 +---- +05110208 +13031204 +15070614 +09011016 +---- +15131610 +09010802 +11050412 +03070614 +---- +15111208 +03050406 +09011014 +07130216 +---- +11150816 +01071202 +13050610 +03090414 +---- +07110410 +03091406 +13010208 +051512 + + + +16 + +---- +0101 +09 +0909 +09 +---- +150301 +13 +091305 +010509 +---- +0907 +03 +091313 +010515 +---- + +0311 +09130109 +01051507 +---- + +0311 +09130109 +01051507 +---- +03 +05 +09130711 +010509 +---- +071513 +0301 +091305 +0105 +---- + +0301 +09131511 +01051309 +---- // +01020304 +05060708 +12111009 +16151413 +---- +09130307 +010505 +1509 +1113 +---- +091305 +010503 +111507 +130109 +---- +09131505 +01050313 +09 +071101 +---- +091303 +01050715 +150111 +091305 +---- +091301 +010515 +030509 +130711 +---- +09130311 +01051305 +0915 +0701 +---- +091305 +01051103 +1507 +090113 + + +15 + + +---- + +141008 +03110715 +01090513 +---- //ŵ +0805 +031307 +020613 +01111009 + + + + +15 +---- +09101101 +130502 +061303 +0807 + + +15 + + +---- +08 +011305 +02061307 +03111009 + + + + +15 +---- +09101103 +07130602 +051301 +08 + + +15 + + +---- +0505 + +1107 +01020309 + +1313 + +15 + +---- +111009 +010203 +0708 +060504 + + +121314 +1615 + +---- +050607 + +01 +020304 + + +131211 +14100908 +1516 +---- +0101 + + + + +c6 +d1 +d2 +d3 +---- // +01150907 +1103 +05011309 +0901 +---- +0915 +010501 +09070911 +031301 +---- +0103 +11011309 +050909 +150701 +---- +091501 +130309 +07090105 +0111 +---- +091115 +11130103 +050913 +01150701 +---- +03151109 +130111 +07150905 +011301 +---- +1305 +02110112 +1305 +09041003 +---- +060109 +08110511 +04100703 +0211 +---- +030109 +130911 +070501 +010915 +---- +030911 +010509 +130115 +070901 +---- +010907 +050109 +091501 +130311 +---- +091501 +010703 +091301 +050911 +---- +1115 +09011301 +13090315 +01051107 +---- +031509 +13010901 +15110511 +010713 +---- +04130512 +0203 +13110510 +0901 +---- +07111610 +03130206 +15090414 +05011208 +---- //αα αα αα +051311 +01100704 +13080214 +03010907 + + + +16 +15 +---- +031009 +01130702 +080511 +01130704 + +15 +14 +16 + +---- +101104 +050802 +010109 +030707 + +13 +16 +1314 +15 +---- +051002 +010807 +011107 +030409 + +16 +1514 +13 +13 +---- //޸ +0302 +0704 +010806 +05 + +1510 +131612 +1109 +14 +---- +05110806 +031214 +09070402 +13011610 + + +15 + + +---- +07090412 +01131016 +11050206 +031408 + + +15 + + +---- +15050416 +13081114 +10030211 +01010707 +---- //帣 +c1c2c3c4 +b3b4b5b6 +a5a6b1b2 +a1a2a3a4 + + + +d3d4d5d6 +c5c6d1d2 +---- +a5a6b1b2 +a1a2a3a4 +c1c2c3c4 +b3b4b5b6 + +d3d4d5d6 +c5c6d1d2 + + +---- + +c1b6b5b4 +c1 +01030204 +---- +b2b3b4b5 +03130212 +15011004 +09111416 +---- +07050612 +03150414 +11010802 +13091610 +---- // +0113 +030709 +05010703 +111109 + + +15 +15 + +---- +030313 +0107 +070909 +01111105 + + +15 +15 + +---- +010303 +011509 +07110907 +05131115 +---- +13030109 +01031109 +071105 +071515 +---- +110707 +09031115 +05011503 +091301 +---- // +011409 +04021012 +07051306 +081103 + + + + +1615 +---- +090307 +020108 +05041006 +1211 + +16 +15 +14 +13 +---- +01 +01 + + + +c6 +d1 +d2 +d3 +---- +080707 +10050911 +0304 +010102 + + + +1316 +131514 +---- +05 + +01 +05 +---- + + + + +---- + + + + +---- + + + + +---- + + + + +---- + + + + +---- \ No newline at end of file diff --git a/jubeatools/formats/jubeat_analyser/tests/memo/test_memo.py b/jubeatools/formats/jubeat_analyser/tests/memo/test_memo.py index 16591c8..28a7654 100644 --- a/jubeatools/formats/jubeat_analyser/tests/memo/test_memo.py +++ b/jubeatools/formats/jubeat_analyser/tests/memo/test_memo.py @@ -49,6 +49,6 @@ def test_that_full_chart_roundtrips(song: song.Song, circle_free: bool) -> None: Format.MEMO, song, temp_path=temp_file_named_txt(), - bytes_decoder=lambda b: b.decode("shift-jis-2004"), + bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"), dump_options={"circle_free": circle_free}, ) diff --git a/jubeatools/formats/jubeat_analyser/tests/memo1/test_memo1.py b/jubeatools/formats/jubeat_analyser/tests/memo1/test_memo1.py index 53b2781..e2c880d 100644 --- a/jubeatools/formats/jubeat_analyser/tests/memo1/test_memo1.py +++ b/jubeatools/formats/jubeat_analyser/tests/memo1/test_memo1.py @@ -45,6 +45,6 @@ def test_that_full_chart_roundtrips(song: song.Song, circle_free: bool) -> None: Format.MEMO_1, song, temp_path=temp_file_named_txt(), - bytes_decoder=lambda b: b.decode("shift-jis-2004"), + bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"), dump_options={"circle_free": circle_free}, ) diff --git a/jubeatools/formats/jubeat_analyser/tests/memo2/test_memo2.py b/jubeatools/formats/jubeat_analyser/tests/memo2/test_memo2.py index 43a9873..9c21815 100644 --- a/jubeatools/formats/jubeat_analyser/tests/memo2/test_memo2.py +++ b/jubeatools/formats/jubeat_analyser/tests/memo2/test_memo2.py @@ -56,6 +56,6 @@ def test_that_full_chart_roundtrips(song: Song, circle_free: bool) -> None: Format.MEMO_2, song, temp_path=temp_file_named_txt(), - bytes_decoder=lambda b: b.decode("shift-jis-2004"), + bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"), dump_options={"circle_free": circle_free}, ) diff --git a/jubeatools/formats/jubeat_analyser/tests/mono_column/test_mono_column_hypothesis.py b/jubeatools/formats/jubeat_analyser/tests/mono_column/test_mono_column_hypothesis.py index 48cc2d2..381e9a6 100644 --- a/jubeatools/formats/jubeat_analyser/tests/mono_column/test_mono_column_hypothesis.py +++ b/jubeatools/formats/jubeat_analyser/tests/mono_column/test_mono_column_hypothesis.py @@ -89,6 +89,6 @@ def test_that_full_chart_roundtrips(song: Song, circle_free: bool) -> None: Format.MONO_COLUMN, song, temp_path=temp_file_named_txt(), - bytes_decoder=lambda b: b.decode("shift-jis-2004"), + bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"), dump_options={"circle_free": circle_free}, ) diff --git a/jubeatools/formats/jubeat_analyser/tests/test_examples.py b/jubeatools/formats/jubeat_analyser/tests/test_examples.py index ea6f36d..32a7a91 100644 --- a/jubeatools/formats/jubeat_analyser/tests/test_examples.py +++ b/jubeatools/formats/jubeat_analyser/tests/test_examples.py @@ -8,15 +8,19 @@ from jubeatools.formats.guess import guess_format from . import data +def try_to_load(example_file: str) -> None: + with resources.path(data, example_file) as p: + format_ = guess_format(p) + loader = LOADERS[format_] + _ = loader(p) + + def test_RorataJins_example() -> None: """This file has a #memo tag but actually uses mono-column formatting, Here I just check that a friendlier error message is sent because there is not much else I can to do here, the file is plain old wrong""" with pytest.raises(SyntaxError, match="separator line"): - with resources.path(data, "RorataJin's example.txt") as p: - format_ = guess_format(p) - loader = LOADERS[format_] - _ = loader(p) + try_to_load("RorataJin's example.txt") def test_Booths_of_Fighters_memo() -> None: @@ -24,7 +28,15 @@ def test_Booths_of_Fighters_memo() -> None: - while it's in #memo2 format, it actually uses b= and t= commands - the position and timing parts are separated by some less common whitespace character""" - with resources.path(data, "Booths_of_Fighters_memo.txt") as p: - format_ = guess_format(p) - loader = LOADERS[format_] - _ = loader(p) + try_to_load("Booths_of_Fighters_memo.txt") + + +def test_MTC_Nageki_no_Ki_EXT() -> None: + """This file is proper euc-kr text that's specially crafted to also be + compatible with jubeat analyser, handcrafted mojibake and all""" + try_to_load("MTC_Nageki_no_Ki_EXT.txt") + + +def test_MTC_Mimi_EXT() -> None: + """Also an euc-kr file but also has long notes""" + try_to_load("MTC_Mimi_EXT.txt")