1
0
mirror of synced 2024-12-12 06:51:05 +01:00

Merge pull request #12 from Stepland/force-encoding

[jubeat-analyser] use `surrogateescape` everywhere
This commit is contained in:
Stepland 2021-05-23 11:26:41 +02:00 committed by GitHub
commit 0af9d36f8c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 1191 additions and 33 deletions

View File

@ -1,3 +1,10 @@
# v1.1.3
## Fixed
- [jubeat-analyser] All files are read and written in `surrogateescape` error
mode to mimick the way jubeat analyser handles files at the byte level, without
caring about whether the whole file can be properly decoded as shift-jis or not
(Thanks Nomlas and Mintice for noticing this !)
# v1.1.2
## Fixed
- [jubeat-analyser]

View File

@ -72,7 +72,7 @@ def _dirty_jba_line_strip(line: str) -> str:
def recognize_jubeat_analyser_format(path: Path) -> Format:
with path.open(encoding="shift-jis-2004") as f:
with path.open(encoding="shift-jis-2004", errors="surrogateescape") as f:
lines = f.readlines()
saw_jubeat_analyser_commands = False

View File

@ -282,7 +282,9 @@ def make_full_dumper_from_jubeat_analyser_chart_dumper(
timing,
circle_free,
)
file_bytes = chart_file.getvalue().encode("shift-jis-2004")
file_bytes = chart_file.getvalue().encode(
"shift-jis-2004", errors="surrogateescape"
)
files.append(ChartFile(file_bytes, song, difficulty, chart))
return files

View File

@ -103,7 +103,9 @@ class DoubleColumnChartLine:
def raise_if_position_unfit(self, bytes_per_panel: int) -> None:
expected_length = 4 * bytes_per_panel
actual_length = len(self.position.encode("shift-jis-2004"))
actual_length = len(
self.position.encode("shift-jis-2004", errors="surrogateescape")
)
if expected_length != actual_length:
raise SyntaxError(
f"Invalid position part. Since #bpp={bytes_per_panel}, the "
@ -115,7 +117,7 @@ class DoubleColumnChartLine:
if self.timing is None:
return
length = len(self.timing.encode("shift-jis-2004"))
length = len(self.timing.encode("shift-jis-2004", errors="surrogateescape"))
if length % bytes_per_panel != 0:
raise SyntaxError(
f"Invalid timing part. Since #bpp={bytes_per_panel}, the timing "
@ -173,7 +175,7 @@ def split_double_byte_line(line: str) -> List[str]:
>>> split_chart_line("口⑪①25")
... ["","","","25"]
"""
encoded_line = line.encode("shift-jis-2004")
encoded_line = line.encode("shift-jis-2004", errors="surrogateescape")
if len(encoded_line) % 2 != 0:
raise ValueError(
"Line of odd length encountered while trying to split a double-byte "
@ -181,7 +183,9 @@ def split_double_byte_line(line: str) -> List[str]:
)
symbols = []
for i in range(0, len(encoded_line), 2):
symbols.append(encoded_line[i : i + 2].decode("shift-jis-2004"))
symbols.append(
encoded_line[i : i + 2].decode("shift-jis-2004", errors="surrogateescape")
)
return symbols
@ -415,7 +419,9 @@ class JubeatAnalyserParser:
def define_symbol(self, symbol: str, timing: Decimal) -> None:
bpp = self.bytes_per_panel
length_as_shift_jis = len(symbol.encode("shift-jis-2004"))
length_as_shift_jis = len(
symbol.encode("shift-jis-2004", errors="surrogateescape")
)
if length_as_shift_jis != bpp:
raise ValueError(
f"Invalid symbol definition. Since #bpp={bpp}, timing symbols "
@ -430,7 +436,10 @@ class JubeatAnalyserParser:
self.symbols[symbol] = round_beats(timing)
def is_short_line(self, line: str) -> bool:
return len(line.encode("shift-jis-2004")) < self.bytes_per_panel * 4
return (
len(line.encode("shift-jis-2004", errors="surrogateescape"))
< self.bytes_per_panel * 4
)
def _split_chart_line(self, line: str) -> List[str]:
if self.bytes_per_panel == 2:
@ -466,15 +475,13 @@ class DoubleColumnFrame:
def read_jubeat_analyser_file(path: Path) -> Optional[List[str]]:
try:
# The vast majority of memo files you will encounter will be propely
# decoded using shift-jis-2004. Get ready for endless fun with the small
# portion of files that won't
lines = path.read_text(encoding="shift-jis-2004").split("\n")
except UnicodeDecodeError:
return None
else:
return lines
"""The vast majority of memo files you will encounter will be propely
decoded using shift-jis-2004. Some won't but jubeat analyser works at the
byte level so it doesn't care, here we use surrogateescape to handle
potential decoding errors"""
return path.read_text(encoding="shift-jis-2004", errors="surrogateescape").split(
"\n"
)
load_folder = make_folder_loader(

View File

@ -213,14 +213,18 @@ class Memo2Parser(JubeatAnalyserParser):
self._do_bpp(value)
def append_chart_line(self, raw_line: RawMemo2ChartLine) -> None:
if len(raw_line.position.encode("shift-jis-2004")) != 4 * self.bytes_per_panel:
if (
len(raw_line.position.encode("shift-jis-2004", errors="surrogateescape"))
!= 4 * self.bytes_per_panel
):
raise SyntaxError(
f"Invalid chart line for #bpp={self.bytes_per_panel} : {raw_line}"
)
if raw_line.timing is not None and self.bytes_per_panel == 2:
if any(
len(e.string.encode("shift-jis-2004")) % 2 != 0
len(e.string.encode("shift-jis-2004", errors="surrogateescape")) % 2
!= 0
for e in raw_line.timing
if isinstance(e, NoteCluster)
):

View File

@ -135,12 +135,15 @@ class MonoColumnParser(JubeatAnalyserParser):
def append_chart_line(self, line: str) -> None:
expected_length = 4 * self.bytes_per_panel
actual_length = len(line.encode("shift-jis-2004"))
actual_length = len(line.encode("shift-jis-2004", errors="surrogateescape"))
if actual_length != expected_length:
raise SyntaxError(f"Invalid chart line. Since for ")
if self.bytes_per_panel == 1 and len(line) != 4:
raise SyntaxError(f"Invalid chart line for #bpp=1 : {line}")
elif self.bytes_per_panel == 2 and len(line.encode("shift-jis-2004")) != 8:
elif (
self.bytes_per_panel == 2
and len(line.encode("shift-jis-2004", errors="surrogateescape")) != 8
):
raise SyntaxError(f"Invalid chart line for #bpp=2 : {line}")
self.current_chart_lines.append(line)

View File

@ -0,0 +1,513 @@
//耕耕 - 独伐舘
//by Mintice
//<2F><><<3C><>><3E>u<75>函釗<E587BD><E98797>
//20190503
t=130
m="[MTC] Mimi Cut.mp3"
o=225
#title="Mimi"
#dif=3
#lev=9
#artist="Pal Hwang Dan"
#holdbyarrow=1
*01:0
*02:0.25
*03:0.5
*04:0.75
*05:1
*06:1.25
*07:1.5
*08:1.75
*09:2
*10:2.25
*11:2.5
*12:2.75
*13:3
*14:3.25
*15:3.5
*16:3.75
けけけけ
けけけけ
けけけけ
けけけけ
----
けけ15け
け13けけ
け0511け
01090307
----
01けけ07
け0903け
0513け11
けけけ15
----
けけ15け
051311け
け0903け
01け07け
----
け0115け
けけ1103
051307け
け09けけ
----
13031309
01けけ15
07けけ09
01051105
---- //筈球?
け0511け
01け0315
090107け
01130713
----
0115け07
け01け13
13090703
05け11け
----
05150707
010903け
けけけ11
130113け
----
15け03け
05151107
01010711
13091309
----
け010711
0113け07
09け0313
け0515け
----
011311け
09け0313
け0107け
0515け07
----
05け0707
13011115
010913け
けけ03け
---- //杷 崎
け01けけ
けけけけ
0305けけ
0907けけ
1111けけ
1111けけ
1115け15
けけけけ
---- //戚 獣 姥砧
けけけけ
けけけ05
けけ05け
けけ0105
09け09け
けけけけ
けけけけ
13け13け
---- //舌縦級戚
け03け07
03010705
01け05け
けけけけ
---- //恭嬢閃 赤浦推
け01け01
05け05け
け09け09
151311け
---- //舌縦
けけ11け
け15け15
けけ11け
けけけけ
---- //級亀 爽昔
け13け09
けけけけ
13050913
01090105
---- //幻鏑戚蟹
03けけけ
01070707
け111111
01けけけ
---- //神掘鞠醸生艦
け05け15
05けけ13
01け11け
け0109け
----
けけけけ
け1507け
け051309
030111け
----
け011307
01け0703
0505け09
11111503
----
01051103
け010707
05150311
0913けけ
----
05け13け
01050307
01150911
11け0307
----
07けけ05
07け01け
03け05け
け030901
け1515け
け131211
け11けけ
13けけけ
----
け151305
071111け
03070105
け030901
----
03けけけ
03110901
け070105
07111305
けけけけ
けけけ15
けけけけ
けけけけ
----
け011103
051311け
01050703
1509け07
---- //遭伸
01050705
03けけ09
01151111
13151309
---- //企税 胡走
け050509
05010101
131309け
09け13け
---- //級引 敗臆
け0701け
07050303
けけ0501
111315け
---- //神掘 奄陥携倉
051513け
15010513
09050111
け091101
---- //及畷
11け1505
け1105け
09け0715
03け13け
---- //生稽 腔形
け051309
01130105
050109け
け09け13
---- //左戚走亀
け1511け
1511け07
11030703
010701け
---- //省惟 鞠醸走幻
01151305
15110113
11010509
け0509け
----
けけけ01
15け0307
け051109
15130307
---- //適窪
15010111
け03けけ
07111505
09けけ13
----
15けけ11
07け0103
け1115け
0905け13
----
15け0511
け03け07
011115け
09けけ13
----
1503け11
け1511け
05111507
09け0113
----
け0101け
けけけけ
01けけ01
けけけけ
けけけけ
15け0711
1105け15
030913け
----
07け03け
1501け11
11け0515
け0913け
----
けけけ01
15け0311
1105け15
け091307
----
01け0113
090315け
け110713
0905け05
---- //乞
01<EFBFBD><EFBFBD>けけ
けけけけ
けけけけ
けけけけ
13けけけ
けけけけ
けけけけ
けけけけ
---- //砧
けけけけ
けけ<EFBFBD><EFBFBD>01
けけけけ
けけけけ
けけけけ
けけけ13
けけけけ
けけけけ
---- //蹟醸嬢推
けけ<EFBFBD><EFBFBD>15
13<EFBFBD><EFBFBD>けけ
01<EFBFBD><EFBFBD><EFBFBD><EFBFBD>11
けけけけ
けけけけ
15けけけ
09けけ13
けけけけ
----
けけけ05
けけけけ
けけけけ
13090913
---- //戚
01<EFBFBD><EFBFBD>けけ
け131103
け051307
けけけ08
13けけけ
けけけけ
けけけけ
けけけけ
---- //賎
け0305け
1315<EFBFBD><EFBFBD>01
け0710け
130811け
けけけけ
けけけ13
けけけけ
けけけけ
---- //匂奄背推
けけ<EFBFBD><EFBFBD>15
13<EFBFBD><EFBFBD>けけ
01<EFBFBD><EFBFBD><EFBFBD><EFBFBD>11
けけけけ
けけけけ
15けけけ
09けけ13
けけけけ
けけけけ
けけ05け
け1310け
け071103
けけけけ
けけ15け
けけけけ
けけけけ
----
0103け05
1501け09
け050705
15131109
---- //杷採澗 痕事戚
01け03け
け0307け
け0711け
け11け01
けけけけ
けけけけ
けけけ15
けけ1513
---- //達焼人 希 戚雌
け0301け
0307け01
071115け
111513け
---- //馬冗惟 左戚走
0103け15
13070315
01110711
け1311け
---- //省澗汽
け03けけ
09030715
01110715
け0513け
---- //焼巷亀 紫形壱
01151101
け1107け
け070315
け03け13
---- //馬走研 省澗 杏
け11けけ
1107け01
070315け
03150113
---- //切重幻 乞牽壱
け070315
130703け
01けけ01
け13け11
けけけけ
けけ15け
けけけ11
けけけけ
---- //赤澗汽
010509け
13030711
15010509
け13けけ
---- //杷採澗 痕事戚 2
01<EFBFBD><EFBFBD>03け
け0307け
け071115
け111501
13けけけ
けけけけ
けけけけ
けけけ13
---- //達焼人 希 戚雌
け0301け
0307<EFBFBD><EFBFBD>01
071115け
111513け
けけけけ
けけけ13
けけけけ
けけけけ
---- //馬冗惟 左戚走
01け<EFBFBD><EFBFBD>15
13<EFBFBD><EFBFBD>けけ
01<EFBFBD><EFBFBD><EFBFBD><EFBFBD>11
けけけけ
け03けけ
け070315
け1107け
け1311け
けけけけ
15けけけ
09けけ13
けけけけ
---- //省澗汽
け03け05
09030715
01110715
け0513け
---- //焼巷亀 紫形壱
け151101
け1107け
け070315
け03け13
01<EFBFBD><EFBFBD>けけ
けけけけ
けけけけ
けけけけ
13けけけ
けけけけ
けけけけ
けけけけ
---- //馬走研 省澗 杏
け11けけ
1107<EFBFBD><EFBFBD>01
070315け
03150113
けけけけ
けけけ13
けけけけ
けけけけ
---- //切重幻 乞牽壱
け0703け
け0703け
01<EFBFBD><EFBFBD>け01
け13け11
けけ<EFBFBD><EFBFBD>15
13<EFBFBD><EFBFBD>15け
09け<EFBFBD><EFBFBD>11
けけけけ
けけけけ
15けけけ
けけけ13
けけけけ
---- //赤澗汽
けけけ05
けけけけ
けけけけ
けけけけ
010509け
13030711
15010509
け13けけ
----
けけけけ
けけけけ
け0101け
01けけ01

View File

@ -0,0 +1,610 @@
//<2F><><EFBFBD>皈 - 冪游蹲
//by Mintice
//20190409
t=160
m="[MTC] Nageki no Ki.mp3"
o=1320
#title="Nageki no Ki"
#dif=3
#lev=10
#artist="Golden Lion"
*01:0
*02:0.25
*03:0.5
*04:0.75
*05:1
*06:1.25
*07:1.5
*08:1.75
*09:2
*10:2.25
*11:2.5
*12:2.75
*13:3
*14:3.25
*15:3.5
*16:3.75
*a1:0
*a2:0.166667
*a3:0.333333
*a4:0.5
*a5:0.666667
*a6:0.833333
*b1:1
*b2:1.166667
*b3:1.333333
*b4:1.5
*b5:1.666667
*b6:1.833333
*c1:2
*c2:2.166667
*c3:2.333333
*c4:2.5
*c5:2.666667
*c6:2.833333
*d1:3
*d2:3.166667
*d3:3.333333
*d4:3.5
*d5:3.666667
*d6:3.833333
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----
け0102け
15030416
け05けけ
c1b6b5b4
----
12030215
け1411け
10010413
b3b4b5b6
けけけけ
けけけけ
けけけけ
け16けけ
----
1104け14
02131603
15090510
けけ12け
----
15051210
03131402
11090406
07010816
---- //球軍 獣拙
15070614
01091608
05130204
11031012
----
09030812
15071606
01111002
13050414
----
05110208
13031204
15070614
09011016
----
15131610
09010802
11050412
03070614
----
15111208
03050406
09011014
07130216
----
11150816
01071202
13050610
03090414
----
07110410
03091406
13010208
051512け
けけけけ
けけけけ
けけ16け
けけけけ
----
01け01け
け09けけ
09け09け
けけけ09
----
け150301
けけけ13
091305け
0105け09
----
けけ0907
けけ03け
0913け13
010515け
----
けけけけ
けけ0311
09130109
01051507
----
けけけけ
けけ0311
09130109
01051507
----
けけ03け
けけけ05
09130711
0105け09
----
け071513
03け01け
091305け
0105けけ
----
けけけけ
けけ0301
09131511
01051309
---- //域舘
01020304
05060708
12111009
16151413
----
09130307
010505け
けけ1509
けけ1113
----
091305け
0105け03
け111507
130109け
----
09131505
01050313
09けけけ
け071101
----
0913け03
01050715
15け0111
け091305
----
091301け
010515け
け030509
130711け
----
09130311
01051305
09けけ15
けけ0701
----
091305け
01051103
け1507け
け090113
けけけけ
けけけ15
けけけけ
けけけけ
----
けけけけ
け141008
03110715
01090513
---- //重巨
け0805け
031307け
020613け
01111009
けけけけ
けけけけ
けけけけ
15けけけ
----
09101101
け130502
け061303
けけ0807
けけけけ
けけ15け
けけけけ
けけけけ
----
け08けけ
0113け05
02061307
03111009
けけけけ
けけけけ
けけけけ
15けけけ
----
09101103
07130602
け051301
けけ08け
けけけけ
けけ15け
けけけけ
けけけけ
----
け0505け
けけけけ
け11け07
01020309
13けけ13
けけけけ
けけ15け
けけけけ
----
け111009
010203け
0708けけ
け060504
けけけけ
121314け
けけ1615
けけけけ
----
050607け
けけけけ
け01けけ
020304け
けけけけ
131211け
14100908
1516けけ
----
け0101け
けけけけ
けけけけ
けけけけ
けけけc6
けけけd1
けけけd2
けけけd3
---- //畷照
01150907
11け03け
05011309
け0901け
----
け09け15
け010501
09070911
031301け
----
01け03け
11011309
0509け09
15け0701
----
け091501
1303け09
07090105
け0111け
----
091115け
11130103
050913け
01150701
----
03151109
け130111
07150905
01け1301
----
13けけ05
02110112
け1305け
09041003
----
け060109
08110511
04100703
け0211け
----
03け0109
け130911
07け0501
010915け
----
け030911
01け0509
130115け
0709け01
----
010907け
050109け
け091501
け130311
----
け091501
01け0703
0913け01
05け0911
----
け11け15
09011301
13090315
01051107
----
0315け09
13010901
15110511
010713け
----
04130512
け02け03
13110510
09けけ01
----
07111610
03130206
15090414
05011208
---- //砧姥砧姥逆 砧姥砧姥逆 砧姥砧姥
0513け11
01100704
13080214
03010907
けけけけ
けけけけ
けけ16け
け15けけ
----
0310け09
01130702
0805け11
01130704
15けけけ
けけけ14
けけ16け
けけけけ
----
10け1104
050802け
0101け09
03け0707
け13けけ
けけけ16
13け14け
け15けけ
----
0510け02
010807け
け011107
03け0409
けけ16け
15けけ14
け13けけ
け13けけ
---- //含軒奄
03けけ02
け0704け
01け0806
05けけけ
け1510け
13け1612
1109けけ
けけ14け
----
05110806
け031214
09070402
13011610
けけけけ
け15けけ
けけけけ
けけけけ
----
07090412
01131016
11050206
け031408
けけけけ
15けけけ
けけけけ
けけけけ
----
15050416
13081114
10030211
01010707
---- //球牽牽牽牽犬
c1c2c3c4
b3b4b5b6
a5a6b1b2
a1a2a3a4
けけけけ
けけけけ
d3d4d5d6
c5c6d1d2
----
a5a6b1b2
a1a2a3a4
c1c2c3c4
b3b4b5b6
d3d4d5d6
c5c6d1d2
けけけけ
けけけけ
----
けけけけ
c1b6b5b4
c1けけけ
01030204
----
b2b3b4b5
03130212
15011004
09111416
----
07050612
03150414
11010802
13091610
---- //虹軒
01けけ13
03け0709
05010703
け111109
けけけけ
け15けけ
け15けけ
けけけけ
----
030313け
01け07け
07け0909
01111105
けけけけ
15けけけ
け15けけ
けけけけ
----
0103け03
け011509
07110907
05131115
----
13030109
01031109
07け1105
071515け
----
け110707
09031115
05011503
0913け01
---- //是蝿
0114け09
04021012
07051306
081103け
けけけけ
けけけけ
けけけけ
けけ1615
----
09け0307
020108け
05041006
け12け11
け16けけ
けけけ15
14けけけ
けけ13け
----
けけ01け
け01けけ
けけけけ
けけけけ
けけけc6
けけけd1
けけけd2
けけけd3
----
け080707
10050911
03け04け
010102け
けけけけ
けけけけ
け13け16
131514け
----
けけけ05
けけけけ
01けけけ
けけけ05
----
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----
けけけけ
けけけけ
けけけけ
けけけけ
----

View File

@ -49,6 +49,6 @@ def test_that_full_chart_roundtrips(song: song.Song, circle_free: bool) -> None:
Format.MEMO,
song,
temp_path=temp_file_named_txt(),
bytes_decoder=lambda b: b.decode("shift-jis-2004"),
bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"),
dump_options={"circle_free": circle_free},
)

View File

@ -45,6 +45,6 @@ def test_that_full_chart_roundtrips(song: song.Song, circle_free: bool) -> None:
Format.MEMO_1,
song,
temp_path=temp_file_named_txt(),
bytes_decoder=lambda b: b.decode("shift-jis-2004"),
bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"),
dump_options={"circle_free": circle_free},
)

View File

@ -56,6 +56,6 @@ def test_that_full_chart_roundtrips(song: Song, circle_free: bool) -> None:
Format.MEMO_2,
song,
temp_path=temp_file_named_txt(),
bytes_decoder=lambda b: b.decode("shift-jis-2004"),
bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"),
dump_options={"circle_free": circle_free},
)

View File

@ -89,6 +89,6 @@ def test_that_full_chart_roundtrips(song: Song, circle_free: bool) -> None:
Format.MONO_COLUMN,
song,
temp_path=temp_file_named_txt(),
bytes_decoder=lambda b: b.decode("shift-jis-2004"),
bytes_decoder=lambda b: b.decode("shift-jis-2004", errors="surrogateescape"),
dump_options={"circle_free": circle_free},
)

View File

@ -8,15 +8,19 @@ from jubeatools.formats.guess import guess_format
from . import data
def try_to_load(example_file: str) -> None:
with resources.path(data, example_file) as p:
format_ = guess_format(p)
loader = LOADERS[format_]
_ = loader(p)
def test_RorataJins_example() -> None:
"""This file has a #memo tag but actually uses mono-column formatting,
Here I just check that a friendlier error message is sent because there
is not much else I can to do here, the file is plain old wrong"""
with pytest.raises(SyntaxError, match="separator line"):
with resources.path(data, "RorataJin's example.txt") as p:
format_ = guess_format(p)
loader = LOADERS[format_]
_ = loader(p)
try_to_load("RorataJin's example.txt")
def test_Booths_of_Fighters_memo() -> None:
@ -24,7 +28,15 @@ def test_Booths_of_Fighters_memo() -> None:
- while it's in #memo2 format, it actually uses b= and t= commands
- the position and timing parts are separated by some less common
whitespace character"""
with resources.path(data, "Booths_of_Fighters_memo.txt") as p:
format_ = guess_format(p)
loader = LOADERS[format_]
_ = loader(p)
try_to_load("Booths_of_Fighters_memo.txt")
def test_MTC_Nageki_no_Ki_EXT() -> None:
"""This file is proper euc-kr text that's specially crafted to also be
compatible with jubeat analyser, handcrafted mojibake and all"""
try_to_load("MTC_Nageki_no_Ki_EXT.txt")
def test_MTC_Mimi_EXT() -> None:
"""Also an euc-kr file but also has long notes"""
try_to_load("MTC_Mimi_EXT.txt")