1
0
mirror of synced 2024-11-12 01:20:47 +01:00

Switch to bytes for file format guessing

This commit is contained in:
Stepland 2024-07-27 11:05:48 +02:00
parent ce51b56dd7
commit 65c86de19c
2 changed files with 76 additions and 51 deletions

View File

@ -1,6 +1,7 @@
import json
import re
from functools import wraps
from io import StringIO
from pathlib import Path
from typing import Any, Callable, Type
@ -12,38 +13,43 @@ def guess_format(path: Path) -> Format:
format is unknown"""
if path.is_dir():
raise ValueError("Can't guess chart format for a folder")
contents = path.read_bytes()
return guess_file_format(contents)
def guess_file_format(contents: bytes) -> Format:
try:
return recognize_json_formats(path)
except (json.JSONDecodeError, UnicodeDecodeError, ValueError):
return recognize_json_formats(contents)
except (json.JSONDecodeError, UnicodeError, ValueError):
pass
try:
return recognize_jubeat_analyser_format(path)
except (UnicodeDecodeError, ValueError):
return recognize_jubeat_analyser_format(contents)
except (UnicodeError, ValueError):
pass
if looks_like_eve(path):
if looks_like_eve(contents):
return Format.EVE
if looks_like_jbsq(path):
if looks_like_jbsq(contents):
return Format.JBSQ
if looks_like_yubiosi_1_0(path):
if looks_like_yubiosi_1_0(contents):
return Format.YUBIOSI_1_0
if looks_like_yubiosi_1_5(path):
if looks_like_yubiosi_1_5(contents):
return Format.YUBIOSI_1_5
if looks_like_yubiosi_2_0(path):
if looks_like_yubiosi_2_0(contents):
return Format.YUBIOSI_2_0
raise ValueError("Unrecognized file format")
def recognize_json_formats(path: Path) -> Format:
with path.open(encoding="utf8") as f:
obj = json.load(f)
def recognize_json_formats(contents: bytes) -> Format:
obj = json.loads(contents)
if not isinstance(obj, dict):
raise ValueError("Top level value is not an object")
@ -95,9 +101,9 @@ def _dirty_jba_line_strip(line: str) -> str:
return COMMENT.sub("", line).strip()
def recognize_jubeat_analyser_format(path: Path) -> Format:
with path.open(encoding="shift-jis-2004", errors="surrogateescape") as f:
lines = f.readlines()
def recognize_jubeat_analyser_format(contents: bytes) -> Format:
text = contents.decode(encoding="shift-jis-2004", errors="surrogateescape")
lines = text.splitlines()
saw_jubeat_analyser_commands = False
for raw_line in lines:
@ -138,10 +144,10 @@ def false_if_raises(
return decorator
@false_if_raises(UnicodeDecodeError, StopIteration)
def looks_like_eve(path: Path) -> bool:
with path.open(encoding="ascii") as f:
return looks_like_eve_line(f.readline())
@false_if_raises(UnicodeError, StopIteration)
def looks_like_eve(contents: bytes) -> bool:
f = StringIO(contents.decode("ascii"))
return looks_like_eve_line(f.readline())
EVE_COMMANDS = {
@ -176,30 +182,28 @@ def looks_like_eve_line(line: str) -> bool:
return True
def looks_like_jbsq(path: Path) -> bool:
with path.open(mode="rb") as f:
magic = f.read(4)
return magic in (b"IJBQ", b"IJSQ", b"JBSQ")
def looks_like_jbsq(contents: bytes) -> bool:
magic = contents[:4]
return magic in (b"IJBQ", b"IJSQ", b"JBSQ")
@false_if_raises(UnicodeDecodeError, ValueError)
def looks_like_yubiosi_1_0(path: Path) -> bool:
with path.open(encoding="shift-jis-2004") as f:
lines = f.read().split("\n")
(
_, # title
_, # save_data_name
raw_bpm,
chart_duration_ms,
raw_offset,
raw_note_count,
*raw_times_and_positions,
) = lines
float(raw_bpm)
int(chart_duration_ms)
int(raw_offset)
note_count = int(raw_note_count)
return len(raw_times_and_positions) == 2 * note_count
@false_if_raises(UnicodeError, ValueError)
def looks_like_yubiosi_1_0(contents: bytes) -> bool:
lines = contents.decode(encoding="shift-jis-2004").splitlines()
(
_, # title
_, # save_data_name
raw_bpm,
chart_duration_ms,
raw_offset,
raw_note_count,
*raw_times_and_positions,
) = lines
float(raw_bpm)
int(chart_duration_ms)
int(raw_offset)
note_count = int(raw_note_count)
return len(raw_times_and_positions) == 2 * note_count
YUBIOSI_1_5_TAGS = {
@ -215,12 +219,11 @@ YUBIOSI_1_5_TAGS = {
}
@false_if_raises(UnicodeDecodeError, ValueError)
def looks_like_yubiosi_1_5(path: Path) -> bool:
with path.open(encoding="shift-jis-2004") as f:
lines = f.read().split("\n")
note_index = lines.index("[Notes]")
return any(line_has_yubiosi_tag(line) for line in lines[:note_index])
@false_if_raises(UnicodeError, ValueError)
def looks_like_yubiosi_1_5(contents: bytes) -> bool:
lines = contents.decode(encoding="shift-jis-2004").splitlines()
note_index = lines.index("[Notes]")
return any(line_has_yubiosi_tag(line) for line in lines[:note_index])
def line_has_yubiosi_tag(line: str) -> bool:
@ -231,7 +234,8 @@ def line_has_yubiosi_tag(line: str) -> bool:
return False
@false_if_raises(UnicodeDecodeError)
def looks_like_yubiosi_2_0(path: Path) -> bool:
with path.open(encoding="utf-16") as f:
return f.readline() == "//Yubiosi 2.0\n"
@false_if_raises(UnicodeError)
def looks_like_yubiosi_2_0(contents: bytes) -> bool:
text = contents.decode("utf-16")
lines = text.splitlines()
return lines and lines[0] == "//Yubiosi 2.0"

View File

@ -0,0 +1,21 @@
from pathlib import Path
import hypothesis.strategies as st
from hypothesis import given, settings
from .. import guess
@given(st.binary())
def test_that_guess_format_only_raises_the_specific_value_error(contents: bytes):
try:
guess.guess_file_format(contents)
except ValueError as e:
if e.args != ("Unrecognized file format",):
raise
def test_that_yubiosi_2_0_detection_does_not_raise_exception_for_non_utf16_files():
text = "blablabla"
bytes_ = text.encode("ascii")
guess.looks_like_yubiosi_2_0(bytes_)