From e2c302836ff7c44ebc5a8b76bdad84a32041d77d Mon Sep 17 00:00:00 2001 From: WerWolv Date: Thu, 9 Jan 2025 18:33:45 +0100 Subject: [PATCH] feat: Added support for scanning binaries for UTF-8 strings --- .../include/hex/api/content_registry.hpp | 2 +- .../include/content/views/view_find.hpp | 2 +- .../source/content/views/view_find.cpp | 54 +++++++++++++++++-- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/lib/libimhex/include/hex/api/content_registry.hpp b/lib/libimhex/include/hex/api/content_registry.hpp index 6bbe866c4..7c9e3cc9b 100644 --- a/lib/libimhex/include/hex/api/content_registry.hpp +++ b/lib/libimhex/include/hex/api/content_registry.hpp @@ -1024,7 +1024,7 @@ namespace hex { struct FindOccurrence { Region region; - enum class DecodeType { ASCII, Binary, UTF16, Unsigned, Signed, Float, Double } decodeType; + enum class DecodeType { ASCII, UTF8, Binary, UTF16, Unsigned, Signed, Float, Double } decodeType; std::endian endian = std::endian::native; bool selected; }; diff --git a/plugins/builtin/include/content/views/view_find.hpp b/plugins/builtin/include/content/views/view_find.hpp index ab7ad6135..a276e7739 100644 --- a/plugins/builtin/include/content/views/view_find.hpp +++ b/plugins/builtin/include/content/views/view_find.hpp @@ -42,7 +42,7 @@ namespace hex::plugin::builtin { Value } mode = Mode::Strings; - enum class StringType : int { ASCII = 0, UTF16LE = 1, UTF16BE = 2, ASCII_UTF16LE = 3, ASCII_UTF16BE = 4 }; + enum class StringType : int { ASCII = 0, UTF8 = 1, UTF16LE = 2, UTF16BE = 3, ASCII_UTF16LE = 4, ASCII_UTF16BE = 5 }; struct Strings { int minLength = 5; diff --git a/plugins/builtin/source/content/views/view_find.cpp b/plugins/builtin/source/content/views/view_find.cpp index c7fd6d350..07eba16fa 100644 --- a/plugins/builtin/source/content/views/view_find.cpp +++ b/plugins/builtin/source/content/views/view_find.cpp @@ -202,6 +202,8 @@ namespace hex::plugin::builtin { const auto [decodeType, endian] = [&] -> std::pair { if (settings.type == ASCII) return { Occurrence::DecodeType::ASCII, std::endian::native }; + if (settings.type == UTF8) + return { Occurrence::DecodeType::UTF8, std::endian::native }; else if (settings.type == SearchSettings::StringType::UTF16BE) return { Occurrence::DecodeType::UTF16, std::endian::big }; else if (settings.type == SearchSettings::StringType::UTF16LE) @@ -210,11 +212,13 @@ namespace hex::plugin::builtin { return { Occurrence::DecodeType::Binary, std::endian::native }; }(); - size_t countedCharacters = 0; + i64 countedCharacters = 0; u64 startAddress = reader.begin().getAddress(); u64 endAddress = reader.end().getAddress(); u64 progress = 0; + u64 codePointWidth = 0; + i8 remainingCharacters = 0; for (u8 byte : reader) { bool validChar = (settings.lowerCaseLetters && std::islower(byte)) || @@ -233,6 +237,42 @@ namespace hex::plugin::builtin { // Check if first byte of UTF-16 encoded string is 0x00 if (countedCharacters % 2 == 0) validChar = byte == 0x00; + } else if (settings.type == UTF8) { + if ((byte & 0b1000'0000) == 0b0000'0000) { + // ASCII range + codePointWidth = 1; + remainingCharacters = 0; + validChar = true; + } else if ((byte & 0b1100'0000) == 0b1000'0000) { + // Continuation mark + + if (remainingCharacters > 0) { + remainingCharacters -= 1; + validChar = true; + } else { + countedCharacters -= std::max(0, codePointWidth - (remainingCharacters + 1)); + codePointWidth = 0; + remainingCharacters = 0; + validChar = false; + } + } else if ((byte & 0b1110'0000) == 0b1100'0000) { + // Two bytes + codePointWidth = 2; + remainingCharacters = codePointWidth - 1; + validChar = true; + } else if ((byte & 0b1111'0000) == 0b1110'0000) { + // Three bytes + codePointWidth = 3; + remainingCharacters = codePointWidth - 1; + validChar = true; + } else if ((byte & 0b1111'1000) == 0b1111'0000) { + // Four bytes + codePointWidth = 4; + remainingCharacters = codePointWidth - 1; + validChar = true; + } else { + validChar = false; + } } task.update(progress); @@ -240,9 +280,9 @@ namespace hex::plugin::builtin { if (validChar) countedCharacters++; if (!validChar || startAddress + countedCharacters == endAddress) { - if (countedCharacters >= size_t(settings.minLength)) { + if (countedCharacters >= settings.minLength) { if (!settings.nullTermination || byte == 0x00) { - results.push_back(Occurrence { Region { startAddress, countedCharacters }, decodeType, endian, false }); + results.push_back(Occurrence { Region { startAddress, size_t(countedCharacters) }, decodeType, endian, false }); } } @@ -563,6 +603,11 @@ namespace hex::plugin::builtin { case ASCII: result = hex::encodeByteString(bytes); break; + case UTF8: + result = std::string(bytes.begin(), bytes.end()); + result = wolv::util::replaceStrings(result, "\n", ""); + result = wolv::util::replaceStrings(result, "\r", ""); + break; case UTF16: for (size_t i = occurrence.endian == std::endian::little ? 0 : 1; i < bytes.size(); i += 2) result += hex::encodeByteString({ bytes[i] }); @@ -667,8 +712,9 @@ namespace hex::plugin::builtin { ImGui::NewLine(); if (ImGui::BeginTabBar("SearchMethods")) { - const std::array StringTypes = { + const std::array StringTypes = { "hex.ui.common.encoding.ascii"_lang, + "hex.ui.common.encoding.utf8"_lang, "hex.ui.common.encoding.utf16le"_lang, "hex.ui.common.encoding.utf16be"_lang, hex::format("{} + {}", "hex.ui.common.encoding.ascii"_lang, "hex.ui.common.encoding.utf16le"_lang),