feat: Added support for scanning binaries for UTF-8 strings

2025-01-11 05:42:15 +01:00 · 2025-01-09 18:33:45 +01:00 · 2025-01-09 18:33:45 +01:00 · e2c302836f
commit e2c302836f
parent f1d9642cf6
3 changed files with 52 additions and 6 deletions
--- a/lib/libimhex/include/hex/api/content_registry.hpp
+++ b/lib/libimhex/include/hex/api/content_registry.hpp
@ -1024,7 +1024,7 @@ namespace hex {
                struct FindOccurrence {
                    Region region;
-                    enum class DecodeType { ASCII, Binary, UTF16, Unsigned, Signed, Float, Double } decodeType;
+                    enum class DecodeType { ASCII, UTF8, Binary, UTF16, Unsigned, Signed, Float, Double } decodeType;
                    std::endian endian = std::endian::native;
                    bool selected;
                };
--- a/plugins/builtin/include/content/views/view_find.hpp
+++ b/plugins/builtin/include/content/views/view_find.hpp
@ -42,7 +42,7 @@ namespace hex::plugin::builtin {
                Value
            } mode = Mode::Strings;
-            enum class StringType : int { ASCII = 0, UTF16LE = 1, UTF16BE = 2, ASCII_UTF16LE = 3, ASCII_UTF16BE = 4 };
+            enum class StringType : int { ASCII = 0, UTF8 = 1, UTF16LE = 2, UTF16BE = 3, ASCII_UTF16LE = 4, ASCII_UTF16BE = 5 };
            struct Strings {
                int minLength = 5;
--- a/plugins/builtin/source/content/views/view_find.cpp
+++ b/plugins/builtin/source/content/views/view_find.cpp
@ -202,6 +202,8 @@ namespace hex::plugin::builtin {
        const auto [decodeType, endian] = [&] -> std::pair<Occurrence::DecodeType, std::endian> {
            if (settings.type == ASCII)
                return { Occurrence::DecodeType::ASCII, std::endian::native };
            if (settings.type == UTF8)
                return { Occurrence::DecodeType::UTF8, std::endian::native };
            else if (settings.type == SearchSettings::StringType::UTF16BE)
                return { Occurrence::DecodeType::UTF16, std::endian::big };
            else if (settings.type == SearchSettings::StringType::UTF16LE)
@ -210,11 +212,13 @@ namespace hex::plugin::builtin {
                return { Occurrence::DecodeType::Binary, std::endian::native };
        }();
-        size_t countedCharacters = 0;
+        i64 countedCharacters = 0;
        u64 startAddress = reader.begin().getAddress();
        u64 endAddress = reader.end().getAddress();
        u64 progress = 0;
        u64 codePointWidth = 0;
        i8 remainingCharacters = 0;
        for (u8 byte : reader) {
            bool validChar =
                (settings.lowerCaseLetters    && std::islower(byte))  ||
@ -233,6 +237,42 @@ namespace hex::plugin::builtin {
                // Check if first byte of UTF-16 encoded string is 0x00
                if (countedCharacters % 2 == 0)
                    validChar = byte == 0x00;
            } else if (settings.type == UTF8) {
                if ((byte & 0b1000'0000) == 0b0000'0000) {
                    // ASCII range
                    codePointWidth = 1;
                    remainingCharacters = 0;
                    validChar = true;
                } else if ((byte & 0b1100'0000) == 0b1000'0000) {
                    // Continuation mark
                    if (remainingCharacters > 0) {
                        remainingCharacters -= 1;
                        validChar = true;
                    } else {
                        countedCharacters -= std::max<i64>(0, codePointWidth - (remainingCharacters + 1));
                        codePointWidth = 0;
                        remainingCharacters = 0;
                        validChar = false;
                    }
                } else if ((byte & 0b1110'0000) == 0b1100'0000) {
                    // Two bytes
                    codePointWidth = 2;
                    remainingCharacters = codePointWidth - 1;
                    validChar = true;
                } else if ((byte & 0b1111'0000) == 0b1110'0000) {
                    // Three bytes
                    codePointWidth = 3;
                    remainingCharacters = codePointWidth - 1;
                    validChar = true;
                } else if ((byte & 0b1111'1000) == 0b1111'0000) {
                    // Four bytes
                    codePointWidth = 4;
                    remainingCharacters = codePointWidth - 1;
                    validChar = true;
                } else {
                    validChar = false;
                }
            }
            task.update(progress);
@ -240,9 +280,9 @@ namespace hex::plugin::builtin {
            if (validChar)
                countedCharacters++;
            if (!validChar || startAddress + countedCharacters == endAddress) {
-                if (countedCharacters >= size_t(settings.minLength)) {
+                if (countedCharacters >= settings.minLength) {
                    if (!settings.nullTermination || byte == 0x00) {
-                        results.push_back(Occurrence { Region { startAddress, countedCharacters }, decodeType, endian, false });
+                        results.push_back(Occurrence { Region { startAddress, size_t(countedCharacters) }, decodeType, endian, false });
                    }
                }
@ -563,6 +603,11 @@ namespace hex::plugin::builtin {
                    case ASCII:
                        result = hex::encodeByteString(bytes);
                        break;
                    case UTF8:
                        result = std::string(bytes.begin(), bytes.end());
                        result = wolv::util::replaceStrings(result, "\n", "");
                        result = wolv::util::replaceStrings(result, "\r", "");
                        break;
                    case UTF16:
                        for (size_t i = occurrence.endian == std::endian::little ? 0 : 1; i < bytes.size(); i += 2)
                            result += hex::encodeByteString({ bytes[i] });
@ -667,8 +712,9 @@ namespace hex::plugin::builtin {
            ImGui::NewLine();
            if (ImGui::BeginTabBar("SearchMethods")) {
-                const std::array<std::string, 5> StringTypes = {
+                const std::array<std::string, 6> StringTypes = {
                        "hex.ui.common.encoding.ascii"_lang,
                        "hex.ui.common.encoding.utf8"_lang,
                        "hex.ui.common.encoding.utf16le"_lang,
                        "hex.ui.common.encoding.utf16be"_lang,
                        hex::format("{} + {}", "hex.ui.common.encoding.ascii"_lang, "hex.ui.common.encoding.utf16le"_lang),