feat: Add ignore case and UTF16 search options to sequence searching

2024-11-27 17:10:51 +01:00 · 2023-12-19 14:34:35 +01:00 · 2023-12-19 14:34:35 +01:00 · 96db2074c6
commit 96db2074c6
parent c7ab4a4569
6 changed files with 150 additions and 9 deletions
--- a/.gdbinit
+++ b/.gdbinit
@ -6,4 +6,11 @@ skip -rfu ^__gnu_debug::
 skip -rfu ^ImGui::

 # Trigger breakpoint when execution reaches triggerSafeShutdown()
-break triggerSafeShutdown
+break triggerSafeShutdown
+
+# Print backtrace after execution jumped to an invalid address
+define fixbt
+    set $pc = *(void **)$rsp
+    set $rsp = $rsp + 8
+    bt
+end
--- a/lib/libimhex/include/hex/helpers/utils.hpp
+++ b/lib/libimhex/include/hex/helpers/utils.hpp
@ -78,6 +78,8 @@ namespace hex {
    [[nodiscard]] std::string encodeByteString(const std::vector<u8> &bytes);
    [[nodiscard]] std::vector<u8> decodeByteString(const std::string &string);

+    std::wstring utf8ToUtf16(const std::string& utf8);
+
    [[nodiscard]] constexpr u64 extract(u8 from, u8 to, const std::unsigned_integral auto &value) {
        if (from < to) std::swap(from, to);

--- a/lib/libimhex/source/helpers/utils.cpp
+++ b/lib/libimhex/source/helpers/utils.cpp
@ -485,6 +485,70 @@ namespace hex {
        return result;
    }

+    std::wstring utf8ToUtf16(const std::string& utf8) {
+        std::vector<u32> unicodes;
+
+        for (size_t byteIndex = 0; byteIndex < utf8.size();) {
+            u32 unicode = 0;
+            size_t unicodeSize = 0;
+
+            u8 ch = utf8[byteIndex];
+            byteIndex += 1;
+
+            if (ch <= 0x7F) {
+                unicode = ch;
+                unicodeSize = 0;
+            } else if (ch <= 0xBF) {
+                return { };
+            } else if (ch <= 0xDF) {
+                unicode = ch&0x1F;
+                unicodeSize = 1;
+            } else if (ch <= 0xEF) {
+                unicode = ch&0x0F;
+                unicodeSize = 2;
+            } else if (ch <= 0xF7) {
+                unicode = ch&0x07;
+                unicodeSize = 3;
+            } else {
+                return { };
+            }
+
+            for (size_t unicodeByteIndex = 0; unicodeByteIndex < unicodeSize; unicodeByteIndex += 1) {
+                if (byteIndex == utf8.size())
+                    return { };
+
+                u8 byte = utf8[byteIndex];
+                if (byte < 0x80 || byte > 0xBF)
+                    return { };
+
+                unicode <<= 6;
+                unicode += byte & 0x3F;
+
+                byteIndex += 1;
+            }
+
+            if (unicode >= 0xD800 && unicode <= 0xDFFF)
+                return { };
+            if (unicode > 0x10FFFF)
+                return { };
+
+            unicodes.push_back(unicode);
+        }
+
+        std::wstring utf16;
+
+        for (auto unicode : unicodes) {
+            if (unicode <= 0xFFFF)
+                utf16 += static_cast<wchar_t>(unicode);
+            else {
+                unicode -= 0x10000;
+                utf16 += static_cast<wchar_t>(((unicode >> 10) + 0xD800));
+                utf16 += static_cast<wchar_t>(((unicode & 0x3FF) + 0xDC00));
+            }
+        }
+        return utf16;
+    }
+
    float float16ToFloat32(u16 float16) {
        u32 sign     = float16 >> 15;
        u32 exponent = (float16 >> 10) & 0x1F;
--- a/plugins/builtin/include/content/views/view_find.hpp
+++ b/plugins/builtin/include/content/views/view_find.hpp
@ -63,6 +63,9 @@ namespace hex::plugin::builtin {

            struct Sequence {
                std::string sequence;
+
+                StringType type = StringType::ASCII;
+                bool ignoreCase = false;
            } bytes;

            struct Regex {
--- a/plugins/builtin/romfs/lang/en_US.json
+++ b/plugins/builtin/romfs/lang/en_US.json
@ -895,6 +895,7 @@
        "hex.builtin.view.find.search.reset": "Reset",
        "hex.builtin.view.find.searching": "Searching...",
        "hex.builtin.view.find.sequences": "Sequences",
+        "hex.builtin.view.find.sequences.ignore_case": "Ignore case",
        "hex.builtin.view.find.shortcut.select_all": "Select All Occurrences",
        "hex.builtin.view.find.strings": "Strings",
        "hex.builtin.view.find.strings.chars": "Characters",
--- a/plugins/builtin/source/content/views/view_find.cpp
+++ b/plugins/builtin/source/content/views/view_find.cpp
@ -6,6 +6,7 @@
 #include <hex/providers/buffered_reader.hpp>

 #include <array>
+#include <ranges>
 #include <regex>
 #include <string>
 #include <utility>
@ -23,7 +24,7 @@ namespace hex::plugin::builtin {
            if (m_searchTask.isRunning())
                return { };

-            if (!m_occurrenceTree->overlapping({ address, address + size }).empty())
+            if (!m_occurrenceTree->overlapping({ address, address }).empty())
                return HighlightColor();
            else
                return std::nullopt;
@ -258,23 +259,74 @@ namespace hex::plugin::builtin {
        reader.seek(searchRegion.getStartAddress());
        reader.setEndAddress(searchRegion.getEndAddress());

-        auto bytes = hex::decodeByteString(settings.sequence);
-
-        if (bytes.empty())
+        auto input = hex::decodeByteString(settings.sequence);
+        if (input.empty())
            return { };

+        std::vector<u8> bytes;
+        Occurrence::DecodeType decodeType = Occurrence::DecodeType::Binary;
+        std::endian endian;
+        switch (settings.type) {
+            default:
+            case SearchSettings::StringType::ASCII:
+                bytes = input;
+                decodeType = Occurrence::DecodeType::ASCII;
+                endian = std::endian::native;
+                break;
+            case SearchSettings::StringType::UTF16LE: {
+                auto wString = hex::utf8ToUtf16({ input.begin(), input.end() });
+
+                bytes.resize(wString.size() * 2);
+                std::memcpy(bytes.data(), wString.data(), bytes.size());
+                decodeType = Occurrence::DecodeType::UTF16;
+                endian = std::endian::little;
+
+                break;
+            }
+            case SearchSettings::StringType::UTF16BE: {
+                auto wString = hex::utf8ToUtf16({ input.begin(), input.end() });
+
+                bytes.resize(wString.size() * 2);
+                std::memcpy(bytes.data(), wString.data(), bytes.size());
+                decodeType = Occurrence::DecodeType::UTF16;
+                endian = std::endian::big;
+
+                for (size_t i = 0; i < bytes.size(); i += 2)
+                    std::swap(bytes[i], bytes[i + 1]);
+                break;
+            }
+        }
+
        auto occurrence = reader.begin();
        u64 progress = 0;
+
+        auto searchPredicate = [&] -> bool(*)(u8, u8) {
+            if (!settings.ignoreCase)
+                return [](u8 left, u8 right) -> bool {
+                    return left == right;
+                };
+            else
+                return [](u8 left, u8 right) -> bool {
+                    if (std::isupper(left))
+                        left = std::tolower(left);
+                    if (std::isupper(right))
+                        right = std::tolower(right);
+
+                    return left == right;
+                };
+        }();
+
+
        while (true) {
            task.update(progress);

-            occurrence = std::search(reader.begin(), reader.end(), std::boyer_moore_horspool_searcher(bytes.begin(), bytes.end()));
+            occurrence = std::search(reader.begin(), reader.end(), std::default_searcher(bytes.begin(), bytes.end(), searchPredicate));
            if (occurrence == reader.end())
                break;

            auto address = occurrence.getAddress();
            reader.seek(address + 1);
-            results.push_back(Occurrence{ Region { address, bytes.size() }, Occurrence::DecodeType::Binary, std::endian::native, false });
+            results.push_back(Occurrence{ Region { address, bytes.size() }, decodeType, endian, false });
            progress = address - searchRegion.getStartAddress();
        }

@ -497,6 +549,8 @@ namespace hex::plugin::builtin {

            case Value:
            case Strings:
+            case Sequence:
+            case Regex:
            {
                switch (occurrence.decodeType) {
                    using enum Occurrence::DecodeType;
@ -523,8 +577,6 @@ namespace hex::plugin::builtin {
                }
            }
                break;
-            case Sequence:
-            case Regex:
            case BinaryPattern:
                result = hex::encodeByteString(bytes);
                break;
@ -661,6 +713,18 @@ namespace hex::plugin::builtin {

                    ImGuiExt::InputTextIcon("hex.builtin.common.value"_lang, ICON_VS_SYMBOL_KEY, settings.sequence);

+                    if (ImGui::BeginCombo("hex.builtin.common.type"_lang, StringTypes[std::to_underlying(settings.type)].c_str())) {
+                        for (size_t i = 0; i < StringTypes.size() - 2; i++) {
+                            auto type = static_cast<SearchSettings::StringType>(i);
+
+                            if (ImGui::Selectable(StringTypes[i].c_str(), type == settings.type))
+                                settings.type = type;
+                        }
+                        ImGui::EndCombo();
+                    }
+
+                    ImGui::Checkbox("hex.builtin.view.find.sequences.ignore_case"_lang, &settings.ignoreCase);
+
                    m_settingsValid = !settings.sequence.empty() && !hex::decodeByteString(settings.sequence).empty();

                    ImGui::EndTabItem();