From 96db2074c60a0abddffc6bc8df4a848166a3a2c2 Mon Sep 17 00:00:00 2001 From: WerWolv Date: Tue, 19 Dec 2023 14:34:35 +0100 Subject: [PATCH] feat: Add ignore case and UTF16 search options to sequence searching --- .gdbinit | 9 ++- lib/libimhex/include/hex/helpers/utils.hpp | 2 + lib/libimhex/source/helpers/utils.cpp | 64 +++++++++++++++ .../include/content/views/view_find.hpp | 3 + plugins/builtin/romfs/lang/en_US.json | 1 + .../source/content/views/view_find.cpp | 80 +++++++++++++++++-- 6 files changed, 150 insertions(+), 9 deletions(-) diff --git a/.gdbinit b/.gdbinit index 36214964d..3d20cea4a 100644 --- a/.gdbinit +++ b/.gdbinit @@ -6,4 +6,11 @@ skip -rfu ^__gnu_debug:: skip -rfu ^ImGui:: # Trigger breakpoint when execution reaches triggerSafeShutdown() -break triggerSafeShutdown \ No newline at end of file +break triggerSafeShutdown + +# Print backtrace after execution jumped to an invalid address +define fixbt + set $pc = *(void **)$rsp + set $rsp = $rsp + 8 + bt +end \ No newline at end of file diff --git a/lib/libimhex/include/hex/helpers/utils.hpp b/lib/libimhex/include/hex/helpers/utils.hpp index 99ccd7e3f..ec3e47cc9 100644 --- a/lib/libimhex/include/hex/helpers/utils.hpp +++ b/lib/libimhex/include/hex/helpers/utils.hpp @@ -78,6 +78,8 @@ namespace hex { [[nodiscard]] std::string encodeByteString(const std::vector &bytes); [[nodiscard]] std::vector decodeByteString(const std::string &string); + std::wstring utf8ToUtf16(const std::string& utf8); + [[nodiscard]] constexpr u64 extract(u8 from, u8 to, const std::unsigned_integral auto &value) { if (from < to) std::swap(from, to); diff --git a/lib/libimhex/source/helpers/utils.cpp b/lib/libimhex/source/helpers/utils.cpp index 8edf42f17..9cc22613e 100644 --- a/lib/libimhex/source/helpers/utils.cpp +++ b/lib/libimhex/source/helpers/utils.cpp @@ -485,6 +485,70 @@ namespace hex { return result; } + std::wstring utf8ToUtf16(const std::string& utf8) { + std::vector unicodes; + + for (size_t byteIndex = 0; byteIndex < utf8.size();) { + u32 unicode = 0; + size_t unicodeSize = 0; + + u8 ch = utf8[byteIndex]; + byteIndex += 1; + + if (ch <= 0x7F) { + unicode = ch; + unicodeSize = 0; + } else if (ch <= 0xBF) { + return { }; + } else if (ch <= 0xDF) { + unicode = ch&0x1F; + unicodeSize = 1; + } else if (ch <= 0xEF) { + unicode = ch&0x0F; + unicodeSize = 2; + } else if (ch <= 0xF7) { + unicode = ch&0x07; + unicodeSize = 3; + } else { + return { }; + } + + for (size_t unicodeByteIndex = 0; unicodeByteIndex < unicodeSize; unicodeByteIndex += 1) { + if (byteIndex == utf8.size()) + return { }; + + u8 byte = utf8[byteIndex]; + if (byte < 0x80 || byte > 0xBF) + return { }; + + unicode <<= 6; + unicode += byte & 0x3F; + + byteIndex += 1; + } + + if (unicode >= 0xD800 && unicode <= 0xDFFF) + return { }; + if (unicode > 0x10FFFF) + return { }; + + unicodes.push_back(unicode); + } + + std::wstring utf16; + + for (auto unicode : unicodes) { + if (unicode <= 0xFFFF) + utf16 += static_cast(unicode); + else { + unicode -= 0x10000; + utf16 += static_cast(((unicode >> 10) + 0xD800)); + utf16 += static_cast(((unicode & 0x3FF) + 0xDC00)); + } + } + return utf16; + } + float float16ToFloat32(u16 float16) { u32 sign = float16 >> 15; u32 exponent = (float16 >> 10) & 0x1F; diff --git a/plugins/builtin/include/content/views/view_find.hpp b/plugins/builtin/include/content/views/view_find.hpp index 26299639f..b3088b08e 100644 --- a/plugins/builtin/include/content/views/view_find.hpp +++ b/plugins/builtin/include/content/views/view_find.hpp @@ -63,6 +63,9 @@ namespace hex::plugin::builtin { struct Sequence { std::string sequence; + + StringType type = StringType::ASCII; + bool ignoreCase = false; } bytes; struct Regex { diff --git a/plugins/builtin/romfs/lang/en_US.json b/plugins/builtin/romfs/lang/en_US.json index 4328ffd77..6feec69b0 100644 --- a/plugins/builtin/romfs/lang/en_US.json +++ b/plugins/builtin/romfs/lang/en_US.json @@ -895,6 +895,7 @@ "hex.builtin.view.find.search.reset": "Reset", "hex.builtin.view.find.searching": "Searching...", "hex.builtin.view.find.sequences": "Sequences", + "hex.builtin.view.find.sequences.ignore_case": "Ignore case", "hex.builtin.view.find.shortcut.select_all": "Select All Occurrences", "hex.builtin.view.find.strings": "Strings", "hex.builtin.view.find.strings.chars": "Characters", diff --git a/plugins/builtin/source/content/views/view_find.cpp b/plugins/builtin/source/content/views/view_find.cpp index 3e1150d1c..78fe18f44 100644 --- a/plugins/builtin/source/content/views/view_find.cpp +++ b/plugins/builtin/source/content/views/view_find.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -23,7 +24,7 @@ namespace hex::plugin::builtin { if (m_searchTask.isRunning()) return { }; - if (!m_occurrenceTree->overlapping({ address, address + size }).empty()) + if (!m_occurrenceTree->overlapping({ address, address }).empty()) return HighlightColor(); else return std::nullopt; @@ -258,23 +259,74 @@ namespace hex::plugin::builtin { reader.seek(searchRegion.getStartAddress()); reader.setEndAddress(searchRegion.getEndAddress()); - auto bytes = hex::decodeByteString(settings.sequence); - - if (bytes.empty()) + auto input = hex::decodeByteString(settings.sequence); + if (input.empty()) return { }; + std::vector bytes; + Occurrence::DecodeType decodeType = Occurrence::DecodeType::Binary; + std::endian endian; + switch (settings.type) { + default: + case SearchSettings::StringType::ASCII: + bytes = input; + decodeType = Occurrence::DecodeType::ASCII; + endian = std::endian::native; + break; + case SearchSettings::StringType::UTF16LE: { + auto wString = hex::utf8ToUtf16({ input.begin(), input.end() }); + + bytes.resize(wString.size() * 2); + std::memcpy(bytes.data(), wString.data(), bytes.size()); + decodeType = Occurrence::DecodeType::UTF16; + endian = std::endian::little; + + break; + } + case SearchSettings::StringType::UTF16BE: { + auto wString = hex::utf8ToUtf16({ input.begin(), input.end() }); + + bytes.resize(wString.size() * 2); + std::memcpy(bytes.data(), wString.data(), bytes.size()); + decodeType = Occurrence::DecodeType::UTF16; + endian = std::endian::big; + + for (size_t i = 0; i < bytes.size(); i += 2) + std::swap(bytes[i], bytes[i + 1]); + break; + } + } + auto occurrence = reader.begin(); u64 progress = 0; + + auto searchPredicate = [&] -> bool(*)(u8, u8) { + if (!settings.ignoreCase) + return [](u8 left, u8 right) -> bool { + return left == right; + }; + else + return [](u8 left, u8 right) -> bool { + if (std::isupper(left)) + left = std::tolower(left); + if (std::isupper(right)) + right = std::tolower(right); + + return left == right; + }; + }(); + + while (true) { task.update(progress); - occurrence = std::search(reader.begin(), reader.end(), std::boyer_moore_horspool_searcher(bytes.begin(), bytes.end())); + occurrence = std::search(reader.begin(), reader.end(), std::default_searcher(bytes.begin(), bytes.end(), searchPredicate)); if (occurrence == reader.end()) break; auto address = occurrence.getAddress(); reader.seek(address + 1); - results.push_back(Occurrence{ Region { address, bytes.size() }, Occurrence::DecodeType::Binary, std::endian::native, false }); + results.push_back(Occurrence{ Region { address, bytes.size() }, decodeType, endian, false }); progress = address - searchRegion.getStartAddress(); } @@ -497,6 +549,8 @@ namespace hex::plugin::builtin { case Value: case Strings: + case Sequence: + case Regex: { switch (occurrence.decodeType) { using enum Occurrence::DecodeType; @@ -523,8 +577,6 @@ namespace hex::plugin::builtin { } } break; - case Sequence: - case Regex: case BinaryPattern: result = hex::encodeByteString(bytes); break; @@ -661,6 +713,18 @@ namespace hex::plugin::builtin { ImGuiExt::InputTextIcon("hex.builtin.common.value"_lang, ICON_VS_SYMBOL_KEY, settings.sequence); + if (ImGui::BeginCombo("hex.builtin.common.type"_lang, StringTypes[std::to_underlying(settings.type)].c_str())) { + for (size_t i = 0; i < StringTypes.size() - 2; i++) { + auto type = static_cast(i); + + if (ImGui::Selectable(StringTypes[i].c_str(), type == settings.type)) + settings.type = type; + } + ImGui::EndCombo(); + } + + ImGui::Checkbox("hex.builtin.view.find.sequences.ignore_case"_lang, &settings.ignoreCase); + m_settingsValid = !settings.sequence.empty() && !hex::decodeByteString(settings.sequence).empty(); ImGui::EndTabItem();