utils: add chunked text reader

2025-01-31 04:13:47 +01:00 · 2022-04-16 22:14:02 +02:00 · 2022-04-16 22:14:02 +02:00 · ca04c668a5
commit ca04c668a5
parent 676a8b6b5e
4 changed files with 238 additions and 0 deletions
--- a/src/libvgmstream.vcxproj
+++ b/src/libvgmstream.vcxproj
@ -175,6 +175,7 @@
    <ClInclude Include="util\endianness.h" />
    <ClInclude Include="util\log.h" />
    <ClInclude Include="util\m2_psb.h" />
+    <ClInclude Include="util\text_reader.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="coding\at3plus_decoder.c" />
@ -727,6 +728,7 @@
    <ClCompile Include="util\chunks.c" />
    <ClCompile Include="util\log.c" />
    <ClCompile Include="util\m2_psb.c" />
+    <ClCompile Include="util\text_reader.c" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/src/libvgmstream.vcxproj.filters
+++ b/src/libvgmstream.vcxproj.filters
@ -323,6 +323,9 @@
    <ClInclude Include="util\m2_psb.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="util\text_reader.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="formats.c">
@ -1969,6 +1972,9 @@
    <ClCompile Include="util\m2_psb.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="util\text_reader.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
    <ClCompile Include="meta\wbk.c">
      <Filter>meta\Source Files</Filter>
    </ClCompile>
--- a/src/util/text_reader.c
+++ b/src/util/text_reader.c
@ -0,0 +1,187 @@
+#include <string.h>
+#include "text_reader.h"
+#include "log.h"
+
+
+/* convenience function to init the above struct */
+int text_reader_init(text_reader_t* tr, uint8_t* buf, int buf_size, STREAMFILE* sf, uint32_t offset, uint32_t max) {
+    memset(tr, 0, sizeof(text_reader_t));
+
+    if (buf_size <= 1 || !buf || !sf)
+        return 0;
+
+    tr->buf = buf;
+    tr->buf_size = buf_size;
+    tr->sf = sf;
+    tr->offset = offset;
+
+    if (!max)
+        max = get_streamfile_size(sf) - offset;
+    tr->max_offset = max;
+
+    return 1;
+}
+
+
+/* reads more data into buf and adjust values */
+static void prepare_buf(text_reader_t* tr) {
+
+    /* since we may read N lines in the same buffer, move starting pos each call */
+    tr->pos = tr->next_pos;
+
+    /* not more data (but may still read lines so not an error) */
+    if (tr->offset >= tr->max_offset) {
+        return;
+    }
+
+    /* request more data */
+    if (tr->pos >= tr->filled) {
+        tr->pos = 0;
+        tr->filled = 0;
+    }
+
+    /* partially filled, move buffer */
+    if (tr->pos > 0) {
+        int move_size = tr->filled - tr->pos;
+
+        memmove(tr->buf, &tr->buf[tr->pos], move_size); /* memmove = may overlap */
+        tr->filled -= tr->pos; /* now less filled */
+        tr->pos = 0;
+    }
+
+    /* has enough data */
+    if (tr->filled >= tr->buf_size) {
+        return;
+    }
+
+    /* read buf up to max */
+    {
+        int bytes;
+        int read_size = tr->buf_size - tr->filled;
+        if (read_size + tr->offset > tr->max_offset)
+            read_size = tr->max_offset - tr->offset;
+
+        if (read_size <= 0) { /* ??? */
+            bytes = 0;
+        }
+        else {
+            if (tr->filled + read_size >= tr->buf_size)
+                read_size -= 1; /* always leave an extra byte for c-string null */
+
+            bytes = read_streamfile(tr->buf + tr->filled, tr->offset, read_size, tr->sf);
+            tr->offset += bytes;
+            tr->filled += bytes;
+        }
+
+        /* maybe some internal issue, force EOF */
+        if (bytes == 0) {
+            tr->offset = tr->max_offset;
+        }
+
+        /* ensure no old data is used as valid (simplifies some checks during parse) */
+        tr->buf[tr->filled] = '\0';
+    }
+}
+
+static void parse_buf(text_reader_t* tr) {
+    int i;
+
+    tr->line = (char*)&tr->buf[tr->pos];
+    tr->line_len = 0;
+    tr->line_ok = 0;
+
+    /* detect EOF (this should only happen if no more data was loaded) */
+    if (tr->pos == tr->filled) {
+        tr->line = NULL;
+        tr->line_ok = 1;
+        tr->line_len = 0;
+        return;
+    }
+
+    /* assumes filled doesn't reach buf_size (to allow trailing \0 after filled) */
+    for (i = tr->pos; i < tr->filled; i++) {
+        char c = (char)tr->buf[i];
+
+        if (c == '\0') {
+            i++;
+            break; /* not a valid file? (line_ok=0) */
+        }
+
+        if (c == '\r' && tr->buf[i+1] == '\n') { /* CRLF (0x0d0a) */
+            /* i+1 may read past filled but it's pre-set to \0 */
+            i += 2; //todo check that i < buf_size-1
+            tr->line_ok = 1;
+            break;
+        }
+        else if (c == '\n') { /* LF (0x0a) */
+            i++;
+            tr->line_ok = 1;
+            break;
+        }
+        else if (c == '\r') { /* CR (0x0d) */
+            i++;
+            tr->line_ok = (i < tr->buf_size - 1);
+            /* if buf ends with a CR, next buf may start be a LF (single CRLF), so line is not ok near buf end
+             * (old Macs use single \r as lines, but using only that and reaching buf end should happen rarely) */
+            break;
+        }
+
+        tr->line_len++;
+    }
+
+    /* when lines are small may read up to filled smaller than buf, with no more data */
+    if (!tr->line_ok && i == tr->filled)
+        tr->line_ok = (tr->filled < tr->buf_size - 1);
+
+    /* added after proper line (a \n) or after buf end, so we aren't changing valid data */
+    tr->buf[tr->pos + tr->line_len] = '\0';
+    tr->next_pos = i;
+}
+
+int text_reader_get_line(text_reader_t* tr, char** p_line) {
+
+    if (!tr->buf) /* no init */
+        return 0;
+
+    /* how it works:
+     * - fills buffer up to max or buf_len, from pos 0
+     * - counts from 0 to next '\n' or EOF
+     *   - nulls \n or after EOF to make a proper c-string
+     * - returns from string from pos 0 to len
+     * - on next call rather than re-reading continues from pos N (after \n)
+     *   - a buf will likely contain multiple lines
+     * - if read chars reach buf_end (no proper line found):
+     *   - pos = 0: buf isn't big enough, error
+     *   - pos > 0: move data to pos=0, fill rest of buf, fill rest of buf
+     *
+     * ex. 
+     * - parse buf: read chunk full [aaaaa\nbbbb] (pos = 0)
+     * - get line: returns "aaaaa\0" (next_pos points to first 'b')
+     * - get line: from 'b', but reaches buf end before \n or EOF: must readjust
+     * - parse buf: move chunk part [bbbb*******] ('b' to beginning, * is garbage)
+     * - parse buf: read chunk part [bbbbbb\ncc_] (reaches EOF)
+     * - get line: returns "bbbbbb\0" (pos points to first c)
+     * - get line: returns "cc\0"
+     * - get line: returns NULL (reached EOF, no more bytes)
+     * - (there is an implicit \0 reserved in buf)
+     *
+     * ex.
+     * - start: read chunk [aaaaaaaaaaa]
+     * - get line: reaches buf end, but didn't reach EOF nor \n: error, can't store line
+    */
+
+    prepare_buf(tr); /* may not do anything */
+    parse_buf(tr); /* next line */
+
+    /* if we are reading a partial line there may be more data */
+    if (!tr->line_ok && tr->pos > 0) {
+        prepare_buf(tr);
+        parse_buf(tr); /* could continue from prev parse but makes logic more complex for little gain */
+    }
+
+    /* always output line even if truncated */
+    if (p_line) *p_line = tr->line;
+    return !tr->line_ok ?
+        -(tr->line_len + 1) : /* -0 also is possible, force -1 */
+        tr->line_len;
+}
--- a/src/util/text_reader.h
+++ b/src/util/text_reader.h
@ -0,0 +1,43 @@
+#ifndef _TEXT_READER_H_
+#define _TEXT_READER_H_
+
+
+/* Reader tuned for whole text files, reading chunks to minimize I/O with a single buffer.
+ * For short lines read_line may be more appropriate (reads up to line end, while this reads bigger chunks),
+ * which also allow \0 (this reader returns an error).
+ * NOTE: modifies passed buffer (lines are forced to end with \0 rather than \n).
+ * 
+ * Usage: set text_reader_t and defaults with text_reader_init, call text_reader_get_line(...) to get lines.
+ * buf may be size+1 to allow 2^N chunk reads + trailing \0 (better performance?).
+ */
+
+#include "../streamfile.h"
+
+typedef struct {
+    /* init */
+    uint8_t* buf;           /* where data will be read */
+    int buf_size;           /* size of the struct (also max line size) */
+    STREAMFILE* sf;         /* used to read data */
+    uint32_t offset;        /* sf pos */
+    uint32_t max_offset;    /* sf max */
+  
+    /* internal */
+    int filled;             /* current buf bytes */
+    int pos;                /* current buf pos (last line) */
+    int next_pos;           /* buf pos on next call, after line end */
+    int line_ok;            /* current line is fully correct */
+
+    char* line;
+    int line_len;
+} text_reader_t;
+
+
+/* convenience function to init the above struct */
+int text_reader_init(text_reader_t* tr, uint8_t* buf, int buf_size, STREAMFILE* sf, uint32_t offset, uint32_t max);
+
+/* Reads and sets next line, or NULL if no lines are found (EOF).
+ * returns line length (0 for empty lines), or <0 if line was too long to store in buf.
+ * Will always return a valid (null terminated) string. */
+int text_reader_get_line(text_reader_t* tr, char** p_line);
+
+#endif