From b7c051fbc1643d4283fac7233053ff699398fcf5 Mon Sep 17 00:00:00 2001
From: Mikhail Khachayants <mkhachaiants@gmail.com>
Date: Fri, 18 Oct 2024 23:47:27 +0300
Subject: [PATCH] Fix buffer overflow in u8_strlen

---
 CMakeLists.txt                        |  1 +
 include/valijson/utils/utf8_utils.hpp | 51 +++++++++++----------------
 tests/test_utf8_utils.cpp             | 51 +++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 31 deletions(-)
 create mode 100644 tests/test_utf8_utils.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb0ffa3..f166159 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,6 +126,7 @@ if(valijson_BUILD_TESTS)
         tests/test_validator.cpp
         tests/test_validator_with_custom_regular_expression_engine.cpp
         tests/test_yaml_cpp_adapter.cpp
+        tests/test_utf8_utils.cpp
     )
 
     set(TEST_LIBS gtest gtest_main jsoncpp json11 yamlcpp)
diff --git a/include/valijson/utils/utf8_utils.hpp b/include/valijson/utils/utf8_utils.hpp
index f1e01a5..74c49d0 100644
--- a/include/valijson/utils/utf8_utils.hpp
+++ b/include/valijson/utils/utf8_utils.hpp
@@ -14,50 +14,39 @@
 namespace valijson {
 namespace utils {
 
-static const uint32_t offsetsFromUTF8[6] = {
-    0x00000000UL, 0x00003080UL, 0x000E2080UL,
-    0x03C82080UL, 0xFA082080UL, 0x82082080UL
-};
-
 /* is c the start of a utf8 sequence? */
-inline bool isutf(char c) {
-    return ((c & 0xC0) != 0x80);
-}
-
-/* reads the next utf-8 sequence out of a string, updating an index */
-inline uint64_t u8_nextchar(const char *s, uint64_t *i)
+inline bool isutf(char c)
 {
-    uint64_t ch = 0;
-    int sz = 0;
-
-    do {
-        ch <<= 6;
-        ch += static_cast<unsigned char>(s[(*i)++]);
-        sz++;
-    } while (s[*i] && !isutf(s[*i]));
-    ch -= offsetsFromUTF8[sz-1];
-
-    return ch;
+    return ((c & 0xC0) != 0x80);
 }
 
 /* number of characters */
 inline uint64_t u8_strlen(const char *s)
 {
-    constexpr auto maxLength = std::numeric_limits<uint64_t>::max();
     uint64_t count = 0;
-    uint64_t i = 0;
 
-    while (s[i] != 0 && u8_nextchar(s, &i) != 0) {
-        if (i == maxLength) {
-            throwRuntimeError(
-                    "String exceeded maximum size of " +
-                    std::to_string(maxLength) + " bytes.");
+    while (*s) {
+        unsigned char p = static_cast<unsigned char>(*s);
+
+        size_t seqLen = p < 0x80   ? 1  // 0xxxxxxx: 1-byte (ASCII)
+                        : p < 0xE0 ? 2  // 110xxxxx: 2-byte sequence
+                        : p < 0xF0 ? 3  // 1110xxxx: 3-byte sequence
+                        : p < 0xF8 ? 4  // 11110xxx: 4-byte sequence
+                                   : 1; // treat as a single character
+
+        for (size_t i = 1; i < seqLen; ++i) {
+            if (s[i] == 0 || isutf(s[i])) {
+                seqLen = i;
+                break;
+            }
         }
+
+        s += seqLen;
         count++;
     }
 
     return count;
 }
 
-}  // namespace utils
-}  // namespace valijson
+} // namespace utils
+} // namespace valijson
diff --git a/tests/test_utf8_utils.cpp b/tests/test_utf8_utils.cpp
new file mode 100644
index 0000000..5001243
--- /dev/null
+++ b/tests/test_utf8_utils.cpp
@@ -0,0 +1,51 @@
+#include <gtest/gtest.h>
+#include <valijson/utils/utf8_utils.hpp>
+
+class TestUtf8Utils : public testing::Test
+{
+};
+
+TEST_F(TestUtf8Utils, Utf8StringLength)
+{
+    using valijson::utils::u8_strlen;
+
+    EXPECT_EQ(u8_strlen(""), 0);
+    EXPECT_EQ(u8_strlen("a"), 1);
+    EXPECT_EQ(u8_strlen("abc"), 3);
+
+    // U+0416
+    EXPECT_EQ(u8_strlen("\xD0\x96"), 1);
+
+    // U+0915
+    EXPECT_EQ(u8_strlen("\xE0\xA4\x95"), 1);
+
+    // U+10348
+    EXPECT_EQ(u8_strlen("\xF0\x90\x8D\x88"), 1);
+
+    // U+0915 + U+0416
+    EXPECT_EQ(u8_strlen("\xE0\xA4\x95\xD0\x96"), 2);
+
+    // incomplete U+0416 at the end
+    EXPECT_EQ(u8_strlen("\xD0"), 1);
+
+    // incomplete U+0416 in the middle
+    EXPECT_EQ(u8_strlen("\320abc"), 4);
+
+    // incomplete U+0915 at the end
+    EXPECT_EQ(u8_strlen("\xE0\xA4"), 1);
+
+    // incomplete U+0915 at the end
+    EXPECT_EQ(u8_strlen("\xE0\244abc"), 4);
+
+    // U+DFFF
+    EXPECT_EQ(u8_strlen("\xED\xBF\xBF"), 1);
+
+    // Overlong encoding for U+0000
+    EXPECT_EQ(u8_strlen("\xC0\x80"), 1);
+
+    // U+110000 (out of Unicode range)
+    EXPECT_EQ(u8_strlen("\xF5\x80\x80\x80"), 1);
+
+    // 0xE0 + 0xA4 repeating 9 times
+    EXPECT_EQ(u8_strlen("\340\244\244\244\244\244\244\244\244\244"), 5);
+}