Skip to content

Commit

Permalink
add function to validate if string is a valid utf-8 string
Browse files Browse the repository at this point in the history
  • Loading branch information
CarlosEduR committed May 27, 2024
1 parent b6dd46f commit 315ea60
Showing 1 changed file with 89 additions and 3 deletions.
92 changes: 89 additions & 3 deletions fuzz/parse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,90 @@
#include "ada.cpp"
#include "ada.h"

bool is_valid_utf8_string(const char *buf, size_t len) {
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
uint64_t pos = 0;
uint32_t code_point = 0;
while (pos < len) {
uint64_t next_pos = pos + 16;
if (next_pos <= len) { // if it is safe to read 16 more bytes, check that
// they are ascii
uint64_t v1;
std::memcpy(&v1, data + pos, sizeof(uint64_t));
uint64_t v2;
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
uint64_t v{v1 | v2};
if ((v & 0x8080808080808080) == 0) {
pos = next_pos;
continue;
}
}
unsigned char byte = data[pos];
while (byte < 0b10000000) {
if (++pos == len) {
return true;
}
byte = data[pos];
}

if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if ((code_point < 0x80) || (0x7ff < code_point)) {
return false;
}
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
(data[pos + 2] & 0b00111111);
if ((code_point < 0x800) || (0xffff < code_point) ||
(0xd7ff < code_point && code_point < 0xe000)) {
return false;
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
return false;
}
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
if (code_point <= 0xffff || 0x10ffff < code_point) {
return false;
}
} else {
return false;
}
pos = next_pos;
}
return true;
}

extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
FuzzedDataProvider fdp(data, size);
std::string source = fdp.ConsumeRandomLengthString(256);
Expand All @@ -18,9 +102,11 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
auto parse_url = ada::parse<ada::url>(source);
auto parse_url_aggregator = ada::parse<ada::url_aggregator>(source);

if (parse_url.has_value() ^ parse_url_aggregator.has_value()) {
printf("Source used to parse: %s", source.c_str());
abort();
if (is_valid_utf8_string(source.data(), source.length())) {
if (parse_url.has_value() ^ parse_url_aggregator.has_value()) {
printf("Source used to parse: %s", source.c_str());
abort();
}
}

if (parse_url) {
Expand Down

0 comments on commit 315ea60

Please sign in to comment.