celina/core/utf8_utils

UTF-8 Processing Utilities

This module contains pure business logic for UTF-8 character processing, shared between synchronous and asynchronous implementations.

No I/O operations are performed here - only data validation and parsing.

Types

Utf8ValidationResult = object
  isValid*: bool
  expectedBytes*: int
  errorMessage*: string
Result of UTF-8 validation

Procs

proc buildUtf8String(firstByte: byte; continuationBytes: openArray[byte]): string {.
    ...raises: [], tags: [], forbids: [].}

Build a UTF-8 string from first byte and continuation bytes

This is a pure function that doesn't perform I/O - it just constructs the string from the provided bytes.

Example:

let s = buildUtf8String(0xC3.byte, [0xA9.byte])  # รฉ
assert s == "รฉ"

proc isUtf8ContinuationByte(b: byte): bool {.inline, ...raises: [], tags: [],
    forbids: [].}

Check if a byte is a valid UTF-8 continuation byte (10xxxxxx)

Example:

assert isUtf8ContinuationByte(0x80)  # true
assert isUtf8ContinuationByte(0xBF)  # true
assert not isUtf8ContinuationByte(0xC0)  # false

proc truncateUtf8(s: string; maxBytes: int): string {....raises: [], tags: [],
    forbids: [].}

Truncate a UTF-8 string to at most maxBytes, ensuring valid UTF-8

This will not split multi-byte characters - it truncates at character boundaries.

Example:

assert truncateUtf8("hello", 3) == "hel"
assert truncateUtf8("ใ“ใ‚“ใซใกใฏ", 7) == "ใ“ใ‚“"  # 6 bytes (2 chars)

proc utf8ByteLength(firstByte: byte): int {....raises: [], tags: [], forbids: [].}

Determine the number of bytes in a UTF-8 character from its first byte Returns 1 for ASCII, 2-4 for multi-byte characters, 0 for invalid

Example:

assert utf8ByteLength(0x41) == 1  # 'A' (ASCII)
assert utf8ByteLength(0xC3) == 2  # Start of 2-byte UTF-8
assert utf8ByteLength(0xE0) == 3  # Start of 3-byte UTF-8
assert utf8ByteLength(0xF0) == 4  # Start of 4-byte UTF-8
assert utf8ByteLength(0xFF) == 0  # Invalid

proc utf8CharLength(s: string): int {....raises: [], tags: [], forbids: [].}

Get the number of UTF-8 characters in a string (not bytes)

Example:

assert utf8CharLength("hello") == 5
assert utf8CharLength("ใ“ใ‚“ใซใกใฏ") == 5  # 5 characters, 15 bytes

proc validateUtf8Sequence(bytes: openArray[byte]): Utf8ValidationResult {.
    ...raises: [], tags: [], forbids: [].}

Validate a UTF-8 byte sequence

Returns validation result with detailed error information