Offsets Alignment Examples

This page is a practical tutorial for applying the normalization and offsets contract. For the normative specification, see Normalization and Offsets Contract.

Mental Model And Coordinate System

Offsets are always interpreted in the coordinate system of tokenization_text. tokenization_text may differ from clean_text when tokenizer intrinsic normalization is active.

Safe pattern with KeemenaPreprocessing output:

  1. tokenization_text = tokenization_view(tokenizer, clean_text)
  2. encode_result(tokenizer, tokenization_text; assume_normalized=true, return_offsets=true, ...)

Offset contract reminders:

  • Offsets are 1-based UTF-8 codeunit half-open spans (start, stop).
  • stop is exclusive.
  • Sentinel (0, 0) means "no span" and should be treated as non-aligning.

Example 1: Inspect encode_result Output

using KeemenaSubwords

tokenizer = load_tokenizer(:core_sentencepiece_unigram_en)
clean_text = "Hello, world! This is an offsets demo."
tokenization_text = tokenization_view(tokenizer, clean_text)

result = encode_result(
    tokenizer,
    tokenization_text;
    assume_normalized = true,
    add_special_tokens = true,
    return_offsets = true,
    return_masks = true,
)

@assert result.offsets !== nothing
@assert result.special_tokens_mask !== nothing

token_offsets = result.offsets
special_tokens_mask = result.special_tokens_mask

rows = [
    (
        token_index = i,
        token_id = result.ids[i],
        token_string = result.tokens[i],
        offset = token_offsets[i],
        substring_or_nothing = try_span_substring(tokenization_text, token_offsets[i]),
        is_special = special_tokens_mask[i] == 1,
        has_span = has_nonempty_span(token_offsets[i]),
    )
    for i in eachindex(result.ids)
]

rows[1:min(end, 30)]
9-element Vector{@NamedTuple{token_index::Int64, token_id::Int64, token_string::String, offset::Tuple{Int64, Int64}, substring_or_nothing::String, is_special::Bool, has_span::Bool}}:
 (token_index = 1, token_id = 2, token_string = "<s>", offset = (0, 0), substring_or_nothing = "", is_special = 1, has_span = 0)
 (token_index = 2, token_id = 1, token_string = "<unk>", offset = (1, 7), substring_or_nothing = "Hello,", is_special = 1, has_span = 1)
 (token_index = 3, token_id = 1, token_string = "<unk>", offset = (8, 14), substring_or_nothing = "world!", is_special = 1, has_span = 1)
 (token_index = 4, token_id = 1, token_string = "<unk>", offset = (15, 19), substring_or_nothing = "This", is_special = 1, has_span = 1)
 (token_index = 5, token_id = 1, token_string = "<unk>", offset = (20, 22), substring_or_nothing = "is", is_special = 1, has_span = 1)
 (token_index = 6, token_id = 1, token_string = "<unk>", offset = (23, 25), substring_or_nothing = "an", is_special = 1, has_span = 1)
 (token_index = 7, token_id = 1, token_string = "<unk>", offset = (26, 33), substring_or_nothing = "offsets", is_special = 1, has_span = 1)
 (token_index = 8, token_id = 1, token_string = "<unk>", offset = (34, 39), substring_or_nothing = "demo.", is_special = 1, has_span = 1)
 (token_index = 9, token_id = 3, token_string = "</s>", offset = (0, 0), substring_or_nothing = "", is_special = 1, has_span = 0)

How to interpret these rows:

  • Tokens with offset (0, 0) are no-span tokens. They are usually inserted specials.
  • is_special and has_span are related but not identical concepts. Align by span, not by mask alone.
  • substring_or_nothing helps verify offsets quickly against tokenization_text.
  • Use tokenization_text for offset-based slicing. Do not assume clean_text uses the same coordinates.

Example 2: Word Offsets And Subword-To-Word Mapping

function whitespace_word_offsets(text)::Vector{Tuple{Int,Int}}
    offsets = Tuple{Int,Int}[]
    stop_exclusive = ncodeunits(text) + 1
    i = firstindex(text)

    while i < stop_exclusive
        while i < stop_exclusive && isspace(text[i])
            i = nextind(text, i)
        end
        i < stop_exclusive || break

        word_start = i
        while i < stop_exclusive && !isspace(text[i])
            i = nextind(text, i)
        end
        word_stop = i
        push!(offsets, (word_start, word_stop))
    end

    return offsets
end

overlap_len(a_start, a_stop, b_start, b_stop)::Int =
    max(0, min(a_stop, b_stop) - max(a_start, b_start))

function subword_to_word_index(
    word_offsets::Vector{Tuple{Int,Int}},
    subword_offset::Tuple{Int,Int},
)::Union{Nothing,Int}
    has_nonempty_span(subword_offset) || return nothing
    sub_start, sub_stop = subword_offset

    for (word_index, (word_start, word_stop)) in pairs(word_offsets)
        if sub_start >= word_start && sub_stop <= word_stop
            return word_index
        end
    end

    best_index = nothing
    best_overlap = 0
    for (word_index, (word_start, word_stop)) in pairs(word_offsets)
        current_overlap = overlap_len(sub_start, sub_stop, word_start, word_stop)
        # Strict > means equal-overlap ties keep the earliest word index.
        if current_overlap > best_overlap
            best_overlap = current_overlap
            best_index = word_index
        end
    end

    return best_overlap > 0 ? best_index : nothing
end

word_offsets = whitespace_word_offsets(tokenization_text)
token_to_word = map(token_offsets) do off
    subword_to_word_index(word_offsets, off)
end

word_rows = [
    (
        word_index = i,
        offset = word_offsets[i],
        word_substring = try_span_substring(tokenization_text, word_offsets[i]),
    )
    for i in eachindex(word_offsets)
]

token_rows = [
    (
        token_index = i,
        token_string = result.tokens[i],
        offset = token_offsets[i],
        word_index = token_to_word[i],
        word_substring = token_to_word[i] === nothing ? nothing :
            try_span_substring(tokenization_text, word_offsets[token_to_word[i]]),
    )
    for i in eachindex(result.tokens)
]

(
    words = word_rows,
    first_tokens = token_rows[1:min(end, 30)],
)
(words = [(word_index = 1, offset = (1, 7), word_substring = "Hello,"), (word_index = 2, offset = (8, 14), word_substring = "world!"), (word_index = 3, offset = (15, 19), word_substring = "This"), (word_index = 4, offset = (20, 22), word_substring = "is"), (word_index = 5, offset = (23, 25), word_substring = "an"), (word_index = 6, offset = (26, 33), word_substring = "offsets"), (word_index = 7, offset = (34, 39), word_substring = "demo.")], first_tokens = NamedTuple{(:token_index, :token_string, :offset, :word_index, :word_substring)}[(token_index = 1, token_string = "<s>", offset = (0, 0), word_index = nothing, word_substring = nothing), (token_index = 2, token_string = "<unk>", offset = (1, 7), word_index = 1, word_substring = "Hello,"), (token_index = 3, token_string = "<unk>", offset = (8, 14), word_index = 2, word_substring = "world!"), (token_index = 4, token_string = "<unk>", offset = (15, 19), word_index = 3, word_substring = "This"), (token_index = 5, token_string = "<unk>", offset = (20, 22), word_index = 4, word_substring = "is"), (token_index = 6, token_string = "<unk>", offset = (23, 25), word_index = 5, word_substring = "an"), (token_index = 7, token_string = "<unk>", offset = (26, 33), word_index = 6, word_substring = "offsets"), (token_index = 8, token_string = "<unk>", offset = (34, 39), word_index = 7, word_substring = "demo."), (token_index = 9, token_string = "</s>", offset = (0, 0), word_index = nothing, word_substring = nothing)])

Limitations of this tutorial mapping:

  • Subword spans can overlap multiple words in some normalization and punctuation situations.
  • This example returns one best word index (full containment first, else maximum overlap).
  • Equal-overlap ties are resolved to the earliest word index.
  • If you need multi-word mapping, return all overlapping word indices instead of one index.

Example 3: Special Tokens Policy For Alignment

participates_in_alignment(offset, is_special)::Bool = has_nonempty_span(offset)

alignment_rows = [
    (
        token_index = i,
        token_string = result.tokens[i],
        offset = token_offsets[i],
        is_special = special_tokens_mask[i] == 1,
        participates = participates_in_alignment(token_offsets[i], special_tokens_mask[i] == 1),
    )
    for i in eachindex(result.tokens)
]

(
    skipped = [row for row in alignment_rows if !row.participates][1:min(end, 10)],
    participating = [row for row in alignment_rows if row.participates][1:min(end, 10)],
)
(skipped = @NamedTuple{token_index::Int64, token_string::String, offset::Tuple{Int64, Int64}, is_special::Bool, participates::Bool}[(token_index = 1, token_string = "<s>", offset = (0, 0), is_special = 1, participates = 0), (token_index = 9, token_string = "</s>", offset = (0, 0), is_special = 1, participates = 0)], participating = @NamedTuple{token_index::Int64, token_string::String, offset::Tuple{Int64, Int64}, is_special::Bool, participates::Bool}[(token_index = 2, token_string = "<unk>", offset = (1, 7), is_special = 1, participates = 1), (token_index = 3, token_string = "<unk>", offset = (8, 14), is_special = 1, participates = 1), (token_index = 4, token_string = "<unk>", offset = (15, 19), is_special = 1, participates = 1), (token_index = 5, token_string = "<unk>", offset = (20, 22), is_special = 1, participates = 1), (token_index = 6, token_string = "<unk>", offset = (23, 25), is_special = 1, participates = 1), (token_index = 7, token_string = "<unk>", offset = (26, 33), is_special = 1, participates = 1), (token_index = 8, token_string = "<unk>", offset = (34, 39), is_special = 1, participates = 1)])

Policy summary:

  • Pragmatic default: participate in alignment iff has_nonempty_span(offset).
  • Inserted special tokens usually have (0, 0) and are skipped automatically.

Example 4: Byte-Level Caveat And Safe Extraction

Byte-level tokenizers can produce offsets that are valid codeunit spans but are not always safe Julia string slicing boundaries on multibyte text.

When you consume offsets, use this safe pattern:

# non-executable byte-level pattern
substring = try_span_substring(tokenization_text, offset)

if substring === nothing && has_nonempty_span(offset)
    bytes = span_codeunits(tokenization_text, offset)
    # Use bytes in a byte-aware path when boundaries are not string-safe.
end

This fallback keeps alignment pipelines robust across both string-safe and byte-level offset cases.

Example 5: Map A Labeled Span To Token Indices

function token_indices_overlapping_span(
    offsets::Vector{Tuple{Int,Int}},
    span::Tuple{Int,Int},
)::Vector{Int}
    span_start, span_stop = span
    span_stop > span_start || return Int[]

    overlaps = Int[]
    for (token_index, offset) in pairs(offsets)
        has_nonempty_span(offset) || continue
        token_start, token_stop = offset
        if min(token_stop, span_stop) > max(token_start, span_start)
            push!(overlaps, token_index)
        end
    end
    return overlaps
end

labeled_range = findfirst("offsets", tokenization_text)
@assert labeled_range !== nothing

labeled_span = (
    first(labeled_range),
    nextind(tokenization_text, last(labeled_range)),
)
overlapping_token_indices = token_indices_overlapping_span(token_offsets, labeled_span)

overlap_rows = [
    (
        token_index = i,
        token_string = result.tokens[i],
        token_offset = token_offsets[i],
        token_substring = try_span_substring(tokenization_text, token_offsets[i]),
    )
    for i in overlapping_token_indices
]

(
    labeled_span = labeled_span,
    labeled_substring = try_span_substring(tokenization_text, labeled_span),
    overlapping_token_indices = overlapping_token_indices,
    overlapping_tokens = overlap_rows,
)
(labeled_span = (26, 33), labeled_substring = "offsets", overlapping_token_indices = [7], overlapping_tokens = [(token_index = 7, token_string = "<unk>", token_offset = (26, 33), token_substring = "offsets")])

This pattern is useful for projecting character/codeunit span labels onto token indices for training targets.