Offsets Alignment Examples
This page is a practical tutorial for applying the normalization and offsets contract. For the normative specification, see Normalization and Offsets Contract.
Mental Model And Coordinate System
Offsets are always interpreted in the coordinate system of tokenization_text. tokenization_text may differ from clean_text when tokenizer intrinsic normalization is active.
Safe pattern with KeemenaPreprocessing output:
tokenization_text = tokenization_view(tokenizer, clean_text)encode_result(tokenizer, tokenization_text; assume_normalized=true, return_offsets=true, ...)
Offset contract reminders:
- Offsets are 1-based UTF-8 codeunit half-open spans
(start, stop). stopis exclusive.- Sentinel
(0, 0)means "no span" and should be treated as non-aligning.
Example 1: Inspect encode_result Output
using KeemenaSubwords
tokenizer = load_tokenizer(:core_sentencepiece_unigram_en)
clean_text = "Hello, world! This is an offsets demo."
tokenization_text = tokenization_view(tokenizer, clean_text)
result = encode_result(
tokenizer,
tokenization_text;
assume_normalized = true,
add_special_tokens = true,
return_offsets = true,
return_masks = true,
)
@assert result.offsets !== nothing
@assert result.special_tokens_mask !== nothing
token_offsets = result.offsets
special_tokens_mask = result.special_tokens_mask
rows = [
(
token_index = i,
token_id = result.ids[i],
token_string = result.tokens[i],
offset = token_offsets[i],
substring_or_nothing = try_span_substring(tokenization_text, token_offsets[i]),
is_special = special_tokens_mask[i] == 1,
has_span = has_nonempty_span(token_offsets[i]),
)
for i in eachindex(result.ids)
]
rows[1:min(end, 30)]9-element Vector{@NamedTuple{token_index::Int64, token_id::Int64, token_string::String, offset::Tuple{Int64, Int64}, substring_or_nothing::String, is_special::Bool, has_span::Bool}}:
(token_index = 1, token_id = 2, token_string = "<s>", offset = (0, 0), substring_or_nothing = "", is_special = 1, has_span = 0)
(token_index = 2, token_id = 1, token_string = "<unk>", offset = (1, 7), substring_or_nothing = "Hello,", is_special = 1, has_span = 1)
(token_index = 3, token_id = 1, token_string = "<unk>", offset = (8, 14), substring_or_nothing = "world!", is_special = 1, has_span = 1)
(token_index = 4, token_id = 1, token_string = "<unk>", offset = (15, 19), substring_or_nothing = "This", is_special = 1, has_span = 1)
(token_index = 5, token_id = 1, token_string = "<unk>", offset = (20, 22), substring_or_nothing = "is", is_special = 1, has_span = 1)
(token_index = 6, token_id = 1, token_string = "<unk>", offset = (23, 25), substring_or_nothing = "an", is_special = 1, has_span = 1)
(token_index = 7, token_id = 1, token_string = "<unk>", offset = (26, 33), substring_or_nothing = "offsets", is_special = 1, has_span = 1)
(token_index = 8, token_id = 1, token_string = "<unk>", offset = (34, 39), substring_or_nothing = "demo.", is_special = 1, has_span = 1)
(token_index = 9, token_id = 3, token_string = "</s>", offset = (0, 0), substring_or_nothing = "", is_special = 1, has_span = 0)How to interpret these rows:
- Tokens with offset
(0, 0)are no-span tokens. They are usually inserted specials. is_specialandhas_spanare related but not identical concepts. Align by span, not by mask alone.substring_or_nothinghelps verify offsets quickly againsttokenization_text.- Use
tokenization_textfor offset-based slicing. Do not assumeclean_textuses the same coordinates.
Example 2: Word Offsets And Subword-To-Word Mapping
function whitespace_word_offsets(text)::Vector{Tuple{Int,Int}}
offsets = Tuple{Int,Int}[]
stop_exclusive = ncodeunits(text) + 1
i = firstindex(text)
while i < stop_exclusive
while i < stop_exclusive && isspace(text[i])
i = nextind(text, i)
end
i < stop_exclusive || break
word_start = i
while i < stop_exclusive && !isspace(text[i])
i = nextind(text, i)
end
word_stop = i
push!(offsets, (word_start, word_stop))
end
return offsets
end
overlap_len(a_start, a_stop, b_start, b_stop)::Int =
max(0, min(a_stop, b_stop) - max(a_start, b_start))
function subword_to_word_index(
word_offsets::Vector{Tuple{Int,Int}},
subword_offset::Tuple{Int,Int},
)::Union{Nothing,Int}
has_nonempty_span(subword_offset) || return nothing
sub_start, sub_stop = subword_offset
for (word_index, (word_start, word_stop)) in pairs(word_offsets)
if sub_start >= word_start && sub_stop <= word_stop
return word_index
end
end
best_index = nothing
best_overlap = 0
for (word_index, (word_start, word_stop)) in pairs(word_offsets)
current_overlap = overlap_len(sub_start, sub_stop, word_start, word_stop)
# Strict > means equal-overlap ties keep the earliest word index.
if current_overlap > best_overlap
best_overlap = current_overlap
best_index = word_index
end
end
return best_overlap > 0 ? best_index : nothing
end
word_offsets = whitespace_word_offsets(tokenization_text)
token_to_word = map(token_offsets) do off
subword_to_word_index(word_offsets, off)
end
word_rows = [
(
word_index = i,
offset = word_offsets[i],
word_substring = try_span_substring(tokenization_text, word_offsets[i]),
)
for i in eachindex(word_offsets)
]
token_rows = [
(
token_index = i,
token_string = result.tokens[i],
offset = token_offsets[i],
word_index = token_to_word[i],
word_substring = token_to_word[i] === nothing ? nothing :
try_span_substring(tokenization_text, word_offsets[token_to_word[i]]),
)
for i in eachindex(result.tokens)
]
(
words = word_rows,
first_tokens = token_rows[1:min(end, 30)],
)(words = [(word_index = 1, offset = (1, 7), word_substring = "Hello,"), (word_index = 2, offset = (8, 14), word_substring = "world!"), (word_index = 3, offset = (15, 19), word_substring = "This"), (word_index = 4, offset = (20, 22), word_substring = "is"), (word_index = 5, offset = (23, 25), word_substring = "an"), (word_index = 6, offset = (26, 33), word_substring = "offsets"), (word_index = 7, offset = (34, 39), word_substring = "demo.")], first_tokens = NamedTuple{(:token_index, :token_string, :offset, :word_index, :word_substring)}[(token_index = 1, token_string = "<s>", offset = (0, 0), word_index = nothing, word_substring = nothing), (token_index = 2, token_string = "<unk>", offset = (1, 7), word_index = 1, word_substring = "Hello,"), (token_index = 3, token_string = "<unk>", offset = (8, 14), word_index = 2, word_substring = "world!"), (token_index = 4, token_string = "<unk>", offset = (15, 19), word_index = 3, word_substring = "This"), (token_index = 5, token_string = "<unk>", offset = (20, 22), word_index = 4, word_substring = "is"), (token_index = 6, token_string = "<unk>", offset = (23, 25), word_index = 5, word_substring = "an"), (token_index = 7, token_string = "<unk>", offset = (26, 33), word_index = 6, word_substring = "offsets"), (token_index = 8, token_string = "<unk>", offset = (34, 39), word_index = 7, word_substring = "demo."), (token_index = 9, token_string = "</s>", offset = (0, 0), word_index = nothing, word_substring = nothing)])Limitations of this tutorial mapping:
- Subword spans can overlap multiple words in some normalization and punctuation situations.
- This example returns one best word index (full containment first, else maximum overlap).
- Equal-overlap ties are resolved to the earliest word index.
- If you need multi-word mapping, return all overlapping word indices instead of one index.
Example 3: Special Tokens Policy For Alignment
participates_in_alignment(offset, is_special)::Bool = has_nonempty_span(offset)
alignment_rows = [
(
token_index = i,
token_string = result.tokens[i],
offset = token_offsets[i],
is_special = special_tokens_mask[i] == 1,
participates = participates_in_alignment(token_offsets[i], special_tokens_mask[i] == 1),
)
for i in eachindex(result.tokens)
]
(
skipped = [row for row in alignment_rows if !row.participates][1:min(end, 10)],
participating = [row for row in alignment_rows if row.participates][1:min(end, 10)],
)(skipped = @NamedTuple{token_index::Int64, token_string::String, offset::Tuple{Int64, Int64}, is_special::Bool, participates::Bool}[(token_index = 1, token_string = "<s>", offset = (0, 0), is_special = 1, participates = 0), (token_index = 9, token_string = "</s>", offset = (0, 0), is_special = 1, participates = 0)], participating = @NamedTuple{token_index::Int64, token_string::String, offset::Tuple{Int64, Int64}, is_special::Bool, participates::Bool}[(token_index = 2, token_string = "<unk>", offset = (1, 7), is_special = 1, participates = 1), (token_index = 3, token_string = "<unk>", offset = (8, 14), is_special = 1, participates = 1), (token_index = 4, token_string = "<unk>", offset = (15, 19), is_special = 1, participates = 1), (token_index = 5, token_string = "<unk>", offset = (20, 22), is_special = 1, participates = 1), (token_index = 6, token_string = "<unk>", offset = (23, 25), is_special = 1, participates = 1), (token_index = 7, token_string = "<unk>", offset = (26, 33), is_special = 1, participates = 1), (token_index = 8, token_string = "<unk>", offset = (34, 39), is_special = 1, participates = 1)])Policy summary:
- Pragmatic default: participate in alignment iff
has_nonempty_span(offset). - Inserted special tokens usually have
(0, 0)and are skipped automatically.
Example 4: Byte-Level Caveat And Safe Extraction
Byte-level tokenizers can produce offsets that are valid codeunit spans but are not always safe Julia string slicing boundaries on multibyte text.
When you consume offsets, use this safe pattern:
# non-executable byte-level pattern
substring = try_span_substring(tokenization_text, offset)
if substring === nothing && has_nonempty_span(offset)
bytes = span_codeunits(tokenization_text, offset)
# Use bytes in a byte-aware path when boundaries are not string-safe.
endThis fallback keeps alignment pipelines robust across both string-safe and byte-level offset cases.
Example 5: Map A Labeled Span To Token Indices
function token_indices_overlapping_span(
offsets::Vector{Tuple{Int,Int}},
span::Tuple{Int,Int},
)::Vector{Int}
span_start, span_stop = span
span_stop > span_start || return Int[]
overlaps = Int[]
for (token_index, offset) in pairs(offsets)
has_nonempty_span(offset) || continue
token_start, token_stop = offset
if min(token_stop, span_stop) > max(token_start, span_start)
push!(overlaps, token_index)
end
end
return overlaps
end
labeled_range = findfirst("offsets", tokenization_text)
@assert labeled_range !== nothing
labeled_span = (
first(labeled_range),
nextind(tokenization_text, last(labeled_range)),
)
overlapping_token_indices = token_indices_overlapping_span(token_offsets, labeled_span)
overlap_rows = [
(
token_index = i,
token_string = result.tokens[i],
token_offset = token_offsets[i],
token_substring = try_span_substring(tokenization_text, token_offsets[i]),
)
for i in overlapping_token_indices
]
(
labeled_span = labeled_span,
labeled_substring = try_span_substring(tokenization_text, labeled_span),
overlapping_token_indices = overlapping_token_indices,
overlapping_tokens = overlap_rows,
)(labeled_span = (26, 33), labeled_substring = "offsets", overlapping_token_indices = [7], overlapping_tokens = [(token_index = 7, token_string = "<unk>", token_offset = (26, 33), token_substring = "offsets")])This pattern is useful for projecting character/codeunit span labels onto token indices for training targets.