Source code for kerb.tokenizer.tokenizer

"""Core token counting utilities for LLM applications.

This module provides flexible token counting for various LLM models and tokenizers,
supporting OpenAI models (via tiktoken), HuggingFace models (via transformers),
and approximation methods for quick estimates.
"""

import warnings
from enum import Enum
from typing import List, Optional, Union


[docs] class Tokenizer(Enum): """Enumeration of supported tokenizers for token counting. Using explicit tokenizers instead of model names provides better control and consistency for LLM developers. Tiktoken Encodings (OpenAI): CL100K_BASE: GPT-4o, GPT-4o-mini, text-embedding-ada-002 P50K_BASE: Code models (Codex, text-davinci-002, text-davinci-003) R50K_BASE: GPT-3 models (davinci, curie, babbage, ada) Approximation Methods: CHAR_4: Fast approximation using 4 chars/token (good for GPT-like models) CHAR_5: Fast approximation using 5 chars/token (good for BERT-like models) WORD: Word-based approximation (1.3 tokens/word average) """ # Tiktoken encodings CL100K_BASE = "cl100k_base" P50K_BASE = "p50k_base" R50K_BASE = "r50k_base" P50K_EDIT = "p50k_edit" # Approximation methods CHAR_4 = "approximate_char_4" CHAR_5 = "approximate_char_5" WORD = "approximate_word" @property def method(self) -> str: """Get the tokenization method for this tokenizer.""" if self.value.startswith("approximate"): return "approximate" else: return "tiktoken"
[docs] def count_tokens( text: str, tokenizer: Union[Tokenizer, str] = Tokenizer.CL100K_BASE ) -> int: """Count tokens in text using the specified tokenizer. Args: text (str): Text to count tokens for tokenizer (Union[Tokenizer, str]): Tokenizer to use. Can be a Tokenizer enum value or a HuggingFace model name (e.g., "bert-base-uncased", "meta-llama/Llama-2-7b-hf"). Defaults to Tokenizer.CL100K_BASE (used by GPT-4o and GPT-4o-mini). Returns: int: Token count Examples: >>> count_tokens("Hello world!", tokenizer=Tokenizer.CL100K_BASE) 3 >>> count_tokens("Hello world!", tokenizer=Tokenizer.P50K_BASE) 3 >>> count_tokens("Hello world!", tokenizer="bert-base-uncased") 4 >>> count_tokens("Hello world!", tokenizer=Tokenizer.CHAR_4) 3 """ if not text: return 0 # Handle string tokenizer (HuggingFace model name) if isinstance(tokenizer, str) and not isinstance(tokenizer, Tokenizer): return _count_tokens_transformers(text, tokenizer) # Handle Tokenizer enum if isinstance(tokenizer, Tokenizer): if tokenizer.method == "tiktoken": return _count_tokens_tiktoken(text, tokenizer.value) elif tokenizer.method == "approximate": return _count_tokens_approximate(text, tokenizer) raise ValueError(f"Invalid tokenizer: {tokenizer}")
[docs] def batch_count_tokens( texts: List[str], tokenizer: Union[Tokenizer, str] = Tokenizer.CL100K_BASE ) -> List[int]: """Count tokens for multiple texts. Args: texts (List[str]): List of texts to count tokens for tokenizer (Union[Tokenizer, str]): Tokenizer to use. Defaults to Tokenizer.CL100K_BASE. Returns: List[int]: List of token counts Examples: >>> texts = ["Hello world!", "How are you?", "Good morning!"] >>> batch_count_tokens(texts, tokenizer=Tokenizer.CL100K_BASE) [3, 4, 3] """ return [count_tokens(text, tokenizer) for text in texts]
[docs] def count_tokens_for_messages( messages: List[dict], tokenizer: Union[Tokenizer, str] = Tokenizer.CL100K_BASE ) -> int: """Count tokens for a list of chat messages including format overhead. OpenAI chat models format messages with special tokens. This function accounts for the overhead of message formatting. Works best with tiktoken tokenizers (CL100K_BASE, P50K_BASE, etc.). Args: messages (List[dict]): List of message dicts with 'role' and 'content' keys. Example: [{"role": "user", "content": "Hello!"}] tokenizer (Union[Tokenizer, str]): Tokenizer to use. Defaults to Tokenizer.CL100K_BASE. Returns: int: Total token count including message formatting overhead Examples: >>> messages = [ ... {"role": "system", "content": "You are a helpful assistant."}, ... {"role": "user", "content": "Hello!"} ... ] >>> count_tokens_for_messages(messages, tokenizer=Tokenizer.CL100K_BASE) 28 """ if not messages: return 0 # Try using tiktoken for accurate counting try: import tiktoken # Get encoding based on tokenizer if isinstance(tokenizer, Tokenizer): if tokenizer.method == "tiktoken": encoding_obj = tiktoken.get_encoding(tokenizer.value) else: # Fall back to cl100k_base for approximation tokenizers encoding_obj = tiktoken.get_encoding("cl100k_base") elif isinstance(tokenizer, str): # Try as HuggingFace model - fall back to cl100k_base try: encoding_obj = tiktoken.encoding_for_model(tokenizer) except KeyError: encoding_obj = tiktoken.get_encoding("cl100k_base") else: encoding_obj = tiktoken.get_encoding("cl100k_base") # Token overhead is consistent for cl100k_base encoding tokens_per_message = 3 tokens_per_name = 1 num_tokens = 0 for message in messages: num_tokens += tokens_per_message for key, value in message.items(): num_tokens += len(encoding_obj.encode(str(value))) if key == "name": num_tokens += tokens_per_name num_tokens += 3 # every reply is primed with <|start|>assistant<|message|> return num_tokens except ImportError: warnings.warn( "tiktoken not installed. Using approximation for message token counting. " "Install with: pip install tiktoken" ) # Approximate counting total = 0 for message in messages: content = message.get("content", "") total += count_tokens(content, tokenizer) total += 4 # Overhead per message total += 3 # Reply priming tokens return total
[docs] def truncate_to_token_limit( text: str, max_tokens: int, tokenizer: Union[Tokenizer, str] = Tokenizer.CL100K_BASE, preserve_end: bool = False, ellipsis: str = "...", ) -> str: """Truncate text to fit within a token limit. Args: text (str): Text to truncate max_tokens (int): Maximum number of tokens tokenizer (Union[Tokenizer, str]): Tokenizer to use. Defaults to Tokenizer.CL100K_BASE. preserve_end (bool): If True, keep the end of text instead of beginning. Defaults to False. ellipsis (str): String to indicate truncation. Defaults to "...". Returns: str: Truncated text Examples: >>> text = "This is a long text that needs to be truncated." >>> truncate_to_token_limit(text, max_tokens=5, tokenizer=Tokenizer.CL100K_BASE) 'This is a long...' """ if not text: return "" current_tokens = count_tokens(text, tokenizer) if current_tokens <= max_tokens: return text # Import from utils module for character conversion from .utils import tokens_to_chars # Try using tiktoken for accurate truncation if isinstance(tokenizer, Tokenizer) and tokenizer.method == "tiktoken": try: import tiktoken encoding_obj = tiktoken.get_encoding(tokenizer.value) # Encode the text tokens = encoding_obj.encode(text) # Account for ellipsis tokens ellipsis_tokens = len(encoding_obj.encode(ellipsis)) available_tokens = max_tokens - ellipsis_tokens if available_tokens <= 0: return ellipsis if preserve_end: truncated_tokens = tokens[-available_tokens:] return ellipsis + encoding_obj.decode(truncated_tokens) else: truncated_tokens = tokens[:available_tokens] return encoding_obj.decode(truncated_tokens) + ellipsis except ImportError: pass # Fall through to character-based truncation # Fallback to character-based truncation if isinstance(tokenizer, Tokenizer): char_limit = tokens_to_chars(max_tokens, tokenizer) else: char_limit = max_tokens * 4 # Default approximation if preserve_end: if len(text) > char_limit: return ellipsis + text[-(char_limit - len(ellipsis)) :] return text else: if len(text) > char_limit: return text[: char_limit - len(ellipsis)] + ellipsis return text
# Private helper functions def _count_tokens_tiktoken(text: str, encoding: str) -> int: """Count tokens using tiktoken library.""" try: import tiktoken enc = tiktoken.get_encoding(encoding) return len(enc.encode(text)) except ImportError: warnings.warn( f"tiktoken not installed. Using approximation. " "Install with: pip install tiktoken" ) return _count_tokens_approximate(text, Tokenizer.CHAR_4) def _count_tokens_transformers(text: str, model: str) -> int: """Count tokens using HuggingFace transformers library.""" try: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model) tokens = tokenizer.encode(text) return len(tokens) except ImportError: warnings.warn( f"transformers not installed. Using approximation for {model}. " "Install with: pip install transformers" ) return _count_tokens_approximate(text, Tokenizer.CHAR_5) except Exception as e: warnings.warn( f"Could not load tokenizer for {model}: {e}. Using approximation." ) return _count_tokens_approximate(text, Tokenizer.CHAR_5) def _count_tokens_approximate(text: str, tokenizer: Tokenizer) -> int: """Fast approximation of token count based on character count or word count. This provides a quick estimate without requiring external libraries. Accuracy varies by language and text type. """ if not text: return 0 if tokenizer == Tokenizer.CHAR_4: return len(text) // 4 elif tokenizer == Tokenizer.CHAR_5: return len(text) // 5 elif tokenizer == Tokenizer.WORD: words = text.split() # Assume ~1.3 tokens per word on average return int(len(words) * 1.3) else: # Default to CHAR_4 return len(text) // 4