Source code for kerb.tokenizer.utils

"""Tokenizer utility functions for token estimation and cost calculation.

This module provides utility functions for:
- Converting between tokens and characters
- Estimating API costs based on token usage
- Optimizing token usage in applications
"""

from typing import Optional

from .tokenizer import Tokenizer


[docs] def tokens_to_chars( token_count: int, tokenizer: Tokenizer = Tokenizer.CL100K_BASE ) -> int: """Estimate character count from token count. Args: token_count (int): Number of tokens tokenizer (Tokenizer): Tokenizer for estimation. Defaults to Tokenizer.CL100K_BASE. Returns: int: Estimated character count Examples: >>> tokens_to_chars(100, tokenizer=Tokenizer.CL100K_BASE) 400 """ # Tokenizer-specific character-to-token ratios if tokenizer in [ Tokenizer.CL100K_BASE, Tokenizer.P50K_BASE, Tokenizer.R50K_BASE, Tokenizer.CHAR_4, ]: chars_per_token = 4 elif tokenizer in [Tokenizer.CHAR_5]: chars_per_token = 5 elif tokenizer == Tokenizer.WORD: chars_per_token = 6 # Assuming average word length + space else: chars_per_token = 4 # Default return token_count * chars_per_token
[docs] def chars_to_tokens( char_count: int, tokenizer: Tokenizer = Tokenizer.CL100K_BASE ) -> int: """Estimate token count from character count. Args: char_count (int): Number of characters tokenizer (Tokenizer): Tokenizer for estimation. Defaults to Tokenizer.CL100K_BASE. Returns: int: Estimated token count Examples: >>> chars_to_tokens(400, tokenizer=Tokenizer.CL100K_BASE) 100 """ # Tokenizer-specific character-to-token ratios if tokenizer in [ Tokenizer.CL100K_BASE, Tokenizer.P50K_BASE, Tokenizer.R50K_BASE, Tokenizer.CHAR_4, ]: chars_per_token = 4 elif tokenizer in [Tokenizer.CHAR_5]: chars_per_token = 5 elif tokenizer == Tokenizer.WORD: chars_per_token = 6 # Assuming average word length + space else: chars_per_token = 4 # Default return char_count // chars_per_token
[docs] def estimate_cost( token_count: int, model: str = "gpt-4o", is_input: bool = True ) -> float: """Estimate API cost based on token usage. Args: token_count (int): Number of tokens model (str): Model name for pricing. Defaults to "gpt-4o". is_input (bool): Whether tokens are input (True) or output (False). Defaults to True. Returns: float: Estimated cost in USD Examples: >>> estimate_cost(1000, model="gpt-4o", is_input=True) 0.005 >>> estimate_cost(1000, model="gpt-4o-mini", is_input=False) 0.0006 Note: Pricing is approximate and may change. Check official pricing for accuracy. """ # Approximate pricing per 1K tokens (as of 2026) # These should be updated to match current pricing pricing = { "gpt-4o": {"input": 0.005, "output": 0.015}, "gpt-4o-mini": {"input": 0.00015, "output": 0.0006}, "text-embedding-ada-002": {"input": 0.0001, "output": 0.0001}, } # Get pricing for model (default to gpt-4o if not found) model_pricing = pricing.get(model, pricing["gpt-4o"]) price_per_1k = model_pricing["input"] if is_input else model_pricing["output"] # Calculate cost cost = (token_count / 1000) * price_per_1k return cost
[docs] def optimize_token_usage( text: str, max_tokens: Optional[int] = None, tokenizer: Tokenizer = Tokenizer.CL100K_BASE, ) -> dict: """Analyze and suggest optimizations for token usage. Args: text (str): Text to analyze max_tokens (Optional[int]): Maximum token limit. If provided, will check if text exceeds limit. tokenizer (Tokenizer): Tokenizer to use. Defaults to Tokenizer.CL100K_BASE. Returns: dict: Analysis results including: - token_count: Actual token count - char_count: Character count - tokens_per_char: Token to character ratio - exceeds_limit: Whether text exceeds max_tokens (if provided) - suggested_action: Recommended action based on analysis Examples: >>> result = optimize_token_usage("Hello world!", max_tokens=10) >>> result["token_count"] 3 >>> result["exceeds_limit"] False """ from .tokenizer import count_tokens token_count = count_tokens(text, tokenizer) char_count = len(text) tokens_per_char = token_count / char_count if char_count > 0 else 0 result = { "token_count": token_count, "char_count": char_count, "tokens_per_char": round(tokens_per_char, 4), "exceeds_limit": False, "suggested_action": "Token usage is optimal", } if max_tokens is not None: result["exceeds_limit"] = token_count > max_tokens if result["exceeds_limit"]: excess = token_count - max_tokens result["suggested_action"] = ( f"Text exceeds limit by {excess} tokens. " f"Consider truncating or summarizing." ) return result