Multimodal Module

Multi-modal processing utilities for LLM applications.

This module provides comprehensive multi-modal capabilities for working with images, audio, video, and vision models in LLM applications.

Usage Examples:

# Common imports - most frequently used
from kerb.multimodal import (
    # Enums
    MediaType, ImageFormat, AudioFormat, VideoFormat,
    VisionModel, TranscriptionModel,
    # Data classes
    ImageInfo, AudioInfo, VideoInfo, TranscriptionResult, VisionAnalysis,
    # Common utilities
    detect_media_type, validate_media_file,
)

# Vision - image processing and analysis
from kerb.multimodal.vision import (
    load_image, get_image_info, image_to_base64,
    analyze_image_with_vision_model, embed_multimodal,
)

# Audio - transcription and processing
from kerb.multimodal.audio import (
    transcribe_audio, get_audio_info, convert_audio_format,
)

# Video - frame extraction and processing
from kerb.multimodal.video import (
    extract_video_frames, get_video_info, create_video_thumbnail,
)

# Prompts - multi-modal prompt construction
from kerb.multimodal.prompts import (
    build_multimodal_prompt,
    build_anthropic_multimodal_content,
    build_google_multimodal_content,
)

For image editing (resize, crop, rotate, grid), use PIL/Pillow directly:

from PIL import Image
img = Image.open("photo.jpg")
img = img.resize((800, 600))
img = img.rotate(90)
img.save("edited.jpg")
class kerb.multimodal.MediaType(*values)[source]

Bases: Enum

Supported media types.

IMAGE = 'image'
AUDIO = 'audio'
VIDEO = 'video'
UNKNOWN = 'unknown'
class kerb.multimodal.ImageFormat(*values)[source]

Bases: Enum

Supported image formats.

JPEG = 'jpeg'
PNG = 'png'
WEBP = 'webp'
GIF = 'gif'
BMP = 'bmp'
TIFF = 'tiff'
SVG = 'svg'
class kerb.multimodal.AudioFormat(*values)[source]

Bases: Enum

Supported audio formats.

MP3 = 'mp3'
WAV = 'wav'
M4A = 'm4a'
FLAC = 'flac'
OGG = 'ogg'
OPUS = 'opus'
AAC = 'aac'
class kerb.multimodal.VideoFormat(*values)[source]

Bases: Enum

Supported video formats.

MP4 = 'mp4'
AVI = 'avi'
MOV = 'mov'
MKV = 'mkv'
WEBM = 'webm'
FLV = 'flv'
class kerb.multimodal.VisionModel(*values)[source]

Bases: Enum

Supported vision models.

GPT4_VISION = 'gpt-4-vision-preview'
GPT4O = 'gpt-4o'
GPT4O_MINI = 'gpt-4o-mini'
CLAUDE_3_5_SONNET = 'claude-3-5-sonnet-20241022'
CLAUDE_OPUS_4 = 'claude-opus-4'
CLAUDE_SONNET_4 = 'claude-sonnet-4'
CLAUDE_35_HAIKU = 'claude-3-5-haiku-20241022'
GEMINI_PRO_VISION = 'gemini-pro-vision'
GEMINI_2_5_PRO = 'gemini-2.5-pro'
GEMINI_2_5_FLASH = 'gemini-2.5-flash'
class kerb.multimodal.TranscriptionModel(*values)[source]

Bases: Enum

Supported transcription models.

WHISPER_TINY = 'whisper-tiny'
WHISPER_BASE = 'whisper-base'
WHISPER_SMALL = 'whisper-small'
WHISPER_MEDIUM = 'whisper-medium'
WHISPER_LARGE = 'whisper-large'
WHISPER_LARGE_V3 = 'whisper-large-v3'
OPENAI_WHISPER_1 = 'whisper-1'
class kerb.multimodal.EmbeddingModelMultimodal(*values)[source]

Bases: Enum

Supported multi-modal embedding models.

CLIP_VIT_B_32 = 'clip-vit-b-32'
CLIP_VIT_L_14 = 'clip-vit-l-14'
OPENAI_CLIP = 'openai/clip-vit-base-patch32'
IMAGEBIND = 'imagebind'
class kerb.multimodal.ImageInfo(width, height, format, mode, size_bytes, aspect_ratio, metadata=<factory>)[source]

Bases: object

Information about an image.

width: int
height: int
format: ImageFormat
mode: str
size_bytes: int
aspect_ratio: float
metadata: Dict[str, Any]
__init__(width, height, format, mode, size_bytes, aspect_ratio, metadata=<factory>)
class kerb.multimodal.AudioInfo(duration_seconds, sample_rate, channels, format, size_bytes, bitrate=None, metadata=<factory>)[source]

Bases: object

Information about an audio file.

duration_seconds: float
sample_rate: int
channels: int
format: AudioFormat
size_bytes: int
bitrate: int | None = None
metadata: Dict[str, Any]
__init__(duration_seconds, sample_rate, channels, format, size_bytes, bitrate=None, metadata=<factory>)
class kerb.multimodal.VideoInfo(width, height, duration_seconds, fps, frame_count, format, size_bytes, codec=None, has_audio=False, metadata=<factory>)[source]

Bases: object

Information about a video file.

width: int
height: int
duration_seconds: float
fps: float
frame_count: int
format: VideoFormat
size_bytes: int
codec: str | None = None
has_audio: bool = False
metadata: Dict[str, Any]
__init__(width, height, duration_seconds, fps, frame_count, format, size_bytes, codec=None, has_audio=False, metadata=<factory>)
class kerb.multimodal.TranscriptionResult(text, language=None, segments=None, confidence=None, duration=None, word_timestamps=None, metadata=<factory>)[source]

Bases: object

Result of audio transcription.

text: str
language: str | None = None
segments: List[Dict[str, Any]] | None = None
confidence: float | None = None
duration: float | None = None
word_timestamps: List[Dict[str, Any]] | None = None
metadata: Dict[str, Any]
__init__(text, language=None, segments=None, confidence=None, duration=None, word_timestamps=None, metadata=<factory>)
class kerb.multimodal.VisionAnalysis(description, objects=None, text_content=None, emotions=None, colors=None, confidence=None, metadata=<factory>)[source]

Bases: object

Result of vision model analysis.

description: str
objects: List[Dict[str, Any]] | None = None
text_content: str | None = None
emotions: List[str] | None = None
colors: List[str] | None = None
confidence: float | None = None
metadata: Dict[str, Any]
__init__(description, objects=None, text_content=None, emotions=None, colors=None, confidence=None, metadata=<factory>)
class kerb.multimodal.MultiModalContent(type, content, metadata=<factory>)[source]

Bases: object

Represents multi-modal content for prompts.

type: str
content: str | bytes | Dict[str, Any]
metadata: Dict[str, Any]
__init__(type, content, metadata=<factory>)
kerb.multimodal.detect_media_type(file_path)[source]

Detect media type from file extension.

Parameters:

file_path (str) – Path to the media file

Returns:

Detected media type

Return type:

MediaType

Examples

>>> detect_media_type("photo.jpg")
MediaType.IMAGE
>>> detect_media_type("audio.mp3")
MediaType.AUDIO
kerb.multimodal.get_mime_type(file_path)[source]

Get MIME type for a file.

Parameters:

file_path (str) – Path to the file

Returns:

MIME type (e.g., “image/jpeg”)

Return type:

str

Examples

>>> get_mime_type("photo.jpg")
'image/jpeg'
kerb.multimodal.validate_media_file(file_path, expected_type=None)[source]

Validate that a media file exists and is of expected type.

Parameters:
  • file_path (str) – Path to the media file

  • expected_type (Optional[MediaType]) – Expected media type (None to skip type check)

Returns:

True if valid, False otherwise

Return type:

bool

Examples

>>> validate_media_file("photo.jpg", MediaType.IMAGE)
True
kerb.multimodal.calculate_file_checksum(file_path, algorithm='md5')[source]

Calculate checksum of a media file.

Parameters:
  • file_path (str) – Path to the file

  • algorithm (str) – Hash algorithm (“md5”, “sha1”, “sha256”)

Returns:

Hexadecimal checksum string

Return type:

str

Examples

>>> checksum = calculate_file_checksum("video.mp4")
>>> len(checksum)
32
kerb.multimodal.load_image(file_path)[source]

Load an image from file.

Parameters:

file_path (str) – Path to the image file

Returns:

Loaded image object

Return type:

Any

Raises:

Examples

>>> img = load_image("photo.jpg")
>>> img.size
(1920, 1080)
kerb.multimodal.get_image_info(file_path)[source]

Get detailed information about an image.

Parameters:

file_path (str) – Path to the image file

Returns:

Image information object

Return type:

ImageInfo

Examples

>>> info = get_image_info("photo.jpg")
>>> print(f"{info.width}x{info.height}")
1920x1080
kerb.multimodal.convert_image_format(file_path, target_format, output_path=None, quality=85)[source]

Convert image to a different format.

Parameters:
  • file_path (str) – Path to the input image

  • target_format (Union[str, ImageFormat]) – Target format (e.g., “PNG”, “JPEG”)

  • output_path (Optional[str]) – Output path (auto-generated if None)

  • quality (int) – Quality for lossy formats

Returns:

Path to the converted image

Return type:

str

Examples

>>> convert_image_format("photo.png", "JPEG")
'photo.jpg'
kerb.multimodal.image_to_base64(file_path, include_prefix=True)[source]

Convert image to base64 string.

Parameters:
  • file_path (str) – Path to the image file

  • include_prefix (bool) – Whether to include data URI prefix

Returns:

Base64-encoded image string

Return type:

str

Examples

>>> b64 = image_to_base64("photo.jpg")
>>> b64[:30]
'data:image/jpeg;base64,/9j/4A'
kerb.multimodal.base64_to_image(b64_string, output_path)[source]

Convert base64 string to image file.

Parameters:
  • b64_string (str) – Base64-encoded image (with or without prefix)

  • output_path (str) – Path to save the image

Returns:

Path to the saved image

Return type:

str

Examples

>>> base64_to_image(b64_data, "output.jpg")
'output.jpg'
kerb.multimodal.extract_dominant_colors(file_path, num_colors=5)[source]

Extract dominant colors from an image.

Parameters:
  • file_path (str) – Path to the image file

  • num_colors (int) – Number of dominant colors to extract

Return type:

List[Tuple[int, int, int]]

Returns:

List of RGB tuples representing dominant colors

Examples

>>> colors = extract_dominant_colors("photo.jpg", 3)
>>> colors
[(45, 67, 89), (120, 130, 140), (200, 210, 220)]
kerb.multimodal.calculate_image_hash(file_path, hash_size=8)[source]

Calculate perceptual hash of an image for similarity comparison.

Parameters:
  • file_path (str) – Path to the image file

  • hash_size (int) – Size of hash (default 8 gives 64-bit hash)

Returns:

Hexadecimal hash string

Return type:

str

Examples

>>> hash1 = calculate_image_hash("photo1.jpg")
>>> hash2 = calculate_image_hash("photo2.jpg")
>>> hash1 == hash2  # Similar images have same hash
True
kerb.multimodal.analyze_image_with_vision_model(image_path, prompt, model=VisionModel.GPT4O, api_key=None, max_tokens=300)[source]

Analyze an image using a vision model.

Parameters:
  • image_path (str) – Path to the image file

  • prompt (str) – Text prompt/question about the image

  • model (Union[str, VisionModel]) – Vision model to use

  • api_key (Optional[str]) – API key for the model provider

  • max_tokens (int) – Maximum tokens in response

Returns:

Analysis result with description and metadata

Return type:

VisionAnalysis

Examples

>>> analysis = analyze_image_with_vision_model(
...     "photo.jpg",
...     "What objects are in this image?"
... )
>>> print(analysis.description)
'The image contains a cat, a book, and a coffee mug on a table.'
kerb.multimodal.embed_multimodal(content, content_type, model=EmbeddingModelMultimodal.CLIP_VIT_B_32, device='cpu')[source]

Generate multi-modal embeddings for images, audio, or text.

Parameters:
  • content (Union[str, bytes]) – Content to embed (file path for images/audio, text string for text)

  • content_type (str) – Type of content (“image”, “audio”, “text”)

  • model (Union[str, EmbeddingModelMultimodal]) – Embedding model to use

  • device (Union[Device, str]) – Device to run model on (Device enum or string: “cpu”, “cuda”, “cuda:0”, “cuda:1”, “mps”)

Return type:

List[float]

Returns:

List of embedding values

Examples

>>> from kerb.core.enums import Device
>>> embedding = embed_multimodal("photo.jpg", "image", device=Device.CUDA)
>>> len(embedding)
512
kerb.multimodal.compute_multimodal_similarity(embedding1, embedding2)[source]

Compute cosine similarity between two multi-modal embeddings.

Parameters:
  • embedding1 (List[float]) – First embedding vector

  • embedding2 (List[float]) – Second embedding vector

Returns:

Cosine similarity score (-1 to 1)

Return type:

float

Examples

>>> emb1 = embed_multimodal("photo1.jpg", "image")
>>> emb2 = embed_multimodal("photo2.jpg", "image")
>>> similarity = compute_multimodal_similarity(emb1, emb2)
>>> print(f"Similarity: {similarity:.3f}")
Similarity: 0.892
kerb.multimodal.get_audio_info(file_path)[source]

Get detailed information about an audio file.

Parameters:

file_path (str) – Path to the audio file

Returns:

Audio information object

Return type:

AudioInfo

Raises:

ImportError – If required audio library is not installed

Examples

>>> info = get_audio_info("audio.mp3")
>>> print(f"Duration: {info.duration_seconds}s")
Duration: 123.5s
kerb.multimodal.convert_audio_format(file_path, target_format, output_path=None, bitrate='192k')[source]

Convert audio to a different format.

Parameters:
  • file_path (str) – Path to the input audio

  • target_format (Union[str, AudioFormat]) – Target format (e.g., “mp3”, “wav”)

  • output_path (Optional[str]) – Output path (auto-generated if None)

  • bitrate (str) – Bitrate for lossy formats (e.g., “192k”)

Returns:

Path to the converted audio

Return type:

str

Raises:

ImportError – If pydub is not installed

Examples

>>> convert_audio_format("audio.wav", "mp3")
'audio.mp3'
kerb.multimodal.transcribe_audio(file_path, model=TranscriptionModel.OPENAI_WHISPER_1, language=None, api_key=None, return_timestamps=False, max_size_mb=25, max_duration_minutes=None)[source]

Transcribe audio to text using various models.

Parameters:
  • file_path (str) – Path to the audio file

  • model (Union[str, TranscriptionModel]) – Transcription model to use

  • language (Optional[str]) – Language code (None for auto-detect)

  • api_key (Optional[str]) – API key for cloud models (OpenAI)

  • return_timestamps (bool) – Whether to return word-level timestamps

  • max_size_mb (float) – Maximum file size in MB. Defaults to 25 (OpenAI limit).

  • max_duration_minutes (Optional[float]) – Maximum audio duration in minutes. None for no limit.

Returns:

Transcription result with text and metadata

Return type:

TranscriptionResult

Raises:

Examples

>>> result = transcribe_audio("audio.mp3")
>>> print(result.text)
'Hello, this is a test transcription.'
>>> # With size and duration limits
>>> result = transcribe_audio("audio.mp3", max_size_mb=50,
...                           max_duration_minutes=10)
async kerb.multimodal.transcribe_audio_async(file_path, model=TranscriptionModel.OPENAI_WHISPER_1, language=None, api_key=None, return_timestamps=False, max_size_mb=25, max_duration_minutes=None)[source]

Transcribe audio to text asynchronously using API models.

Parameters:
  • file_path (str) – Path to the audio file

  • model (Union[str, TranscriptionModel]) – Transcription model to use

  • language (Optional[str]) – Language code (None for auto-detect)

  • api_key (Optional[str]) – API key for cloud models (OpenAI)

  • return_timestamps (bool) – Whether to return word-level timestamps

  • max_size_mb (float) – Maximum file size in MB. Defaults to 25 (OpenAI limit).

  • max_duration_minutes (Optional[float]) – Maximum audio duration in minutes. None for no limit.

Returns:

Transcription result with text and metadata

Return type:

TranscriptionResult

Note

Currently supports async for OpenAI Whisper API only. Local Whisper models will run synchronously in a thread pool.

Examples

>>> import asyncio
>>> result = asyncio.run(transcribe_audio_async("audio.mp3"))
>>> print(result.text)
kerb.multimodal.extract_audio_from_video(video_path, output_path=None, audio_format='mp3')[source]

Extract audio track from video file.

Parameters:
  • video_path (str) – Path to the video file

  • output_path (Optional[str]) – Output path for audio (auto-generated if None)

  • audio_format (str) – Output audio format

Returns:

Path to the extracted audio file

Return type:

str

Raises:

ImportError – If moviepy is not installed

Examples

>>> audio_path = extract_audio_from_video("video.mp4")
>>> print(audio_path)
'video.mp3'
kerb.multimodal.get_video_info(file_path)[source]

Get detailed information about a video file.

Parameters:

file_path (str) – Path to the video file

Returns:

Video information object

Return type:

VideoInfo

Raises:

ImportError – If moviepy is not installed

Examples

>>> info = get_video_info("video.mp4")
>>> print(f"{info.width}x{info.height} @ {info.fps} FPS")
1920x1080 @ 30.0 FPS
kerb.multimodal.extract_video_frames(video_path, output_dir, fps=None, max_frames=None, start_time=0.0, end_time=None)[source]

Extract frames from a video.

Parameters:
  • video_path (str) – Path to the video file

  • output_dir (str) – Directory to save frames

  • fps (Optional[float]) – Frames per second to extract (None for all frames)

  • max_frames (Optional[int]) – Maximum number of frames to extract

  • start_time (float) – Start time in seconds

  • end_time (Optional[float]) – End time in seconds (None for end of video)

Return type:

List[str]

Returns:

List of paths to extracted frame images

Examples

>>> frames = extract_video_frames("video.mp4", "frames/", fps=1)
>>> len(frames)
30
kerb.multimodal.create_video_thumbnail(video_path, output_path=None, time=1.0)[source]

Create a thumbnail image from a video.

Parameters:
  • video_path (str) – Path to the video file

  • output_path (Optional[str]) – Output path for thumbnail (auto-generated if None)

  • time (float) – Time in seconds to extract frame

Returns:

Path to the thumbnail image

Return type:

str

Examples

>>> thumb = create_video_thumbnail("video.mp4")
>>> print(thumb)
'video_thumb.jpg'
kerb.multimodal.build_multimodal_prompt(text, images=None, audio=None, encode_media=True)[source]

Build a multi-modal prompt for LLM APIs.

Parameters:
  • text (str) – Text prompt

  • images (Optional[List[str]]) – List of image file paths

  • audio (Optional[List[str]]) – List of audio file paths (will be transcribed)

  • encode_media (bool) – Whether to encode media as base64

Return type:

List[Dict[str, Any]]

Returns:

List of content parts for multi-modal API calls

Examples

>>> prompt = build_multimodal_prompt(
...     "What's in these images?",
...     images=["photo1.jpg", "photo2.jpg"]
... )
>>> len(prompt)
3
kerb.multimodal.build_anthropic_multimodal_content(text, images=None)[source]

Build Anthropic-specific multi-modal content format.

Parameters:
Return type:

List[Dict[str, Any]]

Returns:

List of content blocks in Anthropic format

Examples

>>> content = build_anthropic_multimodal_content(
...     "Describe this image",
...     images=["photo.jpg"]
... )
kerb.multimodal.build_google_multimodal_content(text, images=None)[source]

Build Google Gemini-specific multi-modal content format.

Parameters:
Return type:

List[Any]

Returns:

List of content parts for Gemini API

Examples

>>> content = build_google_multimodal_content(
...     "What's in this image?",
...     images=["photo.jpg"]
... )

Image, audio, and video processing for multimodal models.