Multimodal Module

Multi-modal processing utilities for LLM applications.

This module provides comprehensive multi-modal capabilities for working with images, audio, video, and vision models in LLM applications.

Usage Examples:

# Common imports - most frequently used
from kerb.multimodal import (
    # Enums
    MediaType, ImageFormat, AudioFormat, VideoFormat,
    VisionModel, TranscriptionModel,
    # Data classes
    ImageInfo, AudioInfo, VideoInfo, TranscriptionResult, VisionAnalysis,
    # Common utilities
    detect_media_type, validate_media_file,
)

# Vision - image processing and analysis
from kerb.multimodal.vision import (
    load_image, get_image_info, image_to_base64,
    analyze_image_with_vision_model, embed_multimodal,
)

# Audio - transcription and processing
from kerb.multimodal.audio import (
    transcribe_audio, get_audio_info, convert_audio_format,
)

# Video - frame extraction and processing
from kerb.multimodal.video import (
    extract_video_frames, get_video_info, create_video_thumbnail,
)

# Prompts - multi-modal prompt construction
from kerb.multimodal.prompts import (
    build_multimodal_prompt,
    build_anthropic_multimodal_content,
    build_google_multimodal_content,
)

For image editing (resize, crop, rotate, grid), use PIL/Pillow directly:

from PIL import Image
img = Image.open("photo.jpg")
img = img.resize((800, 600))
img = img.rotate(90)
img.save("edited.jpg")

class kerb.multimodal.MediaType(*values)[source]

Bases: Enum

Supported media types.

IMAGE = 'image'

AUDIO = 'audio'

VIDEO = 'video'

UNKNOWN = 'unknown'

class kerb.multimodal.ImageFormat(*values)[source]

Bases: Enum

Supported image formats.

JPEG = 'jpeg'

PNG = 'png'

WEBP = 'webp'

GIF = 'gif'

BMP = 'bmp'

TIFF = 'tiff'

SVG = 'svg'

class kerb.multimodal.AudioFormat(*values)[source]

Bases: Enum

Supported audio formats.

MP3 = 'mp3'

WAV = 'wav'

M4A = 'm4a'

FLAC = 'flac'

OGG = 'ogg'

OPUS = 'opus'

AAC = 'aac'

class kerb.multimodal.VideoFormat(*values)[source]

Bases: Enum

Supported video formats.

MP4 = 'mp4'

AVI = 'avi'

MOV = 'mov'

MKV = 'mkv'

WEBM = 'webm'

FLV = 'flv'

class kerb.multimodal.VisionModel(*values)[source]

Bases: Enum

Supported vision models.

GPT4_VISION = 'gpt-4-vision-preview'

GPT4O = 'gpt-4o'

GPT4O_MINI = 'gpt-4o-mini'

CLAUDE_3_5_SONNET = 'claude-3-5-sonnet-20241022'

CLAUDE_OPUS_4 = 'claude-opus-4'

CLAUDE_SONNET_4 = 'claude-sonnet-4'

CLAUDE_35_HAIKU = 'claude-3-5-haiku-20241022'

GEMINI_PRO_VISION = 'gemini-pro-vision'

GEMINI_2_5_PRO = 'gemini-2.5-pro'

GEMINI_2_5_FLASH = 'gemini-2.5-flash'

class kerb.multimodal.TranscriptionModel(*values)[source]

Bases: Enum

Supported transcription models.

WHISPER_TINY = 'whisper-tiny'

WHISPER_BASE = 'whisper-base'

WHISPER_SMALL = 'whisper-small'

WHISPER_MEDIUM = 'whisper-medium'

WHISPER_LARGE = 'whisper-large'

WHISPER_LARGE_V3 = 'whisper-large-v3'

OPENAI_WHISPER_1 = 'whisper-1'

class kerb.multimodal.EmbeddingModelMultimodal(*values)[source]

Bases: Enum

Supported multi-modal embedding models.

CLIP_VIT_B_32 = 'clip-vit-b-32'

CLIP_VIT_L_14 = 'clip-vit-l-14'

OPENAI_CLIP = 'openai/clip-vit-base-patch32'

IMAGEBIND = 'imagebind'

class kerb.multimodal.ImageInfo(width, height, format, mode, size_bytes, aspect_ratio, metadata=<factory>)[source]

Bases: object

Information about an image.

width: int

height: int

format: ImageFormat

mode: str

size_bytes: int

aspect_ratio: float

metadata: Dict[str, Any]

__init__(width, height, format, mode, size_bytes, aspect_ratio, metadata=<factory>)

class kerb.multimodal.AudioInfo(duration_seconds, sample_rate, channels, format, size_bytes, bitrate=None, metadata=<factory>)[source]

Bases: object

Information about an audio file.

duration_seconds: float

sample_rate: int

channels: int

format: AudioFormat

size_bytes: int

bitrate: int | None = None

metadata: Dict[str, Any]

__init__(duration_seconds, sample_rate, channels, format, size_bytes, bitrate=None, metadata=<factory>)

class kerb.multimodal.VideoInfo(width, height, duration_seconds, fps, frame_count, format, size_bytes, codec=None, has_audio=False, metadata=<factory>)[source]

Bases: object

Information about a video file.

width: int

height: int

duration_seconds: float

fps: float

frame_count: int

format: VideoFormat

size_bytes: int

codec: str | None = None

has_audio: bool = False

metadata: Dict[str, Any]

__init__(width, height, duration_seconds, fps, frame_count, format, size_bytes, codec=None, has_audio=False, metadata=<factory>)

class kerb.multimodal.TranscriptionResult(text, language=None, segments=None, confidence=None, duration=None, word_timestamps=None, metadata=<factory>)[source]

Bases: object

Result of audio transcription.

text: str

language: str | None = None

segments: List[Dict[str, Any]] | None = None

confidence: float | None = None

duration: float | None = None

word_timestamps: List[Dict[str, Any]] | None = None

metadata: Dict[str, Any]

__init__(text, language=None, segments=None, confidence=None, duration=None, word_timestamps=None, metadata=<factory>)

class kerb.multimodal.VisionAnalysis(description, objects=None, text_content=None, emotions=None, colors=None, confidence=None, metadata=<factory>)[source]

Bases: object

Result of vision model analysis.

description: str

objects: List[Dict[str, Any]] | None = None

text_content: str | None = None

emotions: List[str] | None = None

colors: List[str] | None = None

confidence: float | None = None

metadata: Dict[str, Any]

__init__(description, objects=None, text_content=None, emotions=None, colors=None, confidence=None, metadata=<factory>)

class kerb.multimodal.MultiModalContent(type, content, metadata=<factory>)[source]

Bases: object

Represents multi-modal content for prompts.

type: str

content: str | bytes | Dict[str, Any]

metadata: Dict[str, Any]

__init__(type, content, metadata=<factory>)

kerb.multimodal.detect_media_type(file_path)[source]

Detect media type from file extension.

Parameters:: file_path (str) – Path to the media file
Returns:: Detected media type
Return type:: MediaType

Examples

>>> detect_media_type("photo.jpg")
MediaType.IMAGE

>>> detect_media_type("audio.mp3")
MediaType.AUDIO

kerb.multimodal.get_mime_type(file_path)[source]

Get MIME type for a file.

Parameters:: file_path (str) – Path to the file
Returns:: MIME type (e.g., “image/jpeg”)
Return type:: str

Examples

>>> get_mime_type("photo.jpg")
'image/jpeg'

kerb.multimodal.validate_media_file(file_path, expected_type=None)[source]

Validate that a media file exists and is of expected type.

Parameters:

file_path (str) – Path to the media file
expected_type (Optional[MediaType]) – Expected media type (None to skip type check)

Returns:

True if valid, False otherwise

Return type:

bool

Examples

>>> validate_media_file("photo.jpg", MediaType.IMAGE)
True

kerb.multimodal.calculate_file_checksum(file_path, algorithm='md5')[source]

Calculate checksum of a media file.

Parameters:

file_path (str) – Path to the file
algorithm (str) – Hash algorithm (“md5”, “sha1”, “sha256”)

Returns:

Hexadecimal checksum string

Return type:

str

Examples

>>> checksum = calculate_file_checksum("video.mp4")
>>> len(checksum)
32

kerb.multimodal.load_image(file_path)[source]

Load an image from file.

Parameters:

file_path (str) – Path to the image file

Returns:

Loaded image object

Return type:

Any

Raises:

ImportError – If PIL is not installed
FileNotFoundError – If file doesn’t exist

Examples

>>> img = load_image("photo.jpg")
>>> img.size
(1920, 1080)

kerb.multimodal.get_image_info(file_path)[source]

Get detailed information about an image.

Parameters:: file_path (str) – Path to the image file
Returns:: Image information object
Return type:: ImageInfo

Examples

>>> info = get_image_info("photo.jpg")
>>> print(f"{info.width}x{info.height}")
1920x1080

kerb.multimodal.convert_image_format(file_path, target_format, output_path=None, quality=85)[source]

Convert image to a different format.

Parameters:

file_path (str) – Path to the input image
target_format (Union[str, ImageFormat]) – Target format (e.g., “PNG”, “JPEG”)
output_path (Optional[str]) – Output path (auto-generated if None)
quality (int) – Quality for lossy formats

Returns:

Path to the converted image

Return type:

str

Examples

>>> convert_image_format("photo.png", "JPEG")
'photo.jpg'

kerb.multimodal.image_to_base64(file_path, include_prefix=True)[source]

Convert image to base64 string.

Parameters:

file_path (str) – Path to the image file
include_prefix (bool) – Whether to include data URI prefix

Returns:

Base64-encoded image string

Return type:

str

Examples

>>> b64 = image_to_base64("photo.jpg")
>>> b64[:30]
'data:image/jpeg;base64,/9j/4A'

kerb.multimodal.base64_to_image(b64_string, output_path)[source]

Convert base64 string to image file.

Parameters:

b64_string (str) – Base64-encoded image (with or without prefix)
output_path (str) – Path to save the image

Returns:

Path to the saved image

Return type:

str

Examples

>>> base64_to_image(b64_data, "output.jpg")
'output.jpg'

kerb.multimodal.extract_dominant_colors(file_path, num_colors=5)[source]

Extract dominant colors from an image.

Parameters:

file_path (str) – Path to the image file
num_colors (int) – Number of dominant colors to extract

Return type:

List[Tuple[int, int, int]]

Returns:

List of RGB tuples representing dominant colors

Examples

>>> colors = extract_dominant_colors("photo.jpg", 3)
>>> colors
[(45, 67, 89), (120, 130, 140), (200, 210, 220)]

kerb.multimodal.calculate_image_hash(file_path, hash_size=8)[source]

Calculate perceptual hash of an image for similarity comparison.

Parameters:

file_path (str) – Path to the image file
hash_size (int) – Size of hash (default 8 gives 64-bit hash)

Returns:

Hexadecimal hash string

Return type:

str

Examples

>>> hash1 = calculate_image_hash("photo1.jpg")
>>> hash2 = calculate_image_hash("photo2.jpg")
>>> hash1 == hash2  # Similar images have same hash
True

kerb.multimodal.analyze_image_with_vision_model(image_path, prompt, model=VisionModel.GPT4O, api_key=None, max_tokens=300)[source]

Analyze an image using a vision model.

Parameters:

image_path (str) – Path to the image file
prompt (str) – Text prompt/question about the image
model (Union[str, VisionModel]) – Vision model to use
api_key (Optional[str]) – API key for the model provider
max_tokens (int) – Maximum tokens in response

Returns:

Analysis result with description and metadata

Return type:

VisionAnalysis

Examples

>>> analysis = analyze_image_with_vision_model(
...     "photo.jpg",
...     "What objects are in this image?"
... )
>>> print(analysis.description)
'The image contains a cat, a book, and a coffee mug on a table.'

kerb.multimodal.embed_multimodal(content, content_type, model=EmbeddingModelMultimodal.CLIP_VIT_B_32, device='cpu')[source]

Generate multi-modal embeddings for images, audio, or text.

Parameters:

content (Union[str, bytes]) – Content to embed (file path for images/audio, text string for text)
content_type (str) – Type of content (“image”, “audio”, “text”)
model (Union[str, EmbeddingModelMultimodal]) – Embedding model to use
device (Union[Device, str]) – Device to run model on (Device enum or string: “cpu”, “cuda”, “cuda:0”, “cuda:1”, “mps”)

Return type:

List[float]

Returns:

List of embedding values

Examples

>>> from kerb.core.enums import Device
>>> embedding = embed_multimodal("photo.jpg", "image", device=Device.CUDA)
>>> len(embedding)
512

kerb.multimodal.compute_multimodal_similarity(embedding1, embedding2)[source]

Compute cosine similarity between two multi-modal embeddings.

Parameters:

embedding1 (List[float]) – First embedding vector
embedding2 (List[float]) – Second embedding vector

Returns:

Cosine similarity score (-1 to 1)

Return type:

float

Examples

>>> emb1 = embed_multimodal("photo1.jpg", "image")
>>> emb2 = embed_multimodal("photo2.jpg", "image")
>>> similarity = compute_multimodal_similarity(emb1, emb2)
>>> print(f"Similarity: {similarity:.3f}")
Similarity: 0.892

kerb.multimodal.get_audio_info(file_path)[source]

Get detailed information about an audio file.

Parameters:: file_path (str) – Path to the audio file
Returns:: Audio information object
Return type:: AudioInfo
Raises:: ImportError – If required audio library is not installed

Examples

>>> info = get_audio_info("audio.mp3")
>>> print(f"Duration: {info.duration_seconds}s")
Duration: 123.5s

kerb.multimodal.convert_audio_format(file_path, target_format, output_path=None, bitrate='192k')[source]

Convert audio to a different format.

Parameters:

file_path (str) – Path to the input audio
target_format (Union[str, AudioFormat]) – Target format (e.g., “mp3”, “wav”)
output_path (Optional[str]) – Output path (auto-generated if None)
bitrate (str) – Bitrate for lossy formats (e.g., “192k”)

Returns:

Path to the converted audio

Return type:

str

Raises:

ImportError – If pydub is not installed

Examples

>>> convert_audio_format("audio.wav", "mp3")
'audio.mp3'

kerb.multimodal.transcribe_audio(file_path, model=TranscriptionModel.OPENAI_WHISPER_1, language=None, api_key=None, return_timestamps=False, max_size_mb=25, max_duration_minutes=None)[source]

Transcribe audio to text using various models.

Parameters:

file_path (str) – Path to the audio file
model (Union[str, TranscriptionModel]) – Transcription model to use
language (Optional[str]) – Language code (None for auto-detect)
api_key (Optional[str]) – API key for cloud models (OpenAI)
return_timestamps (bool) – Whether to return word-level timestamps
max_size_mb (float) – Maximum file size in MB. Defaults to 25 (OpenAI limit).
max_duration_minutes (Optional[float]) – Maximum audio duration in minutes. None for no limit.

Returns:

Transcription result with text and metadata

Return type:

TranscriptionResult

Raises:

ValueError – If file exceeds size or duration limits
FileNotFoundError – If file doesn’t exist

Examples

>>> result = transcribe_audio("audio.mp3")
>>> print(result.text)
'Hello, this is a test transcription.'

>>> # With size and duration limits
>>> result = transcribe_audio("audio.mp3", max_size_mb=50,
...                           max_duration_minutes=10)

async kerb.multimodal.transcribe_audio_async(file_path, model=TranscriptionModel.OPENAI_WHISPER_1, language=None, api_key=None, return_timestamps=False, max_size_mb=25, max_duration_minutes=None)[source]

Transcribe audio to text asynchronously using API models.

Parameters:

file_path (str) – Path to the audio file
model (Union[str, TranscriptionModel]) – Transcription model to use
language (Optional[str]) – Language code (None for auto-detect)
api_key (Optional[str]) – API key for cloud models (OpenAI)
return_timestamps (bool) – Whether to return word-level timestamps
max_size_mb (float) – Maximum file size in MB. Defaults to 25 (OpenAI limit).
max_duration_minutes (Optional[float]) – Maximum audio duration in minutes. None for no limit.

Returns:

Transcription result with text and metadata

Return type:

TranscriptionResult

Note

Currently supports async for OpenAI Whisper API only. Local Whisper models will run synchronously in a thread pool.

Examples

>>> import asyncio
>>> result = asyncio.run(transcribe_audio_async("audio.mp3"))
>>> print(result.text)

kerb.multimodal.extract_audio_from_video(video_path, output_path=None, audio_format='mp3')[source]

Extract audio track from video file.

Parameters:

video_path (str) – Path to the video file
output_path (Optional[str]) – Output path for audio (auto-generated if None)
audio_format (str) – Output audio format

Returns:

Path to the extracted audio file

Return type:

str

Raises:

ImportError – If moviepy is not installed

Examples

>>> audio_path = extract_audio_from_video("video.mp4")
>>> print(audio_path)
'video.mp3'

kerb.multimodal.get_video_info(file_path)[source]

Get detailed information about a video file.

Parameters:: file_path (str) – Path to the video file
Returns:: Video information object
Return type:: VideoInfo
Raises:: ImportError – If moviepy is not installed

Examples

>>> info = get_video_info("video.mp4")
>>> print(f"{info.width}x{info.height} @ {info.fps} FPS")
1920x1080 @ 30.0 FPS

kerb.multimodal.extract_video_frames(video_path, output_dir, fps=None, max_frames=None, start_time=0.0, end_time=None)[source]

Extract frames from a video.

Parameters:

video_path (str) – Path to the video file
output_dir (str) – Directory to save frames
fps (Optional[float]) – Frames per second to extract (None for all frames)
max_frames (Optional[int]) – Maximum number of frames to extract
start_time (float) – Start time in seconds
end_time (Optional[float]) – End time in seconds (None for end of video)

Return type:

List[str]

Returns:

List of paths to extracted frame images

Examples

>>> frames = extract_video_frames("video.mp4", "frames/", fps=1)
>>> len(frames)
30

kerb.multimodal.create_video_thumbnail(video_path, output_path=None, time=1.0)[source]

Create a thumbnail image from a video.

Parameters:

video_path (str) – Path to the video file
output_path (Optional[str]) – Output path for thumbnail (auto-generated if None)
time (float) – Time in seconds to extract frame

Returns:

Path to the thumbnail image

Return type:

str

Examples

>>> thumb = create_video_thumbnail("video.mp4")
>>> print(thumb)
'video_thumb.jpg'

kerb.multimodal.build_multimodal_prompt(text, images=None, audio=None, encode_media=True)[source]

Build a multi-modal prompt for LLM APIs.

Parameters:

text (str) – Text prompt
images (Optional[List[str]]) – List of image file paths
audio (Optional[List[str]]) – List of audio file paths (will be transcribed)
encode_media (bool) – Whether to encode media as base64

Return type:

List[Dict[str, Any]]

Returns:

List of content parts for multi-modal API calls

Examples

>>> prompt = build_multimodal_prompt(
...     "What's in these images?",
...     images=["photo1.jpg", "photo2.jpg"]
... )
>>> len(prompt)
3

kerb.multimodal.build_anthropic_multimodal_content(text, images=None)[source]

Build Anthropic-specific multi-modal content format.

Parameters:

text (str) – Text prompt
images (Optional[List[str]]) – List of image file paths

Return type:

List[Dict[str, Any]]

Returns:

List of content blocks in Anthropic format

Examples

>>> content = build_anthropic_multimodal_content(
...     "Describe this image",
...     images=["photo.jpg"]
... )

kerb.multimodal.build_google_multimodal_content(text, images=None)[source]

Build Google Gemini-specific multi-modal content format.

Parameters:

text (str) – Text prompt
images (Optional[List[str]]) – List of image file paths

Return type:

List[Any]

Returns:

List of content parts for Gemini API

Examples

>>> content = build_google_multimodal_content(
...     "What's in this image?",
...     images=["photo.jpg"]
... )

Image, audio, and video processing for multimodal models.