JSONL Quick Reference
Common Operations
Basic I/O
from kerb.fine_tuning import jsonl
# Write
jsonl.write_jsonl(data, "output.jsonl")
# Read
data = jsonl.read_jsonl("input.jsonl")
# Append
jsonl.append_jsonl(more_data, "output.jsonl")
Compression (Recommended for Large Files)
# Write compressed (70-90% smaller)
jsonl.write_jsonl(data, "data.jsonl", compress=True)
# Read compressed (auto-detects format)
data = jsonl.read_jsonl("data.jsonl.gz")
# Convert existing file
jsonl.compress_jsonl("data.jsonl") # → data.jsonl.gz
Performance (Large Files)
# Parallel reading (2-4x faster)
data = jsonl.parallel_read_jsonl("large.jsonl", num_workers=4)
# Streaming (unlimited file size)
for batch in jsonl.stream_jsonl("huge.jsonl", batch_size=1000):
process(batch)
Filtering
# Filter by condition
jsonl.filter_jsonl(
"input.jsonl",
"output.jsonl",
filter_fn=lambda x: x["score"] > 0.8
)
# Stream with filter
for batch in jsonl.stream_jsonl(
"data.jsonl",
filter_fn=lambda x: x["lang"] == "en"
):
process(batch)
Transformation
# Transform data
def normalize(item):
return {
"text": item["text"].lower(),
"label": item["label"]
}
jsonl.transform_jsonl("in.jsonl", "out.jsonl", normalize)
# Parallel transformation (faster)
jsonl.transform_jsonl(
"in.jsonl", "out.jsonl",
normalize,
parallel=True,
num_workers=8
)
Data Cleaning
# Remove duplicates
removed = jsonl.deduplicate_jsonl(
"data.jsonl",
"clean.jsonl",
key_fn=lambda x: x["prompt"] # dedupe by field
)
# Random sampling
jsonl.sample_jsonl(
"full.jsonl",
"sample.jsonl",
fraction=0.1, # 10% of data
random_state=42
)
File Operations
# Merge files
jsonl.merge_jsonl(
["file1.jsonl", "file2.jsonl", "file3.jsonl"],
"merged.jsonl"
)
# Split large file
chunks = jsonl.split_jsonl(
"large.jsonl",
"chunk",
split_size=10000 # lines per file
)
# Count lines
count = jsonl.count_jsonl_lines("data.jsonl")
# Get statistics
stats = jsonl.get_jsonl_stats("data.jsonl")
print(f"Lines: {stats['total_lines']}, Size: {stats['total_bytes']}")
Validation
# Validate file
result = jsonl.validate_jsonl("data.jsonl", parallel=True)
if result.is_valid:
print(f"✓ Valid: {result.valid_examples} examples")
else:
print(f"✗ Errors: {result.errors}")
When to Use What
Scenario |
Use |
Benefit |
|---|---|---|
Small file (<10MB) |
|
Simple, fast |
Large file (>100MB) |
|
2-4x faster |
Huge file (>1GB) |
|
Constant memory |
Storage cost matters |
|
70-90% smaller |
Need to filter |
|
Memory efficient |
Complex transform |
|
Multi-core speed |
Check quality |
|
Find issues |
Remove dupes |
|
Clean data |
Performance Tips
✅ Do
Compress large files:
compress=TrueStream huge files:
stream_jsonl()Use parallel for >100MB files
Filter while streaming
Deduplicate before training
❌ Don’t
Load 1GB+ files into memory
Parse JSON repeatedly
Skip validation on new data
Store uncompressed training data
Process entire file if you need subset
Example Workflow
from kerb.fine_tuning import jsonl
# 1. Merge source files
jsonl.merge_jsonl(
["raw1.jsonl", "raw2.jsonl"],
"merged.jsonl"
)
# 2. Validate
result = jsonl.validate_jsonl("merged.jsonl")
assert result.is_valid
# 3. Filter quality
jsonl.filter_jsonl(
"merged.jsonl",
"filtered.jsonl",
filter_fn=lambda x: len(x.get("text", "")) > 50
)
# 4. Deduplicate
jsonl.deduplicate_jsonl(
"filtered.jsonl",
"clean.jsonl",
key_fn=lambda x: x["prompt"]
)
# 5. Split train/val
jsonl.sample_jsonl("clean.jsonl", "val.jsonl", fraction=0.1)
# 6. Compress for storage
jsonl.compress_jsonl("clean.jsonl", "train.jsonl.gz")
jsonl.compress_jsonl("val.jsonl", "val.jsonl.gz")
Compression Formats
Format |
Extension |
Speed |
Ratio |
Use Case |
|---|---|---|---|---|
gzip |
|
Fast |
8:1 |
Default choice |
bzip2 |
|
Medium |
10:1 |
Better compression |
xz |
|
Slow |
12:1 |
Best compression |
All Functions
# Core I/O
write_jsonl(data, filepath, compress=False)
read_jsonl(filepath, max_lines=None, skip_invalid=False)
append_jsonl(data, filepath)
# Streaming
stream_jsonl(filepath, batch_size=1000, filter_fn=None, transform_fn=None)
parallel_read_jsonl(filepath, num_workers=None, chunk_size=10000)
# Manipulation
filter_jsonl(input_file, output_file, filter_fn)
transform_jsonl(input_file, output_file, transform_fn, parallel=False)
deduplicate_jsonl(input_file, output_file, key_fn=None, keep='first')
sample_jsonl(input_file, output_file, n=None, fraction=None)
# File ops
merge_jsonl(input_files, output_file, parallel=False)
split_jsonl(input_file, output_prefix, split_size=10000, by_line=True)
compress_jsonl(input_file, output_file=None, compression_type='gz')
decompress_jsonl(input_file, output_file=None)
# Analysis
validate_jsonl(filepath, schema=None, parallel=False)
count_jsonl_lines(filepath, fast=True)
get_jsonl_stats(filepath, sample_size=1000)