Usage Guide¶
MiniLLMLib is designed for real-world applications. Here’s how to use it effectively:
Basic Setup¶
Provider Options¶
import minillmlib as mll
import os
# OpenAI (direct)
gi_openai = mll.GeneratorInfo(
model="gpt-4o",
_format="openai",
api_key=os.getenv("OPENAI_API_KEY")
)
# Anthropic Claude (direct)
gi_anthropic = mll.GeneratorInfo(
model="claude-3.5-sonnet-20241022",
_format="anthropic",
api_key=os.getenv("ANTHROPIC_API_KEY")
)
# OpenRouter (100+ models)
gi_openrouter = mll.GeneratorInfo(
model="anthropic/claude-3.5-sonnet",
_format="url",
api_url="https://openrouter.ai/api/v1/chat/completions",
api_key=os.getenv("OPENROUTER_API_KEY")
)
# Use any provider the same way
chat = mll.ChatNode(content="What is machine learning?", role="user")
response = chat.complete_one(gi_openai) # or any other gi
print(response.content)
Working with JSON Prompts¶
# Load a prompt template from JSON file
prompt = mll.ChatNode.from_thread("my_prompt.json")
# Fill in template variables
prompt.update_format_kwargs(
user_request="Explain quantum computing",
context="for a general audience"
)
# Complete with any provider
result = prompt.complete_one(gi_openai)
print(result.content)
Error Handling and Retries¶
# Robust completion with error handling
try:
result = chat.complete_one(
mll.NodeCompletionParameters(
gi=gi,
retry=3,
exp_back_off=True,
crash_on_empty_response=True
)
)
print(result.content)
except Exception as e:
print(f"Completion failed: {e}")
Async Operations¶
# Async completion for better performance
async def get_response():
result = await chat.complete_async(gi)
return result.content
# Run async function
import asyncio
response = asyncio.run(get_response())
Multimodal Usage¶
Image Analysis (URL Format Only)¶
import minillmlib as mll
import os
def analyze_document_image(image_url: str):
"""Analyze a document image and extract key information."""
# Create image analysis node (only URLs supported currently)
node = mll.ChatNode(
content="Analyze this document image. Extract key information and summarize the main points. Be concise but thorough.",
image_data=mll.ImageData(images=[image_url])
)
# Use vision-capable model (URL format providers only)
gi = mll.GeneratorInfo(
model="anthropic/claude-3.5-sonnet",
_format="url",
api_url="https://openrouter.ai/api/v1/chat/completions",
api_key=os.getenv("OPENROUTER_API_KEY"),
completion_parameters=mll.GeneratorCompletionParameters(
temperature=0.3, # Lower temperature for factual analysis
max_tokens=1024
)
)
# Complete with error handling
try:
result = node.complete_one(mll.NodeCompletionParameters(
gi=gi,
retry=2,
crash_on_empty_response=True
))
return result.content if result else None
except Exception as e:
print(f"Analysis failed: {e}")
return None
# Usage with image URLs only
analysis = analyze_document_image("https://example.com/chart.png")
print(analysis)
Audio Input (OpenRouter Compatible)¶
MiniLLMLib supports sending audio files as input to models that support audio processing (e.g., Whisper for transcription, or multimodal models). This is compatible with OpenRouter’s audio input format.
Supported Audio Formats:
WAV (
.wav,.wave)MP3 (
.mp3)
Basic Audio Transcription¶
import minillmlib as mll
import os
def transcribe_audio(audio_path: str):
"""Transcribe an audio file using OpenRouter."""
# Create AudioData with your audio file
audio_data = mll.AudioData(audio_paths=[audio_path])
# Create a ChatNode with audio input
node = mll.ChatNode(
content="Please transcribe this audio file.",
role="user",
audio_data=audio_data
)
# Use OpenRouter with Whisper model
gi = mll.GeneratorInfo(
model="google/gemini-2.0-flash-lite-001",
_format="url",
api_url="https://openrouter.ai/api/v1/chat/completions",
api_key=os.getenv("OPENROUTER_API_KEY")
)
result = node.complete_one(gi)
return result.content
# Usage
transcription = transcribe_audio("meeting.wav")
print(transcription)
Audio-Only Input (No Text Prompt)¶
# Send audio without text prompt
audio_data = mll.AudioData(audio_paths=["audio.wav"])
node = mll.ChatNode(role="user", audio_data=audio_data)
gi = mll.GeneratorInfo(
model="google/gemini-2.0-flash-lite-001",
_format="url",
api_url="https://openrouter.ai/api/v1/chat/completions",
api_key=os.getenv("OPENROUTER_API_KEY")
)
result = node.complete_one(gi)
print(result.content)
Multiple Audio Files¶
# Send multiple audio files in one message
audio_data = mll.AudioData(audio_paths=[
"part1.wav",
"part2.mp3",
"part3.wav"
])
node = mll.ChatNode(
content="Transcribe all these audio files:",
role="user",
audio_data=audio_data
)
result = node.complete_one(gi)
print(result.content)
Audio + Text in Same Message¶
# Combine audio with specific instructions
audio_data = mll.AudioData(audio_paths=["interview.wav"])
node = mll.ChatNode(
content="Transcribe this interview and extract the key insights. Focus on technical details.",
role="user",
audio_data=audio_data
)
result = node.complete_one(gi)
print(result.content)
Audio + Images (Multimodal)¶
# Combine audio and images in the same message
audio_data = mll.AudioData(audio_paths=["presentation.wav"])
image_data = mll.ImageData(images=[
"https://example.com/slide1.png",
"https://example.com/slide2.png"
])
node = mll.ChatNode(
content="Analyze this presentation. The audio contains the speaker's narration and the images show the slides.",
role="user",
audio_data=audio_data,
image_data=image_data
)
gi = mll.GeneratorInfo(
model="google/gemini-2.0-flash-lite-001", # Use a multimodal model
_format="url",
api_url="https://openrouter.ai/api/v1/chat/completions",
api_key=os.getenv("OPENROUTER_API_KEY")
)
result = node.complete_one(gi)
print(result.content)
Async Audio Processing¶
import asyncio
async def transcribe_multiple_files(audio_files: list[str]):
"""Transcribe multiple audio files concurrently."""
gi = mll.GeneratorInfo(
model="google/gemini-2.0-flash-lite-001",
_format="url",
api_url="https://openrouter.ai/api/v1/chat/completions",
api_key=os.getenv("OPENROUTER_API_KEY")
)
# Create tasks for each audio file
tasks = []
for audio_file in audio_files:
audio_data = mll.AudioData(audio_paths=[audio_file])
node = mll.ChatNode(
content="Transcribe this audio:",
role="user",
audio_data=audio_data
)
tasks.append(node.complete_one_async(gi))
# Run all transcriptions concurrently
results = await asyncio.gather(*tasks)
return [r.content for r in results]
# Usage
audio_files = ["file1.wav", "file2.mp3", "file3.wav"]
transcriptions = asyncio.run(transcribe_multiple_files(audio_files))
for i, transcription in enumerate(transcriptions):
print(f"File {i+1}: {transcription}")
Important Notes:
Audio input is supported for
_format="url"(OpenRouter) and_format="openai"(OpenAI)Audio files are automatically base64-encoded and format-detected
For user/system roles, audio can be combined with text and images
For assistant roles, audio output uses a different format (see Audio Output section)
Check OpenRouter’s model list for audio-capable models: https://openrouter.ai/models?input_modalities=audio
Installation¶
# (Recommended) Install from PyPI
pip install minillmlib
# Or install from source
git clone https://github.com/qfeuilla/MiniLLMLib.git
cd MiniLLMLib
pip install -e .
# For HuggingFace/local models:
pip install minillmlib[huggingface]
Core Concepts¶
1. ChatNode: Building Conversations¶
A ChatNode represents a single message in a conversation, with a role (like "user", "assistant", or "system") and a content (the text or payload of the message).
Creating a Simple Message:
chat = mll.ChatNode(content="Hello, how are you?", role="user")
Building a Conversation Tree:
root = mll.ChatNode(content="What's the weather like today?", role="user")
assistant = mll.ChatNode(content="It's sunny and 25°C.", role="assistant")
root.add_child(assistant)
followup = mll.ChatNode(content="Will it rain tomorrow?", role="user")
assistant.add_child(followup)
Visualizing the Structure:
root (user): What's the weather like today?
└── assistant: It's sunny and 25°C.
└── user: Will it rain tomorrow?
Saving and Loading Threads:
followup.save_thread("weather_thread.json")
thread = mll.ChatNode.from_thread(path="weather_thread.json")
print(thread.content) # Will it rain tomorrow?
Saving a thread always saves the path from the current node up to the root. Loom saving is supported, but loom loading is not yet implemented.
2. GeneratorInfo: Model and Provider Configuration¶
GeneratorInfo holds all configuration needed to interact with a model/provider:
Model name (e.g. “gpt-4”, “claude-3-opus-20240229”)
Provider format (e.g. “openai”, “anthropic”, “hf”, “url”)
API keys and URLs
Default generation parameters
Example:
gi = mll.GeneratorInfo(
model="gpt-4",
_format="openai",
api_key="sk-..." # Or use environment variable OPENAI_API_KEY
)
Passing Generation Parameters:
You can pass generation parameters in two ways:
Via
GeneratorInfo: Set default generation parameters by passing acompletion_parametersargument when creating aGeneratorInfoobject. These will be used for all completions unless overridden.
gi = mll.GeneratorInfo(
model="gpt-4",
_format="openai",
api_key="sk-...",
completion_parameters=mll.GeneratorCompletionParameters(
temperature=0.2,
max_tokens=256
)
)
Per-completion: Override default parameters by passing a
GeneratorCompletionParametersobject tocomplete_oneorcomplete_one_async.
custom_params = mll.GeneratorCompletionParameters(
temperature=0.1,
max_tokens=128
)
params = mll.NodeCompletionParameters(gi=gi, generation_parameters=custom_params)
response = chat.complete_one(params)
Conflict Resolution: If a parameter is set in both GeneratorInfo and GeneratorCompletionParameters, the per-completion value takes precedence.
3. Branching/Loom Conversations¶
A loom conversation is a tree-like structure where each node can have multiple children. This allows for branching conversations and more complex dialogue flows.
ASCII Diagram:
root (user): What's the weather like today?
├── assistant: It's sunny and 25°C.
│ ├── user: Will it rain tomorrow?
│ └── user: What's the forecast for the weekend?
└── assistant: It's cloudy and 20°C.
└── user: Will it rain tomorrow?
Completion Semantics: When completing a node, the context is set to the node’s parents. The completion grows the branch by adding a new child node.
4. Merging Contiguous Messages¶
merge_contiguous is a compatibility feature that merges consecutive messages with the same role. This is useful when working with certain models or providers that expect a specific input format.
Options:
'all': Merge all consecutive messages with the same role, regardless of role.'user': Only merge consecutive user messages.'assistant': Only merge consecutive assistant messages.'system': Only merge consecutive system messages.'base': Only merge base messages (rare, for advanced workflows).None: No merging; every message is passed as-is.
Example:
params = mll.NodeCompletionParameters(
gi=gi,
merge_contiguous="user"
)
Basic Usage¶
Completion Methods Overview¶
MiniLLMLib provides several ways to generate completions from a ChatNode:
Method |
Sync/Async |
Returns |
Description |
|---|---|---|---|
|
Sync |
Single ChatNode |
One response (most common) |
|
Sync |
List[ChatNode] or ChatNode (if |
All possible completions (if provider supports) |
|
Async |
Single ChatNode |
Async single response |
|
Async |
List[ChatNode] or ChatNode (if |
Async all completions |
Collapsing (Truncating or Summarizing) Chat Threads¶
MiniLLMLib provides an advanced method for managing long chat threads: ChatNode.collapse_thread. This asynchronous method lets you keep only the most relevant parts of a conversation, either by truncating (removing the middle) or summarizing it with an LLM.
Method Signature¶
collapsed = await chat.collapse_thread(keep_last_n, keep_n, gi=None)
keep_last_n(int): Number of most recent nodes to keep (from the current node backward).keep_n(int): Total number of nodes to keep in the thread (including the start and end).gi(GeneratorInfo | None): If provided, summarizes the truncated section using the model; if None, inserts a prominent truncation marker node.
Modes¶
Truncation (gi=None):
Middle nodes are replaced by a visually prominent marker node (role=”assistant”) indicating that a section was truncated.
Example:
collapsed = await chat.collapse_thread(keep_last_n=3, keep_n=6, gi=None)
Summarization (gi=GeneratorInfo):
Middle nodes are summarized using the provided generator. A summary node (role=”assistant”) is inserted, prefixed with a clear message (e.g., “Conversation truncated. Summary of the truncated conversation:”).
Example:
collapsed = await chat.collapse_thread(keep_last_n=3, keep_n=6, gi=gi)
Behavior and Edge Cases¶
If the thread is shorter than
keep_norkeep_last_n, no truncation occurs.If
keep_n< 2, only the last node is kept.If
keep_last_n= 0, only the start nodes are kept (up tokeep_n).The method always returns the last node of the collapsed thread.
The summary/truncation node never inherits metadata or formatting from the truncated section.
Best Practices¶
Use truncation for efficiency or when summarization is not needed.
Use summarization to preserve context for the model when removing large sections.
Always use
awaitas this method is asynchronous (especially when summarizing).
See tests for more advanced usage and edge case handling.
For simple use, you can pass a
GeneratorInfo(gi) directly. UseNodeCompletionParametersonly for advanced options.If you want to override defaults or use advanced features, wrap with
NodeCompletionParameters:response = chat.complete_one(mll.NodeCompletionParameters(gi=gi, ...))
Synchronous Example¶
response = chat.complete_one(gi)
Asynchronous Example¶
response = await chat.complete_one_async(gi)
Note: Async is supported for all providers except HuggingFace/local models.
Advanced Completion Options¶
You can control completion behavior using NodeCompletionParameters. Here are the most important options:
Parameter |
Type |
Default |
Description |
|---|---|---|---|
|
GeneratorInfo |
— |
The model/provider configuration |
|
GeneratorCompletionParameters |
— |
Per-call generation settings (overrides gi defaults) |
|
str/None |
“all” |
Merge consecutive messages with same role (see above) |
|
bool |
False |
Parse/repair model output as JSON |
|
bool |
False |
Raise error & retry if model doesn’t return JSON (when |
|
bool |
False |
Raise error & retry if model returns empty output |
|
int |
4 |
Number of retry attempts on error |
|
bool |
False |
Use exponential backoff between retries |
|
int/float |
1 |
Initial wait time (seconds) for backoff |
|
int/float |
15 |
Max wait time (seconds) for backoff |
|
str/None |
None |
Force some text to be prepended to the completion before the assistant answer. For example if you want the LLM to start their answer with “Score: “ and then continue, you can set |
|
bool |
False |
Whether to attach the completion as a child node to the current node |
|
int |
1 |
Number of completions to generate. If > 1, returns a list of completions with |
Special Features:
Strict JSON Output: Use
parse_json=Trueandcrash_on_refusal=Trueto enforce JSON output and trigger retries/backoff if the model refuses.Crash on Empty: Use
crash_on_empty_response=Trueto ensure you always get a non-empty response.
Example:
params = mll.NodeCompletionParameters(
gi=gi,
parse_json=True,
crash_on_refusal=True,
retry=5,
exp_back_off=True,
back_off_time=2,
max_back_off=30
)
response = chat.complete_one(params)
Branching and Loom Semantics¶
Each completion grows a branch: calling
.complete_one()adds a new assistant node as a child.Only the path from the current node to the root is used as context for completions (siblings/other branches are ignored).
Looms enable branching conversations—each node can have multiple children, representing different possible continuations.
Advanced GeneratorInfo Parameters¶
The GeneratorInfo class configures all model/provider and runtime options for completions. Most users only need to set model, _format, and api_key, but advanced users can fine-tune many behaviors.
Parameter |
Type |
Default |
Description |
|---|---|---|---|
|
str/None |
None |
Model name (e.g. ‘gpt-4’, ‘claude-3-opus-20240229’) |
|
bool |
True |
Whether the model uses chat format |
|
GeneratorCompletionParameters |
— |
Default generation parameters (temperature, max_tokens, etc) |
|
str/None |
None |
Custom API endpoint (for custom providers) |
|
str/None |
None |
API key for the provider |
|
str |
“url” |
Provider format: ‘openai’, ‘openai-audio’, ‘anthropic’, ‘url’, ‘mistralai’, ‘hf’, ‘prettify’ |
|
bool |
False |
Force merging of contiguous messages |
|
bool |
False |
Ensure prompt is JSON-compatible (rarely needed) |
|
bool |
False |
Treat all ‘system’ messages as ‘user’ |
|
bool |
False |
Ignore temperature in completions |
|
bool |
False |
Ignore max_tokens in completions |
|
tuple |
(0.0, 0.0) |
Cost per input/output token (for tracking) |
|
bool |
False |
Mark model as uncensored (for filtering/auditing) |
|
dict |
|
Custom role translations (see ‘mll.pretty_messages’ configuration) |
|
str/None |
None |
Usage tracking backend (currently only “openrouter” supported with openrouter url backend) |
|
Collection/None |
None |
MongoDB collection for usage tracking |
|
str/None |
None |
Key for identifying usage records |
|
str/None |
None |
Value for identifying usage records |
|
str/None |
None |
The key to use to update the price in the usage record (this should already exist and be set to 0) |
|
dict |
|
Extra kwargs for HuggingFace model loading |
|
dict |
|
Extra kwargs for HuggingFace processor |
|
object |
None |
Loaded HuggingFace model (internal) |
|
object |
None |
Loaded HuggingFace processor (internal) |
|
object |
None |
Loaded HuggingFace tokenizer (internal) |
|
str |
“cuda:0” |
Device for HuggingFace models |
|
str |
“< |
end |
Note: If your provider does not support
temperatureand/ormax_tokens, and throws an error when these are sent (for example, some OpenAI reasoning models), you can setdeactivate_max_tokens=Trueand/ordeactivate_temperature=Truein yourGeneratorInfo. This will prevent these parameters from being sent to the provider. See the following code pattern:GeneratorInfo( model="o1-preview", _format="openai", deactivate_max_tokens=True, # ... )This is used in the model zoo (see
model_zoo.py, e.g.deactivate_max_tokens="o1" in model_name).
Usage Tracking:
To enable usage tracking, set
usage_tracking_type="openrouter"and provide a validusage_db(a MongoDB collection). Other backends are not yet supported.Use the
usage_id_key,usage_id_value, andusage_keyfields to customize how usage is tracked.
See :
assistant_builder = mll.GeneratorInfo(
model="deepseek/deepseek-chat-v3-0324",
_format="url",
api_url=f"https://openrouter.ai/api/v1/chat/completions",
api_key=OPENROUTER_API_KEY,
completion_parameters=mll.GeneratorCompletionParameters(
provider={
"data_collection": "deny",
"sort": "throughput",
},
usage={
"include": True
}
),
usage_tracking_type="openrouter", # Only openrouter is supported for now
usage_id_key="_id", # This will filter object on _id
usage_id_value=database_objects["_id"], # The value of the _id to find the object to update in the db
usage_key="cost" # The key to use to update the price in the object record. It should be set to 0 in your db for each object, and mll will update this value
usage_db=database # database must be a MongoDB collection
)
Advanced Usage¶
Parametric Prompts with update_format_kwargs¶
MiniLLMLib supports dynamic, programmatic prompting by allowing you to update variables (placeholders) in prompt templates using update_format_kwargs on a ChatNode.
This is especially powerful when loading prompt trees from files and injecting runtime variables for each completion.
Example:
Suppose you have a prompt file with placeholders like {instructions} and {to_transform}:
{
"role": "user",
"content": "Transform this paragraph: {to_transform}\nInstructions: {instructions}"
}
You can load this as a node and update the variables dynamically:
node = mll.ChatNode.from_thread(["prompts/pre_prompt.json"])
node.update_format_kwargs(propagate=True, instructions="Rewrite in passive voice", to_transform="The cat chased the mouse.")
response = node.complete_one(gi)
print(response.content) # The prompt will have the variables filled in
The
propagate=Trueoption updates all parent nodes up to the root, ensuring all relevant placeholders are filled.This enables powerful, reusable prompt trees for advanced programmatic workflows.
Advanced/Beta Features¶
Audio Completions (Beta)¶
Beta: Audio support is experimental and only for OpenAI’s audio model. Not for general LLM completions. See this:
to_complete = mll.ChatNode(
role="system",
content="You are a helpful assistant that always talk to the user using voice."
)
user = to_complete.add_child(
mll.ChatNode(
role="user",
content="Hello! Please explain why the sky is blue like if I was a child."
)
)
answer = user.complete_one(
mll.NodeCompletionParameters(
gi=mll.openai_audio["gpt-4o-audio-preview"],
)
)
print(answer.audio_data.audio_paths[0])
print(answer.audio_data.audio_ids) # If the id is available, it will use the voice to keep completing the text, if it is not, it will fall back to using the transcript
HuggingFace/Local Model Usage (Beta)¶
Beta: Local HuggingFace model support is experimental. Only synchronous completions are supported.
import minillmlib as mll
gi = mll.GeneratorInfo(model="TheBloke/Llama-2-7B-Chat-GPTQ", _format="hf")
chat = mll.ChatNode(content="Hi!", role="user")
response = chat.complete_one(mll.NodeCompletionParameters(gi=gi))
Utilities¶
Prompt Formatting¶
from minillmlib.utils.message_utils import format_prompt
prompt = format_prompt("Hello, {name}!", name="Alice")
Logging & Debugging¶
from minillmlib.utils.logging_utils import get_logger
logger = get_logger()
logger.info("Debug message")
GeneratorCompletionParameters Reference¶
The GeneratorCompletionParameters dataclass lets you specify default or per-call generation settings for completions. Any unknown kwargs will be passed directly to the provider API.
Parameter |
Type |
Default |
Description |
|---|---|---|---|
|
float |
0.8 |
Sampling temperature for generation (higher = more random) |
|
int |
512 |
Maximum number of tokens to generate |
|
str |
“alloy” |
Audio voice (OpenAI audio models only; options: alloy, ash, ballad, coral, echo, sage, shimmer) |
|
str/None |
None |
Where to save audio output files (audio completions) |
|
dict |
|
Any extra provider/model-specific parameters (e.g., |
Custom kwargs:
Any additional keyword arguments not matching a known field will be stored in
kwargsand passed to the underlying API. This allows you to use new or provider-specific parameters without waiting for library updates.
Example:
params = mll.GeneratorCompletionParameters(
temperature=0.7,
max_tokens=256,
top_p=0.95, # Passed via kwargs
frequency_penalty=0.1 # Passed via kwargs
)