Python API for Agent Tool-Calling¶
The DeepFabric Python API provides programmatic access to agent tool-calling dataset generation with full control over configuration, tool definitions, and generation parameters.
Basic Usage¶
Single-Turn Agent Dataset¶
import asyncio
from deepfabric import DataSetGenerator
from deepfabric.dataset import Dataset
from deepfabric.tree import Tree
async def generate_agent_dataset():
# Create topic tree
tree = Tree(
topic_prompt="Real-world scenarios requiring tool usage",
provider="openai",
model_name="gpt-4o-mini",
degree=3,
depth=2,
temperature=0.7
)
topics = await tree.generate()
# Create agent dataset generator
generator = DataSetGenerator(
generation_system_prompt="You excel at systematic tool reasoning.",
provider="openai",
model_name="gpt-4o-mini",
conversation_type="agent_cot_tools",
available_tools=["get_weather", "search_web", "calculator"],
max_tools_per_query=2,
topics=topics
)
# Generate samples
samples = await generator.generate()
# Create and save dataset
dataset = Dataset.from_list(samples)
dataset.save("agent_dataset.jsonl")
return dataset
# Run the generation
dataset = asyncio.run(generate_agent_dataset())
print(f"Generated {len(dataset)} agent samples")
Multi-Turn Agent Dataset¶
async def generate_multi_turn_dataset():
generator = DataSetGenerator(
generation_system_prompt="Create multi-turn conversations with progressive tool usage.",
provider="openai",
model_name="gpt-4o",
conversation_type="agent_cot_multi_turn",
available_tools=["get_weather", "book_restaurant", "calculator"],
max_tools_per_query=3,
temperature=0.8
)
samples = await generator.generate()
dataset = Dataset.from_list(samples)
dataset.save("multi_turn_agent.jsonl")
return dataset
Advanced Configuration¶
Custom Tool Integration¶
from deepfabric.schemas import ToolDefinition, ToolParameter
from deepfabric.tools.defaults import get_default_tools
# Define custom tools using Pydantic models
custom_booking_tool = ToolDefinition(
name="book_restaurant",
description="Book a restaurant reservation",
parameters=[
ToolParameter(
name="restaurant",
type="str",
description="Restaurant name",
required=True
),
ToolParameter(
name="party_size",
type="int",
description="Number of people",
required=True
),
ToolParameter(
name="date",
type="str",
description="Reservation date",
required=True
),
ToolParameter(
name="time",
type="str",
description="Preferred time",
required=True
)
],
returns="Reservation confirmation with details",
category="booking"
)
# Create generator with custom tools
generator = DataSetGenerator(
generation_system_prompt="You are an AI agent with restaurant booking capabilities.",
provider="openai",
model_name="gpt-4o",
conversation_type="agent_cot_tools",
available_tools=["get_weather", "book_restaurant"], # Mix default and custom
custom_tools=[custom_booking_tool.model_dump()], # Custom tools as dicts
max_tools_per_query=2
)
Loading Tools from Files¶
from deepfabric.tools.loader import load_tools_from_file, get_available_tools
# Load tools from YAML file
custom_tools = load_tools_from_file("custom_tools.yaml")
# Get available tools (defaults + customs)
all_tools = get_available_tools(
available_tools=["get_weather", "book_restaurant"],
custom_tool_registry=custom_tools
)
generator = DataSetGenerator(
generation_system_prompt="You have access to specialized tools.",
provider="openai",
model_name="gpt-4o",
conversation_type="agent_cot_tools",
tool_registry_path="custom_tools.yaml",
available_tools=["get_weather", "book_restaurant", "analyze_stock"]
)
Configuration Classes¶
DataSetGeneratorConfig¶
from deepfabric.generator import DataSetGeneratorConfig
# Create configuration object
config = DataSetGeneratorConfig(
generation_system_prompt="You excel at tool reasoning.",
provider="openai",
model_name="gpt-4o",
conversation_type="agent_cot_tools",
reasoning_style="general",
available_tools=["get_weather", "calculator"],
max_tools_per_query=2,
temperature=0.8,
max_retries=3
)
# Use configuration with generator
generator = DataSetGenerator(**config.model_dump())
From YAML Configuration¶
from deepfabric.config import DeepFabricConfig
# Load from YAML file
config = DeepFabricConfig.from_yaml("agent_config.yaml")
# Extract generator parameters
engine_params = config.get_engine_params()
# Create generator
generator = DataSetGenerator(**engine_params)
Provider Configuration¶
OpenAI Configuration¶
generator = DataSetGenerator(
provider="openai",
model_name="gpt-4o", # or gpt-4o-mini, gpt-4-turbo
temperature=0.8,
max_retries=3,
# OpenAI-specific parameters
timeout=60,
# API key from environment: OPENAI_API_KEY
)
Anthropic Configuration¶
generator = DataSetGenerator(
provider="anthropic",
model_name="claude-3-5-sonnet-20241022",
temperature=0.7,
max_retries=2,
# API key from environment: ANTHROPIC_API_KEY
)
Local/Ollama Configuration¶
generator = DataSetGenerator(
provider="ollama",
model_name="llama3.1:8b",
temperature=0.6,
max_retries=5,
# No API key required for local models
)
Batch Generation¶
Large Dataset Generation¶
async def generate_large_dataset():
generator = DataSetGenerator(
generation_system_prompt="Create diverse tool usage scenarios.",
provider="openai",
model_name="gpt-4o-mini", # Cost-effective for large datasets
conversation_type="agent_cot_tools",
available_tools=["get_weather", "search_web", "calculator"],
max_tools_per_query=2
)
# Generate in batches
all_samples = []
batch_size = 10
total_batches = 5
for i in range(total_batches):
print(f"Generating batch {i+1}/{total_batches}")
batch_samples = await generator.generate()
all_samples.extend(batch_samples)
# Optional: save intermediate results
if i % 2 == 0: # Save every 2 batches
temp_dataset = Dataset.from_list(all_samples)
temp_dataset.save(f"agent_dataset_batch_{i}.jsonl")
# Final dataset
dataset = Dataset.from_list(all_samples)
dataset.save("agent_dataset_complete.jsonl")
return dataset
Parallel Generation¶
import asyncio
from concurrent.futures import ThreadPoolExecutor
async def parallel_generation():
# Create multiple generators for different scenarios
generators = [
DataSetGenerator(
generation_system_prompt="Focus on weather-related tool usage.",
conversation_type="agent_cot_tools",
available_tools=["get_weather", "search_web"],
topics=weather_topics
),
DataSetGenerator(
generation_system_prompt="Focus on calculation and analysis.",
conversation_type="agent_cot_tools",
available_tools=["calculator", "analyze_stock"],
topics=calculation_topics
),
DataSetGenerator(
generation_system_prompt="Focus on booking and reservations.",
conversation_type="agent_cot_tools",
available_tools=["book_restaurant", "search_web"],
topics=booking_topics
)
]
# Generate in parallel
tasks = [gen.generate() for gen in generators]
results = await asyncio.gather(*tasks)
# Combine results
all_samples = []
for samples in results:
all_samples.extend(samples)
return Dataset.from_list(all_samples)
Quality Control¶
Validation and Filtering¶
def validate_agent_sample(sample):
"""Validate agent CoT sample quality."""
required_fields = ["question", "tool_planning", "tool_executions", "final_answer"]
# Check required fields
if not all(field in sample for field in required_fields):
return False
# Check tool usage
if len(sample["tool_executions"]) == 0:
return False
# Check reasoning quality
if len(sample["tool_planning"]) == 0:
return False
return True
async def generate_validated_dataset():
generator = DataSetGenerator(
generation_system_prompt="Create high-quality agent reasoning examples.",
conversation_type="agent_cot_tools",
available_tools=["get_weather", "calculator", "search_web"],
max_tools_per_query=3
)
valid_samples = []
attempts = 0
max_attempts = 100
while len(valid_samples) < 50 and attempts < max_attempts:
samples = await generator.generate()
for sample in samples:
if validate_agent_sample(sample):
valid_samples.append(sample)
attempts += 1
print(f"Valid samples: {len(valid_samples)}, Attempts: {attempts}")
return Dataset.from_list(valid_samples)
Output Formatting¶
Apply Formatters Programmatically¶
# Generate raw dataset
dataset = await generate_agent_dataset()
# Apply formatters
formatter_configs = [
{
"name": "tool_calling",
"template": "builtin://tool_calling",
"output": "agent_tool_calling.jsonl",
"config": {
"system_prompt": "You are a function calling AI model.",
"include_tools_in_system": True,
"thinking_format": "<think>{reasoning}</think>",
"tool_call_format": "<tool_call>\n{tool_call}\n</tool_call>",
"tool_response_format": "<tool_response>\n{tool_output}\n</tool_response>"
}
}
]
formatted_datasets = dataset.apply_formatters(formatter_configs)
tool_calling_dataset = formatted_datasets["tool_calling"]
print(f"Original samples: {len(dataset)}")
print(f"Formatted samples: {len(tool_calling_dataset)}")
Error Handling¶
Robust Generation with Retry Logic¶
async def robust_generation():
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
generator = DataSetGenerator(
generation_system_prompt="Create agent tool usage examples.",
provider="openai",
model_name="gpt-4o-mini",
conversation_type="agent_cot_tools",
available_tools=["get_weather", "calculator"],
max_retries=2 # Per-sample retries
)
samples = await generator.generate()
if len(samples) > 0:
return Dataset.from_list(samples)
except Exception as e:
retry_count += 1
print(f"Generation failed (attempt {retry_count}): {e}")
if retry_count < max_retries:
print(f"Retrying in 5 seconds...")
await asyncio.sleep(5)
else:
print("Max retries exceeded")
raise
return None
Integration Examples¶
Complete Production Workflow¶
async def production_agent_dataset():
"""Complete production workflow for agent dataset generation."""
# 1. Generate topic tree
tree = Tree(
topic_prompt="Professional scenarios requiring intelligent tool usage",
provider="openai",
model_name="gpt-4o-mini",
degree=4,
depth=3,
temperature=0.7
)
topics = await tree.generate()
print(f"Generated {len(topics)} topics")
# 2. Load custom tools
custom_tools = load_tools_from_file("production_tools.yaml")
# 3. Generate agent dataset
generator = DataSetGenerator(
generation_system_prompt="Create realistic professional agent scenarios.",
provider="openai",
model_name="gpt-4o",
conversation_type="agent_cot_tools",
available_tools=["get_weather", "search_web", "book_restaurant", "analyze_stock"],
custom_tools=[tool.model_dump() for tool in custom_tools.tools],
max_tools_per_query=3,
temperature=0.8,
topics=topics
)
samples = await generator.generate()
# 4. Create and validate dataset
dataset = Dataset.from_list(samples)
# 5. Apply multiple formatters
formatters = [
{
"name": "tool_calling",
"template": "builtin://tool_calling",
"output": "production_tool_calling.jsonl"
},
{
"name": "conversation",
"template": "builtin://conversation",
"output": "production_conversation.jsonl"
}
]
formatted_datasets = dataset.apply_formatters(formatters)
# 6. Save results
dataset.save("production_agent_raw.jsonl")
return {
"raw_dataset": dataset,
"formatted_datasets": formatted_datasets,
"topics": topics
}
# Run production workflow
results = asyncio.run(production_agent_dataset())
print("Production dataset generation complete!")
This API provides full programmatic control over agent tool-calling dataset generation, enabling sophisticated workflows and integration with existing ML pipelines.