Benchmarks
The benchmarks namespace provides A/B testing and research benchmarking capabilities for comparing different query processing modes and evaluating system performance.
Basic Operations
Run a Benchmark
from functor_sdk import FunctorClientclient = FunctorClient(api_key="your-api-key")# Run a simple benchmarkresult = client.benchmarks.run_benchmark(query="What is the relationship between AI and machine learning?",modes=["kg_rag", "vector"],evaluators=["accuracy", "latency"])print(f"Benchmark ID: {result['benchmark_id']}")print(f"Winner: {result['winner']}")# View results for each modefor mode, metrics in result['results'].items():print(f"\n{mode}:")print(f" Accuracy: {metrics['accuracy']:.2f}")print(f" Latency: {metrics['latency_ms']:.1f}ms")
Get Benchmark Results
# Retrieve results from a previous benchmarkresults = client.benchmarks.get_results("benchmark_abc123")print(f"Status: {results['status']}")print(f"Winner: {results['winner']}")print(f"Query: {results['query']}")# Analyze detailed resultsfor mode, metrics in results['results'].items():print(f"\n{mode} Results:")for metric, value in metrics.items():print(f" {metric}: {value}")
Complete Parameter Reference
run_benchmark() Parameters
result = client.benchmarks.run_benchmark(query="Your query here", # Required: Query to benchmarkuser_id="user_123", # Optional: User ID for trackingevaluators=["accuracy", "latency"], # Optional: List of evaluatorsmodes=["kg_rag", "vector", "sql"], # Optional: Modes to compareiterations=1 # Optional: Number of iterations (default: 1))
get_results() Parameters
results = client.benchmarks.get_results(benchmark_id="benchmark_abc123" # Required: Benchmark ID)
Available Modes
The following query processing modes are available for benchmarking:
- kg_rag - Knowledge Graph Retrieval Augmented Generation
- vector - Vector similarity search
- sql - SQL-based retrieval
- hybrid - Combination of multiple approaches
Available Evaluators
These evaluators can be used to assess benchmark results:
- accuracy - Response accuracy and relevance
- latency - Response time in milliseconds
- cost - Estimated cost per query
- quality - Overall quality score
- relevance - Relevance of retrieved information
Advanced Usage Patterns
Multi-Mode Comparison
def compare_all_modes(query):client = FunctorClient()# Run benchmark across all modesresult = client.benchmarks.run_benchmark(query=query,modes=["kg_rag", "vector", "sql", "hybrid"],evaluators=["accuracy", "latency", "cost"],iterations=3 # Run multiple times for consistency)# Analyze resultscomparison = {"query": query,"winner": result['winner'],"modes": {}}for mode, metrics in result['results'].items():comparison['modes'][mode] = {"accuracy": metrics['accuracy'],"latency": metrics['latency_ms'],"cost": metrics['cost_usd'],"score": (metrics['accuracy'] * 0.5 +(1 - metrics['latency_ms'] / 10000) * 0.3 +(1 - metrics['cost_usd'] / 0.1) * 0.2)}return comparison# Usagecomparison = compare_all_modes("Explain quantum computing")print(f"Best mode: {comparison['winner']}")for mode, scores in comparison['modes'].items():print(f"\n{mode}: score={scores['score']:.2f}")print(f" Accuracy: {scores['accuracy']:.2f}")print(f" Latency: {scores['latency']:.1f}ms")print(f" Cost: ${scores['cost']:.4f}")
Benchmark Suite
def run_benchmark_suite(queries):"""Run benchmarks for a suite of queries."""client = FunctorClient()results = []for query in queries:print(f"Benchmarking: {query}")result = client.benchmarks.run_benchmark(query=query,modes=["kg_rag", "vector"],evaluators=["accuracy", "latency"])results.append({"query": query,"benchmark_id": result['benchmark_id'],"winner": result['winner'],"results": result['results']})# Aggregate resultsmode_wins = {}for result in results:winner = result['winner']mode_wins[winner] = mode_wins.get(winner, 0) + 1return {"total_queries": len(queries),"results": results,"mode_wins": mode_wins,"overall_winner": max(mode_wins, key=mode_wins.get)}# Usagequeries = ["What is machine learning?","Explain neural networks","How does gradient descent work?"]suite_results = run_benchmark_suite(queries)print(f"Overall winner: {suite_results['overall_winner']}")print(f"Mode wins: {suite_results['mode_wins']}")
Performance Tracking
import jsonfrom datetime import datetimeclass BenchmarkTracker:def __init__(self, client):self.client = clientself.history = []def run_tracked_benchmark(self, query, modes, evaluators):"""Run benchmark and track results."""result = self.client.benchmarks.run_benchmark(query=query,modes=modes,evaluators=evaluators)# Track benchmarkself.history.append({"timestamp": datetime.now().isoformat(),"benchmark_id": result['benchmark_id'],"query": query,"winner": result['winner'],"results": result['results']})return resultdef get_mode_statistics(self):"""Get statistics for each mode."""mode_stats = {}for benchmark in self.history:winner = benchmark['winner']if winner not in mode_stats:mode_stats[winner] = {"wins": 0,"avg_accuracy": 0,"avg_latency": 0}mode_stats[winner]["wins"] += 1return mode_statsdef export_history(self, filename):"""Export benchmark history."""with open(filename, 'w') as f:json.dump(self.history, f, indent=2)return filename# Usageclient = FunctorClient()tracker = BenchmarkTracker(client)# Run benchmarkstracker.run_tracked_benchmark("What is AI?",modes=["kg_rag", "vector"],evaluators=["accuracy"])# Get statisticsstats = tracker.get_mode_statistics()print(f"Mode statistics: {stats}")# Exporttracker.export_history("benchmark_history.json")
Async Benchmarking
import asynciofrom functor_sdk import FunctorClientasync def run_parallel_benchmarks(queries):"""Run multiple benchmarks in parallel."""async with FunctorClient() as client:# Create benchmark taskstasks = [client.benchmarks.run_benchmark_async(query=query,modes=["kg_rag", "vector"],evaluators=["accuracy", "latency"])for query in queries]# Run all benchmarks concurrentlyresults = await asyncio.gather(*tasks)# Analyze resultsanalysis = {"total_benchmarks": len(results),"winners": {}}for result in results:winner = result['winner']analysis['winners'][winner] = analysis['winners'].get(winner, 0) + 1return analysis# Usagequeries = ["What is deep learning?","Explain transformers","How does attention work?"]analysis = asyncio.run(run_parallel_benchmarks(queries))print(f"Benchmarks completed: {analysis['total_benchmarks']}")print(f"Winner distribution: {analysis['winners']}")
Next Steps
- Analytics Namespace - Track usage and analyze costs
- Queries Namespace - Execute queries across different modes
- Usage & Analytics API - Direct API reference for benchmarking