Benchmarks

The benchmarks namespace provides A/B testing and research benchmarking capabilities for comparing different query processing modes and evaluating system performance.

Basic Operations

Run a Benchmark

from functor_sdk import FunctorClient

client = FunctorClient(api_key="your-api-key")

# Run a simple benchmark
result = client.benchmarks.run_benchmark(
    query="What is the relationship between AI and machine learning?",
    modes=["kg_rag", "vector"],
    evaluators=["accuracy", "latency"]
)

print(f"Benchmark ID: {result['benchmark_id']}")
print(f"Winner: {result['winner']}")

# View results for each mode
for mode, metrics in result['results'].items():
    print(f"\n{mode}:")
    print(f"  Accuracy: {metrics['accuracy']:.2f}")
    print(f"  Latency: {metrics['latency_ms']:.1f}ms")

Get Benchmark Results

# Retrieve results from a previous benchmark
results = client.benchmarks.get_results("benchmark_abc123")

print(f"Status: {results['status']}")
print(f"Winner: {results['winner']}")
print(f"Query: {results['query']}")

# Analyze detailed results
for mode, metrics in results['results'].items():
    print(f"\n{mode} Results:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")

Complete Parameter Reference

run_benchmark() Parameters

result = client.benchmarks.run_benchmark(
    query="Your query here",              # Required: Query to benchmark
    user_id="user_123",                   # Optional: User ID for tracking
    evaluators=["accuracy", "latency"],   # Optional: List of evaluators
    modes=["kg_rag", "vector", "sql"],    # Optional: Modes to compare
    iterations=1                          # Optional: Number of iterations (default: 1)
)

get_results() Parameters

results = client.benchmarks.get_results(
    benchmark_id="benchmark_abc123"       # Required: Benchmark ID
)

Available Modes

The following query processing modes are available for benchmarking:

kg_rag - Knowledge Graph Retrieval Augmented Generation
vector - Vector similarity search
sql - SQL-based retrieval
hybrid - Combination of multiple approaches

Available Evaluators

These evaluators can be used to assess benchmark results:

accuracy - Response accuracy and relevance
latency - Response time in milliseconds
cost - Estimated cost per query
quality - Overall quality score
relevance - Relevance of retrieved information

Advanced Usage Patterns

Multi-Mode Comparison

def compare_all_modes(query):
    client = FunctorClient()
    
    # Run benchmark across all modes
    result = client.benchmarks.run_benchmark(
        query=query,
        modes=["kg_rag", "vector", "sql", "hybrid"],
        evaluators=["accuracy", "latency", "cost"],
        iterations=3  # Run multiple times for consistency
    )
    
    # Analyze results
    comparison = {
        "query": query,
        "winner": result['winner'],
        "modes": {}
    }
    
    for mode, metrics in result['results'].items():
        comparison['modes'][mode] = {
            "accuracy": metrics['accuracy'],
            "latency": metrics['latency_ms'],
            "cost": metrics['cost_usd'],
            "score": (metrics['accuracy'] * 0.5 + 
                     (1 - metrics['latency_ms'] / 10000) * 0.3 + 
                     (1 - metrics['cost_usd'] / 0.1) * 0.2)
        }
    
    return comparison

# Usage
comparison = compare_all_modes("Explain quantum computing")
print(f"Best mode: {comparison['winner']}")
for mode, scores in comparison['modes'].items():
    print(f"\n{mode}: score={scores['score']:.2f}")
    print(f"  Accuracy: {scores['accuracy']:.2f}")
    print(f"  Latency: {scores['latency']:.1f}ms")
    print(f"  Cost: ${scores['cost']:.4f}")

Benchmark Suite

def run_benchmark_suite(queries):
    """Run benchmarks for a suite of queries."""
    client = FunctorClient()
    
    results = []
    
    for query in queries:
        print(f"Benchmarking: {query}")
        
        result = client.benchmarks.run_benchmark(
            query=query,
            modes=["kg_rag", "vector"],
            evaluators=["accuracy", "latency"]
        )
        
        results.append({
            "query": query,
            "benchmark_id": result['benchmark_id'],
            "winner": result['winner'],
            "results": result['results']
        })
    
    # Aggregate results
    mode_wins = {}
    for result in results:
        winner = result['winner']
        mode_wins[winner] = mode_wins.get(winner, 0) + 1
    
    return {
        "total_queries": len(queries),
        "results": results,
        "mode_wins": mode_wins,
        "overall_winner": max(mode_wins, key=mode_wins.get)
    }

# Usage
queries = [
    "What is machine learning?",
    "Explain neural networks",
    "How does gradient descent work?"
]

suite_results = run_benchmark_suite(queries)
print(f"Overall winner: {suite_results['overall_winner']}")
print(f"Mode wins: {suite_results['mode_wins']}")

Performance Tracking

import json
from datetime import datetime

class BenchmarkTracker:
    def __init__(self, client):
        self.client = client
        self.history = []
    
    def run_tracked_benchmark(self, query, modes, evaluators):
        """Run benchmark and track results."""
        result = self.client.benchmarks.run_benchmark(
            query=query,
            modes=modes,
            evaluators=evaluators
        )
        
        # Track benchmark
        self.history.append({
            "timestamp": datetime.now().isoformat(),
            "benchmark_id": result['benchmark_id'],
            "query": query,
            "winner": result['winner'],
            "results": result['results']
        })
        
        return result
    
    def get_mode_statistics(self):
        """Get statistics for each mode."""
        mode_stats = {}
        
        for benchmark in self.history:
            winner = benchmark['winner']
            if winner not in mode_stats:
                mode_stats[winner] = {
                    "wins": 0,
                    "avg_accuracy": 0,
                    "avg_latency": 0
                }
            
            mode_stats[winner]["wins"] += 1
        
        return mode_stats
    
    def export_history(self, filename):
        """Export benchmark history."""
        with open(filename, 'w') as f:
            json.dump(self.history, f, indent=2)
        
        return filename

# Usage
client = FunctorClient()
tracker = BenchmarkTracker(client)

# Run benchmarks
tracker.run_tracked_benchmark(
    "What is AI?",
    modes=["kg_rag", "vector"],
    evaluators=["accuracy"]
)

# Get statistics
stats = tracker.get_mode_statistics()
print(f"Mode statistics: {stats}")

# Export
tracker.export_history("benchmark_history.json")

Async Benchmarking

import asyncio
from functor_sdk import FunctorClient

async def run_parallel_benchmarks(queries):
    """Run multiple benchmarks in parallel."""
    async with FunctorClient() as client:
        # Create benchmark tasks
        tasks = [
            client.benchmarks.run_benchmark_async(
                query=query,
                modes=["kg_rag", "vector"],
                evaluators=["accuracy", "latency"]
            )
            for query in queries
        ]
        
        # Run all benchmarks concurrently
        results = await asyncio.gather(*tasks)
        
        # Analyze results
        analysis = {
            "total_benchmarks": len(results),
            "winners": {}
        }
        
        for result in results:
            winner = result['winner']
            analysis['winners'][winner] = analysis['winners'].get(winner, 0) + 1
        
        return analysis

# Usage
queries = [
    "What is deep learning?",
    "Explain transformers",
    "How does attention work?"
]

analysis = asyncio.run(run_parallel_benchmarks(queries))
print(f"Benchmarks completed: {analysis['total_benchmarks']}")
print(f"Winner distribution: {analysis['winners']}")

Next Steps

Analytics Namespace - Track usage and analyze costs
Queries Namespace - Execute queries across different modes
Usage & Analytics API - Direct API reference for benchmarking