GitHub

Benchmarks

The benchmarks namespace provides A/B testing and research benchmarking capabilities for comparing different query processing modes and evaluating system performance.

Basic Operations

Run a Benchmark

from functor_sdk import FunctorClient
client = FunctorClient(api_key="your-api-key")
# Run a simple benchmark
result = client.benchmarks.run_benchmark(
query="What is the relationship between AI and machine learning?",
modes=["kg_rag", "vector"],
evaluators=["accuracy", "latency"]
)
print(f"Benchmark ID: {result['benchmark_id']}")
print(f"Winner: {result['winner']}")
# View results for each mode
for mode, metrics in result['results'].items():
print(f"\n{mode}:")
print(f" Accuracy: {metrics['accuracy']:.2f}")
print(f" Latency: {metrics['latency_ms']:.1f}ms")

Get Benchmark Results

# Retrieve results from a previous benchmark
results = client.benchmarks.get_results("benchmark_abc123")
print(f"Status: {results['status']}")
print(f"Winner: {results['winner']}")
print(f"Query: {results['query']}")
# Analyze detailed results
for mode, metrics in results['results'].items():
print(f"\n{mode} Results:")
for metric, value in metrics.items():
print(f" {metric}: {value}")

Complete Parameter Reference

run_benchmark() Parameters

result = client.benchmarks.run_benchmark(
query="Your query here", # Required: Query to benchmark
user_id="user_123", # Optional: User ID for tracking
evaluators=["accuracy", "latency"], # Optional: List of evaluators
modes=["kg_rag", "vector", "sql"], # Optional: Modes to compare
iterations=1 # Optional: Number of iterations (default: 1)
)

get_results() Parameters

results = client.benchmarks.get_results(
benchmark_id="benchmark_abc123" # Required: Benchmark ID
)

Available Modes

The following query processing modes are available for benchmarking:

  • kg_rag - Knowledge Graph Retrieval Augmented Generation
  • vector - Vector similarity search
  • sql - SQL-based retrieval
  • hybrid - Combination of multiple approaches

Available Evaluators

These evaluators can be used to assess benchmark results:

  • accuracy - Response accuracy and relevance
  • latency - Response time in milliseconds
  • cost - Estimated cost per query
  • quality - Overall quality score
  • relevance - Relevance of retrieved information

Advanced Usage Patterns

Multi-Mode Comparison

def compare_all_modes(query):
client = FunctorClient()
# Run benchmark across all modes
result = client.benchmarks.run_benchmark(
query=query,
modes=["kg_rag", "vector", "sql", "hybrid"],
evaluators=["accuracy", "latency", "cost"],
iterations=3 # Run multiple times for consistency
)
# Analyze results
comparison = {
"query": query,
"winner": result['winner'],
"modes": {}
}
for mode, metrics in result['results'].items():
comparison['modes'][mode] = {
"accuracy": metrics['accuracy'],
"latency": metrics['latency_ms'],
"cost": metrics['cost_usd'],
"score": (metrics['accuracy'] * 0.5 +
(1 - metrics['latency_ms'] / 10000) * 0.3 +
(1 - metrics['cost_usd'] / 0.1) * 0.2)
}
return comparison
# Usage
comparison = compare_all_modes("Explain quantum computing")
print(f"Best mode: {comparison['winner']}")
for mode, scores in comparison['modes'].items():
print(f"\n{mode}: score={scores['score']:.2f}")
print(f" Accuracy: {scores['accuracy']:.2f}")
print(f" Latency: {scores['latency']:.1f}ms")
print(f" Cost: ${scores['cost']:.4f}")

Benchmark Suite

def run_benchmark_suite(queries):
"""Run benchmarks for a suite of queries."""
client = FunctorClient()
results = []
for query in queries:
print(f"Benchmarking: {query}")
result = client.benchmarks.run_benchmark(
query=query,
modes=["kg_rag", "vector"],
evaluators=["accuracy", "latency"]
)
results.append({
"query": query,
"benchmark_id": result['benchmark_id'],
"winner": result['winner'],
"results": result['results']
})
# Aggregate results
mode_wins = {}
for result in results:
winner = result['winner']
mode_wins[winner] = mode_wins.get(winner, 0) + 1
return {
"total_queries": len(queries),
"results": results,
"mode_wins": mode_wins,
"overall_winner": max(mode_wins, key=mode_wins.get)
}
# Usage
queries = [
"What is machine learning?",
"Explain neural networks",
"How does gradient descent work?"
]
suite_results = run_benchmark_suite(queries)
print(f"Overall winner: {suite_results['overall_winner']}")
print(f"Mode wins: {suite_results['mode_wins']}")

Performance Tracking

import json
from datetime import datetime
class BenchmarkTracker:
def __init__(self, client):
self.client = client
self.history = []
def run_tracked_benchmark(self, query, modes, evaluators):
"""Run benchmark and track results."""
result = self.client.benchmarks.run_benchmark(
query=query,
modes=modes,
evaluators=evaluators
)
# Track benchmark
self.history.append({
"timestamp": datetime.now().isoformat(),
"benchmark_id": result['benchmark_id'],
"query": query,
"winner": result['winner'],
"results": result['results']
})
return result
def get_mode_statistics(self):
"""Get statistics for each mode."""
mode_stats = {}
for benchmark in self.history:
winner = benchmark['winner']
if winner not in mode_stats:
mode_stats[winner] = {
"wins": 0,
"avg_accuracy": 0,
"avg_latency": 0
}
mode_stats[winner]["wins"] += 1
return mode_stats
def export_history(self, filename):
"""Export benchmark history."""
with open(filename, 'w') as f:
json.dump(self.history, f, indent=2)
return filename
# Usage
client = FunctorClient()
tracker = BenchmarkTracker(client)
# Run benchmarks
tracker.run_tracked_benchmark(
"What is AI?",
modes=["kg_rag", "vector"],
evaluators=["accuracy"]
)
# Get statistics
stats = tracker.get_mode_statistics()
print(f"Mode statistics: {stats}")
# Export
tracker.export_history("benchmark_history.json")

Async Benchmarking

import asyncio
from functor_sdk import FunctorClient
async def run_parallel_benchmarks(queries):
"""Run multiple benchmarks in parallel."""
async with FunctorClient() as client:
# Create benchmark tasks
tasks = [
client.benchmarks.run_benchmark_async(
query=query,
modes=["kg_rag", "vector"],
evaluators=["accuracy", "latency"]
)
for query in queries
]
# Run all benchmarks concurrently
results = await asyncio.gather(*tasks)
# Analyze results
analysis = {
"total_benchmarks": len(results),
"winners": {}
}
for result in results:
winner = result['winner']
analysis['winners'][winner] = analysis['winners'].get(winner, 0) + 1
return analysis
# Usage
queries = [
"What is deep learning?",
"Explain transformers",
"How does attention work?"
]
analysis = asyncio.run(run_parallel_benchmarks(queries))
print(f"Benchmarks completed: {analysis['total_benchmarks']}")
print(f"Winner distribution: {analysis['winners']}")

Next Steps