Python SDK
Use Korad.AI with Python OpenAI-compatible Python SDK integration.
Installation​
pip install openai
Basic Setup​
import os
from openai import OpenAI
# Configure client for Korad.AI
client = OpenAI(
base_url="http://localhost:8084/v1",
api_key=os.getenv("KORAD_VIRTUAL_KEY") # Your Virtual Key
)
# Basic request
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[
{"role": "user", "content": "Hello, world!"}
],
max_tokens=100
)
print(response.choices[0].message.content)
With Savings Slider​
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)
# Enable extreme cost savings
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": large_document # 100k+ tokens
],
max_tokens=500,
extra_headers={
"X-Savings-Level": "extreme" # Cap at 16k tokens
}
)
# Check your savings
print(f"Original: {response.response_headers.get('X-Korad-Original-Tokens')}")
print(f"Optimized: {response.response_headers.get('X-Korad-Optimized-Tokens')}")
print(f"Savings: {response.response_headers.get('X-Korad-Savings-USD')}")
print(f"Strategy: {response.response_headers.get('X-Korad-Strategy')}")
With Vanishing Context​
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)
# Document QA with Vanishing Context
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[
{"role": "user", "content": f"""
Based on the following legal contract, explain the liability clause:
{large_contract_text}
Question: What are the penalties for late payment?
"""}
],
max_tokens=1000,
extra_headers={
"X-Vanishing-Context": "true" # Optimizes for document QA
}
)
Streaming​
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)
stream = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[{"role": "user", "content": "Count to 10"}],
stream=True,
extra_headers={
"X-Savings-Level": "med"
}
)
for chunk in stream:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")
Error Handling​
import time
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)
def make_request_with_retry(messages, max_retries=3):
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=messages,
max_tokens=500,
extra_headers={"X-Savings-Level": "med"}
)
return response
except Exception as e:
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
raise
# Use
response = make_request_with_retry([
{"role": "user", "content": "Hello!"}
])
Monitoring Savings​
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)
class SavingsMonitor:
def __init__(self, client):
self.client = client
self.total_savings = 0
self.total_requests = 0
def create_completion(self, **kwargs):
response = self.client.chat.completions.create(**kwargs)
self.total_requests += 1
# Extract savings from headers
savings = float(response.response_headers
.get('X-Korad-Savings-USD', '0')
.replace('$', ''))
self.total_savings += savings
return response
def get_stats(self):
return {
"total_requests": self.total_requests,
"total_savings": self.total_savings,
"avg_savings": self.total_savings / self.total_requests if self.total_requests > 0 else 0
}
# Use
monitor = SavingsMonitor(client)
response = monitor.create_completion(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[{"role": "user", "content": "Hello!"}],
max_tokens=100,
extra_headers={"X-Savings-Level": "extreme"}
)
print(monitor.get_stats())
Async Client​
import asyncio
from openai import AsyncOpenAI
client = AsyncOpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)
async def create_completion(message):
response = await client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[{"role": "user", "content": message}],
max_tokens=100
)
return response.choices[0].message.content
async def main():
tasks = [
create_completion("Hello!"),
create_completion("How are you?"),
create_completion("Goodbye!")
]
results = await asyncio.gather(*tasks)
print(results)
asyncio.run(main())
Best Practices​
1. Always Check Savings Headers​
response = client.chat.completions.create(...)
# Log your savings
original = int(response.response_headers.get('X-Korad-Original-Tokens', 0))
optimized = int(response.response_headers.get('X-Korad-Optimized-Tokens', 0))
savings_usd = float(response.response_headers.get('X-Korad-Savings-USD', '0').replace('$', ''))
print(f"Saved ${savings_usd:.6f} ({original} -> {optimized} tokens)")
2. Choose the Right Optimization Tier​
# Document QA
headers = {"X-Vanishing-Context": "true"}
# Cost-sensitive app
headers = {"X-Savings-Level": "extreme"}
# Production app
headers = {"X-Savings-Level": "med"}
# Quality-critical
headers = {"X-Savings-Level": "min"}
3. Handle Rate Limits Gracefully​
import time
def make_request_with_backoff(messages, max_retries=5):
for attempt in range(max_retries):
try:
return client.chat.completions.create(...)
except RateLimitError:
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"Rate limited. Waiting {wait_time}s...")
time.sleep(wait_time)
else:
raise
Use Korad.AI with Python for automatic cost savings.