Skip to main content

Python SDK

Use Korad.AI with Python OpenAI-compatible Python SDK integration.

Installation​

pip install openai

Basic Setup​

import os
from openai import OpenAI

# Configure client for Korad.AI
client = OpenAI(
base_url="http://localhost:8084/v1",
api_key=os.getenv("KORAD_VIRTUAL_KEY") # Your Virtual Key
)

# Basic request
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[
{"role": "user", "content": "Hello, world!"}
],
max_tokens=100
)

print(response.choices[0].message.content)

With Savings Slider​

from openai import OpenAI

client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)

# Enable extreme cost savings
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": large_document # 100k+ tokens
],
max_tokens=500,
extra_headers={
"X-Savings-Level": "extreme" # Cap at 16k tokens
}
)

# Check your savings
print(f"Original: {response.response_headers.get('X-Korad-Original-Tokens')}")
print(f"Optimized: {response.response_headers.get('X-Korad-Optimized-Tokens')}")
print(f"Savings: {response.response_headers.get('X-Korad-Savings-USD')}")
print(f"Strategy: {response.response_headers.get('X-Korad-Strategy')}")

With Vanishing Context​

from openai import OpenAI

client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)

# Document QA with Vanishing Context
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[
{"role": "user", "content": f"""
Based on the following legal contract, explain the liability clause:

{large_contract_text}

Question: What are the penalties for late payment?
"""}
],
max_tokens=1000,
extra_headers={
"X-Vanishing-Context": "true" # Optimizes for document QA
}
)

Streaming​

from openai import OpenAI

client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)

stream = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[{"role": "user", "content": "Count to 10"}],
stream=True,
extra_headers={
"X-Savings-Level": "med"
}
)

for chunk in stream:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")

Error Handling​

import time
from openai import OpenAI

client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)

def make_request_with_retry(messages, max_retries=3):
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=messages,
max_tokens=500,
extra_headers={"X-Savings-Level": "med"}
)
return response
except Exception as e:
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
else:
raise

# Use
response = make_request_with_retry([
{"role": "user", "content": "Hello!"}
])

Monitoring Savings​

from openai import OpenAI

client = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)

class SavingsMonitor:
def __init__(self, client):
self.client = client
self.total_savings = 0
self.total_requests = 0

def create_completion(self, **kwargs):
response = self.client.chat.completions.create(**kwargs)
self.total_requests += 1

# Extract savings from headers
savings = float(response.response_headers
.get('X-Korad-Savings-USD', '0')
.replace('$', ''))
self.total_savings += savings

return response

def get_stats(self):
return {
"total_requests": self.total_requests,
"total_savings": self.total_savings,
"avg_savings": self.total_savings / self.total_requests if self.total_requests > 0 else 0
}

# Use
monitor = SavingsMonitor(client)

response = monitor.create_completion(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[{"role": "user", "content": "Hello!"}],
max_tokens=100,
extra_headers={"X-Savings-Level": "extreme"}
)

print(monitor.get_stats())

Async Client​

import asyncio
from openai import AsyncOpenAI

client = AsyncOpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)

async def create_completion(message):
response = await client.chat.completions.create(
model="anthropic/claude-sonnet-4-5-20250929",
messages=[{"role": "user", "content": message}],
max_tokens=100
)
return response.choices[0].message.content

async def main():
tasks = [
create_completion("Hello!"),
create_completion("How are you?"),
create_completion("Goodbye!")
]
results = await asyncio.gather(*tasks)
print(results)

asyncio.run(main())

Best Practices​

1. Always Check Savings Headers​

response = client.chat.completions.create(...)

# Log your savings
original = int(response.response_headers.get('X-Korad-Original-Tokens', 0))
optimized = int(response.response_headers.get('X-Korad-Optimized-Tokens', 0))
savings_usd = float(response.response_headers.get('X-Korad-Savings-USD', '0').replace('$', ''))

print(f"Saved ${savings_usd:.6f} ({original} -> {optimized} tokens)")

2. Choose the Right Optimization Tier​

# Document QA
headers = {"X-Vanishing-Context": "true"}

# Cost-sensitive app
headers = {"X-Savings-Level": "extreme"}

# Production app
headers = {"X-Savings-Level": "med"}

# Quality-critical
headers = {"X-Savings-Level": "min"}

3. Handle Rate Limits Gracefully​

import time

def make_request_with_backoff(messages, max_retries=5):
for attempt in range(max_retries):
try:
return client.chat.completions.create(...)
except RateLimitError:
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"Rate limited. Waiting {wait_time}s...")
time.sleep(wait_time)
else:
raise

Use Korad.AI with Python for automatic cost savings.