Skip to main content

LlamaIndex Integration

Use Korad.AI with LlamaIndex Build RAG applications with automatic cost optimization.

Overview​

LlamaIndex works seamlessly with Korad.AI for building RAG (Retrieval-Augmented Generation) applications with automatic cost savings.

Installation​

pip install llama-index
pip install llama-index-llms-openai

Basic Setup​

from llama_index.llms.openai import OpenAI

# Configure for Korad.AI
llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929"
)

# Use LLM
response = llm.complete("Hello, world!")
print(response.text)

With Optimization Headers​

from llama_index.llms.openai import OpenAI

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929",
default_headers={
"X-Vanishing-Context": "true" # Document optimization
}
)

RAG Pipeline​

Basic RAG​

from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI

# Configure LLM
llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929"
)

# Load documents
documents = SimpleDirectoryReader('data').load_data()

# Create index
index = VectorStoreIndex.from_documents(documents)

# Create query engine
query_engine = index.as_query_engine(llm=llm)

# Query
response = query_engine.query("What is the main topic?")
print(response)

With Optimization​

from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI

# Configure with Vanishing Context for large documents
llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929",
default_headers={
"X-Vanishing-Context": "true", # Best for document QA
"X-Savings-Level": "med" # Fallback cap
}
)

# Large document processing
documents = SimpleDirectoryReader('large_documents').load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(llm=llm)

Chat Engine​

from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.chat_engine import CondenseQuestionChatEngine

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929"
)

documents = SimpleDirectoryReader('data').load_data()
index = VectorStoreIndex.from_documents(documents)

# Create chat engine
chat_engine = index.as_chat_engine(
chat_mode="condense_question",
llm=llm
)

# Chat
response = chat_engine.chat("What did I ask about earlier?")
print(response)

Streaming Responses​

from llama_index.llms.openai import OpenAI

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929"
)

# Stream completion
for chunk in llm.stream("Tell me a story"):
print(chunk.delta, end="")

Advanced Features​

Custom Embeddings​

from llama_index import ServiceContext
from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

# Use Korad.AI for LLM, separate for embeddings
llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929"
)

embed_model = OpenAIEmbedding(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY"
)

service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model
)
from llama_index import VectorStoreIndex
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.retrievers import BM25Retriever

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929"
)

index = VectorStoreIndex.from_documents(documents)

# Hybrid vector + keyword search
bm25_retriever = BM25Retriever.from_defaults(index=index)
vector_retriever = index.as_retriever()

query_engine = RetrieverQueryEngine.from_args(
retriever=vector_retriever,
llm=llm
)

Multi-Document Agents​

from llama_index.agent import OpenAIAgent
from llama_index.tools import QueryEngineTool
from llama_index.query_engine import RouterQueryEngine

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929",
default_headers={
"X-Korad-RLM": "true" # For complex multi-document reasoning
}
)

# Create tools for different document collections
tool1 = QueryEngineTool.from_defaults(
query_engine=index1.as_query_engine(),
name="docs1",
description="Documentation 1"
)

tool2 = QueryEngineTool.from_defaults(
query_engine=index2.as_query_engine(),
name="docs2",
description="Documentation 2"
)

# Create agent
agent = OpenAIAgent.from_tools(
[tool1, tool2],
llm=llm
)

response = agent.chat("Compare the approaches in both document sets")

Best Practices​

1. Use Vanishing Context for Large Documents​

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929",
default_headers={"X-Vanishing-Context": "true"}
)

2. Use RLM for Complex Reasoning​

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929",
default_headers={"X-Korad-RLM": "true"}
)

3. Set Savings Cap for Budget Control​

llm = OpenAI(
base_url="http://localhost:8084/v1",
api_key="sk-bf-YOUR_VIRTUAL_KEY",
model="anthropic/claude-sonnet-4-5-20250929",
default_headers={"X-Savings-Level": "med"}
)

LlamaIndex + Korad.AI = Enterprise RAG with automatic cost savings.