Table of Contents
Not Just Benchmarks, Try it for your data!
About ChatBees
- 1. State-of-the-art RAG pipeline out of the box
- 2. Simple APIs to build your own LLM application in just a few lines of code
- 3. No devops or tuning required to make this work in production
Benchmarking RAG quality with tonic.ai
1. Get or Create the test collection
import chatbees as cb
import shortuuid
def get_shared_collection():
"""
Use the shared collection that already has all essays.
"""
cb.init(account_id=”QZCLEHTW”)
return cb.Collection(name=”tonic_rag_test”)
def create_collection():
"""
Creates a collection and uploads all essays
"""
cb.init(api_key="my_api_key", account_id="my_account_id")
col = cb.create_collection(cb.Collection(name='tonic_test' + shortuuid.uuid()))
col.upload_document('./all_essays_in_single_file.txt')
return col
collection = get_shared_collection()
# If you prefer to create a new collection to try, call create_collection()
# collection = create_collection()
2. Run retrieval challenges
from tonic_validate import (
ValidateScorer,
Benchmark,
BenchmarkItem,
LLMResponse,
BenchmarkItem,
Run,
)
from tqdm import tqdm
def get_cb_rag_response(benchmarkItem: BenchmarkItem, collection):
prompt = benchmarkItem.question
response, _ = collection.ask(prompt)
return response
raw_cb_responses = []
for x in tqdm(benchmark.items):
raw_cb_responses.append(get_cb_rag_response(x, collection))
3. Evaluate ChatBees responses
cb_responses = [
LLMResponse(
llm_answer=r, llm_context_list=[], benchmark_item=bi
) for r, bi in zip(raw_cb_responses, benchmark.items)
]
scorer = ValidateScorer([AnswerSimilarityMetric()])
cb_run = scorer.score_run(cb_responses, parallelism=5)