print(“\nPART 5 ── Datasets & experiments ————————————–“)
DATASET = “capital-cities-tutorial”
langfuse.create_dataset(name=DATASET, description=”Capital-city QA benchmark”)
_items = [
(“What is the capital of France?”, “Paris”),
(“What is the capital of Germany?”, “Berlin”),
(“What is the capital of Japan?”, “Tokyo”),
(“What is the capital of Italy?”, “Rome”),
]
for i, (q, a) in enumerate(_items):
langfuse.create_dataset_item(dataset_name=DATASET, id=f”cap-{i}”,
input={“question”: q}, expected_output=a)
def capital_task(*, item, **kwargs):
question = item.input[“question”] if isinstance(item.input, dict) else item.input
return llm_chat([{“role”: “user”, “content”: question}], name=”experiment-answer”)
def accuracy(*, input, output, expected_output, metadata=None, **kwargs):
hit = bool(expected_output) and expected_output.lower() in (output or “”).lower()
return Evaluation(name=”accuracy”, value=1.0 if hit else 0.0,
comment=”exact-match contains check”)
def conciseness(*, input, output, **kwargs):
return Evaluation(name=”char_length”, value=float(len(output or “”)))
def mean_accuracy(*, item_results, **kwargs):
vals = [e.value for r in item_results for e in r.evaluations if e.name == “accuracy”]
avg = sum(vals) / len(vals) if vals else 0.0
return Evaluation(name=”mean_accuracy”, value=avg, comment=f”{avg:.0%} correct”)
dataset = langfuse.get_dataset(DATASET)
result = dataset.run_experiment(
name=”capitals-baseline”,
description=”Baseline run from the Colab tutorial”,
task=capital_task,
evaluators=[accuracy, conciseness],
run_evaluators=[mean_accuracy],
max_concurrency=4,
)
print(result.format())
Trending
- Samsung might bring Privacy Display to every Galaxy S27 model
- The end for the Phone 1: Nothing’s final update hits the phone that started it all
- Galaxy Z Fold 8 looks pricier in these rumors, which isn’t shocking in the least
- T-Mobile is finally letting go of 2G in August, so anyone with it will need to transition
- This exclusive T-Mobile deal gets you a powerful Samsung tablet for only $99 — but you’re running out of time
- Google Home Speaker has a problem: users report incredibly slow response times
- Amazfit Balance 3 update adds better maps and altitude fixes
- Xreal ROG R1 is crazy expensive, but it’s easily the best wearable monitor I’ve ever used

