ScientificChatbot / evaluation_qasper.py
ZarTShe
Clean version for HF Space
77253fa
from datasets import load_dataset
import pandas as pd
import os, io
from bert_score import score
from sklearn.metrics import accuracy_score, f1_score
from modtran import (
handle_user_query,
initialize_chatbot_agent,
get_uploaded_text,
get_text_chunks,
get_vectorstore,
set_global_vectorstore,
)
# Load SciQ
ds = load_dataset("sciq", split="validation[:100]")
# Extract supports as context, question, and correct answer
contexts = [item["support"] for item in ds]
questions = [item["question"] for item in ds]
answers = [item["correct_answer"] for item in ds]
predictions = []
# Create dataframe
df = pd.DataFrame({"context": contexts, "question": questions, "answer": answers})
# Save contexts to disk to simulate file uploads
os.makedirs("sciq_contexts", exist_ok=True)
for i, context in enumerate(df["context"].unique()):
with open(f"sciq_contexts/context_{i}.txt", "w", encoding="utf-8") as f:
f.write(context)
# Simulate file uploads
uploaded_files = []
for filename in os.listdir("sciq_contexts"):
if filename.endswith(".txt"):
with open(os.path.join("sciq_contexts", filename), "rb") as f:
file_obj = io.BytesIO(f.read())
file_obj.name = filename
uploaded_files.append(file_obj)
print("Total uploaded files:", len(uploaded_files))
# Vectorstore pipeline
raw_text = get_uploaded_text(uploaded_files)
text_chunks = get_text_chunks(raw_text)
vectorstore = get_vectorstore(text_chunks)
set_global_vectorstore(vectorstore)
# Initialize chatbot agent
agent = initialize_chatbot_agent()
# Predict answers
df["chatbot_answer"] = df["question"].apply(lambda q: handle_user_query(q, agent))
# BERTScore Evaluation
P, R, F1 = score(df["chatbot_answer"].tolist(), df["answer"].tolist(), lang="en")
df["BERTScore_F1"] = F1.numpy()
print(f"Mean BERTScore F1: {F1.mean().item():.3f}")
for q in questions:
pred = handle_user_query(q, agent)
predictions.append(pred)
# Compute Accuracy
acc = accuracy_score(answers, predictions)
print(f"Accuracy: {acc:.3f}")
# Compute F1 (macro average - good for open-ended QA)
f1 = f1_score(answers, predictions, average='macro')
print(f"F1 Score (macro): {f1:.3f}")
# Save results
df.to_csv("sciq_evaluation_results.csv", index=False)