Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| import pandas as pd | |
| import os, io | |
| from bert_score import score | |
| from sklearn.metrics import accuracy_score, f1_score | |
| from modtran import ( | |
| handle_user_query, | |
| initialize_chatbot_agent, | |
| get_uploaded_text, | |
| get_text_chunks, | |
| get_vectorstore, | |
| set_global_vectorstore, | |
| ) | |
| # Load SciQ | |
| ds = load_dataset("sciq", split="validation[:100]") | |
| # Extract supports as context, question, and correct answer | |
| contexts = [item["support"] for item in ds] | |
| questions = [item["question"] for item in ds] | |
| answers = [item["correct_answer"] for item in ds] | |
| predictions = [] | |
| # Create dataframe | |
| df = pd.DataFrame({"context": contexts, "question": questions, "answer": answers}) | |
| # Save contexts to disk to simulate file uploads | |
| os.makedirs("sciq_contexts", exist_ok=True) | |
| for i, context in enumerate(df["context"].unique()): | |
| with open(f"sciq_contexts/context_{i}.txt", "w", encoding="utf-8") as f: | |
| f.write(context) | |
| # Simulate file uploads | |
| uploaded_files = [] | |
| for filename in os.listdir("sciq_contexts"): | |
| if filename.endswith(".txt"): | |
| with open(os.path.join("sciq_contexts", filename), "rb") as f: | |
| file_obj = io.BytesIO(f.read()) | |
| file_obj.name = filename | |
| uploaded_files.append(file_obj) | |
| print("Total uploaded files:", len(uploaded_files)) | |
| # Vectorstore pipeline | |
| raw_text = get_uploaded_text(uploaded_files) | |
| text_chunks = get_text_chunks(raw_text) | |
| vectorstore = get_vectorstore(text_chunks) | |
| set_global_vectorstore(vectorstore) | |
| # Initialize chatbot agent | |
| agent = initialize_chatbot_agent() | |
| # Predict answers | |
| df["chatbot_answer"] = df["question"].apply(lambda q: handle_user_query(q, agent)) | |
| # BERTScore Evaluation | |
| P, R, F1 = score(df["chatbot_answer"].tolist(), df["answer"].tolist(), lang="en") | |
| df["BERTScore_F1"] = F1.numpy() | |
| print(f"Mean BERTScore F1: {F1.mean().item():.3f}") | |
| for q in questions: | |
| pred = handle_user_query(q, agent) | |
| predictions.append(pred) | |
| # Compute Accuracy | |
| acc = accuracy_score(answers, predictions) | |
| print(f"Accuracy: {acc:.3f}") | |
| # Compute F1 (macro average - good for open-ended QA) | |
| f1 = f1_score(answers, predictions, average='macro') | |
| print(f"F1 Score (macro): {f1:.3f}") | |
| # Save results | |
| df.to_csv("sciq_evaluation_results.csv", index=False) | |