Spaces:

thechaiexperiment
/

TeaRAG

Running

App Files Files Community

TeaRAG / general_rag.py

thechaiexperiment

Update general_rag.py

8f22720 verified 1 day ago

raw

history blame contribute delete

17.7 kB

	import transformers
	import pickle
	import os
	import re
	import numpy as np
	import torchvision
	import nltk
	import torch
	import pandas as pd
	import requests
	import zipfile
	import tempfile
	from openai import OpenAI
	from PyPDF2 import PdfReader
	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from transformers import (
	AutoTokenizer,
	AutoModelForSeq2SeqLM,
	AutoModelForTokenClassification,
	AutoModelForCausalLM,
	pipeline,
	Qwen2Tokenizer,
	BartForConditionalGeneration
	)
	from sentence_transformers import SentenceTransformer, CrossEncoder, util
	from sklearn.metrics.pairwise import cosine_similarity
	from bs4 import BeautifulSoup
	from huggingface_hub import hf_hub_download
	from safetensors.torch import load_file
	from typing import List, Dict, Optional
	from safetensors.numpy import load_file
	from safetensors.torch import safe_open
	nltk.download('punkt_tab')


	app = FastAPI()
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)
	models = {}
	data = {}

	class QueryRequest(BaseModel):
	query: str
	language_code: int = 1

	class ChatQuery(BaseModel):
	query: str
	language_code: int = 1
	#conversation_id: str

	class ChatMessage(BaseModel):
	role: str
	content: str
	timestamp: str

	def init_nltk():
	try:
	nltk.download('punkt', quiet=True)
	return True
	except Exception as e:
	print(f"Error initializing NLTK: {e}")
	return False


	def get_completion(prompt: str, model: str = "deepseek/deepseek-prover-v2:free") -> str:
	api_key = os.environ.get('OPENROUTER_API_KEY')
	if not api_key:
	raise HTTPException(status_code=500, detail="OPENROUTER_API_KEY not found in environment variables")

	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key
	)

	if not prompt.strip():
	raise HTTPException(status_code=400, detail="Please enter a question")

	try:
	completion = client.chat.completions.create(
	extra_headers={
	"HTTP-Referer": "https://huggingface.co/spaces/thechaiexperiment/phitrial",
	"X-Title": "My Hugging Face Space"
	},
	model=model,
	messages=[
	{
	"role": "user",
	"content": prompt
	}
	]
	)

	if (completion and
	hasattr(completion, 'choices') and
	completion.choices and
	hasattr(completion.choices[0], 'message') and
	hasattr(completion.choices[0].message, 'content')):
	return completion.choices[0].message.content
	else:
	raise HTTPException(status_code=500, detail="Received invalid response from API")

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	def load_models():
	try:
	print("Loading models...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Device set to use {device}")
	models['embedding_model'] = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	models['cross_encoder'] = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
	models['semantic_model'] = SentenceTransformer('all-MiniLM-L6-v2')
	models['ar_to_en_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
	models['ar_to_en_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
	models['en_to_ar_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
	models['en_to_ar_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
	models['att_tokenizer'] = AutoTokenizer.from_pretrained("facebook/bart-base")
	models['att_model'] = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
	models['bio_tokenizer'] = AutoTokenizer.from_pretrained("blaze999/Medical-NER")
	models['bio_model'] = AutoModelForTokenClassification.from_pretrained("blaze999/Medical-NER")
	models['ner_pipeline'] = pipeline("ner", model=models['bio_model'], tokenizer=models['bio_tokenizer'])
	model_name = "M4-ai/Orca-2.0-Tau-1.8B"
	models['llm_tokenizer'] = AutoTokenizer.from_pretrained(model_name)
	models['llm_model'] = AutoModelForCausalLM.from_pretrained(model_name)
	models['gen_tokenizer'] = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B-Instruct")
	models['gen_model'] = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B-Instruct")
	print("Models loaded successfully")
	return True
	except Exception as e:
	print(f"Error loading models: {e}")
	return False

	def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
	try:
	embeddings_path = 'embeddings.safetensors'
	if not os.path.exists(embeddings_path):
	print("File not found locally. Attempting to download from Hugging Face Hub...")
	embeddings_path = hf_hub_download(
	repo_id=os.environ.get('HF_SPACE_ID', 'thechaiexperiment/TeaRAG'),
	filename="embeddings.safetensors",
	repo_type="space"
	)

	embeddings = {}
	with safe_open(embeddings_path, framework="pt") as f:
	keys = f.keys()
	for key in keys:
	try:
	tensor = f.get_tensor(key)
	if not isinstance(tensor, torch.Tensor):
	raise TypeError(f"Value for key {key} is not a valid PyTorch tensor.")
	embeddings[key] = tensor.numpy()
	except Exception as key_error:
	print(f"Failed to process key {key}: {key_error}")
	if embeddings:
	print("Embeddings successfully loaded.")
	else:
	print("No embeddings could be loaded. Please check the file format and content.")
	return embeddings
	except Exception as e:
	print(f"Error loading embeddings: {e}")
	return None

	def normalize_key(key: str) -> str:
	match = re.search(r'file_(\d+)', key)
	if match:
	return match.group(1)
	return key

	def load_recipes_embeddings() -> Optional[np.ndarray]:
	try:
	embeddings_path = 'recipes_embeddings.safetensors'
	if not os.path.exists(embeddings_path):
	print("File not found locally. Attempting to download from Hugging Face Hub...")
	embeddings_path = hf_hub_download(
	repo_id=os.environ.get('HF_SPACE_ID', 'thechaiexperiment/TeaRAG'),
	filename="embeddings.safetensors",
	repo_type="space"
	)
	embeddings = load_file(embeddings_path)
	if "embeddings" not in embeddings:
	raise ValueError("Key 'embeddings' not found in the safetensors file.")
	tensor = embeddings["embeddings"]
	print(f"Successfully loaded embeddings.")
	print(f"Shape of embeddings: {tensor.shape}")
	print(f"Dtype of embeddings: {tensor.dtype}")
	print(f"First few values of the first embedding: {tensor[0][:5]}")
	return tensor
	except Exception as e:
	print(f"Error loading embeddings: {e}")
	return None

	def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
	try:
	print("Loading documents data...")
	if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
	print(f"Error: Folder '{folder_path}' not found")
	return False
	html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]
	if not html_files:
	print(f"No HTML files found in folder '{folder_path}'")
	return False
	documents = []
	for file_name in html_files:
	file_path = os.path.join(folder_path, file_name)
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	soup = BeautifulSoup(file, 'html.parser')
	text = soup.get_text(separator='\n').strip()
	documents.append({"file_name": file_name, "content": text})
	except Exception as e:
	print(f"Error reading file {file_name}: {e}")
	data['df'] = pd.DataFrame(documents)
	if data['df'].empty:
	print("No valid documents loaded.")
	return False
	print(f"Successfully loaded {len(data['df'])} document records.")
	return True
	except Exception as e:
	print(f"Error loading docs: {e}")
	return None

	def load_data():
	embeddings_success = load_embeddings()
	documents_success = load_documents_data()
	if not embeddings_success:
	print("Warning: Failed to load embeddings, falling back to basic functionality")
	if not documents_success:
	print("Warning: Failed to load documents data, falling back to basic functionality")
	return True

	print("Initializing application...")
	init_success = load_models() and load_data()


	def translate_text(text, source_to_target='ar_to_en'):
	try:
	if source_to_target == 'ar_to_en':
	tokenizer = models['ar_to_en_tokenizer']
	model = models['ar_to_en_model']
	else:
	tokenizer = models['en_to_ar_tokenizer']
	model = models['en_to_ar_model']
	inputs = tokenizer(text, return_tensors="pt", truncation=True)
	outputs = model.generate(**inputs)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)
	except Exception as e:
	print(f"Translation error: {e}")
	return text

	def embed_query_text(query_text):
	embedding = models['embedding_model']
	query_embedding = embedding.encode([query_text])
	return query_embedding

	def query_embeddings(query_embedding, embeddings_data, n_results):
	embeddings_data = load_embeddings()
	if not embeddings_data:
	print("No embeddings data available.")
	return []
	try:
	doc_ids = list(embeddings_data.keys())
	doc_embeddings = np.array(list(embeddings_data.values()))
	similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
	top_indices = similarities.argsort()[-n_results:][::-1]
	return [(doc_ids[i], similarities[i]) for i in top_indices]
	except Exception as e:
	print(f"Error in query_embeddings: {e}")
	return []

	def query_recipes_embeddings(query_embedding, embeddings_data, n_results):
	embeddings_data = load_recipes_embeddings()
	if embeddings_data is None:
	print("No embeddings data available.")
	return []
	try:
	if query_embedding.ndim == 1:
	query_embedding = query_embedding.reshape(1, -1)
	similarities = cosine_similarity(query_embedding, embeddings_data).flatten()
	top_indices = similarities.argsort()[-n_results:][::-1]
	return [(index, similarities[index]) for index in top_indices]
	except Exception as e:
	print(f"Error in query_recipes_embeddings: {e}")
	return []

	def get_page_title(url):
	try:
	response = requests.get(url)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	title = soup.find('title')
	return title.get_text() if title else "No title found"
	else:
	return None
	except requests.exceptions.RequestException:
	return None

	def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded_articles'):
	texts = []
	for doc_id in doc_ids:
	file_path = os.path.join(folder_path, doc_id)
	try:
	if not os.path.exists(file_path):
	print(f"Warning: Document file not found: {file_path}")
	texts.append("")
	continue
	with open(file_path, 'r', encoding='utf-8') as file:
	soup = BeautifulSoup(file, 'html.parser')
	text = soup.get_text(separator=' ', strip=True)
	texts.append(text)
	except Exception as e:
	print(f"Error retrieving document {doc_id}: {e}")
	texts.append("")
	return texts

	def retrieve_rec_texts(
	document_indices,
	folder_path='downloaded_articles/downloaded_articles',
	metadata_path='recipes_metadata.xlsx'
	):
	try:
	metadata_df = pd.read_excel(metadata_path)
	if "id" not in metadata_df.columns or "original_file_name" not in metadata_df.columns:
	raise ValueError("Metadata file must contain 'id' and 'original_file_name' columns.")
	metadata_df = metadata_df.sort_values(by="id").reset_index(drop=True)
	if metadata_df.index.max() < max(document_indices):
	raise ValueError("Some document indices exceed the range of metadata.")
	document_texts = []
	for idx in document_indices:
	if idx >= len(metadata_df):
	print(f"Warning: Index {idx} is out of range for metadata.")
	continue
	original_file_name = metadata_df.iloc[idx]["original_file_name"]
	if not original_file_name:
	print(f"Warning: No file name found for index {idx}")
	continue
	file_path = os.path.join(folder_path, original_file_name)
	if os.path.exists(file_path):
	with open(file_path, "r", encoding="utf-8") as f:
	document_texts.append(f.read())
	else:
	print(f"Warning: File not found at {file_path}")
	return document_texts
	except Exception as e:
	print(f"Error in retrieve_rec_texts: {e}")
	return []

	def retrieve_metadata(document_indices: List[int], metadata_path: str = 'recipes_metadata.xlsx') -> Dict[int, Dict[str, str]]:
	try:
	metadata_df = pd.read_excel(metadata_path)
	required_columns = {'id', 'original_file_name', 'url'}
	if not required_columns.issubset(metadata_df.columns):
	raise ValueError(f"Metadata file must contain columns: {required_columns}")
	metadata_df['id'] = metadata_df['id'].astype(int)
	filtered_metadata = metadata_df[metadata_df['id'].isin(document_indices)]
	metadata_dict = {
	int(row['id']): {
	"original_file_name": row['original_file_name'],
	"url": row['url']
	}
	for _, row in filtered_metadata.iterrows()
	}
	return metadata_dict
	except Exception as e:
	print(f"Error retrieving metadata: {e}")
	return {}

	def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
	try:
	pairs = [(query, doc) for doc in document_texts]
	scores = cross_encoder_model.predict(pairs)
	scored_documents = list(zip(scores, document_ids, document_texts))
	scored_documents.sort(key=lambda x: x[0], reverse=True)
	print("Reranked results:")
	for idx, (score, doc_id, doc) in enumerate(scored_documents):
	print(f"Rank {idx + 1} (Score: {score:.4f}, Document ID: {doc_id})")
	return scored_documents
	except Exception as e:
	print(f"Error reranking documents: {e}")
	return []

	def translate_ar_to_en(text):
	try:
	ar_to_en_tokenizer = models['ar_to_en_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
	ar_to_en_model= models['ar_to_en_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
	inputs = ar_to_en_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	translated_ids = ar_to_en_model.generate(
	inputs.input_ids,
	max_length=512,
	num_beams=4,
	early_stopping=True
	)
	translated_text = ar_to_en_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
	return translated_text
	except Exception as e:
	print(f"Error during Arabic to English translation: {e}")
	return None

	def translate_en_to_ar(text):
	try:
	en_to_ar_tokenizer = models['en_to_ar_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
	en_to_ar_model = models['en_to_ar_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
	inputs = en_to_ar_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	translated_ids = en_to_ar_model.generate(
	inputs.input_ids,
	max_length=512,
	num_beams=4,
	early_stopping=True
	)
	translated_text = en_to_ar_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
	return translated_text
	except Exception as e:
	print(f"Error during English to Arabic translation: {e}")
	return None


	@app.get("/")
	async def root():
	return {"message": "Welcome to TeaRAG! Your Medical Assistant Powered by RAG"}

	@app.get("/health")
	async def health_check():
	"""Health check endpoint"""
	status = {
	'status': 'healthy',
	'models_loaded': bool(models),
	'embeddings_loaded': bool(data.get('embeddings')),
	'documents_loaded': not data.get('df', pd.DataFrame()).empty
	}
	return status

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)