Spaces:

ocilab
/

Eid_chat

Running

Eid_chat / app.py

Hafiza Maham

updat app file

c653454 4 days ago

8.49 kB

	# Eid code
	from flask import Flask, request, jsonify
	from flask_cors import CORS
	import os
	import json
	import re
	from sentence_transformers import SentenceTransformer, CrossEncoder, util
	import torch
	from typing import List, Dict
	import random
	import datetime
	from fuzzywuzzy import fuzz

	app = Flask(__name__) # Fixed: __name__ instead of _name_
	CORS(app)

	class EnhancedMultilingualEidQABot:
	def __init__(self, data_file='dataSet.json'): # Fixed: __init__ instead of _init_
	print("\U0001F504 Loading multilingual models...")
	self.bi_encoder = None
	self.cross_encoder = None
	print("\U0001F4D6 Processing dataset...")
	self.data = self._load_dataset(data_file)
	self.knowledge_chunks = self._create_chunks()
	self.chunk_embeddings = None
	self.question_patterns = self._initialize_question_patterns()
	print("\u2705 Bot ready!\n")

	def _ensure_embeddings(self):
	if self.chunk_embeddings is None:
	self._load_models()
	print("\U0001F9E0 Creating embeddings...")
	self.chunk_embeddings = self.bi_encoder.encode(
	[chunk['text'] for chunk in self.knowledge_chunks],
	convert_to_tensor=True,
	show_progress_bar=True
	)

	def _load_dataset(self, data_file):
	try:
	with open(data_file, 'r', encoding='utf-8') as f:
	return json.load(f)
	except Exception as e:
	print(f"Error loading dataset: {e}")
	return []

	def _create_chunks(self):
	chunks = []
	for item in self.data:
	text = item['text']
	tag = item.get('tag', 'General')
	chunks.append({
	'text': text,
	'tag': tag,
	'type': 'original',
	'score_boost': 1.0
	})
	if 'eid' in text.lower() or 'عید' in text:
	chunks.append({'text': f"Eid info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.1})
	if 'prayer' in text.lower() or 'نماز' in text:
	chunks.append({'text': f"Prayer info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.2})
	if 'qurbani' in text.lower() or 'قربانی' in text or 'sacrifice' in text.lower():
	chunks.append({'text': f"Qurbani info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.2})
	if 'funny' in tag.lower() or 'shair' in tag.lower():
	chunks.append({'text': f"Fun: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 0.9})
	if 'gaza' in text.lower() or 'غزہ' in text:
	chunks.append({'text': f"Gaza info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.3})
	return chunks

	def _load_models(self):
	if self.bi_encoder is None:
	print("\U0001F504 Loading bi-encoder...")
	self.bi_encoder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
	if self.cross_encoder is None:
	print("\U0001F504 Loading cross-encoder...")
	self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

	def _initialize_question_patterns(self):
	tag_keywords = {}
	for chunk in self.data:
	tag = chunk.get("tag", "").lower()
	if tag not in tag_keywords:
	tag_keywords[tag] = set()
	tag_keywords[tag].update(tag.replace('_', ' ').split())

	# Heuristics
	if "greeting" in tag:
	tag_keywords[tag].update(["hi", "hello", "salaam", "eid mubarak", "السلام"])
	elif "prayer" in tag:
	tag_keywords[tag].update(["prayer", "namaz", "salah", "نماز"])
	elif "qurbani" in tag or "sacrifice" in tag:
	tag_keywords[tag].update(["qurbani", "sacrifice", "janwar", "bakra", "قربانی"])
	elif "gaza" in tag:
	tag_keywords[tag].update(["gaza", "غزہ", "palestine", "فلسطین"])

	return {k: list(v) for k, v in tag_keywords.items()}

	def _clean_input(self, text: str) -> str:
	text = re.sub(r'\s+', ' ', text.strip().lower())
	text = re.sub(r'[^\w\s؟!]', '', text)
	return text

	def _fuzzy_match(self, word: str, keywords: List[str]) -> bool:
	return any(fuzz.ratio(word, keyword) > 80 for keyword in keywords)

	def _detect_question_type(self, question: str) -> str:
	cleaned_question = self._clean_input(question)
	words = cleaned_question.split()
	for category, keywords in self.question_patterns.items():
	if any(self._fuzzy_match(word, keywords) for word in words):
	return category
	return 'general'

	def _get_contextual_boost(self, chunk: Dict, question_type: str) -> float:
	boost = chunk.get('score_boost', 1.0)
	if question_type in chunk['tag'].lower():
	boost *= 1.3
	return boost

	def _is_time_sensitive(self, question: str) -> bool:
	time_keywords = ['time', 'waqt', 'kab', 'when', 'کب', 'وقت']
	return any(self._fuzzy_match(word, time_keywords) for word in question.lower().split())

	def answer_question(self, question: str) -> str:
	self._load_models()
	self._ensure_embeddings()

	cleaned_question = self._clean_input(question)
	if not cleaned_question:
	return self._get_default_response('empty')

	question_type = self._detect_question_type(cleaned_question)
	question_embedding = self.bi_encoder.encode(cleaned_question, convert_to_tensor=True)
	cos_scores = util.cos_sim(question_embedding, self.chunk_embeddings)[0]

	boosted_scores = [score * self._get_contextual_boost(self.knowledge_chunks[i], question_type)
	for i, score in enumerate(cos_scores)]

	top_k = min(15, len(self.knowledge_chunks))
	top_results = torch.topk(torch.tensor(boosted_scores), k=top_k)
	top_chunks = [self.knowledge_chunks[i]['text'] for i in top_results.indices.tolist()]
	top_scores = top_results.values.tolist()

	rerank_pairs = [(cleaned_question, chunk) for chunk in top_chunks]
	rerank_scores = self.cross_encoder.predict(rerank_pairs)

	combined_scores = [(rerank_scores[i] * 0.7 + top_scores[i] * 0.3) for i in range(len(rerank_scores))]
	best_idx = max(range(len(combined_scores)), key=lambda i: combined_scores[i])
	best_chunk = top_chunks[best_idx]

	for prefix in ["Eid info: ", "Prayer info: ", "Qurbani info: ", "Fun: ", "Gaza info: "]:
	if best_chunk.startswith(prefix):
	best_chunk = best_chunk[len(prefix):]
	break

	if self._is_time_sensitive(cleaned_question):
	date = datetime.datetime.now().strftime('%B %d, %Y')
	best_chunk += f"\n\n🕒 آج {date} ہے۔ عید الاضحیٰ عام طور پر 10th Dhul-Hijjah کو ہوتی ہے۔"

	return best_chunk + "\n\n This is a demo. Your feedback matters."

	def _get_default_response(self, question_type: str) -> str:
	return {
	'empty': "❓ Ask something about Eid!",
	'general': "🌟 I'm your Eid Assistant. Ask me anything about Eid!"
	}.get(question_type, "🌟 I'm your Eid Assistant. Ask me anything about Eid!")

	def get_random_by_tag(self, tag_keyword: str) -> str:
	matches = [c['text'] for c in self.knowledge_chunks if tag_keyword in c['tag'].lower()]
	return random.choice(matches) if matches else "No info found."

	# Instantiate the bot
	bot = EnhancedMultilingualEidQABot('dataSet.json')

	@app.route('/ask', methods=['POST'])
	def ask():
	question = request.get_json().get('question', '')
	return jsonify({'answer': bot.answer_question(question)})

	@app.route('/tags', methods=['GET'])
	def tags():
	unique_tags = sorted({chunk['tag'] for chunk in bot.knowledge_chunks})
	return jsonify({'tags': unique_tags})

	@app.route('/tag/<tag>', methods=['GET'])
	def get_by_tag(tag):
	results = [chunk['text'] for chunk in bot.knowledge_chunks if tag.lower() in chunk['tag'].lower()]
	return jsonify({'results': results})

	@app.route('/')
	def home():
	return "✅ Eid Assistant API is running."

	if __name__ == '__main__': # Fixed: __name__ and __main__ instead of _name_ and _main_
	app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 5000)))