File size: 8,485 Bytes
c653454
f9dbb85
 
 
 
 
 
 
 
 
 
 
 
060f3e9
f9dbb85
 
 
060f3e9
d2a7141
f9dbb85
 
d2a7141
f9dbb85
 
 
 
d2a7141
 
f9dbb85
 
 
d2a7141
 
 
 
 
 
 
f9dbb85
 
 
 
 
 
 
d2a7141
f9dbb85
 
 
 
 
 
 
 
 
 
 
 
d2a7141
f9dbb85
d2a7141
f9dbb85
d2a7141
f9dbb85
d2a7141
f9dbb85
d2a7141
f9dbb85
d2a7141
f9dbb85
 
d2a7141
f9dbb85
 
d2a7141
f9dbb85
 
 
d2a7141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9dbb85
 
d2a7141
f9dbb85
d2a7141
f9dbb85
 
d2a7141
f9dbb85
 
 
 
 
 
 
d2a7141
f9dbb85
 
d2a7141
f9dbb85
 
d2a7141
f9dbb85
d2a7141
f9dbb85
d2a7141
f9dbb85
 
 
 
 
 
 
d2a7141
f9dbb85
 
 
d2a7141
 
 
 
f9dbb85
d2a7141
 
f9dbb85
d2a7141
f9dbb85
 
d2a7141
 
f9dbb85
 
d2a7141
 
 
 
f9dbb85
d2a7141
f9dbb85
d2a7141
 
 
060f3e9
d2a7141
f9dbb85
d2a7141
 
 
 
 
 
 
 
f9dbb85
 
 
 
 
d2a7141
 
 
 
 
 
 
 
 
 
 
 
 
 
6cfb658
 
d2a7141
f9dbb85
060f3e9
d2a7141
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# Eid code
from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import json
import re
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
from typing import List, Dict
import random
import datetime
from fuzzywuzzy import fuzz

app = Flask(__name__)  # Fixed: __name__ instead of _name_
CORS(app)

class EnhancedMultilingualEidQABot:
    def __init__(self, data_file='dataSet.json'):  # Fixed: __init__ instead of _init_
        print("\U0001F504 Loading multilingual models...")
        self.bi_encoder = None
        self.cross_encoder = None
        print("\U0001F4D6 Processing dataset...")
        self.data = self._load_dataset(data_file)
        self.knowledge_chunks = self._create_chunks()
        self.chunk_embeddings = None
        self.question_patterns = self._initialize_question_patterns()
        print("\u2705 Bot ready!\n")

    def _ensure_embeddings(self):
        if self.chunk_embeddings is None:
            self._load_models()
            print("\U0001F9E0 Creating embeddings...")
            self.chunk_embeddings = self.bi_encoder.encode(
                [chunk['text'] for chunk in self.knowledge_chunks],
                convert_to_tensor=True,
                show_progress_bar=True
            )

    def _load_dataset(self, data_file):
        try:
            with open(data_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return []

    def _create_chunks(self):
        chunks = []
        for item in self.data:
            text = item['text']
            tag = item.get('tag', 'General')
            chunks.append({
                'text': text,
                'tag': tag,
                'type': 'original',
                'score_boost': 1.0
            })
            if 'eid' in text.lower() or 'عید' in text:
                chunks.append({'text': f"Eid info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.1})
            if 'prayer' in text.lower() or 'نماز' in text:
                chunks.append({'text': f"Prayer info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.2})
            if 'qurbani' in text.lower() or 'قربانی' in text or 'sacrifice' in text.lower():
                chunks.append({'text': f"Qurbani info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.2})
            if 'funny' in tag.lower() or 'shair' in tag.lower():
                chunks.append({'text': f"Fun: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 0.9})
            if 'gaza' in text.lower() or 'غزہ' in text:
                chunks.append({'text': f"Gaza info: {text}", 'tag': tag, 'type': 'enhanced', 'score_boost': 1.3})
        return chunks

    def _load_models(self):
        if self.bi_encoder is None:
            print("\U0001F504 Loading bi-encoder...")
            self.bi_encoder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
        if self.cross_encoder is None:
            print("\U0001F504 Loading cross-encoder...")
            self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

    def _initialize_question_patterns(self):
        tag_keywords = {}
        for chunk in self.data:
            tag = chunk.get("tag", "").lower()
            if tag not in tag_keywords:
                tag_keywords[tag] = set()
            tag_keywords[tag].update(tag.replace('_', ' ').split())

            # Heuristics
            if "greeting" in tag:
                tag_keywords[tag].update(["hi", "hello", "salaam", "eid mubarak", "السلام"])
            elif "prayer" in tag:
                tag_keywords[tag].update(["prayer", "namaz", "salah", "نماز"])
            elif "qurbani" in tag or "sacrifice" in tag:
                tag_keywords[tag].update(["qurbani", "sacrifice", "janwar", "bakra", "قربانی"])
            elif "gaza" in tag:
                tag_keywords[tag].update(["gaza", "غزہ", "palestine", "فلسطین"])

        return {k: list(v) for k, v in tag_keywords.items()}

    def _clean_input(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text.strip().lower())
        text = re.sub(r'[^\w\s؟!]', '', text)
        return text

    def _fuzzy_match(self, word: str, keywords: List[str]) -> bool:
        return any(fuzz.ratio(word, keyword) > 80 for keyword in keywords)

    def _detect_question_type(self, question: str) -> str:
        cleaned_question = self._clean_input(question)
        words = cleaned_question.split()
        for category, keywords in self.question_patterns.items():
            if any(self._fuzzy_match(word, keywords) for word in words):
                return category
        return 'general'

    def _get_contextual_boost(self, chunk: Dict, question_type: str) -> float:
        boost = chunk.get('score_boost', 1.0)
        if question_type in chunk['tag'].lower():
            boost *= 1.3
        return boost

    def _is_time_sensitive(self, question: str) -> bool:
        time_keywords = ['time', 'waqt', 'kab', 'when', 'کب', 'وقت']
        return any(self._fuzzy_match(word, time_keywords) for word in question.lower().split())

    def answer_question(self, question: str) -> str:
        self._load_models()
        self._ensure_embeddings()

        cleaned_question = self._clean_input(question)
        if not cleaned_question:
            return self._get_default_response('empty')

        question_type = self._detect_question_type(cleaned_question)
        question_embedding = self.bi_encoder.encode(cleaned_question, convert_to_tensor=True)
        cos_scores = util.cos_sim(question_embedding, self.chunk_embeddings)[0]

        boosted_scores = [score * self._get_contextual_boost(self.knowledge_chunks[i], question_type)
                          for i, score in enumerate(cos_scores)]

        top_k = min(15, len(self.knowledge_chunks))
        top_results = torch.topk(torch.tensor(boosted_scores), k=top_k)
        top_chunks = [self.knowledge_chunks[i]['text'] for i in top_results.indices.tolist()]
        top_scores = top_results.values.tolist()

        rerank_pairs = [(cleaned_question, chunk) for chunk in top_chunks]
        rerank_scores = self.cross_encoder.predict(rerank_pairs)

        combined_scores = [(rerank_scores[i] * 0.7 + top_scores[i] * 0.3) for i in range(len(rerank_scores))]
        best_idx = max(range(len(combined_scores)), key=lambda i: combined_scores[i])
        best_chunk = top_chunks[best_idx]

        for prefix in ["Eid info: ", "Prayer info: ", "Qurbani info: ", "Fun: ", "Gaza info: "]:
            if best_chunk.startswith(prefix):
                best_chunk = best_chunk[len(prefix):]
                break

        if self._is_time_sensitive(cleaned_question):
            date = datetime.datetime.now().strftime('%B %d, %Y')
            best_chunk += f"\n\n🕒 آج {date} ہے۔ عید الاضحیٰ عام طور پر 10th Dhul-Hijjah کو ہوتی ہے۔"

        return best_chunk + "\n\n This is a demo. Your feedback matters."

    def _get_default_response(self, question_type: str) -> str:
        return {
            'empty': "❓ Ask something about Eid!",
            'general': "🌟 I'm your Eid Assistant. Ask me anything about Eid!"
        }.get(question_type, "🌟 I'm your Eid Assistant. Ask me anything about Eid!")

    def get_random_by_tag(self, tag_keyword: str) -> str:
        matches = [c['text'] for c in self.knowledge_chunks if tag_keyword in c['tag'].lower()]
        return random.choice(matches) if matches else "No info found."

# Instantiate the bot
bot = EnhancedMultilingualEidQABot('dataSet.json')

@app.route('/ask', methods=['POST'])
def ask():
    question = request.get_json().get('question', '')
    return jsonify({'answer': bot.answer_question(question)})

@app.route('/tags', methods=['GET'])
def tags():
    unique_tags = sorted({chunk['tag'] for chunk in bot.knowledge_chunks})
    return jsonify({'tags': unique_tags})

@app.route('/tag/<tag>', methods=['GET'])
def get_by_tag(tag):
    results = [chunk['text'] for chunk in bot.knowledge_chunks if tag.lower() in chunk['tag'].lower()]
    return jsonify({'results': results})

@app.route('/')
def home():
    return "✅ Eid Assistant API is running."

if __name__ == '__main__':  # Fixed: __name__ and __main__ instead of _name_ and _main_
    app.run(host='0.0.0.0', port=int(os.environ.get('PORT', 5000)))