File size: 3,205 Bytes
18fa92b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import logging
import numpy as np
import firebase_admin
from firebase_admin import credentials, firestore
from sentence_transformers import SentenceTransformer
from flask import Flask, request, jsonify
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
import faiss  # For efficient similarity search

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Firebase Admin SDK
firebase_cred_path = os.getenv("FIREBASE_CRED_PATH", "path/to/your/firebase/credentials.json")
cred = credentials.Certificate(firebase_cred_path)
firebase_admin.initialize_app(cred)
db = firestore.client()

# Initialize the model for embeddings
model = SentenceTransformer('all-mpnet-base-v2')  # Better model for semantic search

# Initialize FAISS index for efficient similarity search
embedding_size = 768  # Size of embeddings from 'all-mpnet-base-v2'
faiss_index = faiss.IndexFlatIP(embedding_size)  # Inner product for cosine similarity
document_ids = []  # To map FAISS index to document IDs

# Initialize Flask app
app = Flask(__name__)

# Rate limiting
limiter = Limiter(
    app=app,
    key_func=get_remote_address,
    default_limits=["200 per day", "50 per hour"]
)

# Function to store documents in Firebase and update FAISS index
def store_documents(documents):
    try:
        batch = db.batch()
        embeddings = model.encode(documents)  # Encode all documents at once

        for idx, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_ref = db.collection('documents').document(f'doc_{idx}')
            batch.set(doc_ref, {
                'text': doc,
                'embedding': embedding.tolist()
            })
            document_ids.append(doc_ref.id)  # Store document ID
            faiss_index.add(np.array([embedding]))  # Add embedding to FAISS index

        batch.commit()
        logger.info(f"Stored {len(documents)} documents in Firebase and updated FAISS index.")
    except Exception as e:
        logger.error(f"Error storing documents: {e}")

# Function to search documents using FAISS
def search_documents(query):
    try:
        query_embedding = model.encode(query).reshape(1, -1)
        distances, indices = faiss_index.search(query_embedding, k=1)  # Find top-1 match

        if indices[0][0] == -1:
            return None  # No match found

        # Retrieve the best matching document from Firebase
        best_doc_id = document_ids[indices[0][0]]
        best_doc = db.collection('documents').document(best_doc_id).get()
        return best_doc.to_dict()
    except Exception as e:
        logger.error(f"Error searching documents: {e}")
        return None

@app.route('/search', methods=['GET'])
@limiter.limit("10 per minute")  # Rate limit for search endpoint
def search():
    query = request.args.get('query')
    if not query:
        return jsonify({"error": "No query provided."}), 400

    result = search_documents(query)
    if result:
        return jsonify(result)
    else:
        return jsonify({"error": "No relevant documents found."}), 404

if __name__ == '__main__':
    # Run Flask app for the AI search
    app.run(debug=True)