Add document listing via RAG for MCP implementation
Browse files- app.py +98 -1
- requirements.txt +7 -1
app.py
CHANGED
@@ -2,6 +2,7 @@ from io import StringIO
|
|
2 |
import bm25s
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
|
|
5 |
import requests
|
6 |
from bs4 import BeautifulSoup
|
7 |
import json
|
@@ -10,6 +11,8 @@ import traceback
|
|
10 |
import uuid
|
11 |
import zipfile
|
12 |
import io
|
|
|
|
|
13 |
import subprocess
|
14 |
import os
|
15 |
import re
|
@@ -25,6 +28,11 @@ from fastapi.responses import FileResponse
|
|
25 |
from fastapi.staticfiles import StaticFiles
|
26 |
from pydantic import BaseModel
|
27 |
from typing import Any, Dict, List, Literal, Optional
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
from sklearn.preprocessing import MinMaxScaler
|
30 |
nltk.download("wordnet")
|
@@ -440,6 +448,7 @@ class SpecDocFinder:
|
|
440 |
finder_tsg = TsgDocFinder()
|
441 |
finder_spec = SpecDocFinder()
|
442 |
lemmatizer = WordNetLemmatizer()
|
|
|
443 |
|
444 |
if os.path.exists("bm25s.zip"):
|
445 |
with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
|
@@ -646,4 +655,92 @@ def find_documents_batch(request: BatchDocRequest):
|
|
646 |
results=results,
|
647 |
missing=missing,
|
648 |
search_time=time.time() - start_time
|
649 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import bm25s
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
+
import faiss
|
6 |
import requests
|
7 |
from bs4 import BeautifulSoup
|
8 |
import json
|
|
|
11 |
import uuid
|
12 |
import zipfile
|
13 |
import io
|
14 |
+
import openai
|
15 |
+
import httpx
|
16 |
import subprocess
|
17 |
import os
|
18 |
import re
|
|
|
28 |
from fastapi.staticfiles import StaticFiles
|
29 |
from pydantic import BaseModel
|
30 |
from typing import Any, Dict, List, Literal, Optional
|
31 |
+
os.environ['CURL_CA_BUNDLE'] = ''
|
32 |
+
from sentence_transformers import SentenceTransformer
|
33 |
+
import warnings
|
34 |
+
|
35 |
+
warnings.filterwarnings("ignore")
|
36 |
|
37 |
from sklearn.preprocessing import MinMaxScaler
|
38 |
nltk.download("wordnet")
|
|
|
448 |
finder_tsg = TsgDocFinder()
|
449 |
finder_spec = SpecDocFinder()
|
450 |
lemmatizer = WordNetLemmatizer()
|
451 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", backend="onnx")
|
452 |
|
453 |
if os.path.exists("bm25s.zip"):
|
454 |
with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
|
|
|
655 |
results=results,
|
656 |
missing=missing,
|
657 |
search_time=time.time() - start_time
|
658 |
+
)
|
659 |
+
|
660 |
+
def generate_keywords_from_rag_query(question: str):
|
661 |
+
llm = openai.OpenAI(
|
662 |
+
api_key=os.environ.get("GROQ_API_KEY"),
|
663 |
+
base_url="https://api.groq.com/openai/v1",
|
664 |
+
http_client=httpx.Client(verify=False)
|
665 |
+
)
|
666 |
+
system_prompt = """
|
667 |
+
You are a keyword extraction assistant specialized in technical documentation and knowledge retrieval.
|
668 |
+
Your task is to convert a natural language question into a concise set of search-friendly keywords that combine technical terms, abbreviations, and general descriptors.
|
669 |
+
Focus on terminology used in standards, technical specifications, or protocol documentation. Avoid full sentences, keep it short and focused.
|
670 |
+
|
671 |
+
Return the result as a single string, suitable for use in vector search or RAG pipelines.
|
672 |
+
|
673 |
+
Input (example):
|
674 |
+
"Explain the procedure for network slice selection"
|
675 |
+
|
676 |
+
Output:
|
677 |
+
"NSSF network slice selection"
|
678 |
+
"""
|
679 |
+
|
680 |
+
messages = [{
|
681 |
+
"role": "system",
|
682 |
+
"content": system_prompt
|
683 |
+
}, {
|
684 |
+
"role": "user",
|
685 |
+
"content": f"Now process the following input: {question}"
|
686 |
+
}]
|
687 |
+
|
688 |
+
response = llm.chat.completions.create(messages=messages, model="llama-3.3-70b-versatile")
|
689 |
+
return response.choices[0].message.content
|
690 |
+
|
691 |
+
class RAGRequest(BaseModel):
|
692 |
+
question: str
|
693 |
+
threshold: int
|
694 |
+
release: Optional[str] = None
|
695 |
+
working_group: Optional[str] = None
|
696 |
+
spec_type: Optional[Literal["TS", "TR"]] = None
|
697 |
+
|
698 |
+
|
699 |
+
@app.post("/list-rag-docs")
|
700 |
+
def get_docs_for_rag(req: RAGRequest):
|
701 |
+
keywords = generate_keywords_from_rag_query(req.question)
|
702 |
+
print(keywords)
|
703 |
+
doc_data = finder_spec.indexer_documents
|
704 |
+
unique_specs = []
|
705 |
+
documents = {}
|
706 |
+
results = search_spec_bm25(KeywordRequest2(keywords=keywords, threshold=req.threshold, release=req.release, working_group=req.working_group, spec_type=req.spec_type))
|
707 |
+
|
708 |
+
for result in results.results:
|
709 |
+
if result['id'] in unique_specs: continue
|
710 |
+
if result['id'] not in unique_specs:
|
711 |
+
unique_specs.append(result['id'])
|
712 |
+
content = dict(doc_data[result['id']])
|
713 |
+
content_bak = dict(doc_data[result['id']])
|
714 |
+
if isinstance(content, str): continue
|
715 |
+
for chapter in content_bak.keys():
|
716 |
+
if any(kw in chapter.lower() for kw in ["reference", "void"]) or any(kw in content_bak[chapter].lower() for kw in ["annex"]):
|
717 |
+
content.pop(chapter)
|
718 |
+
documents[f"{result['id']}*-*{result['title']}"] = content
|
719 |
+
|
720 |
+
faiss_index = faiss.IndexFlatIP(384)
|
721 |
+
meta = {}
|
722 |
+
contents = []
|
723 |
+
index_counter = 0
|
724 |
+
for spec in documents.keys():
|
725 |
+
for chapter, content in documents[spec].items():
|
726 |
+
contents.append(content)
|
727 |
+
meta[index_counter] = (spec.split("*-*")[0], spec.split("*-*")[1], chapter, content)
|
728 |
+
index_counter += 1
|
729 |
+
|
730 |
+
print("Done contents")
|
731 |
+
|
732 |
+
embedding = model.encode(contents, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True).astype('float32')
|
733 |
+
embedding = embedding.reshape(-1, 384) # Forme (1, 384)
|
734 |
+
print(embedding.shape)
|
735 |
+
faiss_index.add(embedding)
|
736 |
+
|
737 |
+
embedding_query = model.encode(req.question, convert_to_numpy=True, normalize_embeddings=True).astype('float32')
|
738 |
+
embedding_query = embedding_query.reshape(1, -1)
|
739 |
+
distances, indices = faiss_index.search(embedding_query, 15)
|
740 |
+
|
741 |
+
outputs = []
|
742 |
+
for i, idx in enumerate(indices[0]):
|
743 |
+
if idx in meta:
|
744 |
+
outputs.append(f"{meta[idx]}")
|
745 |
+
|
746 |
+
return {"output": "\n".join(outputs)}
|
requirements.txt
CHANGED
@@ -11,4 +11,10 @@ python-dotenv
|
|
11 |
lxml
|
12 |
nltk
|
13 |
bm25s[full]
|
14 |
-
scikit-learn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
lxml
|
12 |
nltk
|
13 |
bm25s[full]
|
14 |
+
scikit-learn
|
15 |
+
faiss-cpu
|
16 |
+
sentence-transformers[onnx]
|
17 |
+
transformers
|
18 |
+
accelerate
|
19 |
+
peft
|
20 |
+
huggingface_hub
|