om4r932 commited on
Commit
b76b6bd
·
1 Parent(s): e84b950

Add document listing via RAG for MCP implementation

Browse files
Files changed (2) hide show
  1. app.py +98 -1
  2. requirements.txt +7 -1
app.py CHANGED
@@ -2,6 +2,7 @@ from io import StringIO
2
  import bm25s
3
  import numpy as np
4
  import pandas as pd
 
5
  import requests
6
  from bs4 import BeautifulSoup
7
  import json
@@ -10,6 +11,8 @@ import traceback
10
  import uuid
11
  import zipfile
12
  import io
 
 
13
  import subprocess
14
  import os
15
  import re
@@ -25,6 +28,11 @@ from fastapi.responses import FileResponse
25
  from fastapi.staticfiles import StaticFiles
26
  from pydantic import BaseModel
27
  from typing import Any, Dict, List, Literal, Optional
 
 
 
 
 
28
 
29
  from sklearn.preprocessing import MinMaxScaler
30
  nltk.download("wordnet")
@@ -440,6 +448,7 @@ class SpecDocFinder:
440
  finder_tsg = TsgDocFinder()
441
  finder_spec = SpecDocFinder()
442
  lemmatizer = WordNetLemmatizer()
 
443
 
444
  if os.path.exists("bm25s.zip"):
445
  with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
@@ -646,4 +655,92 @@ def find_documents_batch(request: BatchDocRequest):
646
  results=results,
647
  missing=missing,
648
  search_time=time.time() - start_time
649
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import bm25s
3
  import numpy as np
4
  import pandas as pd
5
+ import faiss
6
  import requests
7
  from bs4 import BeautifulSoup
8
  import json
 
11
  import uuid
12
  import zipfile
13
  import io
14
+ import openai
15
+ import httpx
16
  import subprocess
17
  import os
18
  import re
 
28
  from fastapi.staticfiles import StaticFiles
29
  from pydantic import BaseModel
30
  from typing import Any, Dict, List, Literal, Optional
31
+ os.environ['CURL_CA_BUNDLE'] = ''
32
+ from sentence_transformers import SentenceTransformer
33
+ import warnings
34
+
35
+ warnings.filterwarnings("ignore")
36
 
37
  from sklearn.preprocessing import MinMaxScaler
38
  nltk.download("wordnet")
 
448
  finder_tsg = TsgDocFinder()
449
  finder_spec = SpecDocFinder()
450
  lemmatizer = WordNetLemmatizer()
451
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", backend="onnx")
452
 
453
  if os.path.exists("bm25s.zip"):
454
  with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
 
655
  results=results,
656
  missing=missing,
657
  search_time=time.time() - start_time
658
+ )
659
+
660
+ def generate_keywords_from_rag_query(question: str):
661
+ llm = openai.OpenAI(
662
+ api_key=os.environ.get("GROQ_API_KEY"),
663
+ base_url="https://api.groq.com/openai/v1",
664
+ http_client=httpx.Client(verify=False)
665
+ )
666
+ system_prompt = """
667
+ You are a keyword extraction assistant specialized in technical documentation and knowledge retrieval.
668
+ Your task is to convert a natural language question into a concise set of search-friendly keywords that combine technical terms, abbreviations, and general descriptors.
669
+ Focus on terminology used in standards, technical specifications, or protocol documentation. Avoid full sentences, keep it short and focused.
670
+
671
+ Return the result as a single string, suitable for use in vector search or RAG pipelines.
672
+
673
+ Input (example):
674
+ "Explain the procedure for network slice selection"
675
+
676
+ Output:
677
+ "NSSF network slice selection"
678
+ """
679
+
680
+ messages = [{
681
+ "role": "system",
682
+ "content": system_prompt
683
+ }, {
684
+ "role": "user",
685
+ "content": f"Now process the following input: {question}"
686
+ }]
687
+
688
+ response = llm.chat.completions.create(messages=messages, model="llama-3.3-70b-versatile")
689
+ return response.choices[0].message.content
690
+
691
+ class RAGRequest(BaseModel):
692
+ question: str
693
+ threshold: int
694
+ release: Optional[str] = None
695
+ working_group: Optional[str] = None
696
+ spec_type: Optional[Literal["TS", "TR"]] = None
697
+
698
+
699
+ @app.post("/list-rag-docs")
700
+ def get_docs_for_rag(req: RAGRequest):
701
+ keywords = generate_keywords_from_rag_query(req.question)
702
+ print(keywords)
703
+ doc_data = finder_spec.indexer_documents
704
+ unique_specs = []
705
+ documents = {}
706
+ results = search_spec_bm25(KeywordRequest2(keywords=keywords, threshold=req.threshold, release=req.release, working_group=req.working_group, spec_type=req.spec_type))
707
+
708
+ for result in results.results:
709
+ if result['id'] in unique_specs: continue
710
+ if result['id'] not in unique_specs:
711
+ unique_specs.append(result['id'])
712
+ content = dict(doc_data[result['id']])
713
+ content_bak = dict(doc_data[result['id']])
714
+ if isinstance(content, str): continue
715
+ for chapter in content_bak.keys():
716
+ if any(kw in chapter.lower() for kw in ["reference", "void"]) or any(kw in content_bak[chapter].lower() for kw in ["annex"]):
717
+ content.pop(chapter)
718
+ documents[f"{result['id']}*-*{result['title']}"] = content
719
+
720
+ faiss_index = faiss.IndexFlatIP(384)
721
+ meta = {}
722
+ contents = []
723
+ index_counter = 0
724
+ for spec in documents.keys():
725
+ for chapter, content in documents[spec].items():
726
+ contents.append(content)
727
+ meta[index_counter] = (spec.split("*-*")[0], spec.split("*-*")[1], chapter, content)
728
+ index_counter += 1
729
+
730
+ print("Done contents")
731
+
732
+ embedding = model.encode(contents, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True).astype('float32')
733
+ embedding = embedding.reshape(-1, 384) # Forme (1, 384)
734
+ print(embedding.shape)
735
+ faiss_index.add(embedding)
736
+
737
+ embedding_query = model.encode(req.question, convert_to_numpy=True, normalize_embeddings=True).astype('float32')
738
+ embedding_query = embedding_query.reshape(1, -1)
739
+ distances, indices = faiss_index.search(embedding_query, 15)
740
+
741
+ outputs = []
742
+ for i, idx in enumerate(indices[0]):
743
+ if idx in meta:
744
+ outputs.append(f"{meta[idx]}")
745
+
746
+ return {"output": "\n".join(outputs)}
requirements.txt CHANGED
@@ -11,4 +11,10 @@ python-dotenv
11
  lxml
12
  nltk
13
  bm25s[full]
14
- scikit-learn
 
 
 
 
 
 
 
11
  lxml
12
  nltk
13
  bm25s[full]
14
+ scikit-learn
15
+ faiss-cpu
16
+ sentence-transformers[onnx]
17
+ transformers
18
+ accelerate
19
+ peft
20
+ huggingface_hub