Spaces:

OrganizedProgrammers
/

3GPPDocFinder

Running

App Files Files Community

om4r932 commited on 8 days ago

Commit

b76b6bd

1 Parent(s): e84b950

Add document listing via RAG for MCP implementation

Browse files

Files changed (2) hide show

app.py +98 -1
requirements.txt +7 -1

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ from io import StringIO
 import bm25s
 import numpy as np
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 import json
@@ -10,6 +11,8 @@ import traceback
 import uuid
 import zipfile
 import io
 import subprocess
 import os
 import re
@@ -25,6 +28,11 @@ from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from typing import Any, Dict, List, Literal, Optional
 from sklearn.preprocessing import MinMaxScaler
 nltk.download("wordnet")
@@ -440,6 +448,7 @@ class SpecDocFinder:
 finder_tsg = TsgDocFinder()
 finder_spec = SpecDocFinder()
 lemmatizer = WordNetLemmatizer()
 if os.path.exists("bm25s.zip"):
     with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
@@ -646,4 +655,92 @@ def find_documents_batch(request: BatchDocRequest):
         results=results,
         missing=missing,
         search_time=time.time() - start_time
-    )

 import bm25s
 import numpy as np
 import pandas as pd
+import faiss
 import requests
 from bs4 import BeautifulSoup
 import json
 import uuid
 import zipfile
 import io
+import openai
+import httpx
 import subprocess
 import os
 import re
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from typing import Any, Dict, List, Literal, Optional
+os.environ['CURL_CA_BUNDLE'] = ''
+from sentence_transformers import SentenceTransformer
+import warnings
+warnings.filterwarnings("ignore")
 from sklearn.preprocessing import MinMaxScaler
 nltk.download("wordnet")
 finder_tsg = TsgDocFinder()
 finder_spec = SpecDocFinder()
 lemmatizer = WordNetLemmatizer()
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", backend="onnx")
 if os.path.exists("bm25s.zip"):
     with zipfile.ZipFile("bm25s.zip", 'r') as zip_ref:
         results=results,
         missing=missing,
         search_time=time.time() - start_time
+    )
+def generate_keywords_from_rag_query(question: str):
+    llm = openai.OpenAI(
+        api_key=os.environ.get("GROQ_API_KEY"),
+        base_url="https://api.groq.com/openai/v1",
+        http_client=httpx.Client(verify=False)
+    )
+    system_prompt = """
+        You are a keyword extraction assistant specialized in technical documentation and knowledge retrieval.
+        Your task is to convert a natural language question into a concise set of search-friendly keywords that combine technical terms, abbreviations, and general descriptors.
+        Focus on terminology used in standards, technical specifications, or protocol documentation. Avoid full sentences, keep it short and focused.
+        Return the result as a single string, suitable for use in vector search or RAG pipelines.
+        Input (example):
+        "Explain the procedure for network slice selection"
+        Output:
+        "NSSF network slice selection"
+    """
+    messages = [{
+        "role": "system",
+        "content": system_prompt
+    }, {
+        "role": "user",
+        "content": f"Now process the following input: {question}"
+    }]
+    response = llm.chat.completions.create(messages=messages, model="llama-3.3-70b-versatile")
+    return response.choices[0].message.content
+class RAGRequest(BaseModel):
+    question: str
+    threshold: int
+    release: Optional[str] = None
+    working_group: Optional[str] = None
+    spec_type: Optional[Literal["TS", "TR"]] = None
+@app.post("/list-rag-docs")
+def get_docs_for_rag(req: RAGRequest):
+    keywords = generate_keywords_from_rag_query(req.question)
+    print(keywords)
+    doc_data = finder_spec.indexer_documents
+    unique_specs = []
+    documents = {}
+    results = search_spec_bm25(KeywordRequest2(keywords=keywords, threshold=req.threshold, release=req.release, working_group=req.working_group, spec_type=req.spec_type))
+    for result in results.results:
+        if result['id'] in unique_specs: continue
+        if result['id'] not in unique_specs:
+            unique_specs.append(result['id'])
+        content = dict(doc_data[result['id']])
+        content_bak = dict(doc_data[result['id']])
+        if isinstance(content, str): continue
+        for chapter in content_bak.keys():
+            if any(kw in chapter.lower() for kw in ["reference", "void"]) or any(kw in content_bak[chapter].lower() for kw in ["annex"]):
+                content.pop(chapter)
+        documents[f"{result['id']}*-*{result['title']}"] = content
+    faiss_index = faiss.IndexFlatIP(384)
+    meta = {}
+    contents = []
+    index_counter = 0
+    for spec in documents.keys():
+        for chapter, content in documents[spec].items():
+            contents.append(content)
+            meta[index_counter] = (spec.split("*-*")[0], spec.split("*-*")[1], chapter, content)
+            index_counter += 1
+    print("Done contents")
+    embedding = model.encode(contents, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True).astype('float32')
+    embedding = embedding.reshape(-1, 384)  # Forme (1, 384)
+    print(embedding.shape)
+    faiss_index.add(embedding)
+    embedding_query = model.encode(req.question, convert_to_numpy=True, normalize_embeddings=True).astype('float32')
+    embedding_query = embedding_query.reshape(1, -1)
+    distances, indices = faiss_index.search(embedding_query, 15)
+    outputs = []
+    for i, idx in enumerate(indices[0]):
+        if idx in meta:
+            outputs.append(f"{meta[idx]}")
+    return {"output": "\n".join(outputs)}

requirements.txt CHANGED Viewed

@@ -11,4 +11,10 @@ python-dotenv
 lxml
 nltk
 bm25s[full]
-scikit-learn

 lxml
 nltk
 bm25s[full]
+scikit-learn
+faiss-cpu
+sentence-transformers[onnx]
+transformers
+accelerate
+peft
+huggingface_hub