Spaces:

zamal
/

DeepGit-lite

Running on Zero

App Files Files Community

zamalali commited on Apr 3

Commit

8d67bd2

1 Parent(s): 7c0e46f

Added changes to enhance the metrics

Browse files

Files changed (3) hide show

__pycache__/main.cpython-313.pyc +0 -0
app.py +36 -21
main.py +127 -36

__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (17.6 kB). View file

app.py CHANGED Viewed

@@ -2,9 +2,8 @@ import gradio as gr
 import time
 import threading
 import logging
-import spaces
 from main import run_repository_ranking  # Your repository ranking function
 # ---------------------------
 # Global Logging Buffer Setup
 # ---------------------------
@@ -38,10 +37,17 @@ def filter_logs(logs):
     return filtered
 def parse_result_to_html(raw_result: str) -> str:
-    # Parses raw string output to an HTML table
     entries = raw_result.strip().split("Final Rank:")
     html = """
-    <table border="1" style="width:100%; border-collapse: collapse;">
         <thead>
             <tr>
                 <th>Rank</th>
@@ -52,10 +58,10 @@ def parse_result_to_html(raw_result: str) -> str:
         </thead>
         <tbody>
     """
-    for entry in entries[1:]:
         lines = entry.strip().split("\n")
         data = {}
-        data["Final Rank"] = lines[0].strip()
         for line in lines[1:]:
             if ": " in line:
                 key, val = line.split(": ", 1)
@@ -115,9 +121,18 @@ def lite_runner(topic):
         yield status, details
 # ---------------------------
-# App UI Setup Using Gradio Soft Theme
 # ---------------------------
-with gr.Blocks(theme="gstaff/sketch", title="DeepGit Lite", fill_width=True) as demo:
     gr.HTML(
         """
         <head>
@@ -131,20 +146,20 @@ with gr.Blocks(theme="gstaff/sketch", title="DeepGit Lite", fill_width=True) as
         # DeepGit Lite
         Explore GitHub repositories with deep semantic search.
         Check out our [GitHub](https://github.com/zamalali/DeepGit) for more details.
-        """
     )
-    with gr.Row():
-        with gr.Column(scale=2):
-            research_input = gr.Textbox(
-                label="Research Query",
-                placeholder="Enter your research topic here, e.g., 'data augmentation pipelines for LLM fine-tuning'",
-                lines=3
-            )
-            run_button = gr.Button("Run DeepGit Lite", variant="primary")
-        with gr.Column(scale=3):
-            status_display = gr.Markdown(label="Status")
-            detail_display = gr.HTML(label="Results")
     run_button.click(
         fn=lite_runner,
@@ -164,7 +179,7 @@ with gr.Blocks(theme="gstaff/sketch", title="DeepGit Lite", fill_width=True) as
     gr.HTML(
         """
-        <div>
             Made with ❤️ by <b>Zamal</b>
         </div>
         """

 import time
 import threading
 import logging
 from main import run_repository_ranking  # Your repository ranking function
+import spaces
 # ---------------------------
 # Global Logging Buffer Setup
 # ---------------------------
     return filtered
 def parse_result_to_html(raw_result: str) -> str:
+    """
+    Parses the raw string output from run_repository_ranking to an HTML table.
+    Only the top 10 results are displayed.
+    """
     entries = raw_result.strip().split("Final Rank:")
+    # Only use the first 10 entries (if available)
+    entries = entries[1:11]
+    if not entries:
+        return "<p>No repositories found for your query.</p>"
     html = """
+    <table border="1" style="width:80%; margin: auto; border-collapse: collapse;">
         <thead>
             <tr>
                 <th>Rank</th>
         </thead>
         <tbody>
     """
+    for entry in entries:
         lines = entry.strip().split("\n")
         data = {}
+        data["Final Rank"] = lines[0].strip() if lines else ""
         for line in lines[1:]:
             if ": " in line:
                 key, val = line.split(": ", 1)
         yield status, details
 # ---------------------------
+# App UI Setup Using Gradio Soft Theme with Centered Layout
 # ---------------------------
+with gr.Blocks(
+    theme="gstaff/sketch",
+    title="DeepGit Lite",
+    css="""
+        /* Center header and footer */
+        #header { text-align: center; margin-bottom: 20px; }
+        #main-container { max-width: 800px; margin: auto; }
+        #footer { text-align: center; margin-top: 20px; }
+    """
+) as demo:
     gr.HTML(
         """
         <head>
         # DeepGit Lite
         Explore GitHub repositories with deep semantic search.
         Check out our [GitHub](https://github.com/zamalali/DeepGit) for more details.
+        """,
+        elem_id="header"
     )
+    # Centered main container for inputs and outputs.
+    with gr.Column(elem_id="main-container"):
+        research_input = gr.Textbox(
+            label="Research Query",
+            placeholder="Enter your research topic here, e.g., 'data augmentation pipelines for LLM fine-tuning'",
+            lines=3
+        )
+        run_button = gr.Button("Run DeepGit Lite", variant="primary")
+        status_display = gr.Markdown(label="Status")
+        detail_display = gr.HTML(label="Results")
     run_button.click(
         fn=lite_runner,
     gr.HTML(
         """
+        <div id="footer">
             Made with ❤️ by <b>Zamal</b>
         </div>
         """

main.py CHANGED Viewed

@@ -4,9 +4,10 @@ import requests
 import numpy as np
 import faiss
 import re
-from sentence_transformers import SentenceTransformer
-from dotenv import load_dotenv
 from pathlib import Path
 from langchain_groq import ChatGroq
 from langchain_core.prompts import ChatPromptTemplate
@@ -20,8 +21,10 @@ except ImportError:
 # Environment Setup
 # ---------------------------
 load_dotenv()
-# Setup a persistent session for GitHub API requests
 session = requests.Session()
 session.headers.update({
     "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
@@ -29,7 +32,7 @@ session.headers.update({
 })
 # ---------------------------
-# Langchain Groq Setup
 # ---------------------------
 llm = ChatGroq(
     model="deepseek-r1-distill-llama-70b",
@@ -62,30 +65,44 @@ Rules:
 - If your output does not strictly match the required format, correct it after your internal reasoning.
 - Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
 Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
 """),
     ("human", "{query}")
 ])
 chain = prompt | llm
-def parse_search_tags(response) -> str:
     """
-    Removes internal chain-of-thought (enclosed in <think> tags) and returns only the final search tags.
     """
-    response_str = str(response)
-    if "<think>" in response_str and "</think>" in response_str:
-        end_index = response_str.index("</think>") + len("</think>")
-        tags = response_str[end_index:].strip()
-        return tags
-    else:
-        return response_str.strip()
-def valid_tags(tags: str) -> bool:
     """
-    Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
     """
-    pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
-    return re.match(pattern, tags) is not None
 def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
     print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
@@ -110,6 +127,7 @@ def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str
 # GitHub API Helper Functions
 # ---------------------------
 def fetch_readme_content(repo_full_name):
     readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
     response = session.get(readme_url)
     if response.status_code == 200:
@@ -120,6 +138,30 @@ def fetch_readme_content(repo_full_name):
             return ""
     return ""
 def fetch_github_repositories(query, max_results=10):
     """
     Searches GitHub repositories using the provided query and retrieves key information.
@@ -137,9 +179,8 @@ def fetch_github_repositories(query, max_results=10):
     for repo in response.json().get('items', []):
         repo_link = repo.get('html_url')
         description = repo.get('description') or ""
-        readme_content = fetch_readme_content(repo.get('full_name'))
-        # Combine description and README for a richer document context.
-        combined_text = (description + "\n" + readme_content).strip()
         repo_list.append({
             "title": repo.get('name', 'No title available'),
             "link": repo_link,
@@ -148,9 +189,9 @@ def fetch_github_repositories(query, max_results=10):
     return repo_list
 # ---------------------------
-# Initialize SentenceTransformer Model
 # ---------------------------
-model = SentenceTransformer('all-MiniLM-L6-v2')
 def robust_min_max_norm(scores):
     """
@@ -163,19 +204,65 @@ def robust_min_max_norm(scores):
     return (scores - min_val) / (max_val - min_val)
 # ---------------------------
-# Main Function: Repository Ranking with Hybrid Retrieval
 # ---------------------------
 def run_repository_ranking(query: str) -> str:
     """
     Converts the user query into search tags, runs multiple GitHub queries (individual and combined),
-    deduplicates results, and applies hybrid dense (FAISS) and sparse (BM25) ranking.
     """
     # Step 1: Generate search tags from the query.
     search_tags = iterative_convert_to_search_tags(query)
     tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
     # Step 2: Handle target language extraction.
-    target_lang = None
     if any(tag.startswith("target-") for tag in tag_list):
         target_tag = next(tag for tag in tag_list if tag.startswith("target-"))
         target_lang = target_tag.replace("target-", "")
@@ -195,7 +282,7 @@ def run_repository_ranking(query: str) -> str:
         repos = fetch_github_repositories(github_query, max_results=15)
         all_repositories.extend(repos)
-    # Also perform a combined query using OR logic for higher recall.
     combined_query = " OR ".join(tag_list)
     combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}"
     print("Combined GitHub Query:", combined_query)
@@ -208,7 +295,6 @@ def run_repository_ranking(query: str) -> str:
         if repo["link"] not in unique_repositories:
             unique_repositories[repo["link"]] = repo
         else:
-            # Merge content if the repository appears in multiple queries.
             existing_text = unique_repositories[repo["link"]]["combined_text"]
             unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"]
     repositories = list(unique_repositories.values())
@@ -216,10 +302,10 @@ def run_repository_ranking(query: str) -> str:
     if not repositories:
         return "No repositories found for your query."
-    # Step 4: Prepare documents by using the combined text (description + README).
     docs = [repo.get("combined_text", "") for repo in repositories]
-    # Step 5: Compute dense embeddings and build the FAISS index.
     doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
     if doc_embeddings.ndim == 1:
         doc_embeddings = doc_embeddings.reshape(1, -1)
@@ -239,7 +325,7 @@ def run_repository_ranking(query: str) -> str:
     dense_scores = D.squeeze()
     norm_dense_scores = robust_min_max_norm(dense_scores)
-    # Step 6: Compute BM25 scores with improved tokenization.
     if BM25Okapi is not None:
         tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs]
         bm25 = BM25Okapi(tokenized_docs)
@@ -249,22 +335,27 @@ def run_repository_ranking(query: str) -> str:
     else:
         norm_bm25_scores = np.zeros_like(norm_dense_scores)
-    # Step 7: Combine scores (with denser retrieval given higher weight).
-    alpha = 0.8  # Weight for dense retrieval
     combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores
     for idx, repo in enumerate(repositories):
         repo["combined_score"] = float(combined_scores[idx])
-    # Step 8: Rank repositories and format output.
     ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True)
     output = "\n=== Ranked Repositories ===\n"
-    for rank, repo in enumerate(ranked_repositories, 1):
         output += f"Final Rank: {rank}\n"
         output += f"Title: {repo['title']}\n"
         output += f"Link: {repo['link']}\n"
         output += f"Combined Score: {repo.get('combined_score', 0):.4f}\n"
         snippet = repo['combined_text'][:300].replace('\n', ' ')
         output += f"Snippet: {snippet}...\n"
         output += '-' * 80 + "\n"
@@ -275,6 +366,6 @@ def run_repository_ranking(query: str) -> str:
 # Main Entry Point for Testing
 # ---------------------------
 if __name__ == "__main__":
-    test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
     result = run_repository_ranking(test_query)
     print(result)

 import numpy as np
 import faiss
 import re
+import logging
 from pathlib import Path
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer, CrossEncoder
 from langchain_groq import ChatGroq
 from langchain_core.prompts import ChatPromptTemplate
 # Environment Setup
 # ---------------------------
 load_dotenv()
+# Set the cross-encoder model from environment or use a default SOTA model.
+CROSS_ENCODER_MODEL = os.getenv("CROSS_ENCODER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
+# Setup a persistent session for GitHub API requests.
 session = requests.Session()
 session.headers.update({
     "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
 })
 # ---------------------------
+# Langchain Groq Setup for Search Tag Conversion
 # ---------------------------
 llm = ChatGroq(
     model="deepseek-r1-distill-llama-70b",
 - If your output does not strictly match the required format, correct it after your internal reasoning.
 - Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
+Excellent Examples:
+Input: "No code tool to augment image and annotation"
+Output: image-augmentation:albumentations
+Input: "Repos around chain of thought prompting mainly for finetuned models"
+Output: chain-of-thought:finetuned-llm
+Input: "Find repositories implementing data augmentation pipelines in JavaScript"
+Output: data-augmentation:target-javascript
 Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
 """),
     ("human", "{query}")
 ])
 chain = prompt | llm
+def valid_tags(tags: str) -> bool:
     """
+    Validates that the output is one to six colon-separated tokens composed
+    of lowercase letters, numbers, and hyphens.
     """
+    pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){1,5}$'
+    return re.match(pattern, tags) is not None
+def parse_search_tags(response: str) -> str:
     """
+    Extracts a valid colon-separated tag string from the LLM response.
+    This function removes any chain-of-thought commentary.
     """
+    # Remove any text inside <think>...</think> blocks.
+    cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
+    # Use regex to find a valid tag pattern.
+    pattern = r'([a-z0-9-]+(?::[a-z0-9-]+){1,5})'
+    match = re.search(pattern, cleaned)
+    if match:
+         return match.group(1).strip()
+    return cleaned.strip()
 def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
     print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
 # GitHub API Helper Functions
 # ---------------------------
 def fetch_readme_content(repo_full_name):
+    """Fetch the README content (if available) using the GitHub API."""
     readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
     response = session.get(readme_url)
     if response.status_code == 200:
             return ""
     return ""
+def fetch_markdown_contents(repo_full_name):
+    """
+    Fetch all markdown files (except the README already fetched) from the root of the repository.
+    """
+    url = f"https://api.github.com/repos/{repo_full_name}/contents"
+    response = session.get(url)
+    contents = ""
+    if response.status_code == 200:
+        items = response.json()
+        for item in items:
+            if item.get("type") == "file" and item.get("name", "").lower().endswith(".md"):
+                file_url = item.get("download_url")
+                if file_url:
+                    file_resp = requests.get(file_url)
+                    if file_resp.status_code == 200:
+                        contents += "\n" + file_resp.text
+    return contents
+def fetch_all_markdown(repo_full_name):
+    """Combine README with all markdown contents from the repository root."""
+    readme = fetch_readme_content(repo_full_name)
+    other_md = fetch_markdown_contents(repo_full_name)
+    return readme + "\n" + other_md
 def fetch_github_repositories(query, max_results=10):
     """
     Searches GitHub repositories using the provided query and retrieves key information.
     for repo in response.json().get('items', []):
         repo_link = repo.get('html_url')
         description = repo.get('description') or ""
+        combined_markdown = fetch_all_markdown(repo.get('full_name'))
+        combined_text = (description + "\n" + combined_markdown).strip()
         repo_list.append({
             "title": repo.get('name', 'No title available'),
             "link": repo_link,
     return repo_list
 # ---------------------------
+# Initialize SentenceTransformer Model for Dense Retrieval
 # ---------------------------
+model = SentenceTransformer('all-mpnet-base-v2')
 def robust_min_max_norm(scores):
     """
     return (scores - min_val) / (max_val - min_val)
 # ---------------------------
+# Cross-Encoder Re-Ranking Function
+# ---------------------------
+def cross_encoder_rerank_candidates(candidates, query, model_name, top_n=10):
+    """
+    Re-ranks candidate repositories using a cross-encoder model.
+    For long documents, the text is split into chunks and scores are aggregated.
+    """
+    cross_encoder = CrossEncoder(model_name)
+    CHUNK_SIZE = 2000        # characters per chunk
+    MAX_DOC_LENGTH = 5000      # cap for long docs
+    MIN_DOC_LENGTH = 200       # threshold for short docs
+    def split_text(text, chunk_size=CHUNK_SIZE):
+        return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    for candidate in candidates:
+        doc = candidate.get("combined_text", "")
+        if len(doc) > MAX_DOC_LENGTH:
+            doc = doc[:MAX_DOC_LENGTH]
+        try:
+            if len(doc) < MIN_DOC_LENGTH:
+                score = cross_encoder.predict([[query, doc]])
+                candidate["cross_encoder_score"] = float(score[0])
+            else:
+                chunks = split_text(doc)
+                pairs = [[query, chunk] for chunk in chunks]
+                scores = cross_encoder.predict(pairs)
+                max_score = np.max(scores) if len(scores) > 0 else 0.0
+                avg_score = np.mean(scores) if len(scores) > 0 else 0.0
+                candidate["cross_encoder_score"] = float(0.5 * max_score + 0.5 * avg_score)
+        except Exception as e:
+            logging.error(f"Error scoring candidate {candidate.get('link', 'unknown')}: {e}")
+            candidate["cross_encoder_score"] = 0.0
+    all_scores = [candidate["cross_encoder_score"] for candidate in candidates]
+    if all_scores:
+        min_score = min(all_scores)
+        if min_score < 0:
+            for candidate in candidates:
+                candidate["cross_encoder_score"] += -min_score
+    reranked = sorted(candidates, key=lambda x: x["cross_encoder_score"], reverse=True)
+    return reranked[:top_n]
+# ---------------------------
+# Main Function: Repository Ranking with Hybrid Retrieval and Cross-Encoder Re-Ranking
 # ---------------------------
 def run_repository_ranking(query: str) -> str:
     """
     Converts the user query into search tags, runs multiple GitHub queries (individual and combined),
+    deduplicates results, and applies a hybrid ranking strategy:
+      - Dense embeddings (via SentenceTransformer) combined with BM25 scoring.
+      - Re-ranks top candidates using a cross-encoder for improved contextual alignment.
     """
     # Step 1: Generate search tags from the query.
     search_tags = iterative_convert_to_search_tags(query)
     tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
     # Step 2: Handle target language extraction.
     if any(tag.startswith("target-") for tag in tag_list):
         target_tag = next(tag for tag in tag_list if tag.startswith("target-"))
         target_lang = target_tag.replace("target-", "")
         repos = fetch_github_repositories(github_query, max_results=15)
         all_repositories.extend(repos)
+    # Combined query using OR logic.
     combined_query = " OR ".join(tag_list)
     combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}"
     print("Combined GitHub Query:", combined_query)
         if repo["link"] not in unique_repositories:
             unique_repositories[repo["link"]] = repo
         else:
             existing_text = unique_repositories[repo["link"]]["combined_text"]
             unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"]
     repositories = list(unique_repositories.values())
     if not repositories:
         return "No repositories found for your query."
+    # Step 4: Prepare documents.
     docs = [repo.get("combined_text", "") for repo in repositories]
+    # Step 5: Dense retrieval.
     doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
     if doc_embeddings.ndim == 1:
         doc_embeddings = doc_embeddings.reshape(1, -1)
     dense_scores = D.squeeze()
     norm_dense_scores = robust_min_max_norm(dense_scores)
+    # Step 6: BM25 scoring.
     if BM25Okapi is not None:
         tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs]
         bm25 = BM25Okapi(tokenized_docs)
     else:
         norm_bm25_scores = np.zeros_like(norm_dense_scores)
+    # Step 7: Combine scores.
+    alpha = 0.8
     combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores
     for idx, repo in enumerate(repositories):
         repo["combined_score"] = float(combined_scores[idx])
+    # Step 8: Initial ranking.
     ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True)
+    # Step 9: Cross-Encoder Re-Ranking.
+    top_candidates = ranked_repositories[:100] if len(ranked_repositories) > 100 else ranked_repositories
+    final_ranked = cross_encoder_rerank_candidates(top_candidates, query, model_name=CROSS_ENCODER_MODEL, top_n=10)
+    # Step 10: Format output.
     output = "\n=== Ranked Repositories ===\n"
+    for rank, repo in enumerate(final_ranked, 1):
         output += f"Final Rank: {rank}\n"
         output += f"Title: {repo['title']}\n"
         output += f"Link: {repo['link']}\n"
         output += f"Combined Score: {repo.get('combined_score', 0):.4f}\n"
+        output += f"Cross-Encoder Score: {repo.get('cross_encoder_score', 0):.4f}\n"
         snippet = repo['combined_text'][:300].replace('\n', ' ')
         output += f"Snippet: {snippet}...\n"
         output += '-' * 80 + "\n"
 # Main Entry Point for Testing
 # ---------------------------
 if __name__ == "__main__":
+    test_query = "Chain of thought prompting for reasoning models"
     result = run_repository_ranking(test_query)
     print(result)