Spaces:

Agents-MCP-Hackathon
/

HF_RepoSense

Sleeping

App Files Files Community

naman1102 commited on Jun 7

Commit

f3ed537

1 Parent(s): 785101b

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -118

app.py CHANGED Viewed

@@ -7,6 +7,9 @@ import logging
 from datetime import datetime
 import os
 from huggingface_hub import HfApi, SpaceCard
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -14,14 +17,25 @@ logger = logging.getLogger(__name__)
 # Constants
 CSV_FILE = "repo_ids.csv"
-CHATBOT_SYSTEM_PROMPT = """You are a helpful AI assistant that analyzes Hugging Face repositories.
-Your task is to help users understand repositories, extract key information, and provide insights.
-Be concise, clear, and focus on the most important aspects of each repository."""
 def read_csv_as_text(filename: str) -> pd.DataFrame:
     """Read CSV file and return as DataFrame."""
     try:
-        return pd.read_csv(filename)
     except Exception as e:
         logger.error(f"Error reading CSV: {e}")
         return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
@@ -29,7 +43,7 @@ def read_csv_as_text(filename: str) -> pd.DataFrame:
 def write_repos_to_csv(repo_ids: List[str]) -> None:
     """Write repository IDs to CSV file."""
     try:
-        with open(CSV_FILE, 'w', newline='') as f:
             writer = csv.writer(f)
             writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
             for repo_id in repo_ids:
@@ -37,74 +51,148 @@ def write_repos_to_csv(repo_ids: List[str]) -> None:
     except Exception as e:
         logger.error(f"Error writing to CSV: {e}")
-def search_top_spaces(keyword: str, limit: int = 5) -> List[str]:
-    """Search for repositories by keyword."""
-    try:
-        api = HfApi()
-        spaces = api.list_spaces(search=keyword, limit=limit)
-        return [space.id for space in spaces]
-    except Exception as e:
-        logger.error(f"Error searching spaces: {e}")
-        return []
-def analyze_repo(repo_id: str) -> Tuple[str, str]:
     """Analyze a single repository."""
     try:
-        api = HfApi()
-        space = api.get_space(repo_id)
-        card = SpaceCard.load(repo_id)
-        content = f"""
-        Repository: {repo_id}
-        Title: {card.title}
-        Description: {card.description}
-        Tags: {', '.join(card.tags)}
-        """
-        summary = f"Analysis of {repo_id}:\n"
-        summary += f"- Title: {card.title}\n"
-        summary += f"- Main focus: {card.description[:200]}...\n"
-        summary += f"- Key tags: {', '.join(card.tags[:5])}\n"
-        return content, summary
     except Exception as e:
         logger.error(f"Error analyzing repo {repo_id}: {e}")
-        return f"Error analyzing {repo_id}", f"Error: {str(e)}"
-def chat_with_user(message: str, history: List[Dict[str, str]]) -> str:
-    """Simple chat response."""
     try:
-        return f"I understand you're asking about: {message}. How can I help you analyze this repository?"
     except Exception as e:
-        logger.error(f"Error in chat: {e}")
-        return "I apologize, but I encountered an error. Please try again."
 def create_ui() -> gr.Blocks:
-    """Create a simplified Gradio interface."""
     with gr.Blocks(title="Hugging Face Repo Analyzer", theme=gr.themes.Soft()) as app:
         gr.Markdown("# Hugging Face Repository Analyzer")
         with gr.Row():
             with gr.Column():
-                # Repository ID Input Section
                 gr.Markdown("### Enter Repository IDs")
                 repo_id_input = gr.Textbox(
-                    label="Enter repository IDs (comma or newline separated)",
-                    lines=3,
                     placeholder="repo1, repo2\nrepo3"
                 )
-                submit_repo_btn = gr.Button("Submit Repository IDs", variant="primary")
-                # Keyword Search Section
                 gr.Markdown("### Or Search by Keywords")
                 keyword_input = gr.Textbox(
                     label="Enter keywords to search",
-                    lines=2,
                     placeholder="Enter keywords separated by commas"
                 )
                 search_btn = gr.Button("Search by Keywords", variant="primary")
-                # Status
                 status = gr.Textbox(label="Status", visible=True)
                 # Results Section
@@ -117,6 +205,10 @@ def create_ui() -> gr.Blocks:
                 content_output = gr.Textbox(label="Repository Content", lines=10)
                 summary_output = gr.Textbox(label="Analysis Summary", lines=5)
                 # Chat Section
                 chatbot = gr.Chatbot(
                     label="Chat with Assistant",
@@ -126,97 +218,69 @@ def create_ui() -> gr.Blocks:
                 msg = gr.Textbox(label="Message", placeholder="Ask about the repository...")
                 with gr.Row():
                     send_btn = gr.Button("Send", variant="primary")
-                    clear_btn = gr.Button("Clear Chat", variant="secondary")
-        def process_repo_ids(text: str) -> Tuple[pd.DataFrame, str, str, str]:
-            """Process repository IDs and return results."""
-            try:
-                repo_ids = [rid.strip() for rid in re.split(r'[\n,]+', text) if rid.strip()]
-                if not repo_ids:
-                    return pd.DataFrame(), "No repository IDs provided", "", ""
-                # Remove duplicates
-                repo_ids = list(dict.fromkeys(repo_ids))
-                # Update CSV
-                write_repos_to_csv(repo_ids)
-                # Get first repo analysis
-                content, summary = analyze_repo(repo_ids[0])
-                return read_csv_as_text(CSV_FILE), f"Found {len(repo_ids)} repositories", content, summary
-            except Exception as e:
-                logger.error(f"Error processing repository IDs: {e}")
-                return pd.DataFrame(), f"Error: {str(e)}", "", ""
-        def process_keywords(text: str) -> Tuple[pd.DataFrame, str, str, str]:
-            """Process keywords and return search results."""
-            try:
-                keywords = [k.strip() for k in re.split(r'[\n,]+', text) if k.strip()]
-                if not keywords:
-                    return pd.DataFrame(), "No keywords provided", "", ""
-                repo_ids = []
-                for kw in keywords:
-                    repo_ids.extend(search_top_spaces(kw, limit=5))
-                # Remove duplicates
-                repo_ids = list(dict.fromkeys(repo_ids))
-                if not repo_ids:
-                    return pd.DataFrame(), "No repositories found for the given keywords", "", ""
-                # Update CSV
-                write_repos_to_csv(repo_ids)
-                # Get first repo analysis
-                content, summary = analyze_repo(repo_ids[0])
-                return read_csv_as_text(CSV_FILE), f"Found {len(repo_ids)} repositories", content, summary
-            except Exception as e:
-                logger.error(f"Error processing keywords: {e}")
-                return pd.DataFrame(), f"Error: {str(e)}", "", ""
-        def send_message(message: str, history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str]:
-            """Send message to chat."""
             if not message:
                 return history, ""
             history.append({"role": "user", "content": message})
-            response = chat_with_user(message, history)
             history.append({"role": "assistant", "content": response})
             return history, ""
-        def clear_chat() -> Tuple[List[Dict[str, str]], str]:
-            """Clear chat history."""
-            return [], ""
         # Event handlers
-        submit_repo_btn.click(
-            fn=process_repo_ids,
-            inputs=[repo_id_input],
-            outputs=[df_output, status, content_output, summary_output]
         )
         search_btn.click(
-            fn=process_keywords,
-            inputs=[keyword_input],
-            outputs=[df_output, status, content_output, summary_output]
         )
         send_btn.click(
-            fn=send_message,
-            inputs=[msg, chatbot],
             outputs=[chatbot, msg]
         )
-        clear_btn.click(
-            fn=clear_chat,
-            inputs=[],
-            outputs=[chatbot, msg]
         )
     return app

 from datetime import datetime
 import os
 from huggingface_hub import HfApi, SpaceCard
+from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response
+from hf_utils import download_space_repo, search_top_spaces
+from chatbot_page import chat_with_user, extract_keywords_from_conversation
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Constants
 CSV_FILE = "repo_ids.csv"
+CHATBOT_SYSTEM_PROMPT = (
+    "You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. "
+    "Ask questions to clarify what they want, their use case, preferred language, features, etc. "
+    "When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. "
+    "Return only the keywords as a comma-separated list."
+)
+class AppState:
+    """State management for the application."""
+    def __init__(self):
+        self.repo_ids: List[str] = []
+        self.current_repo_idx: int = 0
+        self.generated_keywords: List[str] = []
+        self.chat_history: List[Dict[str, str]] = []
 def read_csv_as_text(filename: str) -> pd.DataFrame:
     """Read CSV file and return as DataFrame."""
     try:
+        return pd.read_csv(filename, dtype=str)
     except Exception as e:
         logger.error(f"Error reading CSV: {e}")
         return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
 def write_repos_to_csv(repo_ids: List[str]) -> None:
     """Write repository IDs to CSV file."""
     try:
+        with open(CSV_FILE, 'w', newline='', encoding="utf-8") as f:
             writer = csv.writer(f)
             writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
             for repo_id in repo_ids:
     except Exception as e:
         logger.error(f"Error writing to CSV: {e}")
+def process_repo_input(text: str, state: AppState) -> pd.DataFrame:
+    """Process repository IDs input."""
+    if not text:
+        state.repo_ids = []
+        state.current_repo_idx = 0
+        return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
+    repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
+    state.repo_ids = repo_ids
+    state.current_repo_idx = 0
+    write_repos_to_csv(repo_ids)
+    return read_csv_as_text(CSV_FILE)
+def keyword_search_and_update(keyword: str, state: AppState) -> pd.DataFrame:
+    """Search for repositories by keywords."""
+    if not keyword:
+        return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
+    keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()]
+    repo_ids = []
+    for kw in keyword_list:
+        repo_ids.extend(search_top_spaces(kw, limit=5))
+    # Remove duplicates while preserving order
+    seen = set()
+    unique_repo_ids = []
+    for rid in repo_ids:
+        if rid not in seen:
+            unique_repo_ids.append(rid)
+            seen.add(rid)
+    state.repo_ids = unique_repo_ids
+    state.current_repo_idx = 0
+    write_repos_to_csv(unique_repo_ids)
+    return read_csv_as_text(CSV_FILE)
+def analyze_single_repo(repo_id: str) -> Tuple[str, str, Dict]:
     """Analyze a single repository."""
     try:
+        download_space_repo(repo_id, local_dir="repo_files")
+        txt_path = combine_repo_files_for_llm()
+        with open(txt_path, "r", encoding="utf-8") as f:
+            combined_content = f.read()
+        llm_output = analyze_combined_file(txt_path)
+        last_start = llm_output.rfind('{')
+        last_end = llm_output.rfind('}')
+        final_json_str = llm_output[last_start:last_end+1] if last_start != -1 and last_end != -1 and last_end > last_start else llm_output
+        llm_json = parse_llm_json_response(final_json_str)
+        if isinstance(llm_json, dict) and "error" not in llm_json:
+            strengths = llm_json.get("strength", "")
+            weaknesses = llm_json.get("weaknesses", "")
+            summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
+        else:
+            summary = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}"
+        return combined_content, summary, llm_json
     except Exception as e:
         logger.error(f"Error analyzing repo {repo_id}: {e}")
+        return f"Error analyzing {repo_id}", f"Error: {str(e)}", {"error": str(e)}
+def update_csv_with_analysis(repo_id: str, analysis_results: Dict) -> pd.DataFrame:
+    """Update CSV file with analysis results."""
     try:
+        df = read_csv_as_text(CSV_FILE)
+        updated = False
+        for idx, row in df.iterrows():
+            if row["repo id"] == repo_id:
+                if isinstance(analysis_results, dict) and "error" not in analysis_results:
+                    df.at[idx, "strength"] = analysis_results.get("strength", "")
+                    df.at[idx, "weaknesses"] = analysis_results.get("weaknesses", "")
+                    df.at[idx, "speciality"] = analysis_results.get("speciality", "")
+                    df.at[idx, "relevance rating"] = analysis_results.get("relevance rating", "")
+                updated = True
+                break
+        if not updated and isinstance(analysis_results, dict) and "error" not in analysis_results:
+            new_row = {
+                "repo id": repo_id,
+                "strength": analysis_results.get("strength", ""),
+                "weaknesses": analysis_results.get("weaknesses", ""),
+                "speciality": analysis_results.get("speciality", ""),
+                "relevance rating": analysis_results.get("relevance rating", "")
+            }
+            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
+        df.to_csv(CSV_FILE, index=False)
+        return df
     except Exception as e:
+        logger.error(f"Error updating CSV: {e}")
+        return read_csv_as_text(CSV_FILE)
+def show_combined_repo_and_llm(state: AppState) -> Tuple[str, str, pd.DataFrame]:
+    """Show combined repo content and LLM analysis."""
+    if not state.repo_ids:
+        return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame()
+    if state.current_repo_idx >= len(state.repo_ids):
+        return "All repo IDs have been processed.", "", read_csv_as_text(CSV_FILE)
+    repo_id = state.repo_ids[state.current_repo_idx]
+    combined_content, summary, analysis_results = analyze_single_repo(repo_id)
+    df = update_csv_with_analysis(repo_id, analysis_results)
+    state.current_repo_idx += 1
+    return combined_content, summary, df
 def create_ui() -> gr.Blocks:
+    """Create the Gradio interface."""
+    state = gr.State(AppState())
     with gr.Blocks(title="Hugging Face Repo Analyzer", theme=gr.themes.Soft()) as app:
         gr.Markdown("# Hugging Face Repository Analyzer")
         with gr.Row():
             with gr.Column():
+                # Input Section
                 gr.Markdown("### Enter Repository IDs")
                 repo_id_input = gr.Textbox(
+                    label="Enter repo IDs (comma or newline separated)",
+                    lines=5,
                     placeholder="repo1, repo2\nrepo3"
                 )
+                submit_btn = gr.Button("Submit Repository IDs", variant="primary")
                 gr.Markdown("### Or Search by Keywords")
                 keyword_input = gr.Textbox(
                     label="Enter keywords to search",
+                    lines=3,
                     placeholder="Enter keywords separated by commas"
                 )
                 search_btn = gr.Button("Search by Keywords", variant="primary")
                 status = gr.Textbox(label="Status", visible=True)
                 # Results Section
                 content_output = gr.Textbox(label="Repository Content", lines=10)
                 summary_output = gr.Textbox(label="Analysis Summary", lines=5)
+                with gr.Row():
+                    analyze_btn = gr.Button("Analyze Next Repository", variant="primary")
+                    finish_btn = gr.Button("Finish Analysis", variant="secondary")
                 # Chat Section
                 chatbot = gr.Chatbot(
                     label="Chat with Assistant",
                 msg = gr.Textbox(label="Message", placeholder="Ask about the repository...")
                 with gr.Row():
                     send_btn = gr.Button("Send", variant="primary")
+                    end_chat_btn = gr.Button("End Chat", variant="secondary")
+        def process_repo_input_with_status(text: str, state: AppState) -> Tuple[pd.DataFrame, str]:
+            """Process repo input with status update."""
+            df = process_repo_input(text, state)
+            return df, f"Found {len(state.repo_ids)} repositories"
+        def keyword_search_with_status(keyword: str, state: AppState) -> Tuple[pd.DataFrame, str]:
+            """Search keywords with status update."""
+            df = keyword_search_and_update(keyword, state)
+            return df, f"Found {len(state.repo_ids)} repositories"
+        def analyze_with_status(state: AppState) -> Tuple[str, str, pd.DataFrame, str]:
+            """Analyze with status update."""
+            content, summary, df = show_combined_repo_and_llm(state)
+            return content, summary, df, f"Analyzing repository {state.current_repo_idx} of {len(state.repo_ids)}"
+        def send_message_with_status(message: str, history: List[Dict[str, str]], state: AppState) -> Tuple[List[Dict[str, str]], str]:
+            """Send message with status update."""
             if not message:
                 return history, ""
             history.append({"role": "user", "content": message})
+            response = chat_with_user(message, history, CHATBOT_SYSTEM_PROMPT)
             history.append({"role": "assistant", "content": response})
             return history, ""
+        def end_chat_with_status(history: List[Dict[str, str]], state: AppState) -> Tuple[List[str], str]:
+            """End chat and extract keywords."""
+            if not history:
+                return [], "No chat history to analyze"
+            keywords = extract_keywords_from_conversation(history)
+            state.generated_keywords = keywords
+            return keywords, "Keywords extracted from conversation"
         # Event handlers
+        submit_btn.click(
+            fn=process_repo_input_with_status,
+            inputs=[repo_id_input, state],
+            outputs=[df_output, status]
         )
         search_btn.click(
+            fn=keyword_search_with_status,
+            inputs=[keyword_input, state],
+            outputs=[df_output, status]
+        )
+        analyze_btn.click(
+            fn=analyze_with_status,
+            inputs=[state],
+            outputs=[content_output, summary_output, df_output, status]
         )
         send_btn.click(
+            fn=send_message_with_status,
+            inputs=[msg, chatbot, state],
             outputs=[chatbot, msg]
         )
+        end_chat_btn.click(
+            fn=end_chat_with_status,
+            inputs=[chatbot, state],
+            outputs=[gr.Textbox(label="Extracted Keywords"), status]
         )
     return app