naman1102 commited on
Commit
f3ed537
·
1 Parent(s): 785101b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -118
app.py CHANGED
@@ -7,6 +7,9 @@ import logging
7
  from datetime import datetime
8
  import os
9
  from huggingface_hub import HfApi, SpaceCard
 
 
 
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO)
@@ -14,14 +17,25 @@ logger = logging.getLogger(__name__)
14
 
15
  # Constants
16
  CSV_FILE = "repo_ids.csv"
17
- CHATBOT_SYSTEM_PROMPT = """You are a helpful AI assistant that analyzes Hugging Face repositories.
18
- Your task is to help users understand repositories, extract key information, and provide insights.
19
- Be concise, clear, and focus on the most important aspects of each repository."""
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def read_csv_as_text(filename: str) -> pd.DataFrame:
22
  """Read CSV file and return as DataFrame."""
23
  try:
24
- return pd.read_csv(filename)
25
  except Exception as e:
26
  logger.error(f"Error reading CSV: {e}")
27
  return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
@@ -29,7 +43,7 @@ def read_csv_as_text(filename: str) -> pd.DataFrame:
29
  def write_repos_to_csv(repo_ids: List[str]) -> None:
30
  """Write repository IDs to CSV file."""
31
  try:
32
- with open(CSV_FILE, 'w', newline='') as f:
33
  writer = csv.writer(f)
34
  writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
35
  for repo_id in repo_ids:
@@ -37,74 +51,148 @@ def write_repos_to_csv(repo_ids: List[str]) -> None:
37
  except Exception as e:
38
  logger.error(f"Error writing to CSV: {e}")
39
 
40
- def search_top_spaces(keyword: str, limit: int = 5) -> List[str]:
41
- """Search for repositories by keyword."""
42
- try:
43
- api = HfApi()
44
- spaces = api.list_spaces(search=keyword, limit=limit)
45
- return [space.id for space in spaces]
46
- except Exception as e:
47
- logger.error(f"Error searching spaces: {e}")
48
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- def analyze_repo(repo_id: str) -> Tuple[str, str]:
51
  """Analyze a single repository."""
52
  try:
53
- api = HfApi()
54
- space = api.get_space(repo_id)
55
- card = SpaceCard.load(repo_id)
56
-
57
- content = f"""
58
- Repository: {repo_id}
59
- Title: {card.title}
60
- Description: {card.description}
61
- Tags: {', '.join(card.tags)}
62
- """
63
-
64
- summary = f"Analysis of {repo_id}:\n"
65
- summary += f"- Title: {card.title}\n"
66
- summary += f"- Main focus: {card.description[:200]}...\n"
67
- summary += f"- Key tags: {', '.join(card.tags[:5])}\n"
68
-
69
- return content, summary
 
 
 
 
 
70
  except Exception as e:
71
  logger.error(f"Error analyzing repo {repo_id}: {e}")
72
- return f"Error analyzing {repo_id}", f"Error: {str(e)}"
73
 
74
- def chat_with_user(message: str, history: List[Dict[str, str]]) -> str:
75
- """Simple chat response."""
76
  try:
77
- return f"I understand you're asking about: {message}. How can I help you analyze this repository?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  except Exception as e:
79
- logger.error(f"Error in chat: {e}")
80
- return "I apologize, but I encountered an error. Please try again."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  def create_ui() -> gr.Blocks:
83
- """Create a simplified Gradio interface."""
 
 
84
  with gr.Blocks(title="Hugging Face Repo Analyzer", theme=gr.themes.Soft()) as app:
85
  gr.Markdown("# Hugging Face Repository Analyzer")
86
 
87
  with gr.Row():
88
  with gr.Column():
89
- # Repository ID Input Section
90
  gr.Markdown("### Enter Repository IDs")
91
  repo_id_input = gr.Textbox(
92
- label="Enter repository IDs (comma or newline separated)",
93
- lines=3,
94
  placeholder="repo1, repo2\nrepo3"
95
  )
96
- submit_repo_btn = gr.Button("Submit Repository IDs", variant="primary")
97
 
98
- # Keyword Search Section
99
  gr.Markdown("### Or Search by Keywords")
100
  keyword_input = gr.Textbox(
101
  label="Enter keywords to search",
102
- lines=2,
103
  placeholder="Enter keywords separated by commas"
104
  )
105
  search_btn = gr.Button("Search by Keywords", variant="primary")
106
 
107
- # Status
108
  status = gr.Textbox(label="Status", visible=True)
109
 
110
  # Results Section
@@ -117,6 +205,10 @@ def create_ui() -> gr.Blocks:
117
  content_output = gr.Textbox(label="Repository Content", lines=10)
118
  summary_output = gr.Textbox(label="Analysis Summary", lines=5)
119
 
 
 
 
 
120
  # Chat Section
121
  chatbot = gr.Chatbot(
122
  label="Chat with Assistant",
@@ -126,97 +218,69 @@ def create_ui() -> gr.Blocks:
126
  msg = gr.Textbox(label="Message", placeholder="Ask about the repository...")
127
  with gr.Row():
128
  send_btn = gr.Button("Send", variant="primary")
129
- clear_btn = gr.Button("Clear Chat", variant="secondary")
130
 
131
- def process_repo_ids(text: str) -> Tuple[pd.DataFrame, str, str, str]:
132
- """Process repository IDs and return results."""
133
- try:
134
- repo_ids = [rid.strip() for rid in re.split(r'[\n,]+', text) if rid.strip()]
135
-
136
- if not repo_ids:
137
- return pd.DataFrame(), "No repository IDs provided", "", ""
138
-
139
- # Remove duplicates
140
- repo_ids = list(dict.fromkeys(repo_ids))
141
-
142
- # Update CSV
143
- write_repos_to_csv(repo_ids)
144
-
145
- # Get first repo analysis
146
- content, summary = analyze_repo(repo_ids[0])
147
-
148
- return read_csv_as_text(CSV_FILE), f"Found {len(repo_ids)} repositories", content, summary
149
-
150
- except Exception as e:
151
- logger.error(f"Error processing repository IDs: {e}")
152
- return pd.DataFrame(), f"Error: {str(e)}", "", ""
153
-
154
- def process_keywords(text: str) -> Tuple[pd.DataFrame, str, str, str]:
155
- """Process keywords and return search results."""
156
- try:
157
- keywords = [k.strip() for k in re.split(r'[\n,]+', text) if k.strip()]
158
-
159
- if not keywords:
160
- return pd.DataFrame(), "No keywords provided", "", ""
161
-
162
- repo_ids = []
163
- for kw in keywords:
164
- repo_ids.extend(search_top_spaces(kw, limit=5))
165
-
166
- # Remove duplicates
167
- repo_ids = list(dict.fromkeys(repo_ids))
168
-
169
- if not repo_ids:
170
- return pd.DataFrame(), "No repositories found for the given keywords", "", ""
171
-
172
- # Update CSV
173
- write_repos_to_csv(repo_ids)
174
-
175
- # Get first repo analysis
176
- content, summary = analyze_repo(repo_ids[0])
177
-
178
- return read_csv_as_text(CSV_FILE), f"Found {len(repo_ids)} repositories", content, summary
179
-
180
- except Exception as e:
181
- logger.error(f"Error processing keywords: {e}")
182
- return pd.DataFrame(), f"Error: {str(e)}", "", ""
183
 
184
- def send_message(message: str, history: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], str]:
185
- """Send message to chat."""
186
  if not message:
187
  return history, ""
188
  history.append({"role": "user", "content": message})
189
- response = chat_with_user(message, history)
190
  history.append({"role": "assistant", "content": response})
191
  return history, ""
192
 
193
- def clear_chat() -> Tuple[List[Dict[str, str]], str]:
194
- """Clear chat history."""
195
- return [], ""
 
 
 
 
196
 
197
  # Event handlers
198
- submit_repo_btn.click(
199
- fn=process_repo_ids,
200
- inputs=[repo_id_input],
201
- outputs=[df_output, status, content_output, summary_output]
202
  )
203
 
204
  search_btn.click(
205
- fn=process_keywords,
206
- inputs=[keyword_input],
207
- outputs=[df_output, status, content_output, summary_output]
 
 
 
 
 
 
208
  )
209
 
210
  send_btn.click(
211
- fn=send_message,
212
- inputs=[msg, chatbot],
213
  outputs=[chatbot, msg]
214
  )
215
 
216
- clear_btn.click(
217
- fn=clear_chat,
218
- inputs=[],
219
- outputs=[chatbot, msg]
220
  )
221
 
222
  return app
 
7
  from datetime import datetime
8
  import os
9
  from huggingface_hub import HfApi, SpaceCard
10
+ from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response
11
+ from hf_utils import download_space_repo, search_top_spaces
12
+ from chatbot_page import chat_with_user, extract_keywords_from_conversation
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
 
17
 
18
  # Constants
19
  CSV_FILE = "repo_ids.csv"
20
+ CHATBOT_SYSTEM_PROMPT = (
21
+ "You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. "
22
+ "Ask questions to clarify what they want, their use case, preferred language, features, etc. "
23
+ "When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. "
24
+ "Return only the keywords as a comma-separated list."
25
+ )
26
+
27
+ class AppState:
28
+ """State management for the application."""
29
+ def __init__(self):
30
+ self.repo_ids: List[str] = []
31
+ self.current_repo_idx: int = 0
32
+ self.generated_keywords: List[str] = []
33
+ self.chat_history: List[Dict[str, str]] = []
34
 
35
  def read_csv_as_text(filename: str) -> pd.DataFrame:
36
  """Read CSV file and return as DataFrame."""
37
  try:
38
+ return pd.read_csv(filename, dtype=str)
39
  except Exception as e:
40
  logger.error(f"Error reading CSV: {e}")
41
  return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
 
43
  def write_repos_to_csv(repo_ids: List[str]) -> None:
44
  """Write repository IDs to CSV file."""
45
  try:
46
+ with open(CSV_FILE, 'w', newline='', encoding="utf-8") as f:
47
  writer = csv.writer(f)
48
  writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
49
  for repo_id in repo_ids:
 
51
  except Exception as e:
52
  logger.error(f"Error writing to CSV: {e}")
53
 
54
+ def process_repo_input(text: str, state: AppState) -> pd.DataFrame:
55
+ """Process repository IDs input."""
56
+ if not text:
57
+ state.repo_ids = []
58
+ state.current_repo_idx = 0
59
+ return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
60
+
61
+ repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()]
62
+ state.repo_ids = repo_ids
63
+ state.current_repo_idx = 0
64
+
65
+ write_repos_to_csv(repo_ids)
66
+ return read_csv_as_text(CSV_FILE)
67
+
68
+ def keyword_search_and_update(keyword: str, state: AppState) -> pd.DataFrame:
69
+ """Search for repositories by keywords."""
70
+ if not keyword:
71
+ return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"])
72
+
73
+ keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()]
74
+ repo_ids = []
75
+
76
+ for kw in keyword_list:
77
+ repo_ids.extend(search_top_spaces(kw, limit=5))
78
+
79
+ # Remove duplicates while preserving order
80
+ seen = set()
81
+ unique_repo_ids = []
82
+ for rid in repo_ids:
83
+ if rid not in seen:
84
+ unique_repo_ids.append(rid)
85
+ seen.add(rid)
86
+
87
+ state.repo_ids = unique_repo_ids
88
+ state.current_repo_idx = 0
89
+
90
+ write_repos_to_csv(unique_repo_ids)
91
+ return read_csv_as_text(CSV_FILE)
92
 
93
+ def analyze_single_repo(repo_id: str) -> Tuple[str, str, Dict]:
94
  """Analyze a single repository."""
95
  try:
96
+ download_space_repo(repo_id, local_dir="repo_files")
97
+ txt_path = combine_repo_files_for_llm()
98
+
99
+ with open(txt_path, "r", encoding="utf-8") as f:
100
+ combined_content = f.read()
101
+
102
+ llm_output = analyze_combined_file(txt_path)
103
+ last_start = llm_output.rfind('{')
104
+ last_end = llm_output.rfind('}')
105
+
106
+ final_json_str = llm_output[last_start:last_end+1] if last_start != -1 and last_end != -1 and last_end > last_start else llm_output
107
+ llm_json = parse_llm_json_response(final_json_str)
108
+
109
+ if isinstance(llm_json, dict) and "error" not in llm_json:
110
+ strengths = llm_json.get("strength", "")
111
+ weaknesses = llm_json.get("weaknesses", "")
112
+ summary = f"JSON extraction: SUCCESS\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}"
113
+ else:
114
+ summary = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}"
115
+
116
+ return combined_content, summary, llm_json
117
+
118
  except Exception as e:
119
  logger.error(f"Error analyzing repo {repo_id}: {e}")
120
+ return f"Error analyzing {repo_id}", f"Error: {str(e)}", {"error": str(e)}
121
 
122
+ def update_csv_with_analysis(repo_id: str, analysis_results: Dict) -> pd.DataFrame:
123
+ """Update CSV file with analysis results."""
124
  try:
125
+ df = read_csv_as_text(CSV_FILE)
126
+ updated = False
127
+
128
+ for idx, row in df.iterrows():
129
+ if row["repo id"] == repo_id:
130
+ if isinstance(analysis_results, dict) and "error" not in analysis_results:
131
+ df.at[idx, "strength"] = analysis_results.get("strength", "")
132
+ df.at[idx, "weaknesses"] = analysis_results.get("weaknesses", "")
133
+ df.at[idx, "speciality"] = analysis_results.get("speciality", "")
134
+ df.at[idx, "relevance rating"] = analysis_results.get("relevance rating", "")
135
+ updated = True
136
+ break
137
+
138
+ if not updated and isinstance(analysis_results, dict) and "error" not in analysis_results:
139
+ new_row = {
140
+ "repo id": repo_id,
141
+ "strength": analysis_results.get("strength", ""),
142
+ "weaknesses": analysis_results.get("weaknesses", ""),
143
+ "speciality": analysis_results.get("speciality", ""),
144
+ "relevance rating": analysis_results.get("relevance rating", "")
145
+ }
146
+ df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
147
+
148
+ df.to_csv(CSV_FILE, index=False)
149
+ return df
150
+
151
  except Exception as e:
152
+ logger.error(f"Error updating CSV: {e}")
153
+ return read_csv_as_text(CSV_FILE)
154
+
155
+ def show_combined_repo_and_llm(state: AppState) -> Tuple[str, str, pd.DataFrame]:
156
+ """Show combined repo content and LLM analysis."""
157
+ if not state.repo_ids:
158
+ return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame()
159
+
160
+ if state.current_repo_idx >= len(state.repo_ids):
161
+ return "All repo IDs have been processed.", "", read_csv_as_text(CSV_FILE)
162
+
163
+ repo_id = state.repo_ids[state.current_repo_idx]
164
+ combined_content, summary, analysis_results = analyze_single_repo(repo_id)
165
+ df = update_csv_with_analysis(repo_id, analysis_results)
166
+
167
+ state.current_repo_idx += 1
168
+ return combined_content, summary, df
169
 
170
  def create_ui() -> gr.Blocks:
171
+ """Create the Gradio interface."""
172
+ state = gr.State(AppState())
173
+
174
  with gr.Blocks(title="Hugging Face Repo Analyzer", theme=gr.themes.Soft()) as app:
175
  gr.Markdown("# Hugging Face Repository Analyzer")
176
 
177
  with gr.Row():
178
  with gr.Column():
179
+ # Input Section
180
  gr.Markdown("### Enter Repository IDs")
181
  repo_id_input = gr.Textbox(
182
+ label="Enter repo IDs (comma or newline separated)",
183
+ lines=5,
184
  placeholder="repo1, repo2\nrepo3"
185
  )
186
+ submit_btn = gr.Button("Submit Repository IDs", variant="primary")
187
 
 
188
  gr.Markdown("### Or Search by Keywords")
189
  keyword_input = gr.Textbox(
190
  label="Enter keywords to search",
191
+ lines=3,
192
  placeholder="Enter keywords separated by commas"
193
  )
194
  search_btn = gr.Button("Search by Keywords", variant="primary")
195
 
 
196
  status = gr.Textbox(label="Status", visible=True)
197
 
198
  # Results Section
 
205
  content_output = gr.Textbox(label="Repository Content", lines=10)
206
  summary_output = gr.Textbox(label="Analysis Summary", lines=5)
207
 
208
+ with gr.Row():
209
+ analyze_btn = gr.Button("Analyze Next Repository", variant="primary")
210
+ finish_btn = gr.Button("Finish Analysis", variant="secondary")
211
+
212
  # Chat Section
213
  chatbot = gr.Chatbot(
214
  label="Chat with Assistant",
 
218
  msg = gr.Textbox(label="Message", placeholder="Ask about the repository...")
219
  with gr.Row():
220
  send_btn = gr.Button("Send", variant="primary")
221
+ end_chat_btn = gr.Button("End Chat", variant="secondary")
222
 
223
+ def process_repo_input_with_status(text: str, state: AppState) -> Tuple[pd.DataFrame, str]:
224
+ """Process repo input with status update."""
225
+ df = process_repo_input(text, state)
226
+ return df, f"Found {len(state.repo_ids)} repositories"
227
+
228
+ def keyword_search_with_status(keyword: str, state: AppState) -> Tuple[pd.DataFrame, str]:
229
+ """Search keywords with status update."""
230
+ df = keyword_search_and_update(keyword, state)
231
+ return df, f"Found {len(state.repo_ids)} repositories"
232
+
233
+ def analyze_with_status(state: AppState) -> Tuple[str, str, pd.DataFrame, str]:
234
+ """Analyze with status update."""
235
+ content, summary, df = show_combined_repo_and_llm(state)
236
+ return content, summary, df, f"Analyzing repository {state.current_repo_idx} of {len(state.repo_ids)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
+ def send_message_with_status(message: str, history: List[Dict[str, str]], state: AppState) -> Tuple[List[Dict[str, str]], str]:
239
+ """Send message with status update."""
240
  if not message:
241
  return history, ""
242
  history.append({"role": "user", "content": message})
243
+ response = chat_with_user(message, history, CHATBOT_SYSTEM_PROMPT)
244
  history.append({"role": "assistant", "content": response})
245
  return history, ""
246
 
247
+ def end_chat_with_status(history: List[Dict[str, str]], state: AppState) -> Tuple[List[str], str]:
248
+ """End chat and extract keywords."""
249
+ if not history:
250
+ return [], "No chat history to analyze"
251
+ keywords = extract_keywords_from_conversation(history)
252
+ state.generated_keywords = keywords
253
+ return keywords, "Keywords extracted from conversation"
254
 
255
  # Event handlers
256
+ submit_btn.click(
257
+ fn=process_repo_input_with_status,
258
+ inputs=[repo_id_input, state],
259
+ outputs=[df_output, status]
260
  )
261
 
262
  search_btn.click(
263
+ fn=keyword_search_with_status,
264
+ inputs=[keyword_input, state],
265
+ outputs=[df_output, status]
266
+ )
267
+
268
+ analyze_btn.click(
269
+ fn=analyze_with_status,
270
+ inputs=[state],
271
+ outputs=[content_output, summary_output, df_output, status]
272
  )
273
 
274
  send_btn.click(
275
+ fn=send_message_with_status,
276
+ inputs=[msg, chatbot, state],
277
  outputs=[chatbot, msg]
278
  )
279
 
280
+ end_chat_btn.click(
281
+ fn=end_chat_with_status,
282
+ inputs=[chatbot, state],
283
+ outputs=[gr.Textbox(label="Extracted Keywords"), status]
284
  )
285
 
286
  return app