Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,587 +1,193 @@
|
|
1 |
-
import gradio as gr
|
2 |
import os
|
3 |
import json
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
-
import networkx as nx
|
7 |
-
import matplotlib
|
8 |
-
matplotlib.use('Agg') # Use non-interactive backend
|
9 |
-
import matplotlib.pyplot as plt
|
10 |
-
import numpy as np
|
11 |
-
import io
|
12 |
-
import base64
|
13 |
-
from huggingface_hub import InferenceClient
|
14 |
-
import re
|
15 |
from urllib.parse import urlparse
|
16 |
-
import
|
17 |
-
|
18 |
-
# Configure matplotlib for better font handling
|
19 |
-
plt.rcParams['font.family'] = ['DejaVu Sans']
|
20 |
-
plt.rcParams['font.size'] = 10
|
21 |
-
plt.rcParams['font.weight'] = 'normal'
|
22 |
-
plt.rcParams['figure.max_open_warning'] = 0 # Disable figure warnings
|
23 |
-
warnings.filterwarnings('ignore', category=UserWarning)
|
24 |
-
warnings.filterwarnings('ignore', message='.*Font family.*not found.*')
|
25 |
-
warnings.filterwarnings('ignore', message='.*Matplotlib.*')
|
26 |
-
|
27 |
-
def clean_text_for_display(text):
|
28 |
-
"""Clean text to remove characters that might cause font issues."""
|
29 |
-
if not isinstance(text, str):
|
30 |
-
return str(text)
|
31 |
-
|
32 |
-
# Remove or replace problematic characters
|
33 |
-
text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII characters
|
34 |
-
text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
|
35 |
-
return text[:50] if len(text) > 50 else text # Limit length for display
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
Returns:
|
44 |
-
Extracted text content
|
45 |
-
"""
|
46 |
-
try:
|
47 |
-
# Check if input looks like a URL
|
48 |
-
parsed = urlparse(url_or_text)
|
49 |
-
if parsed.scheme in ['http', 'https']:
|
50 |
-
try:
|
51 |
-
headers = {
|
52 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
53 |
-
}
|
54 |
-
response = requests.get(url_or_text, headers=headers, timeout=10)
|
55 |
-
response.raise_for_status()
|
56 |
-
|
57 |
-
# Parse HTML and extract text
|
58 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
59 |
-
|
60 |
-
# Remove script and style elements
|
61 |
-
for script in soup(["script", "style"]):
|
62 |
-
script.decompose()
|
63 |
-
|
64 |
-
# Get text and clean it up
|
65 |
-
text = soup.get_text()
|
66 |
-
lines = (line.strip() for line in text.splitlines())
|
67 |
-
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
68 |
-
text = ' '.join(chunk for chunk in chunks if chunk)
|
69 |
-
|
70 |
-
return text[:5000] # Limit to first 5000 characters
|
71 |
-
except Exception as e:
|
72 |
-
return f"Error fetching URL: {str(e)}"
|
73 |
-
else:
|
74 |
-
# It's direct text input
|
75 |
-
return url_or_text
|
76 |
-
except Exception as e:
|
77 |
-
return f"Error processing input: {str(e)}"
|
78 |
|
79 |
-
def
|
80 |
-
"""
|
81 |
try:
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
# Simple heuristic: words that are capitalized and longer than 2 characters
|
86 |
-
seen = set()
|
87 |
-
for word in words[:30]: # Limit to first 30 words
|
88 |
-
clean_word = re.sub(r'[^\w]', '', word)
|
89 |
-
if (clean_word.istitle() and len(clean_word) > 2 and
|
90 |
-
clean_word.lower() not in seen and
|
91 |
-
clean_word not in ['The', 'This', 'That', 'When', 'Where', 'How']):
|
92 |
-
entities.append({
|
93 |
-
"name": clean_text_for_display(clean_word),
|
94 |
-
"type": "CONCEPT",
|
95 |
-
"description": "Auto-detected entity"
|
96 |
-
})
|
97 |
-
seen.add(clean_word.lower())
|
98 |
-
|
99 |
-
# Create some basic relationships
|
100 |
-
relationships = []
|
101 |
-
if len(entities) > 1:
|
102 |
-
for i in range(min(len(entities) - 1, 5)): # Max 5 relationships
|
103 |
-
relationships.append({
|
104 |
-
"source": entities[i]["name"],
|
105 |
-
"target": entities[i + 1]["name"],
|
106 |
-
"relation": "related_to",
|
107 |
-
"description": "Sequential relationship"
|
108 |
-
})
|
109 |
-
|
110 |
-
return {"entities": entities[:10], "relationships": relationships}
|
111 |
-
except Exception as e:
|
112 |
-
return {
|
113 |
-
"entities": [{"name": "Error", "type": "ERROR", "description": str(e)}],
|
114 |
-
"relationships": []
|
115 |
}
|
116 |
-
|
117 |
-
|
118 |
-
"""Extract entities and relationships using Mistral AI with fallback.
|
119 |
-
|
120 |
-
Args:
|
121 |
-
text: Input text to analyze
|
122 |
|
123 |
-
|
124 |
-
Dictionary containing entities and relationships
|
125 |
-
"""
|
126 |
-
try:
|
127 |
-
# Check if HF_TOKEN is available
|
128 |
-
hf_token = os.environ.get("HF_TOKEN")
|
129 |
-
if not hf_token:
|
130 |
-
print("No HF_TOKEN found, using simple extraction")
|
131 |
-
return simple_entity_extraction(text)
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
)
|
137 |
|
138 |
-
|
139 |
-
|
140 |
-
1. Named entities (people, organizations, locations, concepts)
|
141 |
-
2. Relationships between these entities
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
],
|
148 |
-
"relationships": [
|
149 |
-
{{"source": "entity1", "target": "entity2", "relation": "relationship_type", "description": "brief description"}}
|
150 |
-
]
|
151 |
-
}}
|
152 |
|
153 |
-
|
154 |
-
"""
|
155 |
-
|
156 |
-
completion = client.chat.completions.create(
|
157 |
-
model="mistralai/Mistral-Small-24B-Instruct-2501",
|
158 |
-
messages=[{"role": "user", "content": prompt}],
|
159 |
-
max_tokens=1000,
|
160 |
-
temperature=0.1,
|
161 |
-
)
|
162 |
-
|
163 |
-
response_text = completion.choices[0].message.content
|
164 |
-
|
165 |
-
# Clean and extract JSON
|
166 |
-
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
167 |
-
if json_match:
|
168 |
-
json_str = json_match.group()
|
169 |
-
# Clean the JSON string
|
170 |
-
json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', json_str) # Remove control characters
|
171 |
-
|
172 |
-
parsed_data = json.loads(json_str)
|
173 |
-
|
174 |
-
# Clean entity names for display
|
175 |
-
if "entities" in parsed_data:
|
176 |
-
for entity in parsed_data["entities"]:
|
177 |
-
if "name" in entity:
|
178 |
-
entity["name"] = clean_text_for_display(entity["name"])
|
179 |
-
|
180 |
-
return parsed_data
|
181 |
-
else:
|
182 |
-
print("No valid JSON found in AI response, using fallback")
|
183 |
-
return simple_entity_extraction(text)
|
184 |
-
|
185 |
except Exception as e:
|
186 |
-
|
187 |
-
return simple_entity_extraction(text)
|
188 |
|
189 |
-
def
|
190 |
-
"""
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
entities = entities_data.get("entities", [])
|
204 |
-
for entity in entities[:15]: # Limit to 15 entities for better visualization
|
205 |
-
clean_name = clean_text_for_display(entity.get("name", "Unknown"))
|
206 |
-
if clean_name and len(clean_name.strip()) > 0:
|
207 |
-
G.add_node(clean_name,
|
208 |
-
type=entity.get("type", "UNKNOWN"),
|
209 |
-
description=entity.get("description", ""))
|
210 |
-
|
211 |
-
# Add edges (relationships)
|
212 |
-
relationships = entities_data.get("relationships", [])
|
213 |
-
for rel in relationships:
|
214 |
-
source = clean_text_for_display(rel.get("source", ""))
|
215 |
-
target = clean_text_for_display(rel.get("target", ""))
|
216 |
-
if source in G.nodes and target in G.nodes:
|
217 |
-
G.add_edge(source, target,
|
218 |
-
relation=rel.get("relation", "related"),
|
219 |
-
description=rel.get("description", ""))
|
220 |
-
|
221 |
-
# If no relationships found, create some connections between entities
|
222 |
-
if len(relationships) == 0 and len(list(G.nodes())) > 1:
|
223 |
-
node_list = list(G.nodes())
|
224 |
-
for i in range(min(len(node_list) - 1, 5)):
|
225 |
-
G.add_edge(node_list[i], node_list[i + 1], relation="related")
|
226 |
-
|
227 |
-
# Create visualization
|
228 |
-
fig, ax = plt.subplots(figsize=(10, 8))
|
229 |
-
|
230 |
-
# Skip if no nodes
|
231 |
-
if len(G.nodes()) == 0:
|
232 |
-
ax.text(0.5, 0.5, "No entities found to visualize",
|
233 |
-
ha='center', va='center', fontsize=14, transform=ax.transAxes)
|
234 |
-
ax.set_title("Knowledge Graph")
|
235 |
-
ax.axis('off')
|
236 |
-
else:
|
237 |
-
# Position nodes using spring layout
|
238 |
-
pos = nx.spring_layout(G, k=1, iterations=50)
|
239 |
-
|
240 |
-
# Color nodes by type
|
241 |
-
node_colors = []
|
242 |
-
type_colors = {
|
243 |
-
"PERSON": "#FF6B6B",
|
244 |
-
"ORG": "#4ECDC4",
|
245 |
-
"LOCATION": "#45B7D1",
|
246 |
-
"CONCEPT": "#96CEB4",
|
247 |
-
"ERROR": "#FF0000",
|
248 |
-
"UNKNOWN": "#DDA0DD"
|
249 |
-
}
|
250 |
-
|
251 |
-
for node in G.nodes():
|
252 |
-
node_type = G.nodes[node].get('type', 'UNKNOWN')
|
253 |
-
node_colors.append(type_colors.get(node_type, "#DDA0DD"))
|
254 |
-
|
255 |
-
# Draw the graph
|
256 |
-
nx.draw(G, pos,
|
257 |
-
node_color=node_colors,
|
258 |
-
node_size=800,
|
259 |
-
font_size=8,
|
260 |
-
font_weight='bold',
|
261 |
-
with_labels=True,
|
262 |
-
edge_color='gray',
|
263 |
-
width=1.5,
|
264 |
-
alpha=0.8,
|
265 |
-
ax=ax)
|
266 |
-
|
267 |
-
# Add title
|
268 |
-
ax.set_title("Knowledge Graph", size=14, weight='bold')
|
269 |
-
|
270 |
-
# Convert to PIL Image
|
271 |
-
fig.canvas.draw()
|
272 |
-
|
273 |
-
# Handle different matplotlib versions
|
274 |
-
try:
|
275 |
-
# Try newer method first
|
276 |
-
img_array = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
|
277 |
-
img_array = img_array.reshape(fig.canvas.get_width_height()[::-1] + (4,))
|
278 |
-
# Convert RGBA to RGB
|
279 |
-
img_array = img_array[:, :, :3]
|
280 |
-
except AttributeError:
|
281 |
-
try:
|
282 |
-
# Fallback to older method
|
283 |
-
img_array = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
284 |
-
img_array = img_array.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
285 |
-
except AttributeError:
|
286 |
-
# Final fallback - save to buffer
|
287 |
-
buf = io.BytesIO()
|
288 |
-
fig.savefig(buf, format='png', bbox_inches='tight')
|
289 |
-
buf.seek(0)
|
290 |
-
from PIL import Image
|
291 |
-
pil_image = Image.open(buf).convert('RGB')
|
292 |
-
plt.close(fig)
|
293 |
-
return pil_image
|
294 |
-
|
295 |
-
from PIL import Image
|
296 |
-
pil_image = Image.fromarray(img_array)
|
297 |
-
plt.close(fig)
|
298 |
-
|
299 |
-
return pil_image
|
300 |
-
|
301 |
-
except Exception as e:
|
302 |
-
# Create simple error image
|
303 |
-
fig, ax = plt.subplots(figsize=(8, 6))
|
304 |
-
ax.text(0.5, 0.5, f"Error creating graph",
|
305 |
-
ha='center', va='center', fontsize=12, transform=ax.transAxes)
|
306 |
-
ax.set_title("Knowledge Graph Error")
|
307 |
-
ax.axis('off')
|
308 |
-
|
309 |
-
# Handle different matplotlib versions for error image
|
310 |
-
try:
|
311 |
-
# Try newer method first
|
312 |
-
fig.canvas.draw()
|
313 |
-
img_array = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
|
314 |
-
img_array = img_array.reshape(fig.canvas.get_width_height()[::-1] + (4,))
|
315 |
-
img_array = img_array[:, :, :3] # Convert RGBA to RGB
|
316 |
-
except AttributeError:
|
317 |
-
try:
|
318 |
-
# Fallback to older method
|
319 |
-
fig.canvas.draw()
|
320 |
-
img_array = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
321 |
-
img_array = img_array.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
322 |
-
except AttributeError:
|
323 |
-
# Final fallback - save to buffer
|
324 |
-
buf = io.BytesIO()
|
325 |
-
fig.savefig(buf, format='png', bbox_inches='tight')
|
326 |
-
buf.seek(0)
|
327 |
-
from PIL import Image
|
328 |
-
pil_image = Image.open(buf).convert('RGB')
|
329 |
-
plt.close(fig)
|
330 |
-
return pil_image
|
331 |
-
|
332 |
-
from PIL import Image
|
333 |
-
pil_image = Image.fromarray(img_array)
|
334 |
-
plt.close(fig)
|
335 |
-
|
336 |
-
return pil_image
|
337 |
-
|
338 |
-
def build_ascii_diagram(entities, relationships):
|
339 |
-
"""Create simple ASCII diagram of knowledge graph"""
|
340 |
-
if not entities:
|
341 |
-
return "No entities to visualize"
|
342 |
-
|
343 |
-
diagram = "KNOWLEDGE GRAPH DIAGRAM:\n"
|
344 |
-
diagram += "=" * 30 + "\n\n" # Reduced line length
|
345 |
-
|
346 |
-
# Show entities by type
|
347 |
-
entity_types = {}
|
348 |
-
for entity in entities: # Already limited by caller
|
349 |
-
etype = entity.get("type", "UNKNOWN")
|
350 |
-
if etype not in entity_types:
|
351 |
-
entity_types[etype] = []
|
352 |
-
entity_types[etype].append(entity.get("name", "Unknown"))
|
353 |
|
354 |
-
|
355 |
-
|
356 |
-
for name in names:
|
357 |
-
diagram += f" - {name}\n"
|
358 |
-
diagram += "\n"
|
359 |
|
360 |
-
|
361 |
-
|
362 |
-
diagram += "RELATIONSHIPS:\n" # Removed emoji for MCP compatibility
|
363 |
-
for rel in relationships: # Already limited by caller
|
364 |
-
source = rel.get("source", "?")
|
365 |
-
target = rel.get("target", "?")
|
366 |
-
relation = rel.get("relation", "related")
|
367 |
-
diagram += f" {source} -> {target} ({relation})\n"
|
368 |
|
369 |
-
return diagram
|
370 |
-
|
371 |
-
def validate_mcp_response(response_data):
|
372 |
-
"""Validate and sanitize response for MCP compatibility"""
|
373 |
try:
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
|
386 |
-
|
387 |
|
388 |
-
#
|
389 |
-
|
|
|
|
|
|
|
|
|
390 |
|
391 |
-
|
392 |
-
|
393 |
-
# Drastically reduce content
|
394 |
-
sanitized["entities"] = sanitized.get("entities", [])[:5]
|
395 |
-
sanitized["relationships"] = sanitized.get("relationships", [])[:3]
|
396 |
-
sanitized["diagram"] = "Knowledge graph generated (content reduced for MCP)"
|
397 |
-
|
398 |
-
return sanitized
|
399 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
400 |
except Exception as e:
|
401 |
return {
|
402 |
-
"success": False,
|
403 |
-
"error": f"Response validation failed: {str(e)}",
|
404 |
"entities": [],
|
405 |
"relationships": [],
|
406 |
-
"
|
407 |
-
"summary": "Analysis failed during response validation"
|
408 |
}
|
409 |
|
410 |
-
def
|
411 |
-
"""Main function to build knowledge graph from
|
412 |
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
#
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
#
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
source = str(rel.get("source", ""))[:15]
|
460 |
-
target = str(rel.get("target", ""))[:15]
|
461 |
-
diagram_parts.append(f" {source} -> {target}")
|
462 |
-
|
463 |
-
diagram = "\n".join(diagram_parts) if diagram_parts else "No entities found"
|
464 |
-
|
465 |
-
# Ultra-minimal response
|
466 |
-
response = {
|
467 |
-
"success": True,
|
468 |
-
"entity_count": len(entities),
|
469 |
-
"relationship_count": len(relationships),
|
470 |
-
"entities": [{"name": e.get("name", "")[:20], "type": e.get("type", "UNKNOWN")} for e in entities],
|
471 |
-
"relationships": [{"source": r.get("source", "")[:15], "target": r.get("target", "")[:15]} for r in relationships],
|
472 |
-
"diagram": diagram[:500] # Strict limit
|
473 |
}
|
474 |
-
|
475 |
-
# Return ultra-compact JSON
|
476 |
-
return json.dumps(response, separators=(',', ':'))[:2000] # Hard size limit
|
477 |
-
|
478 |
-
except Exception as e:
|
479 |
-
# Ultra-simple error response
|
480 |
-
error_msg = str(e)[:100] # Truncate error message
|
481 |
-
return f'{{"success":false,"error":"{error_msg}"}}'
|
482 |
-
|
483 |
-
# Wrapper function with timeout protection for MCP
|
484 |
-
def mcp_safe_build_kg(url_or_text):
|
485 |
-
"""MCP-safe wrapper with timeout protection"""
|
486 |
-
try:
|
487 |
-
import signal
|
488 |
-
import functools
|
489 |
-
|
490 |
-
def timeout_handler(signum, frame):
|
491 |
-
raise TimeoutError("Function timed out")
|
492 |
-
|
493 |
-
# Set timeout for 10 seconds
|
494 |
-
signal.signal(signal.SIGALRM, timeout_handler)
|
495 |
-
signal.alarm(10)
|
496 |
-
|
497 |
-
try:
|
498 |
-
result = build_kg(url_or_text)
|
499 |
-
signal.alarm(0) # Cancel timeout
|
500 |
-
return result
|
501 |
-
except TimeoutError:
|
502 |
-
return '{"success":false,"error":"Request timed out"}'
|
503 |
-
except Exception as e:
|
504 |
-
signal.alarm(0) # Cancel timeout
|
505 |
-
return f'{{"success":false,"error":"Function error: {str(e)[:50]}"}}'
|
506 |
-
|
507 |
-
except Exception:
|
508 |
-
# Fallback if signal not available (Windows, etc.)
|
509 |
-
try:
|
510 |
-
return build_kg(url_or_text)
|
511 |
-
except Exception as e:
|
512 |
-
return f'{{"success":false,"error":"Fallback error: {str(e)[:50]}"}}'
|
513 |
-
|
514 |
-
# Create Gradio interface with error handling
|
515 |
-
try:
|
516 |
-
demo = gr.Interface(
|
517 |
-
fn=mcp_safe_build_kg, # Use the timeout-protected version
|
518 |
-
inputs=gr.Textbox(
|
519 |
-
label="Input Text or URL",
|
520 |
-
placeholder="Enter text to analyze or paste a URL...",
|
521 |
-
max_lines=5
|
522 |
-
),
|
523 |
-
outputs=gr.Textbox(
|
524 |
-
label="Knowledge Graph JSON",
|
525 |
-
show_copy_button=True
|
526 |
-
),
|
527 |
-
title="KG Builder - MCP Edition",
|
528 |
-
description="Lightweight knowledge graph builder optimized for MCP servers.",
|
529 |
-
allow_flagging="never",
|
530 |
-
cache_examples=False
|
531 |
-
)
|
532 |
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
fn=error_demo,
|
541 |
-
inputs="text",
|
542 |
-
outputs="text",
|
543 |
-
title="KG Builder - Error Mode",
|
544 |
-
allow_flagging="never"
|
545 |
-
)
|
546 |
|
547 |
-
#
|
548 |
-
|
549 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
550 |
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
mcp_server=False,
|
569 |
-
share=False,
|
570 |
-
quiet=True,
|
571 |
-
show_error=False
|
572 |
-
)
|
573 |
-
except Exception as e2:
|
574 |
-
print(f"All launch attempts failed: {e2}")
|
575 |
-
print("Creating emergency fallback...")
|
576 |
-
|
577 |
-
# Create absolute minimal demo
|
578 |
-
def emergency_demo(text):
|
579 |
-
return '{"error":"Server in emergency mode"}'
|
580 |
-
|
581 |
-
emergency = gr.Interface(
|
582 |
-
fn=emergency_demo,
|
583 |
-
inputs="text",
|
584 |
-
outputs="text",
|
585 |
-
title="KG Builder Emergency Mode"
|
586 |
-
)
|
587 |
-
emergency.launch(quiet=True, share=False)
|
|
|
|
|
1 |
import os
|
2 |
import json
|
3 |
import requests
|
4 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from urllib.parse import urlparse
|
6 |
+
import gradio as gr
|
7 |
+
from huggingface_hub import InferenceClient
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
# Initialize Mistral client
|
10 |
+
client = InferenceClient(
|
11 |
+
provider="together",
|
12 |
+
api_key=os.environ.get("HF_TOKEN", ""),
|
13 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
def extract_text_from_url(url):
|
16 |
+
"""Extract text content from a web URL."""
|
17 |
try:
|
18 |
+
headers = {
|
19 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
}
|
21 |
+
response = requests.get(url, headers=headers, timeout=10)
|
22 |
+
response.raise_for_status()
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# Remove script and style elements
|
27 |
+
for script in soup(["script", "style"]):
|
28 |
+
script.decompose()
|
|
|
29 |
|
30 |
+
# Get text content
|
31 |
+
text = soup.get_text()
|
|
|
|
|
32 |
|
33 |
+
# Clean up text
|
34 |
+
lines = (line.strip() for line in text.splitlines())
|
35 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
36 |
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
return text[:5000] # Limit to first 5000 characters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
except Exception as e:
|
40 |
+
return f"Error fetching URL: {str(e)}"
|
|
|
41 |
|
42 |
+
def extract_entities_and_relationships(text):
|
43 |
+
"""Use Mistral to extract entities and relationships from text."""
|
44 |
|
45 |
+
entity_prompt = f"""
|
46 |
+
Analyze the following text and extract key entities and their relationships.
|
47 |
+
Return the result as a JSON object with this exact structure:
|
48 |
+
{{
|
49 |
+
"entities": [
|
50 |
+
{{"name": "entity_name", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT|EVENT|OTHER", "description": "brief description"}}
|
51 |
+
],
|
52 |
+
"relationships": [
|
53 |
+
{{"source": "entity1", "target": "entity2", "relationship": "relationship_type", "description": "brief description"}}
|
54 |
+
]
|
55 |
+
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
Text to analyze:
|
58 |
+
{text[:3000]}
|
|
|
|
|
|
|
59 |
|
60 |
+
Please provide only the JSON response without any additional text or formatting.
|
61 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
|
|
|
|
|
|
|
|
63 |
try:
|
64 |
+
completion = client.chat.completions.create(
|
65 |
+
model="mistralai/Mistral-Small-24B-Instruct-2501",
|
66 |
+
messages=[
|
67 |
+
{
|
68 |
+
"role": "user",
|
69 |
+
"content": entity_prompt
|
70 |
+
}
|
71 |
+
],
|
72 |
+
max_tokens=2000,
|
73 |
+
temperature=0.3
|
74 |
+
)
|
75 |
|
76 |
+
response_text = completion.choices[0].message.content.strip()
|
77 |
|
78 |
+
# Try to parse JSON from the response
|
79 |
+
# Sometimes the model might return JSON wrapped in markdown code blocks
|
80 |
+
if response_text.startswith('```'):
|
81 |
+
response_text = response_text.split('```')[1]
|
82 |
+
if response_text.startswith('json'):
|
83 |
+
response_text = response_text[4:]
|
84 |
|
85 |
+
result = json.loads(response_text)
|
86 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
except json.JSONDecodeError as e:
|
89 |
+
# If JSON parsing fails, return a structured error
|
90 |
+
return {
|
91 |
+
"entities": [],
|
92 |
+
"relationships": [],
|
93 |
+
"error": f"Failed to parse LLM response as JSON: {str(e)}",
|
94 |
+
"raw_response": response_text
|
95 |
+
}
|
96 |
except Exception as e:
|
97 |
return {
|
|
|
|
|
98 |
"entities": [],
|
99 |
"relationships": [],
|
100 |
+
"error": f"Error calling Mistral API: {str(e)}"
|
|
|
101 |
}
|
102 |
|
103 |
+
def build_knowledge_graph(input_text):
|
104 |
+
"""Main function to build knowledge graph from text or URL."""
|
105 |
|
106 |
+
if not input_text.strip():
|
107 |
+
return json.dumps({
|
108 |
+
"error": "Please provide text or a valid URL",
|
109 |
+
"knowledge_graph": None
|
110 |
+
}, indent=2)
|
111 |
+
|
112 |
+
# Check if input is a URL
|
113 |
+
parsed = urlparse(input_text.strip())
|
114 |
+
is_url = parsed.scheme in ('http', 'https') and parsed.netloc
|
115 |
+
|
116 |
+
if is_url:
|
117 |
+
# Extract text from URL
|
118 |
+
extracted_text = extract_text_from_url(input_text.strip())
|
119 |
+
if extracted_text.startswith("Error fetching URL"):
|
120 |
+
return json.dumps({
|
121 |
+
"error": extracted_text,
|
122 |
+
"knowledge_graph": None
|
123 |
+
}, indent=2)
|
124 |
+
source_type = "url"
|
125 |
+
source = input_text.strip()
|
126 |
+
content = extracted_text
|
127 |
+
else:
|
128 |
+
# Use provided text directly
|
129 |
+
source_type = "text"
|
130 |
+
source = "direct_input"
|
131 |
+
content = input_text.strip()
|
132 |
+
|
133 |
+
# Extract entities and relationships using Mistral
|
134 |
+
kg_data = extract_entities_and_relationships(content)
|
135 |
+
|
136 |
+
# Build the final knowledge graph structure
|
137 |
+
knowledge_graph = {
|
138 |
+
"source": {
|
139 |
+
"type": source_type,
|
140 |
+
"value": source,
|
141 |
+
"content_preview": content[:200] + "..." if len(content) > 200 else content
|
142 |
+
},
|
143 |
+
"knowledge_graph": {
|
144 |
+
"entities": kg_data.get("entities", []),
|
145 |
+
"relationships": kg_data.get("relationships", []),
|
146 |
+
"entity_count": len(kg_data.get("entities", [])),
|
147 |
+
"relationship_count": len(kg_data.get("relationships", []))
|
148 |
+
},
|
149 |
+
"metadata": {
|
150 |
+
"model": "mistralai/Mistral-Small-24B-Instruct-2501",
|
151 |
+
"content_length": len(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
}
|
153 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
+
# Add any errors from the extraction process
|
156 |
+
if "error" in kg_data:
|
157 |
+
knowledge_graph["extraction_error"] = kg_data["error"]
|
158 |
+
if "raw_response" in kg_data:
|
159 |
+
knowledge_graph["raw_llm_response"] = kg_data["raw_response"]
|
160 |
+
|
161 |
+
return json.dumps(knowledge_graph, indent=2, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
+
# Create Gradio interface
|
164 |
+
demo = gr.Interface(
|
165 |
+
fn=build_knowledge_graph,
|
166 |
+
inputs=gr.Textbox(
|
167 |
+
label="Text or URL Input",
|
168 |
+
placeholder="Enter text to analyze or a web URL (e.g., https://example.com)",
|
169 |
+
lines=5,
|
170 |
+
max_lines=10
|
171 |
+
),
|
172 |
+
outputs=gr.JSON(label="Knowledge Graph"),
|
173 |
+
title="🧠 Knowledge Graph Builder",
|
174 |
+
description="""
|
175 |
+
**Build Knowledge Graphs with AI**
|
176 |
|
177 |
+
This tool uses Mistral AI to extract entities and relationships from text or web content:
|
178 |
+
|
179 |
+
• **Text Input**: Paste any text to analyze
|
180 |
+
• **URL Input**: Provide a web URL to extract and analyze content
|
181 |
+
• **Output**: Structured JSON knowledge graph for LLM agents
|
182 |
+
|
183 |
+
The output includes entities (people, organizations, locations, concepts) and their relationships, formatted for easy consumption by AI agents.
|
184 |
+
""",
|
185 |
+
examples=[
|
186 |
+
["Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company is headquartered in Cupertino, California."],
|
187 |
+
["https://en.wikipedia.org/wiki/Artificial_intelligence"],
|
188 |
+
],
|
189 |
+
cache_examples=False,
|
190 |
+
theme=gr.themes.Soft()
|
191 |
+
)
|
192 |
+
|
193 |
+
demo.launch(mcp_server=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|