Spaces:
Running
Running
import os | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
import gradio as gr | |
from huggingface_hub import InferenceClient | |
# Initialize Mistral client | |
client = InferenceClient( | |
provider="together", | |
api_key=os.environ.get("HF_TOKEN"), | |
) | |
def extract_text_from_url(url): | |
"""Extract text content from a web URL.""" | |
try: | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, headers=headers) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Remove script and style elements | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# Get text content | |
text = soup.get_text() | |
# Clean up text | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
text = ' '.join(chunk for chunk in chunks if chunk) | |
return text[:5000] # Limit to first 5000 characters | |
except Exception as e: | |
return f"Error fetching URL: {str(e)}" | |
def extract_entities_and_relationships(text): | |
"""Use Mistral to extract entities and relationships from text.""" | |
if not os.environ.get("HF_TOKEN"): | |
return { | |
"entities": [], | |
"relationships": [], | |
"error": "HF_TOKEN environment variable not set" | |
} | |
entity_prompt = f""" | |
Analyze the following text and extract key entities and their relationships. | |
Return the result as a JSON object with this exact structure: | |
{{ | |
"entities": [ | |
{{"name": "entity_name", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT|EVENT|OTHER", "description": "brief description"}} | |
], | |
"relationships": [ | |
{{"source": "entity1", "target": "entity2", "relationship": "relationship_type", "description": "brief description"}} | |
] | |
}} | |
Text to analyze: | |
{text[:3000]} | |
Please provide only the JSON response without any additional text or formatting. | |
""" | |
try: | |
completion = client.chat.completions.create( | |
model="mistralai/Mistral-Small-24B-Instruct-2501", | |
messages=[ | |
{ | |
"role": "user", | |
"content": entity_prompt | |
} | |
], | |
max_tokens=4000, | |
temperature=0.2 | |
) | |
if not completion.choices or not completion.choices[0].message: | |
return { | |
"entities": [], | |
"relationships": [], | |
"error": "Empty response from Mistral API" | |
} | |
response_text = completion.choices[0].message.content.strip() | |
# Try to parse JSON from the response | |
# Sometimes the model might return JSON wrapped in markdown code blocks | |
if response_text.startswith('```'): | |
lines = response_text.split('\n') | |
start_idx = 1 | |
if lines[0].strip() == '```json': | |
start_idx = 1 | |
end_idx = len(lines) - 1 | |
for i in range(len(lines)-1, 0, -1): | |
if lines[i].strip() == '```': | |
end_idx = i | |
break | |
response_text = '\n'.join(lines[start_idx:end_idx]) | |
result = json.loads(response_text) | |
# Validate the structure | |
if not isinstance(result, dict): | |
raise ValueError("Response is not a JSON object") | |
if "entities" not in result: | |
result["entities"] = [] | |
if "relationships" not in result: | |
result["relationships"] = [] | |
return result | |
except json.JSONDecodeError as e: | |
# If JSON parsing fails, return a structured error | |
return { | |
"entities": [], | |
"relationships": [], | |
"error": f"Failed to parse LLM response as JSON: {str(e)}", | |
"raw_response": response_text if 'response_text' in locals() else "No response" | |
} | |
except Exception as e: | |
return { | |
"entities": [], | |
"relationships": [], | |
"error": f"Error calling Mistral API: {str(e)}" | |
} | |
def build_knowledge_graph(input_text): | |
"""Main function to build knowledge graph from text or URL.""" | |
try: | |
if not input_text or not input_text.strip(): | |
return { | |
"error": "Please provide text or a valid URL", | |
"knowledge_graph": None | |
} | |
# Check if input is a URL | |
parsed = urlparse(input_text.strip()) | |
is_url = parsed.scheme in ('http', 'https') and parsed.netloc | |
if is_url: | |
# Extract text from URL | |
extracted_text = extract_text_from_url(input_text.strip()) | |
if extracted_text.startswith("Error fetching URL"): | |
return { | |
"error": extracted_text, | |
"knowledge_graph": None | |
} | |
source_type = "url" | |
source = input_text.strip() | |
content = extracted_text | |
else: | |
# Use provided text directly | |
source_type = "text" | |
source = "direct_input" | |
content = input_text.strip() | |
# Extract entities and relationships using Mistral | |
kg_data = extract_entities_and_relationships(content) | |
# Build the final knowledge graph structure | |
knowledge_graph = { | |
"source": { | |
"type": source_type, | |
"value": source, | |
"content_preview": content[:200] + "..." if len(content) > 200 else content | |
}, | |
"knowledge_graph": { | |
"entities": kg_data.get("entities", []), | |
"relationships": kg_data.get("relationships", []), | |
"entity_count": len(kg_data.get("entities", [])), | |
"relationship_count": len(kg_data.get("relationships", [])) | |
}, | |
"metadata": { | |
"model": "mistralai/Mistral-Small-24B-Instruct-2501", | |
"content_length": len(content) | |
} | |
} | |
# Add any errors from the extraction process | |
if "error" in kg_data: | |
knowledge_graph["extraction_error"] = kg_data["error"] | |
if "raw_response" in kg_data: | |
knowledge_graph["raw_llm_response"] = kg_data["raw_response"] | |
return knowledge_graph | |
except Exception as e: | |
return { | |
"error": f"Unexpected error: {str(e)}", | |
"knowledge_graph": None | |
} | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=build_knowledge_graph, | |
inputs=gr.Textbox( | |
label="Text or URL Input", | |
placeholder="Enter text to analyze or a web URL (e.g., https://example.com)", | |
lines=5, | |
max_lines=10 | |
), | |
outputs=gr.JSON(label="Knowledge Graph"), | |
title="🧠 Knowledge Graph Builder", | |
description=""" | |
**Build Knowledge Graphs with AI** | |
This tool uses Mistral AI to extract entities and relationships from text or web content: | |
• **Text Input**: Paste any text to analyze | |
• **URL Input**: Provide a web URL to extract and analyze content | |
• **Output**: Structured JSON knowledge graph for LLM agents | |
The output includes entities (people, organizations, locations, concepts) and their relationships, formatted for easy consumption by AI agents. | |
""", | |
examples=[ | |
["Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company is headquartered in Cupertino, California."], | |
["https://en.wikipedia.org/wiki/Artificial_intelligence"], | |
], | |
cache_examples=False, | |
theme=gr.themes.Soft() | |
) | |
demo.launch(mcp_server=True) | |