KGB-mcp / app.py
VirtualOasis's picture
Create app.py
da82fbb verified
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import gradio as gr
from huggingface_hub import InferenceClient
# Initialize Mistral client
client = InferenceClient(
provider="together",
api_key=os.environ.get("HF_TOKEN"),
)
def extract_text_from_url(url):
"""Extract text content from a web URL."""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text content
text = soup.get_text()
# Clean up text
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text[:5000] # Limit to first 5000 characters
except Exception as e:
return f"Error fetching URL: {str(e)}"
def extract_entities_and_relationships(text):
"""Use Mistral to extract entities and relationships from text."""
if not os.environ.get("HF_TOKEN"):
return {
"entities": [],
"relationships": [],
"error": "HF_TOKEN environment variable not set"
}
entity_prompt = f"""
Analyze the following text and extract key entities and their relationships.
Return the result as a JSON object with this exact structure:
{{
"entities": [
{{"name": "entity_name", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT|EVENT|OTHER", "description": "brief description"}}
],
"relationships": [
{{"source": "entity1", "target": "entity2", "relationship": "relationship_type", "description": "brief description"}}
]
}}
Text to analyze:
{text[:3000]}
Please provide only the JSON response without any additional text or formatting.
"""
try:
completion = client.chat.completions.create(
model="mistralai/Mistral-Small-24B-Instruct-2501",
messages=[
{
"role": "user",
"content": entity_prompt
}
],
max_tokens=4000,
temperature=0.2
)
if not completion.choices or not completion.choices[0].message:
return {
"entities": [],
"relationships": [],
"error": "Empty response from Mistral API"
}
response_text = completion.choices[0].message.content.strip()
# Try to parse JSON from the response
# Sometimes the model might return JSON wrapped in markdown code blocks
if response_text.startswith('```'):
lines = response_text.split('\n')
start_idx = 1
if lines[0].strip() == '```json':
start_idx = 1
end_idx = len(lines) - 1
for i in range(len(lines)-1, 0, -1):
if lines[i].strip() == '```':
end_idx = i
break
response_text = '\n'.join(lines[start_idx:end_idx])
result = json.loads(response_text)
# Validate the structure
if not isinstance(result, dict):
raise ValueError("Response is not a JSON object")
if "entities" not in result:
result["entities"] = []
if "relationships" not in result:
result["relationships"] = []
return result
except json.JSONDecodeError as e:
# If JSON parsing fails, return a structured error
return {
"entities": [],
"relationships": [],
"error": f"Failed to parse LLM response as JSON: {str(e)}",
"raw_response": response_text if 'response_text' in locals() else "No response"
}
except Exception as e:
return {
"entities": [],
"relationships": [],
"error": f"Error calling Mistral API: {str(e)}"
}
def build_knowledge_graph(input_text):
"""Main function to build knowledge graph from text or URL."""
try:
if not input_text or not input_text.strip():
return {
"error": "Please provide text or a valid URL",
"knowledge_graph": None
}
# Check if input is a URL
parsed = urlparse(input_text.strip())
is_url = parsed.scheme in ('http', 'https') and parsed.netloc
if is_url:
# Extract text from URL
extracted_text = extract_text_from_url(input_text.strip())
if extracted_text.startswith("Error fetching URL"):
return {
"error": extracted_text,
"knowledge_graph": None
}
source_type = "url"
source = input_text.strip()
content = extracted_text
else:
# Use provided text directly
source_type = "text"
source = "direct_input"
content = input_text.strip()
# Extract entities and relationships using Mistral
kg_data = extract_entities_and_relationships(content)
# Build the final knowledge graph structure
knowledge_graph = {
"source": {
"type": source_type,
"value": source,
"content_preview": content[:200] + "..." if len(content) > 200 else content
},
"knowledge_graph": {
"entities": kg_data.get("entities", []),
"relationships": kg_data.get("relationships", []),
"entity_count": len(kg_data.get("entities", [])),
"relationship_count": len(kg_data.get("relationships", []))
},
"metadata": {
"model": "mistralai/Mistral-Small-24B-Instruct-2501",
"content_length": len(content)
}
}
# Add any errors from the extraction process
if "error" in kg_data:
knowledge_graph["extraction_error"] = kg_data["error"]
if "raw_response" in kg_data:
knowledge_graph["raw_llm_response"] = kg_data["raw_response"]
return knowledge_graph
except Exception as e:
return {
"error": f"Unexpected error: {str(e)}",
"knowledge_graph": None
}
# Create Gradio interface
demo = gr.Interface(
fn=build_knowledge_graph,
inputs=gr.Textbox(
label="Text or URL Input",
placeholder="Enter text to analyze or a web URL (e.g., https://example.com)",
lines=5,
max_lines=10
),
outputs=gr.JSON(label="Knowledge Graph"),
title="🧠 Knowledge Graph Builder",
description="""
**Build Knowledge Graphs with AI**
This tool uses Mistral AI to extract entities and relationships from text or web content:
• **Text Input**: Paste any text to analyze
• **URL Input**: Provide a web URL to extract and analyze content
• **Output**: Structured JSON knowledge graph for LLM agents
The output includes entities (people, organizations, locations, concepts) and their relationships, formatted for easy consumption by AI agents.
""",
examples=[
["Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company is headquartered in Cupertino, California."],
["https://en.wikipedia.org/wiki/Artificial_intelligence"],
],
cache_examples=False,
theme=gr.themes.Soft()
)
demo.launch(mcp_server=True)