Spaces:

Agents-MCP-Hackathon
/

KGB-mcp

Running

App Files Files Community

KGB-mcp / app.py

VirtualOasis

Create app.py

da82fbb verified 2 days ago

raw

history blame contribute delete

8.1 kB

	import os
	import json
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	import gradio as gr
	from huggingface_hub import InferenceClient

	# Initialize Mistral client
	client = InferenceClient(
	provider="together",
	api_key=os.environ.get("HF_TOKEN"),
	)

	def extract_text_from_url(url):
	"""Extract text content from a web URL."""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Get text content
	text = soup.get_text()

	# Clean up text
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = ' '.join(chunk for chunk in chunks if chunk)

	return text[:5000] # Limit to first 5000 characters
	except Exception as e:
	return f"Error fetching URL: {str(e)}"

	def extract_entities_and_relationships(text):
	"""Use Mistral to extract entities and relationships from text."""

	if not os.environ.get("HF_TOKEN"):
	return {
	"entities": [],
	"relationships": [],
	"error": "HF_TOKEN environment variable not set"
	}

	entity_prompt = f"""
	Analyze the following text and extract key entities and their relationships.
	Return the result as a JSON object with this exact structure:
	{{
	"entities": [
	{{"name": "entity_name", "type": "PERSON\|ORGANIZATION\|LOCATION\|CONCEPT\|EVENT\|OTHER", "description": "brief description"}}
	],
	"relationships": [
	{{"source": "entity1", "target": "entity2", "relationship": "relationship_type", "description": "brief description"}}
	]
	}}

	Text to analyze:
	{text[:3000]}

	Please provide only the JSON response without any additional text or formatting.
	"""

	try:
	completion = client.chat.completions.create(
	model="mistralai/Mistral-Small-24B-Instruct-2501",
	messages=[
	{
	"role": "user",
	"content": entity_prompt
	}
	],
	max_tokens=4000,
	temperature=0.2
	)

	if not completion.choices or not completion.choices[0].message:
	return {
	"entities": [],
	"relationships": [],
	"error": "Empty response from Mistral API"
	}

	response_text = completion.choices[0].message.content.strip()

	# Try to parse JSON from the response
	# Sometimes the model might return JSON wrapped in markdown code blocks
	if response_text.startswith('```'):
	lines = response_text.split('\n')
	start_idx = 1
	if lines[0].strip() == '```json':
	start_idx = 1
	end_idx = len(lines) - 1
	for i in range(len(lines)-1, 0, -1):
	if lines[i].strip() == '```':
	end_idx = i
	break
	response_text = '\n'.join(lines[start_idx:end_idx])

	result = json.loads(response_text)

	# Validate the structure
	if not isinstance(result, dict):
	raise ValueError("Response is not a JSON object")

	if "entities" not in result:
	result["entities"] = []
	if "relationships" not in result:
	result["relationships"] = []

	return result

	except json.JSONDecodeError as e:
	# If JSON parsing fails, return a structured error
	return {
	"entities": [],
	"relationships": [],
	"error": f"Failed to parse LLM response as JSON: {str(e)}",
	"raw_response": response_text if 'response_text' in locals() else "No response"
	}
	except Exception as e:
	return {
	"entities": [],
	"relationships": [],
	"error": f"Error calling Mistral API: {str(e)}"
	}

	def build_knowledge_graph(input_text):
	"""Main function to build knowledge graph from text or URL."""

	try:
	if not input_text or not input_text.strip():
	return {
	"error": "Please provide text or a valid URL",
	"knowledge_graph": None
	}

	# Check if input is a URL
	parsed = urlparse(input_text.strip())
	is_url = parsed.scheme in ('http', 'https') and parsed.netloc

	if is_url:
	# Extract text from URL
	extracted_text = extract_text_from_url(input_text.strip())
	if extracted_text.startswith("Error fetching URL"):
	return {
	"error": extracted_text,
	"knowledge_graph": None
	}
	source_type = "url"
	source = input_text.strip()
	content = extracted_text
	else:
	# Use provided text directly
	source_type = "text"
	source = "direct_input"
	content = input_text.strip()

	# Extract entities and relationships using Mistral
	kg_data = extract_entities_and_relationships(content)

	# Build the final knowledge graph structure
	knowledge_graph = {
	"source": {
	"type": source_type,
	"value": source,
	"content_preview": content[:200] + "..." if len(content) > 200 else content
	},
	"knowledge_graph": {
	"entities": kg_data.get("entities", []),
	"relationships": kg_data.get("relationships", []),
	"entity_count": len(kg_data.get("entities", [])),
	"relationship_count": len(kg_data.get("relationships", []))
	},
	"metadata": {
	"model": "mistralai/Mistral-Small-24B-Instruct-2501",
	"content_length": len(content)
	}
	}

	# Add any errors from the extraction process
	if "error" in kg_data:
	knowledge_graph["extraction_error"] = kg_data["error"]
	if "raw_response" in kg_data:
	knowledge_graph["raw_llm_response"] = kg_data["raw_response"]

	return knowledge_graph

	except Exception as e:
	return {
	"error": f"Unexpected error: {str(e)}",
	"knowledge_graph": None
	}

	# Create Gradio interface
	demo = gr.Interface(
	fn=build_knowledge_graph,
	inputs=gr.Textbox(
	label="Text or URL Input",
	placeholder="Enter text to analyze or a web URL (e.g., https://example.com)",
	lines=5,
	max_lines=10
	),
	outputs=gr.JSON(label="Knowledge Graph"),
	title="🧠 Knowledge Graph Builder",
	description="""
	Build Knowledge Graphs with AI

	This tool uses Mistral AI to extract entities and relationships from text or web content:

	• Text Input: Paste any text to analyze
	• URL Input: Provide a web URL to extract and analyze content
	• Output: Structured JSON knowledge graph for LLM agents

	The output includes entities (people, organizations, locations, concepts) and their relationships, formatted for easy consumption by AI agents.
	""",
	examples=[
	["Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company is headquartered in Cupertino, California."],
	["https://en.wikipedia.org/wiki/Artificial_intelligence"],
	],
	cache_examples=False,
	theme=gr.themes.Soft()
	)

	demo.launch(mcp_server=True)