Spaces:
Running
Running
File size: 8,095 Bytes
d44d865 804b6dc b99df01 804b6dc 28bf4ed 804b6dc cfa2282 804b6dc cfa2282 804b6dc cfa2282 347e5b8 804b6dc d44d865 804b6dc cfa2282 804b6dc d44d865 804b6dc d44d865 804b6dc d44d865 804b6dc d44d865 804b6dc d44d865 804b6dc d44d865 583d917 7c56cb5 804b6dc 1810a55 804b6dc 1810a55 804b6dc 1810a55 179e56a 804b6dc 9070a9e 804b6dc 179e56a 7c56cb5 804b6dc 179e56a 804b6dc 7c56cb5 179e56a 804b6dc 7c56cb5 804b6dc 179e56a 804b6dc 7c56cb5 804b6dc 179e56a 804b6dc 179e56a 804b6dc d44d865 7c56cb5 804b6dc 7c56cb5 c2b3df9 7c56cb5 804b6dc 78b1724 804b6dc b914d47 804b6dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import gradio as gr
from huggingface_hub import InferenceClient
# Initialize Mistral client
client = InferenceClient(
provider="together",
api_key=os.environ.get("HF_TOKEN"),
)
def extract_text_from_url(url):
"""Extract text content from a web URL."""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text content
text = soup.get_text()
# Clean up text
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text[:5000] # Limit to first 5000 characters
except Exception as e:
return f"Error fetching URL: {str(e)}"
def extract_entities_and_relationships(text):
"""Use Mistral to extract entities and relationships from text."""
if not os.environ.get("HF_TOKEN"):
return {
"entities": [],
"relationships": [],
"error": "HF_TOKEN environment variable not set"
}
entity_prompt = f"""
Analyze the following text and extract key entities and their relationships.
Return the result as a JSON object with this exact structure:
{{
"entities": [
{{"name": "entity_name", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT|EVENT|OTHER", "description": "brief description"}}
],
"relationships": [
{{"source": "entity1", "target": "entity2", "relationship": "relationship_type", "description": "brief description"}}
]
}}
Text to analyze:
{text[:3000]}
Please provide only the JSON response without any additional text or formatting.
"""
try:
completion = client.chat.completions.create(
model="mistralai/Mistral-Small-24B-Instruct-2501",
messages=[
{
"role": "user",
"content": entity_prompt
}
],
max_tokens=4000,
temperature=0.2
)
if not completion.choices or not completion.choices[0].message:
return {
"entities": [],
"relationships": [],
"error": "Empty response from Mistral API"
}
response_text = completion.choices[0].message.content.strip()
# Try to parse JSON from the response
# Sometimes the model might return JSON wrapped in markdown code blocks
if response_text.startswith('```'):
lines = response_text.split('\n')
start_idx = 1
if lines[0].strip() == '```json':
start_idx = 1
end_idx = len(lines) - 1
for i in range(len(lines)-1, 0, -1):
if lines[i].strip() == '```':
end_idx = i
break
response_text = '\n'.join(lines[start_idx:end_idx])
result = json.loads(response_text)
# Validate the structure
if not isinstance(result, dict):
raise ValueError("Response is not a JSON object")
if "entities" not in result:
result["entities"] = []
if "relationships" not in result:
result["relationships"] = []
return result
except json.JSONDecodeError as e:
# If JSON parsing fails, return a structured error
return {
"entities": [],
"relationships": [],
"error": f"Failed to parse LLM response as JSON: {str(e)}",
"raw_response": response_text if 'response_text' in locals() else "No response"
}
except Exception as e:
return {
"entities": [],
"relationships": [],
"error": f"Error calling Mistral API: {str(e)}"
}
def build_knowledge_graph(input_text):
"""Main function to build knowledge graph from text or URL."""
try:
if not input_text or not input_text.strip():
return {
"error": "Please provide text or a valid URL",
"knowledge_graph": None
}
# Check if input is a URL
parsed = urlparse(input_text.strip())
is_url = parsed.scheme in ('http', 'https') and parsed.netloc
if is_url:
# Extract text from URL
extracted_text = extract_text_from_url(input_text.strip())
if extracted_text.startswith("Error fetching URL"):
return {
"error": extracted_text,
"knowledge_graph": None
}
source_type = "url"
source = input_text.strip()
content = extracted_text
else:
# Use provided text directly
source_type = "text"
source = "direct_input"
content = input_text.strip()
# Extract entities and relationships using Mistral
kg_data = extract_entities_and_relationships(content)
# Build the final knowledge graph structure
knowledge_graph = {
"source": {
"type": source_type,
"value": source,
"content_preview": content[:200] + "..." if len(content) > 200 else content
},
"knowledge_graph": {
"entities": kg_data.get("entities", []),
"relationships": kg_data.get("relationships", []),
"entity_count": len(kg_data.get("entities", [])),
"relationship_count": len(kg_data.get("relationships", []))
},
"metadata": {
"model": "mistralai/Mistral-Small-24B-Instruct-2501",
"content_length": len(content)
}
}
# Add any errors from the extraction process
if "error" in kg_data:
knowledge_graph["extraction_error"] = kg_data["error"]
if "raw_response" in kg_data:
knowledge_graph["raw_llm_response"] = kg_data["raw_response"]
return knowledge_graph
except Exception as e:
return {
"error": f"Unexpected error: {str(e)}",
"knowledge_graph": None
}
# Create Gradio interface
demo = gr.Interface(
fn=build_knowledge_graph,
inputs=gr.Textbox(
label="Text or URL Input",
placeholder="Enter text to analyze or a web URL (e.g., https://example.com)",
lines=5,
max_lines=10
),
outputs=gr.JSON(label="Knowledge Graph"),
title="🧠 Knowledge Graph Builder",
description="""
**Build Knowledge Graphs with AI**
This tool uses Mistral AI to extract entities and relationships from text or web content:
• **Text Input**: Paste any text to analyze
• **URL Input**: Provide a web URL to extract and analyze content
• **Output**: Structured JSON knowledge graph for LLM agents
The output includes entities (people, organizations, locations, concepts) and their relationships, formatted for easy consumption by AI agents.
""",
examples=[
["Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company is headquartered in Cupertino, California."],
["https://en.wikipedia.org/wiki/Artificial_intelligence"],
],
cache_examples=False,
theme=gr.themes.Soft()
)
demo.launch(mcp_server=True)
|