File size: 8,095 Bytes
d44d865
 
 
 
 
804b6dc
 
b99df01
804b6dc
 
 
28bf4ed
804b6dc
cfa2282
804b6dc
 
cfa2282
804b6dc
 
cfa2282
347e5b8
804b6dc
d44d865
804b6dc
cfa2282
804b6dc
 
 
d44d865
804b6dc
 
d44d865
804b6dc
 
 
 
d44d865
804b6dc
d44d865
804b6dc
d44d865
804b6dc
 
d44d865
583d917
7c56cb5
 
 
 
 
 
804b6dc
 
 
 
 
 
 
 
 
 
 
1810a55
804b6dc
 
1810a55
804b6dc
 
1810a55
179e56a
804b6dc
 
 
 
 
 
 
 
9070a9e
 
804b6dc
179e56a
7c56cb5
 
 
 
 
 
 
804b6dc
179e56a
804b6dc
 
 
7c56cb5
 
 
 
 
 
 
 
 
 
179e56a
804b6dc
7c56cb5
 
 
 
 
 
 
 
 
 
804b6dc
179e56a
804b6dc
 
 
 
 
 
7c56cb5
804b6dc
179e56a
 
 
 
804b6dc
179e56a
 
804b6dc
 
d44d865
7c56cb5
 
 
 
804b6dc
7c56cb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b3df9
7c56cb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804b6dc
 
78b1724
804b6dc
 
 
 
 
 
 
 
 
 
b914d47
804b6dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import gradio as gr
from huggingface_hub import InferenceClient

# Initialize Mistral client
client = InferenceClient(
    provider="together",
    api_key=os.environ.get("HF_TOKEN"),
)

def extract_text_from_url(url):
    """Extract text content from a web URL."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text content
        text = soup.get_text()
        
        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text[:5000]  # Limit to first 5000 characters
    except Exception as e:
        return f"Error fetching URL: {str(e)}"

def extract_entities_and_relationships(text):
    """Use Mistral to extract entities and relationships from text."""
    
    if not os.environ.get("HF_TOKEN"):
        return {
            "entities": [],
            "relationships": [],
            "error": "HF_TOKEN environment variable not set"
        }
    
    entity_prompt = f"""
    Analyze the following text and extract key entities and their relationships. 
    Return the result as a JSON object with this exact structure:
    {{
        "entities": [
            {{"name": "entity_name", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT|EVENT|OTHER", "description": "brief description"}}
        ],
        "relationships": [
            {{"source": "entity1", "target": "entity2", "relationship": "relationship_type", "description": "brief description"}}
        ]
    }}
    
    Text to analyze:
    {text[:3000]}
    
    Please provide only the JSON response without any additional text or formatting.
    """
    
    try:
        completion = client.chat.completions.create(
            model="mistralai/Mistral-Small-24B-Instruct-2501",
            messages=[
                {
                    "role": "user",
                    "content": entity_prompt
                }
            ],
            max_tokens=4000,
            temperature=0.2
        )
        
        if not completion.choices or not completion.choices[0].message:
            return {
                "entities": [],
                "relationships": [],
                "error": "Empty response from Mistral API"
            }
        
        response_text = completion.choices[0].message.content.strip()
        
        # Try to parse JSON from the response
        # Sometimes the model might return JSON wrapped in markdown code blocks
        if response_text.startswith('```'):
            lines = response_text.split('\n')
            start_idx = 1
            if lines[0].strip() == '```json':
                start_idx = 1
            end_idx = len(lines) - 1
            for i in range(len(lines)-1, 0, -1):
                if lines[i].strip() == '```':
                    end_idx = i
                    break
            response_text = '\n'.join(lines[start_idx:end_idx])
        
        result = json.loads(response_text)
        
        # Validate the structure
        if not isinstance(result, dict):
            raise ValueError("Response is not a JSON object")
        
        if "entities" not in result:
            result["entities"] = []
        if "relationships" not in result:
            result["relationships"] = []
            
        return result
        
    except json.JSONDecodeError as e:
        # If JSON parsing fails, return a structured error
        return {
            "entities": [],
            "relationships": [],
            "error": f"Failed to parse LLM response as JSON: {str(e)}",
            "raw_response": response_text if 'response_text' in locals() else "No response"
        }
    except Exception as e:
        return {
            "entities": [],
            "relationships": [],
            "error": f"Error calling Mistral API: {str(e)}"
        }

def build_knowledge_graph(input_text):
    """Main function to build knowledge graph from text or URL."""
    
    try:
        if not input_text or not input_text.strip():
            return {
                "error": "Please provide text or a valid URL",
                "knowledge_graph": None
            }
        
        # Check if input is a URL
        parsed = urlparse(input_text.strip())
        is_url = parsed.scheme in ('http', 'https') and parsed.netloc
        
        if is_url:
            # Extract text from URL
            extracted_text = extract_text_from_url(input_text.strip())
            if extracted_text.startswith("Error fetching URL"):
                return {
                    "error": extracted_text,
                    "knowledge_graph": None
                }
            source_type = "url"
            source = input_text.strip()
            content = extracted_text
        else:
            # Use provided text directly
            source_type = "text"
            source = "direct_input"
            content = input_text.strip()
        
        # Extract entities and relationships using Mistral
        kg_data = extract_entities_and_relationships(content)
        
        # Build the final knowledge graph structure
        knowledge_graph = {
            "source": {
                "type": source_type,
                "value": source,
                "content_preview": content[:200] + "..." if len(content) > 200 else content
            },
            "knowledge_graph": {
                "entities": kg_data.get("entities", []),
                "relationships": kg_data.get("relationships", []),
                "entity_count": len(kg_data.get("entities", [])),
                "relationship_count": len(kg_data.get("relationships", []))
            },
            "metadata": {
                "model": "mistralai/Mistral-Small-24B-Instruct-2501",
                "content_length": len(content)
            }
        }
        
        # Add any errors from the extraction process
        if "error" in kg_data:
            knowledge_graph["extraction_error"] = kg_data["error"]
            if "raw_response" in kg_data:
                knowledge_graph["raw_llm_response"] = kg_data["raw_response"]
        
        return knowledge_graph
        
    except Exception as e:
        return {
            "error": f"Unexpected error: {str(e)}",
            "knowledge_graph": None
        }

# Create Gradio interface
demo = gr.Interface(
    fn=build_knowledge_graph,
    inputs=gr.Textbox(
        label="Text or URL Input",
        placeholder="Enter text to analyze or a web URL (e.g., https://example.com)",
        lines=5,
        max_lines=10
    ),
    outputs=gr.JSON(label="Knowledge Graph"),
    title="🧠 Knowledge Graph Builder",
    description="""
    **Build Knowledge Graphs with AI**
    
    This tool uses Mistral AI to extract entities and relationships from text or web content:
    
    • **Text Input**: Paste any text to analyze
    • **URL Input**: Provide a web URL to extract and analyze content
    • **Output**: Structured JSON knowledge graph for LLM agents
    
    The output includes entities (people, organizations, locations, concepts) and their relationships, formatted for easy consumption by AI agents.
    """,
    examples=[
        ["Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company is headquartered in Cupertino, California."],
        ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
    ],
    cache_examples=False,
    theme=gr.themes.Soft()
)

demo.launch(mcp_server=True)