import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import gradio as gr
from huggingface_hub import InferenceClient

# Initialize Mistral client
client = InferenceClient(
    provider="together",
    api_key=os.environ.get("HF_TOKEN"),
)

def extract_text_from_url(url):
    """Extract text content from a web URL."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text content
        text = soup.get_text()
        
        # Clean up text
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text[:5000]  # Limit to first 5000 characters
    except Exception as e:
        return f"Error fetching URL: {str(e)}"

def extract_entities_and_relationships(text):
    """Use Mistral to extract entities and relationships from text."""
    
    if not os.environ.get("HF_TOKEN"):
        return {
            "entities": [],
            "relationships": [],
            "error": "HF_TOKEN environment variable not set"
        }
    
    entity_prompt = f"""
    Analyze the following text and extract key entities and their relationships. 
    Return the result as a JSON object with this exact structure:
    {{
        "entities": [
            {{"name": "entity_name", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT|EVENT|OTHER", "description": "brief description"}}
        ],
        "relationships": [
            {{"source": "entity1", "target": "entity2", "relationship": "relationship_type", "description": "brief description"}}
        ]
    }}
    
    Text to analyze:
    {text[:3000]}
    
    Please provide only the JSON response without any additional text or formatting.
    """
    
    try:
        completion = client.chat.completions.create(
            model="mistralai/Mistral-Small-24B-Instruct-2501",
            messages=[
                {
                    "role": "user",
                    "content": entity_prompt
                }
            ],
            max_tokens=4000,
            temperature=0.2
        )
        
        if not completion.choices or not completion.choices[0].message:
            return {
                "entities": [],
                "relationships": [],
                "error": "Empty response from Mistral API"
            }
        
        response_text = completion.choices[0].message.content.strip()
        
        # Try to parse JSON from the response
        # Sometimes the model might return JSON wrapped in markdown code blocks
        if response_text.startswith('```'):
            lines = response_text.split('\n')
            start_idx = 1
            if lines[0].strip() == '```json':
                start_idx = 1
            end_idx = len(lines) - 1
            for i in range(len(lines)-1, 0, -1):
                if lines[i].strip() == '```':
                    end_idx = i
                    break
            response_text = '\n'.join(lines[start_idx:end_idx])
        
        result = json.loads(response_text)
        
        # Validate the structure
        if not isinstance(result, dict):
            raise ValueError("Response is not a JSON object")
        
        if "entities" not in result:
            result["entities"] = []
        if "relationships" not in result:
            result["relationships"] = []
            
        return result
        
    except json.JSONDecodeError as e:
        # If JSON parsing fails, return a structured error
        return {
            "entities": [],
            "relationships": [],
            "error": f"Failed to parse LLM response as JSON: {str(e)}",
            "raw_response": response_text if 'response_text' in locals() else "No response"
        }
    except Exception as e:
        return {
            "entities": [],
            "relationships": [],
            "error": f"Error calling Mistral API: {str(e)}"
        }

def build_knowledge_graph(input_text):
    """Main function to build knowledge graph from text or URL."""
    
    try:
        if not input_text or not input_text.strip():
            return {
                "error": "Please provide text or a valid URL",
                "knowledge_graph": None
            }
        
        # Check if input is a URL
        parsed = urlparse(input_text.strip())
        is_url = parsed.scheme in ('http', 'https') and parsed.netloc
        
        if is_url:
            # Extract text from URL
            extracted_text = extract_text_from_url(input_text.strip())
            if extracted_text.startswith("Error fetching URL"):
                return {
                    "error": extracted_text,
                    "knowledge_graph": None
                }
            source_type = "url"
            source = input_text.strip()
            content = extracted_text
        else:
            # Use provided text directly
            source_type = "text"
            source = "direct_input"
            content = input_text.strip()
        
        # Extract entities and relationships using Mistral
        kg_data = extract_entities_and_relationships(content)
        
        # Build the final knowledge graph structure
        knowledge_graph = {
            "source": {
                "type": source_type,
                "value": source,
                "content_preview": content[:200] + "..." if len(content) > 200 else content
            },
            "knowledge_graph": {
                "entities": kg_data.get("entities", []),
                "relationships": kg_data.get("relationships", []),
                "entity_count": len(kg_data.get("entities", [])),
                "relationship_count": len(kg_data.get("relationships", []))
            },
            "metadata": {
                "model": "mistralai/Mistral-Small-24B-Instruct-2501",
                "content_length": len(content)
            }
        }
        
        # Add any errors from the extraction process
        if "error" in kg_data:
            knowledge_graph["extraction_error"] = kg_data["error"]
            if "raw_response" in kg_data:
                knowledge_graph["raw_llm_response"] = kg_data["raw_response"]
        
        return knowledge_graph
        
    except Exception as e:
        return {
            "error": f"Unexpected error: {str(e)}",
            "knowledge_graph": None
        }

# Create Gradio interface
demo = gr.Interface(
    fn=build_knowledge_graph,
    inputs=gr.Textbox(
        label="Text or URL Input",
        placeholder="Enter text to analyze or a web URL (e.g., https://example.com)",
        lines=5,
        max_lines=10
    ),
    outputs=gr.JSON(label="Knowledge Graph"),
    title="🧠 Knowledge Graph Builder",
    description="""
    **Build Knowledge Graphs with AI**
    
    This tool uses Mistral AI to extract entities and relationships from text or web content:
    
    • **Text Input**: Paste any text to analyze
    • **URL Input**: Provide a web URL to extract and analyze content
    • **Output**: Structured JSON knowledge graph for LLM agents
    
    The output includes entities (people, organizations, locations, concepts) and their relationships, formatted for easy consumption by AI agents.
    """,
    examples=[
        ["Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company is headquartered in Cupertino, California."],
        ["https://en.wikipedia.org/wiki/Artificial_intelligence"],
    ],
    cache_examples=False,
    theme=gr.themes.Soft()
)

demo.launch(mcp_server=True)