Nexus_NLP_model / final.py
Krish Patel
Debugging 2
207a2e4
raw
history blame
16.1 kB
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import networkx as nx
# import spacy
# import pickle
# import pandas as pd
# import google.generativeai as genai
# import json
# # Load spaCy for NER
# nlp = spacy.load("en_core_web_sm")
# # Load the trained ML model
# model_path = "./results/checkpoint-5030" # Replace with the actual path to your model
# tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# model.eval()
# #########################
# def setup_gemini():
# genai.configure(api_key='AIzaSyAQzWpSyWyYCM1G5f-G0ulRCQkXuY7admA')
# model = genai.GenerativeModel('gemini-pro')
# return model
# #########################
# # Load the knowledge graph
# graph_path = "./models/knowledge_graph.pkl" # Replace with the actual path to your knowledge graph
# with open(graph_path, 'rb') as f:
# graph_data = pickle.load(f)
# knowledge_graph = nx.DiGraph()
# knowledge_graph.add_nodes_from(graph_data['nodes'].items())
# for u, edges in graph_data['edges'].items():
# for v, data in edges.items():
# knowledge_graph.add_edge(u, v, **data)
# def predict_with_model(text):
# """Predict whether the news is real or fake using the ML model."""
# inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
# with torch.no_grad():
# outputs = model(**inputs)
# probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
# predicted_label = torch.argmax(probabilities, dim=-1).item()
# return "FAKE" if predicted_label == 1 else "REAL"
# def update_knowledge_graph(text, is_real):
# """Update the knowledge graph with the new article."""
# entities = extract_entities(text)
# for entity, entity_type in entities:
# if not knowledge_graph.has_node(entity):
# knowledge_graph.add_node(
# entity,
# type=entity_type,
# real_count=1 if is_real else 0,
# fake_count=0 if is_real else 1
# )
# else:
# if is_real:
# knowledge_graph.nodes[entity]['real_count'] += 1
# else:
# knowledge_graph.nodes[entity]['fake_count'] += 1
# for i, (entity1, _) in enumerate(entities):
# for entity2, _ in entities[i+1:]:
# if not knowledge_graph.has_edge(entity1, entity2):
# knowledge_graph.add_edge(
# entity1,
# entity2,
# weight=1,
# is_real=is_real
# )
# else:
# knowledge_graph[entity1][entity2]['weight'] += 1
# def extract_entities(text):
# """Extract named entities from text using spaCy."""
# doc = nlp(text)
# entities = [(ent.text, ent.label_) for ent in doc.ents]
# return entities
# def predict_with_knowledge_graph(text):
# """Predict whether the news is real or fake using the knowledge graph."""
# entities = extract_entities(text)
# real_score = 0
# fake_score = 0
# for entity, _ in entities:
# if knowledge_graph.has_node(entity):
# real_count = knowledge_graph.nodes[entity].get('real_count', 0)
# fake_count = knowledge_graph.nodes[entity].get('fake_count', 0)
# total = real_count + fake_count
# if total > 0:
# real_score += real_count / total
# fake_score += fake_count / total
# if real_score > fake_score:
# return "REAL"
# else:
# return "FAKE"
# def predict_news(text):
# """Predict whether the news is real or fake using both the ML model and the knowledge graph."""
# # Predict with the ML model
# ml_prediction = predict_with_model(text)
# is_real = ml_prediction == "REAL"
# # Update the knowledge graph
# update_knowledge_graph(text, is_real)
# # Predict with the knowledge graph
# kg_prediction = predict_with_knowledge_graph(text)
# # Combine predictions (for simplicity, we use the ML model's prediction here)
# # You can enhance this by combining the scores from both predictions
# return ml_prediction if ml_prediction == kg_prediction else "UNCERTAIN"
# #########################
# # def analyze_content_gemini(model, text):
# # prompt = f"""Analyze this news text and provide results in the following JSON-like format:
# # TEXT: {text}
# # Please provide analysis in these specific sections:
# # 1. GEMINI ANALYSIS:
# # - Predicted Classification: [Real/Fake]
# # - Confidence Score: [0-100%]
# # - Reasoning: [Key points for classification]
# # 2. TEXT CLASSIFICATION:
# # - Content category/topic
# # - Writing style: [Formal/Informal/Clickbait]
# # - Target audience
# # - Content type: [news/opinion/editorial]
# # 3. SENTIMENT ANALYSIS:
# # - Primary emotion
# # - Emotional intensity (1-10)
# # - Sensationalism Level: [High/Medium/Low]
# # - Bias Indicators: [List if any]
# # - Tone: (formal/informal), [Professional/Emotional/Neutral]
# # - Key emotional triggers
# # 4. ENTITY RECOGNITION:
# # - Source Credibility: [High/Medium/Low]
# # - People mentioned
# # - Organizations
# # - Locations
# # - Dates/Time references
# # - Key numbers/statistics
# # 5. CONTEXT EXTRACTION:
# # - Main narrative/story
# # - Supporting elements
# # - Key claims
# # - Narrative structure
# # 6. FACT CHECKING:
# # - Verifiable Claims: [List main claims]
# # - Evidence Present: [Yes/No]
# # - Fact Check Score: [0-100%]
# # Format the response clearly with distinct sections."""
# # response = model.generate_content(prompt)
# # return response.text
# def analyze_content_gemini(model, text):
# prompt = f"""Analyze this news text and return a JSON object with the following structure:
# {{
# "gemini_analysis": {{
# "predicted_classification": "Real or Fake",
# "confidence_score": "0-100",
# "reasoning": ["point1", "point2"]
# }},
# "text_classification": {{
# "category": "",
# "writing_style": "Formal/Informal/Clickbait",
# "target_audience": "",
# "content_type": "news/opinion/editorial"
# }},
# "sentiment_analysis": {{
# "primary_emotion": "",
# "emotional_intensity": "1-10",
# "sensationalism_level": "High/Medium/Low",
# "bias_indicators": ["bias1", "bias2"],
# "tone": {{"formality": "formal/informal", "style": "Professional/Emotional/Neutral"}},
# "emotional_triggers": ["trigger1", "trigger2"]
# }},
# "entity_recognition": {{
# "source_credibility": "High/Medium/Low",
# "people": ["person1", "person2"],
# "organizations": ["org1", "org2"],
# "locations": ["location1", "location2"],
# "dates": ["date1", "date2"],
# "statistics": ["stat1", "stat2"]
# }},
# "context": {{
# "main_narrative": "",
# "supporting_elements": ["element1", "element2"],
# "key_claims": ["claim1", "claim2"],
# "narrative_structure": ""
# }},
# "fact_checking": {{
# "verifiable_claims": ["claim1", "claim2"],
# "evidence_present": "Yes/No",
# "fact_check_score": "0-100"
# }}
# }}
# Analyze this text and return only the JSON response: {text}"""
# response = model.generate_content(prompt)
# # return json.loads(response.text)
# # Add error handling and response cleaning
# try:
# # Clean the response text to ensure it's valid JSON
# cleaned_text = response.text.strip()
# if cleaned_text.startswith('```json'):
# cleaned_text = cleaned_text[7:-3] # Remove ```json and ``` markers
# return json.loads(cleaned_text)
# except json.JSONDecodeError:
# # Return a default structured response if JSON parsing fails
# return {
# "gemini_analysis": {
# "predicted_classification": "UNCERTAIN",
# "confidence_score": "50",
# "reasoning": ["Analysis failed to generate valid JSON"]
# }
# }
# def clean_gemini_output(text):
# """Remove markdown formatting from Gemini output"""
# text = text.replace('##', '')
# text = text.replace('**', '')
# return text
# def get_gemini_analysis(text):
# """Get detailed content analysis from Gemini."""
# gemini_model = setup_gemini()
# gemini_analysis = analyze_content_gemini(gemini_model, text)
# # cleaned_analysis = clean_gemini_output(gemini_analysis)
# # return cleaned_analysis
# return gemini_analysis
# #########################
# def main():
# print("Welcome to the News Classifier!")
# print("Enter your news text below. Type 'Exit' to quit.")
# while True:
# news_text = input("\nEnter news text: ")
# if news_text.lower() == 'exit':
# print("Thank you for using the News Classifier!")
# return
# # First get ML and Knowledge Graph prediction
# prediction = predict_news(news_text)
# print(f"\nML and Knowledge Graph Analysis: {prediction}")
# # Then get Gemini analysis
# print("\n=== Detailed Gemini Analysis ===")
# gemini_result = get_gemini_analysis(news_text)
# print(gemini_result)
# if __name__ == "__main__":
# main()
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DebertaV2Tokenizer
import networkx as nx
import spacy
import pickle
import google.generativeai as genai
import json
import os
import dotenv
# Load environment variables
dotenv.load_dotenv()
def load_models():
"""Load all required ML models"""
nlp = spacy.load("en_core_web_sm")
model_path = "./results/checkpoint-753"
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-small')
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
return nlp, tokenizer, model
def load_knowledge_graph():
"""Load and initialize knowledge graph"""
graph_path = "./models/knowledge_graph.pkl"
with open(graph_path, 'rb') as f:
graph_data = pickle.load(f)
knowledge_graph = nx.DiGraph()
knowledge_graph.add_nodes_from(graph_data['nodes'].items())
for u, edges in graph_data['edges'].items():
for v, data in edges.items():
knowledge_graph.add_edge(u, v, **data)
return knowledge_graph
def setup_gemini():
"""Initialize Gemini model"""
genai.configure(api_key=os.getenv("GEMINI_API"))
model = genai.GenerativeModel('gemini-pro')
return model
def predict_with_model(text, tokenizer, model):
"""Make predictions using the ML model"""
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_label = torch.argmax(probabilities, dim=-1).item()
confidence = probabilities[0][predicted_label].item() * 100
return "FAKE" if predicted_label == 1 else "REAL", confidence
def extract_entities(text, nlp):
"""Extract named entities from text"""
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
return entities
def update_knowledge_graph(text, is_real, knowledge_graph, nlp):
"""Update knowledge graph with new information"""
entities = extract_entities(text, nlp)
for entity, entity_type in entities:
if not knowledge_graph.has_node(entity):
knowledge_graph.add_node(
entity,
type=entity_type,
real_count=1 if is_real else 0,
fake_count=0 if is_real else 1
)
else:
if is_real:
knowledge_graph.nodes[entity]['real_count'] += 1
else:
knowledge_graph.nodes[entity]['fake_count'] += 1
for i, (entity1, _) in enumerate(entities):
for entity2, _ in entities[i+1:]:
if not knowledge_graph.has_edge(entity1, entity2):
knowledge_graph.add_edge(
entity1,
entity2,
weight=1,
is_real=is_real
)
else:
knowledge_graph[entity1][entity2]['weight'] += 1
def predict_with_knowledge_graph(text, knowledge_graph, nlp):
"""Make predictions using the knowledge graph"""
entities = extract_entities(text, nlp)
real_score = 0
fake_score = 0
for entity, _ in entities:
if knowledge_graph.has_node(entity):
real_count = knowledge_graph.nodes[entity].get('real_count', 0)
fake_count = knowledge_graph.nodes[entity].get('fake_count', 0)
total = real_count + fake_count
if total > 0:
real_score += real_count / total
fake_score += fake_count / total
total_score = real_score + fake_score
if total_score == 0:
return "UNCERTAIN", 50.0
if real_score > fake_score:
confidence = (real_score / total_score) * 100
return "REAL", confidence
else:
confidence = (fake_score / total_score) * 100
return "FAKE", confidence
def analyze_content_gemini(model, text):
"""Analyze content using Gemini model"""
prompt = f"""Analyze this news text and return a JSON object with the following structure:
{{
"gemini_analysis": {{
"predicted_classification": "Real or Fake",
"confidence_score": "0-100",
"reasoning": ["point1", "point2"]
}},
"text_classification": {{
"category": "",
"writing_style": "Formal/Informal/Clickbait",
"target_audience": "",
"content_type": "news/opinion/editorial"
}},
"sentiment_analysis": {{
"primary_emotion": "",
"emotional_intensity": "1-10",
"sensationalism_level": "High/Medium/Low",
"bias_indicators": ["bias1", "bias2"],
"tone": {{"formality": "formal/informal", "style": "Professional/Emotional/Neutral"}},
"emotional_triggers": ["trigger1", "trigger2"]
}},
"entity_recognition": {{
"source_credibility": "High/Medium/Low",
"people": ["person1", "person2"],
"organizations": ["org1", "org2"],
"locations": ["location1", "location2"],
"dates": ["date1", "date2"],
"statistics": ["stat1", "stat2"]
}},
"context": {{
"main_narrative": "",
"supporting_elements": ["element1", "element2"],
"key_claims": ["claim1", "claim2"],
"narrative_structure": ""
}},
"fact_checking": {{
"verifiable_claims": ["claim1", "claim2"],
"evidence_present": "Yes/No",
"fact_check_score": "0-100"
}}
}}
Analyze this text and return only the JSON response: {text}"""
response = model.generate_content(prompt)
try:
cleaned_text = response.text.strip()
if cleaned_text.startswith('```json'):
cleaned_text = cleaned_text[7:-3]
return json.loads(cleaned_text)
except json.JSONDecodeError:
return {
"gemini_analysis": {
"predicted_classification": "UNCERTAIN",
"confidence_score": "50",
"reasoning": ["Analysis failed to generate valid JSON"]
}
}