Krish Patel commited on
Commit
94a65e4
·
1 Parent(s): 898162d

Added gemini analysis and knowledge graph

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. app.py +90 -196
  3. final.py +418 -48
  4. knowledge_graph_final.pkl +3 -0
  5. prev_final.py +142 -0
  6. test.py +48 -11
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
app.py CHANGED
@@ -1,14 +1,9 @@
1
  # import streamlit as st
2
  # import torch
3
  # from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
4
 
5
  # # Load the model and tokenizer
6
- # # @st.cache_resource
7
- # # def load_model():
8
- # # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
9
- # # model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-753")
10
- # # model.eval()
11
- # # return tokenizer, model
12
  # @st.cache_resource
13
  # def load_model():
14
  # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=False)
@@ -25,202 +20,101 @@
25
  # confidence = probabilities[0][predicted_label].item()
26
  # return "FAKE" if predicted_label == 1 else "REAL", confidence
27
 
28
- # def main():
29
- # st.title("News Classifier")
30
-
31
- # # Load model
 
 
 
32
  # tokenizer, model = load_model()
 
33
 
34
- # # Text input
35
- # news_text = st.text_area("Enter news text to analyze:", height=200)
36
-
37
- # if st.button("Classify"):
38
- # if news_text:
39
- # with st.spinner('Analyzing...'):
40
- # prediction, confidence = predict_news(news_text, tokenizer, model)
41
-
42
- # # Display results
43
- # if prediction == "FAKE":
44
- # st.error(f"⚠️ {prediction} NEWS")
45
- # else:
46
- # st.success(f"✅ {prediction} NEWS")
47
-
48
- # st.info(f"Confidence: {confidence*100:.2f}%")
49
-
50
- # if __name__ == "__main__":
51
- # main()
52
-
53
-
54
- # # import streamlit as st
55
- # # import torch
56
- # # from transformers import AutoTokenizer, AutoModelForSequenceClassification
57
- # # from fastapi import FastAPI, Request
58
- # # from pydantic import BaseModel
59
- # # from threading import Thread
60
- # # from streamlit.web import cli
61
-
62
- # # # FastAPI app
63
- # # api_app = FastAPI()
64
-
65
- # # # Load the model and tokenizer
66
- # # @st.cache_resource
67
- # # def load_model():
68
- # # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=False)
69
- # # model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-753")
70
- # # model.eval()
71
- # # return tokenizer, model
72
-
73
- # # # Prediction function
74
- # # def predict_news(text, tokenizer, model):
75
- # # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
76
- # # with torch.no_grad():
77
- # # outputs = model(**inputs)
78
- # # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
79
- # # predicted_label = torch.argmax(probabilities, dim=-1).item()
80
- # # confidence = probabilities[0][predicted_label].item()
81
- # # return "FAKE" if predicted_label == 1 else "REAL", confidence
82
-
83
- # # # FastAPI request model
84
- # # class NewsInput(BaseModel):
85
- # # text: str
86
-
87
- # # # FastAPI route for POST requests
88
- # # @api_app.post("/classify")
89
- # # async def classify_news(data: NewsInput):
90
- # # tokenizer, model = load_model()
91
- # # prediction, confidence = predict_news(data.text, tokenizer, model)
92
- # # return {
93
- # # "prediction": prediction,
94
- # # "confidence": f"{confidence*100:.2f}%"
95
- # # }
96
-
97
- # # # Streamlit app
98
- # # def run_streamlit():
99
- # # def main():
100
- # # st.title("News Classifier")
101
-
102
- # # # Load model
103
- # # tokenizer, model = load_model()
104
-
105
- # # # Text input
106
- # # news_text = st.text_area("Enter news text to analyze:", height=200)
107
-
108
- # # if st.button("Classify"):
109
- # # if news_text:
110
- # # with st.spinner('Analyzing...'):
111
- # # prediction, confidence = predict_news(news_text, tokenizer, model)
112
-
113
- # # # Display results
114
- # # if prediction == "FAKE":
115
- # # st.error(f"⚠️ {prediction} NEWS")
116
- # # else:
117
- # # st.success(f"✅ {prediction} NEWS")
118
-
119
- # # st.info(f"Confidence: {confidence*100:.2f}%")
120
-
121
- # # main()
122
-
123
- # # # Threaded execution for FastAPI and Streamlit
124
- # # def start_fastapi():
125
- # # import uvicorn
126
- # # uvicorn.run(api_app, host="0.0.0.0", port=8502)
127
-
128
- # # if __name__ == "__main__":
129
- # # fastapi_thread = Thread(target=start_fastapi, daemon=True)
130
- # # fastapi_thread.start()
131
-
132
- # # # Start Streamlit
133
- # # cli.main()
134
-
135
- # # # from fastapi import FastAPI, HTTPException
136
- # # # from pydantic import BaseModel
137
- # # # from transformers import AutoTokenizer, AutoModelForSequenceClassification
138
- # # # import torch
139
-
140
- # # # from fastapi.middleware.cors import CORSMiddleware
141
-
142
-
143
- # # # # Define the FastAPI app
144
- # # # app = FastAPI()
145
-
146
- # # # app.add_middleware(
147
- # # # CORSMiddleware,
148
- # # # allow_origins=["*"], # Update with your frontend's URL for security
149
- # # # allow_credentials=True,
150
- # # # allow_methods=["*"],
151
- # # # allow_headers=["*"],
152
- # # # )
153
- # # # # Define the input data schema
154
- # # # class InputText(BaseModel):
155
- # # # text: str
156
-
157
- # # # # Load the model and tokenizer (ensure these paths are correct in your Space)
158
- # # # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=False)
159
- # # # model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-753")
160
- # # # model.eval()
161
-
162
- # # # # Prediction function
163
- # # # def predict_news(text: str):
164
- # # # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
165
- # # # with torch.no_grad():
166
- # # # outputs = model(**inputs)
167
- # # # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
168
- # # # predicted_label = torch.argmax(probabilities, dim=-1).item()
169
- # # # confidence = probabilities[0][predicted_label].item()
170
- # # # return {
171
- # # # "prediction": "FAKE" if predicted_label == 1 else "REAL",
172
- # # # "confidence": round(confidence * 100, 2) # Return confidence as a percentage
173
- # # # }
174
-
175
- # # # # Define the POST endpoint
176
- # # # @app.post("/predict")
177
- # # # async def classify_news(input_text: InputText):
178
- # # # try:
179
- # # # result = predict_news(input_text.text)
180
- # # # return result
181
- # # # except Exception as e:
182
- # # # raise HTTPException(status_code=500, detail=str(e))
183
 
 
 
 
 
 
 
 
184
 
185
  import streamlit as st
186
- import torch
187
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
188
- import json
189
 
190
- # Load the model and tokenizer
 
 
 
 
 
 
 
191
  @st.cache_resource
192
- def load_model():
193
- tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=False)
194
- model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-753")
195
- model.eval()
196
- return tokenizer, model
197
-
198
- def predict_news(text, tokenizer, model):
199
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
200
- with torch.no_grad():
201
- outputs = model(**inputs)
202
- probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
203
- predicted_label = torch.argmax(probabilities, dim=-1).item()
204
- confidence = probabilities[0][predicted_label].item()
205
- return "FAKE" if predicted_label == 1 else "REAL", confidence
206
 
207
  # Streamlit UI
208
- st.title("News Classifier API")
209
-
210
- # If running as an API, get the request from query parameters
211
- query_params = st.query_params
212
- if "text" in query_params:
213
- text_input = query_params["text"][0] # Get text input from URL query
214
- tokenizer, model = load_model()
215
- prediction, confidence = predict_news(text_input, tokenizer, model)
216
-
217
- # Return JSON response
218
- st.json({"prediction": prediction, "confidence": confidence})
219
-
220
- # If running in UI mode, show text input
221
- else:
222
- text_input = st.text_area("Enter news text:")
223
- if st.button("Classify"):
224
- tokenizer, model = load_model()
225
- prediction, confidence = predict_news(text_input, tokenizer, model)
226
- st.write(f"Prediction: {prediction} (Confidence: {confidence*100:.2f}%)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # import streamlit as st
2
  # import torch
3
  # from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ # import json
5
 
6
  # # Load the model and tokenizer
 
 
 
 
 
 
7
  # @st.cache_resource
8
  # def load_model():
9
  # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=False)
 
20
  # confidence = probabilities[0][predicted_label].item()
21
  # return "FAKE" if predicted_label == 1 else "REAL", confidence
22
 
23
+ # # Streamlit UI
24
+ # st.title("News Classifier API")
25
+
26
+ # # If running as an API, get the request from query parameters
27
+ # query_params = st.query_params
28
+ # if "text" in query_params:
29
+ # text_input = query_params["text"][0] # Get text input from URL query
30
  # tokenizer, model = load_model()
31
+ # prediction, confidence = predict_news(text_input, tokenizer, model)
32
 
33
+ # # Return JSON response
34
+ # st.json({"prediction": prediction, "confidence": confidence})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # # If running in UI mode, show text input
37
+ # else:
38
+ # text_input = st.text_area("Enter news text:")
39
+ # if st.button("Classify"):
40
+ # tokenizer, model = load_model()
41
+ # prediction, confidence = predict_news(text_input, tokenizer, model)
42
+ # st.write(f"Prediction: {prediction} (Confidence: {confidence*100:.2f}%)")
43
 
44
  import streamlit as st
45
+ from final import *
46
+ import pandas as pd
 
47
 
48
+ # Page configuration
49
+ st.set_page_config(
50
+ page_title="Nexus NLP News Classifier",
51
+ page_icon="📰",
52
+ layout="wide"
53
+ )
54
+
55
+ # Cache model loading
56
  @st.cache_resource
57
+ def initialize_models():
58
+ nlp, tokenizer, model = load_models()
59
+ knowledge_graph = load_knowledge_graph()
60
+ return nlp, tokenizer, model, knowledge_graph
61
+
62
+ # Initialize all models
63
+ nlp, tokenizer, model, knowledge_graph = initialize_models()
 
 
 
 
 
 
 
64
 
65
  # Streamlit UI
66
+ def main():
67
+ st.title("📰 Nexus NLP News Classifier")
68
+ st.write("Enter news text below to analyze its authenticity")
69
+
70
+ # Text input area
71
+ news_text = st.text_area("News Text", height=200)
72
+
73
+ if st.button("Analyze"):
74
+ if news_text:
75
+ with st.spinner("Analyzing..."):
76
+ # Get predictions from all models
77
+ ml_prediction, ml_confidence = predict_with_model(news_text, tokenizer, model)
78
+ kg_prediction, kg_confidence = predict_with_knowledge_graph(text, knowledge_graph, nlp)
79
+
80
+ # Update knowledge graph
81
+ update_knowledge_graph(news_text, ml_prediction == "REAL", knowledge_graph, nlp)
82
+
83
+ # Get Gemini analysis
84
+ gemini_model = setup_gemini()
85
+ gemini_result = analyze_content_gemini(gemini_model, news_text)
86
+
87
+ # Display results in columns
88
+ col1, col2, col3 = st.columns(3)
89
+
90
+ with col1:
91
+ st.subheader("ML Model Analysis")
92
+ st.metric("Prediction", ml_prediction)
93
+ st.metric("Confidence", f"{ml_confidence:.2f}%")
94
+
95
+ with col2:
96
+ st.subheader("Knowledge Graph Analysis")
97
+ st.metric("Prediction", kg_prediction)
98
+ st.metric("Confidence", f"{kg_confidence:.2f}%")
99
+
100
+ with col3:
101
+ st.subheader("Gemini Analysis")
102
+ gemini_pred = gemini_result["gemini_analysis"]["predicted_classification"]
103
+ gemini_conf = gemini_result["gemini_analysis"]["confidence_score"]
104
+ st.metric("Prediction", gemini_pred)
105
+ st.metric("Confidence", f"{gemini_conf}%")
106
+
107
+ # Detailed analysis sections
108
+ with st.expander("View Detailed Analysis"):
109
+ st.json(gemini_result)
110
+
111
+ with st.expander("Named Entities"):
112
+ entities = extract_entities(news_text, nlp)
113
+ df = pd.DataFrame(entities, columns=["Entity", "Type"])
114
+ st.dataframe(df)
115
+
116
+ else:
117
+ st.warning("Please enter some text to analyze")
118
+
119
+ if __name__ == "__main__":
120
+ main()
final.py CHANGED
@@ -1,50 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
3
  import spacy
 
4
  import google.generativeai as genai
5
  import json
6
  import os
7
  import dotenv
8
 
 
 
 
 
 
 
 
 
9
  dotenv.load_dotenv()
10
 
11
- # Load spaCy for NER
12
- nlp = spacy.load("en_core_web_sm")
 
 
13
 
14
- # Load the trained ML model
15
- model_path = "./results/checkpoint-753" # Replace with the actual path to your model
16
- # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
17
- # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=False)
18
- from transformers import DebertaV2Tokenizer
19
- tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-small')
20
- model = AutoModelForSequenceClassification.from_pretrained(model_path)
21
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def setup_gemini():
24
  genai.configure(api_key=os.getenv("GEMINI_API"))
25
  model = genai.GenerativeModel('gemini-pro')
26
  return model
27
 
 
 
 
 
 
28
  def predict_with_model(text):
29
- """Predict whether the news is real or fake using the ML model."""
30
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
31
  with torch.no_grad():
32
  outputs = model(**inputs)
33
  probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
34
  predicted_label = torch.argmax(probabilities, dim=-1).item()
35
- return "FAKE" if predicted_label == 1 else "REAL"
 
36
 
37
  def extract_entities(text):
38
- """Extract named entities from text using spaCy."""
39
  doc = nlp(text)
40
  entities = [(ent.text, ent.label_) for ent in doc.ents]
41
  return entities
42
 
43
- def predict_news(text):
44
- """Predict whether the news is real or fake using the ML model."""
45
- # Predict with the ML model
46
- prediction = predict_with_model(text)
47
- return prediction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def analyze_content_gemini(model, text):
50
  prompt = f"""Analyze this news text and return a JSON object with the following structure:
@@ -106,37 +452,61 @@ def analyze_content_gemini(model, text):
106
  }
107
  }
108
 
109
- def clean_gemini_output(text):
110
- """Remove markdown formatting from Gemini output"""
111
- text = text.replace('##', '')
112
- text = text.replace('**', '')
113
- return text
114
 
115
- def get_gemini_analysis(text):
116
- """Get detailed content analysis from Gemini."""
117
- gemini_model = setup_gemini()
118
- gemini_analysis = analyze_content_gemini(gemini_model, text)
119
- return gemini_analysis
 
 
120
 
121
- def main():
122
- print("Welcome to the News Classifier!")
123
- print("Enter your news text below. Type 'Exit' to quit.")
124
-
125
- while True:
126
- news_text = input("\nEnter news text: ")
127
-
128
- if news_text.lower() == 'exit':
129
- print("Thank you for using the News Classifier!")
130
- return
131
-
132
- # Get ML prediction
133
- prediction = predict_news(news_text)
134
- print(f"\nML Analysis: {prediction}")
135
-
136
- # Get Gemini analysis
137
- print("\n=== Detailed Gemini Analysis ===")
138
- gemini_result = get_gemini_analysis(news_text)
139
- print(gemini_result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  if __name__ == "__main__":
142
  main()
 
1
+ # import torch
2
+ # from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ # import networkx as nx
4
+ # import spacy
5
+ # import pickle
6
+ # import pandas as pd
7
+ # import google.generativeai as genai
8
+ # import json
9
+
10
+ # # Load spaCy for NER
11
+ # nlp = spacy.load("en_core_web_sm")
12
+
13
+ # # Load the trained ML model
14
+ # model_path = "./results/checkpoint-5030" # Replace with the actual path to your model
15
+ # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
16
+ # model = AutoModelForSequenceClassification.from_pretrained(model_path)
17
+ # model.eval()
18
+
19
+ # #########################
20
+ # def setup_gemini():
21
+ # genai.configure(api_key='AIzaSyAQzWpSyWyYCM1G5f-G0ulRCQkXuY7admA')
22
+ # model = genai.GenerativeModel('gemini-pro')
23
+ # return model
24
+ # #########################
25
+
26
+ # # Load the knowledge graph
27
+ # graph_path = "./models/knowledge_graph.pkl" # Replace with the actual path to your knowledge graph
28
+ # with open(graph_path, 'rb') as f:
29
+ # graph_data = pickle.load(f)
30
+
31
+ # knowledge_graph = nx.DiGraph()
32
+ # knowledge_graph.add_nodes_from(graph_data['nodes'].items())
33
+ # for u, edges in graph_data['edges'].items():
34
+ # for v, data in edges.items():
35
+ # knowledge_graph.add_edge(u, v, **data)
36
+
37
+ # def predict_with_model(text):
38
+ # """Predict whether the news is real or fake using the ML model."""
39
+ # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
40
+ # with torch.no_grad():
41
+ # outputs = model(**inputs)
42
+ # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
43
+ # predicted_label = torch.argmax(probabilities, dim=-1).item()
44
+ # return "FAKE" if predicted_label == 1 else "REAL"
45
+
46
+ # def update_knowledge_graph(text, is_real):
47
+ # """Update the knowledge graph with the new article."""
48
+ # entities = extract_entities(text)
49
+ # for entity, entity_type in entities:
50
+ # if not knowledge_graph.has_node(entity):
51
+ # knowledge_graph.add_node(
52
+ # entity,
53
+ # type=entity_type,
54
+ # real_count=1 if is_real else 0,
55
+ # fake_count=0 if is_real else 1
56
+ # )
57
+ # else:
58
+ # if is_real:
59
+ # knowledge_graph.nodes[entity]['real_count'] += 1
60
+ # else:
61
+ # knowledge_graph.nodes[entity]['fake_count'] += 1
62
+
63
+ # for i, (entity1, _) in enumerate(entities):
64
+ # for entity2, _ in entities[i+1:]:
65
+ # if not knowledge_graph.has_edge(entity1, entity2):
66
+ # knowledge_graph.add_edge(
67
+ # entity1,
68
+ # entity2,
69
+ # weight=1,
70
+ # is_real=is_real
71
+ # )
72
+ # else:
73
+ # knowledge_graph[entity1][entity2]['weight'] += 1
74
+
75
+ # def extract_entities(text):
76
+ # """Extract named entities from text using spaCy."""
77
+ # doc = nlp(text)
78
+ # entities = [(ent.text, ent.label_) for ent in doc.ents]
79
+ # return entities
80
+
81
+ # def predict_with_knowledge_graph(text):
82
+ # """Predict whether the news is real or fake using the knowledge graph."""
83
+ # entities = extract_entities(text)
84
+ # real_score = 0
85
+ # fake_score = 0
86
+
87
+ # for entity, _ in entities:
88
+ # if knowledge_graph.has_node(entity):
89
+ # real_count = knowledge_graph.nodes[entity].get('real_count', 0)
90
+ # fake_count = knowledge_graph.nodes[entity].get('fake_count', 0)
91
+ # total = real_count + fake_count
92
+ # if total > 0:
93
+ # real_score += real_count / total
94
+ # fake_score += fake_count / total
95
+
96
+ # if real_score > fake_score:
97
+ # return "REAL"
98
+ # else:
99
+ # return "FAKE"
100
+
101
+ # def predict_news(text):
102
+ # """Predict whether the news is real or fake using both the ML model and the knowledge graph."""
103
+ # # Predict with the ML model
104
+ # ml_prediction = predict_with_model(text)
105
+ # is_real = ml_prediction == "REAL"
106
+
107
+ # # Update the knowledge graph
108
+ # update_knowledge_graph(text, is_real)
109
+
110
+ # # Predict with the knowledge graph
111
+ # kg_prediction = predict_with_knowledge_graph(text)
112
+
113
+ # # Combine predictions (for simplicity, we use the ML model's prediction here)
114
+ # # You can enhance this by combining the scores from both predictions
115
+ # return ml_prediction if ml_prediction == kg_prediction else "UNCERTAIN"
116
+
117
+ # #########################
118
+ # # def analyze_content_gemini(model, text):
119
+ # # prompt = f"""Analyze this news text and provide results in the following JSON-like format:
120
+
121
+ # # TEXT: {text}
122
+
123
+ # # Please provide analysis in these specific sections:
124
+
125
+ # # 1. GEMINI ANALYSIS:
126
+ # # - Predicted Classification: [Real/Fake]
127
+ # # - Confidence Score: [0-100%]
128
+ # # - Reasoning: [Key points for classification]
129
+
130
+ # # 2. TEXT CLASSIFICATION:
131
+ # # - Content category/topic
132
+ # # - Writing style: [Formal/Informal/Clickbait]
133
+ # # - Target audience
134
+ # # - Content type: [news/opinion/editorial]
135
+
136
+ # # 3. SENTIMENT ANALYSIS:
137
+ # # - Primary emotion
138
+ # # - Emotional intensity (1-10)
139
+ # # - Sensationalism Level: [High/Medium/Low]
140
+ # # - Bias Indicators: [List if any]
141
+ # # - Tone: (formal/informal), [Professional/Emotional/Neutral]
142
+ # # - Key emotional triggers
143
+
144
+ # # 4. ENTITY RECOGNITION:
145
+ # # - Source Credibility: [High/Medium/Low]
146
+ # # - People mentioned
147
+ # # - Organizations
148
+ # # - Locations
149
+ # # - Dates/Time references
150
+ # # - Key numbers/statistics
151
+
152
+ # # 5. CONTEXT EXTRACTION:
153
+ # # - Main narrative/story
154
+ # # - Supporting elements
155
+ # # - Key claims
156
+ # # - Narrative structure
157
+
158
+ # # 6. FACT CHECKING:
159
+ # # - Verifiable Claims: [List main claims]
160
+ # # - Evidence Present: [Yes/No]
161
+ # # - Fact Check Score: [0-100%]
162
+
163
+ # # Format the response clearly with distinct sections."""
164
+
165
+ # # response = model.generate_content(prompt)
166
+ # # return response.text
167
+
168
+ # def analyze_content_gemini(model, text):
169
+ # prompt = f"""Analyze this news text and return a JSON object with the following structure:
170
+ # {{
171
+ # "gemini_analysis": {{
172
+ # "predicted_classification": "Real or Fake",
173
+ # "confidence_score": "0-100",
174
+ # "reasoning": ["point1", "point2"]
175
+ # }},
176
+ # "text_classification": {{
177
+ # "category": "",
178
+ # "writing_style": "Formal/Informal/Clickbait",
179
+ # "target_audience": "",
180
+ # "content_type": "news/opinion/editorial"
181
+ # }},
182
+ # "sentiment_analysis": {{
183
+ # "primary_emotion": "",
184
+ # "emotional_intensity": "1-10",
185
+ # "sensationalism_level": "High/Medium/Low",
186
+ # "bias_indicators": ["bias1", "bias2"],
187
+ # "tone": {{"formality": "formal/informal", "style": "Professional/Emotional/Neutral"}},
188
+ # "emotional_triggers": ["trigger1", "trigger2"]
189
+ # }},
190
+ # "entity_recognition": {{
191
+ # "source_credibility": "High/Medium/Low",
192
+ # "people": ["person1", "person2"],
193
+ # "organizations": ["org1", "org2"],
194
+ # "locations": ["location1", "location2"],
195
+ # "dates": ["date1", "date2"],
196
+ # "statistics": ["stat1", "stat2"]
197
+ # }},
198
+ # "context": {{
199
+ # "main_narrative": "",
200
+ # "supporting_elements": ["element1", "element2"],
201
+ # "key_claims": ["claim1", "claim2"],
202
+ # "narrative_structure": ""
203
+ # }},
204
+ # "fact_checking": {{
205
+ # "verifiable_claims": ["claim1", "claim2"],
206
+ # "evidence_present": "Yes/No",
207
+ # "fact_check_score": "0-100"
208
+ # }}
209
+ # }}
210
+
211
+ # Analyze this text and return only the JSON response: {text}"""
212
+
213
+ # response = model.generate_content(prompt)
214
+ # # return json.loads(response.text)
215
+ # # Add error handling and response cleaning
216
+ # try:
217
+ # # Clean the response text to ensure it's valid JSON
218
+ # cleaned_text = response.text.strip()
219
+ # if cleaned_text.startswith('```json'):
220
+ # cleaned_text = cleaned_text[7:-3] # Remove ```json and ``` markers
221
+ # return json.loads(cleaned_text)
222
+ # except json.JSONDecodeError:
223
+ # # Return a default structured response if JSON parsing fails
224
+ # return {
225
+ # "gemini_analysis": {
226
+ # "predicted_classification": "UNCERTAIN",
227
+ # "confidence_score": "50",
228
+ # "reasoning": ["Analysis failed to generate valid JSON"]
229
+ # }
230
+ # }
231
+
232
+
233
+ # def clean_gemini_output(text):
234
+ # """Remove markdown formatting from Gemini output"""
235
+ # text = text.replace('##', '')
236
+ # text = text.replace('**', '')
237
+ # return text
238
+
239
+ # def get_gemini_analysis(text):
240
+ # """Get detailed content analysis from Gemini."""
241
+ # gemini_model = setup_gemini()
242
+ # gemini_analysis = analyze_content_gemini(gemini_model, text)
243
+ # # cleaned_analysis = clean_gemini_output(gemini_analysis)
244
+ # # return cleaned_analysis
245
+ # return gemini_analysis
246
+ # #########################
247
+
248
+ # def main():
249
+ # print("Welcome to the News Classifier!")
250
+ # print("Enter your news text below. Type 'Exit' to quit.")
251
+
252
+ # while True:
253
+ # news_text = input("\nEnter news text: ")
254
+
255
+ # if news_text.lower() == 'exit':
256
+ # print("Thank you for using the News Classifier!")
257
+ # return
258
+
259
+ # # First get ML and Knowledge Graph prediction
260
+ # prediction = predict_news(news_text)
261
+ # print(f"\nML and Knowledge Graph Analysis: {prediction}")
262
+
263
+ # # Then get Gemini analysis
264
+ # print("\n=== Detailed Gemini Analysis ===")
265
+ # gemini_result = get_gemini_analysis(news_text)
266
+ # print(gemini_result)
267
+
268
+
269
+ # if __name__ == "__main__":
270
+ # main()
271
+
272
+ import streamlit as st
273
  import torch
274
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, DebertaV2Tokenizer
275
+ import networkx as nx
276
  import spacy
277
+ import pickle
278
  import google.generativeai as genai
279
  import json
280
  import os
281
  import dotenv
282
 
283
+ # Page config
284
+ st.set_page_config(
285
+ page_title="Nexus NLP News Classifier",
286
+ page_icon="📰",
287
+ layout="wide"
288
+ )
289
+
290
+ # Load environment variables
291
  dotenv.load_dotenv()
292
 
293
+ # Load models and resources
294
+ @st.cache_resource
295
+ def load_nlp():
296
+ return spacy.load("en_core_web_sm")
297
 
298
+ @st.cache_resource
299
+ def load_model():
300
+ model_path = "./results/checkpoint-753"
301
+ tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-small')
302
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
303
+ model.eval()
304
+ return tokenizer, model
305
+
306
+ @st.cache_resource
307
+ def load_knowledge_graph():
308
+ graph_path = "./knowledge_graph_final.pkl"
309
+ with open(graph_path, 'rb') as f:
310
+ graph_data = pickle.load(f)
311
+ knowledge_graph = nx.DiGraph()
312
+ knowledge_graph.add_nodes_from(graph_data['nodes'].items())
313
+ for u, edges in graph_data['edges'].items():
314
+ for v, data in edges.items():
315
+ knowledge_graph.add_edge(u, v, **data)
316
+ return knowledge_graph
317
 
318
  def setup_gemini():
319
  genai.configure(api_key=os.getenv("GEMINI_API"))
320
  model = genai.GenerativeModel('gemini-pro')
321
  return model
322
 
323
+ # Initialize resources
324
+ nlp = load_nlp()
325
+ tokenizer, model = load_model()
326
+ knowledge_graph = load_knowledge_graph()
327
+
328
  def predict_with_model(text):
 
329
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
330
  with torch.no_grad():
331
  outputs = model(**inputs)
332
  probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
333
  predicted_label = torch.argmax(probabilities, dim=-1).item()
334
+ confidence = probabilities[0][predicted_label].item() * 100
335
+ return "FAKE" if predicted_label == 1 else "REAL", confidence
336
 
337
  def extract_entities(text):
 
338
  doc = nlp(text)
339
  entities = [(ent.text, ent.label_) for ent in doc.ents]
340
  return entities
341
 
342
+ def update_knowledge_graph(text, is_real):
343
+ entities = extract_entities(text)
344
+ for entity, entity_type in entities:
345
+ if not knowledge_graph.has_node(entity):
346
+ knowledge_graph.add_node(
347
+ entity,
348
+ type=entity_type,
349
+ real_count=1 if is_real else 0,
350
+ fake_count=0 if is_real else 1
351
+ )
352
+ else:
353
+ if is_real:
354
+ knowledge_graph.nodes[entity]['real_count'] += 1
355
+ else:
356
+ knowledge_graph.nodes[entity]['fake_count'] += 1
357
+
358
+ for i, (entity1, _) in enumerate(entities):
359
+ for entity2, _ in entities[i+1:]:
360
+ if not knowledge_graph.has_edge(entity1, entity2):
361
+ knowledge_graph.add_edge(
362
+ entity1,
363
+ entity2,
364
+ weight=1,
365
+ is_real=is_real
366
+ )
367
+ else:
368
+ knowledge_graph[entity1][entity2]['weight'] += 1
369
+
370
+ def predict_with_knowledge_graph(text):
371
+ entities = extract_entities(text)
372
+ real_score = 0
373
+ fake_score = 0
374
+
375
+ for entity, _ in entities:
376
+ if knowledge_graph.has_node(entity):
377
+ real_count = knowledge_graph.nodes[entity].get('real_count', 0)
378
+ fake_count = knowledge_graph.nodes[entity].get('fake_count', 0)
379
+ total = real_count + fake_count
380
+ if total > 0:
381
+ real_score += real_count / total
382
+ fake_score += fake_count / total
383
+
384
+ total_score = real_score + fake_score
385
+ if total_score == 0:
386
+ return "UNCERTAIN", 50.0
387
+
388
+ if real_score > fake_score:
389
+ confidence = (real_score / total_score) * 100
390
+ return "REAL", confidence
391
+ else:
392
+ confidence = (fake_score / total_score) * 100
393
+ return "FAKE", confidence
394
 
395
  def analyze_content_gemini(model, text):
396
  prompt = f"""Analyze this news text and return a JSON object with the following structure:
 
452
  }
453
  }
454
 
455
+ def main():
456
+ st.title("📰 Nexus NLP News Classifier")
457
+ st.write("Enter news text below to analyze its authenticity")
 
 
458
 
459
+ # Query parameters for API functionality
460
+ query_params = st.query_params
461
+ if "text" in query_params:
462
+ text_input = query_params["text"][0]
463
+ ml_prediction, ml_confidence = predict_with_model(text_input)
464
+ st.json({"prediction": ml_prediction, "confidence": ml_confidence})
465
+ return
466
 
467
+ # Regular UI
468
+ news_text = st.text_area("News Text", height=200)
469
+
470
+ if st.button("Analyze"):
471
+ if news_text:
472
+ with st.spinner("Analyzing..."):
473
+ # Get all predictions
474
+ ml_prediction, ml_confidence = predict_with_model(news_text)
475
+ kg_prediction, kg_confidence = predict_with_knowledge_graph(news_text)
476
+ update_knowledge_graph(news_text, ml_prediction == "REAL")
477
+
478
+ gemini_model = setup_gemini()
479
+ gemini_result = analyze_content_gemini(gemini_model, news_text)
480
+
481
+ # Display results
482
+ col1, col2, col3 = st.columns(3)
483
+
484
+ with col1:
485
+ st.subheader("ML Model Analysis")
486
+ st.metric("Prediction", ml_prediction)
487
+ st.metric("Confidence", f"{ml_confidence:.2f}%")
488
+
489
+ with col2:
490
+ st.subheader("Knowledge Graph Analysis")
491
+ st.metric("Prediction", kg_prediction)
492
+ st.metric("Confidence", f"{kg_confidence:.2f}%")
493
+
494
+ with col3:
495
+ st.subheader("Gemini Analysis")
496
+ gemini_pred = gemini_result["gemini_analysis"]["predicted_classification"]
497
+ gemini_conf = gemini_result["gemini_analysis"]["confidence_score"]
498
+ st.metric("Prediction", gemini_pred)
499
+ st.metric("Confidence", f"{gemini_conf}%")
500
+
501
+ with st.expander("View Detailed Analysis"):
502
+ st.json(gemini_result)
503
+
504
+ with st.expander("Named Entities"):
505
+ entities = extract_entities(news_text)
506
+ st.write(entities)
507
+
508
+ else:
509
+ st.warning("Please enter some text to analyze")
510
 
511
  if __name__ == "__main__":
512
  main()
knowledge_graph_final.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f941e2c0b588a89f20e59aefd71c455696b291c88277672d997ea144164f70e8
3
+ size 10584988
prev_final.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import spacy
4
+ import google.generativeai as genai
5
+ import json
6
+ import os
7
+ import dotenv
8
+
9
+ dotenv.load_dotenv()
10
+
11
+ # Load spaCy for NER
12
+ nlp = spacy.load("en_core_web_sm")
13
+
14
+ # Load the trained ML model
15
+ model_path = "./results/checkpoint-753" # Replace with the actual path to your model
16
+ # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
17
+ # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small', use_fast=False)
18
+ from transformers import DebertaV2Tokenizer
19
+ tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-small')
20
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
21
+ model.eval()
22
+
23
+ def setup_gemini():
24
+ genai.configure(api_key=os.getenv("GEMINI_API"))
25
+ model = genai.GenerativeModel('gemini-pro')
26
+ return model
27
+
28
+ def predict_with_model(text):
29
+ """Predict whether the news is real or fake using the ML model."""
30
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
31
+ with torch.no_grad():
32
+ outputs = model(**inputs)
33
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
34
+ predicted_label = torch.argmax(probabilities, dim=-1).item()
35
+ return "FAKE" if predicted_label == 1 else "REAL"
36
+
37
+ def extract_entities(text):
38
+ """Extract named entities from text using spaCy."""
39
+ doc = nlp(text)
40
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
41
+ return entities
42
+
43
+ def predict_news(text):
44
+ """Predict whether the news is real or fake using the ML model."""
45
+ # Predict with the ML model
46
+ prediction = predict_with_model(text)
47
+ return prediction
48
+
49
+ def analyze_content_gemini(model, text):
50
+ prompt = f"""Analyze this news text and return a JSON object with the following structure:
51
+ {{
52
+ "gemini_analysis": {{
53
+ "predicted_classification": "Real or Fake",
54
+ "confidence_score": "0-100",
55
+ "reasoning": ["point1", "point2"]
56
+ }},
57
+ "text_classification": {{
58
+ "category": "",
59
+ "writing_style": "Formal/Informal/Clickbait",
60
+ "target_audience": "",
61
+ "content_type": "news/opinion/editorial"
62
+ }},
63
+ "sentiment_analysis": {{
64
+ "primary_emotion": "",
65
+ "emotional_intensity": "1-10",
66
+ "sensationalism_level": "High/Medium/Low",
67
+ "bias_indicators": ["bias1", "bias2"],
68
+ "tone": {{"formality": "formal/informal", "style": "Professional/Emotional/Neutral"}},
69
+ "emotional_triggers": ["trigger1", "trigger2"]
70
+ }},
71
+ "entity_recognition": {{
72
+ "source_credibility": "High/Medium/Low",
73
+ "people": ["person1", "person2"],
74
+ "organizations": ["org1", "org2"],
75
+ "locations": ["location1", "location2"],
76
+ "dates": ["date1", "date2"],
77
+ "statistics": ["stat1", "stat2"]
78
+ }},
79
+ "context": {{
80
+ "main_narrative": "",
81
+ "supporting_elements": ["element1", "element2"],
82
+ "key_claims": ["claim1", "claim2"],
83
+ "narrative_structure": ""
84
+ }},
85
+ "fact_checking": {{
86
+ "verifiable_claims": ["claim1", "claim2"],
87
+ "evidence_present": "Yes/No",
88
+ "fact_check_score": "0-100"
89
+ }}
90
+ }}
91
+
92
+ Analyze this text and return only the JSON response: {text}"""
93
+
94
+ response = model.generate_content(prompt)
95
+ try:
96
+ cleaned_text = response.text.strip()
97
+ if cleaned_text.startswith('```json'):
98
+ cleaned_text = cleaned_text[7:-3]
99
+ return json.loads(cleaned_text)
100
+ except json.JSONDecodeError:
101
+ return {
102
+ "gemini_analysis": {
103
+ "predicted_classification": "UNCERTAIN",
104
+ "confidence_score": "50",
105
+ "reasoning": ["Analysis failed to generate valid JSON"]
106
+ }
107
+ }
108
+
109
+ def clean_gemini_output(text):
110
+ """Remove markdown formatting from Gemini output"""
111
+ text = text.replace('##', '')
112
+ text = text.replace('**', '')
113
+ return text
114
+
115
+ def get_gemini_analysis(text):
116
+ """Get detailed content analysis from Gemini."""
117
+ gemini_model = setup_gemini()
118
+ gemini_analysis = analyze_content_gemini(gemini_model, text)
119
+ return gemini_analysis
120
+
121
+ def main():
122
+ print("Welcome to the News Classifier!")
123
+ print("Enter your news text below. Type 'Exit' to quit.")
124
+
125
+ while True:
126
+ news_text = input("\nEnter news text: ")
127
+
128
+ if news_text.lower() == 'exit':
129
+ print("Thank you for using the News Classifier!")
130
+ return
131
+
132
+ # Get ML prediction
133
+ prediction = predict_news(news_text)
134
+ print(f"\nML Analysis: {prediction}")
135
+
136
+ # Get Gemini analysis
137
+ print("\n=== Detailed Gemini Analysis ===")
138
+ gemini_result = get_gemini_analysis(news_text)
139
+ print(gemini_result)
140
+
141
+ if __name__ == "__main__":
142
+ main()
test.py CHANGED
@@ -1,14 +1,51 @@
1
- import requests
 
 
 
 
 
 
 
 
2
 
3
- # Replace with your actual Hugging Face Spaces URL
4
- SPACE_API_URL = "https://your-username-your-app.hf.space/?text=Breaking: Stock market crashes!"
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Send request to Streamlit API
7
- response = requests.get(SPACE_API_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Parse JSON response
10
- if response.status_code == 200:
11
- result = response.json()
12
- print(f"Prediction: {result['prediction']} (Confidence: {result['confidence']*100:.2f}%)")
13
- else:
14
- print("Error: Could not get prediction")
 
1
+ # import requests
2
+ # import json
3
+
4
+ # # Replace with your actual Hugging Face Spaces URL
5
+ # SPACE_API_URL = "https://heheboi0769-nexus-nlp-model.hf.space//?text=Breaking: Stock market crashes!"
6
+
7
+ # # Add the text as a query parameter since the app uses st.experimental_get_query_params()
8
+ # text = "Breaking: Stock market crashes!"
9
+ # url_with_params = f"{SPACE_API_URL}?text={text}"
10
 
11
+ # # Send request to Streamlit API
12
+ # response = requests.get(url_with_params)
13
+
14
+ # # Parse JSON response
15
+ # if response.status_code == 200:
16
+ # result = response.json()
17
+ # print(f"Prediction: {result['prediction']} (Confidence: {result['confidence']*100:.2f}%)")
18
+ # else:
19
+ # print("Error: Could not get prediction")
20
+
21
+ import requests
22
+ import urllib.parse
23
 
24
+ def test_model():
25
+ # Base URL for your Streamlit app
26
+ base_url = "https://heheboi0769-nexus-nlp-model.hf.space/api"
27
+
28
+ # Test text
29
+ text = "Breaking: Stock market crashes!"
30
+
31
+ # Make request to the Streamlit app's API endpoint
32
+ response = requests.post(
33
+ f"{base_url}/predict",
34
+ headers={
35
+ "Content-Type": "application/json",
36
+ "Authorization": "Bearer your_api_key_here"
37
+ },
38
+ json={"text": text}
39
+ )
40
+
41
+ # Print response for debugging
42
+ print(f"Status Code: {response.status_code}")
43
+ print(f"Response: {response.text}")
44
+
45
+ if response.status_code == 200:
46
+ result = response.json()
47
+ print(f"Prediction: {result['prediction']}")
48
+ print(f"Confidence: {result['confidence']*100:.2f}%")
49
 
50
+ if __name__ == "__main__":
51
+ test_model()