JaishnaCodz commited on
Commit
3ee9037
·
verified ·
1 Parent(s): 4f4b697

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -85
app.py CHANGED
@@ -8,11 +8,26 @@ import nltk
8
  from nltk.tokenize import sent_tokenize
9
  from autogen import AssistantAgent, UserProxyAgent
10
 
11
- nltk.download('punkt')
 
 
 
 
 
 
12
 
13
  # Initialize Hugging Face models
14
- toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")
15
- summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6")
 
 
 
 
 
 
 
 
 
16
 
17
  # Define AutoGen Agents (for modularity, but we'll call functions directly)
18
  user_proxy = UserProxyAgent(
@@ -49,19 +64,23 @@ coordinator_agent = AssistantAgent(
49
 
50
  # Task functions
51
  def extract_text(input_type, text_input, url_input):
52
- if input_type == "URL" and url_input:
53
- try:
54
  article = Article(url_input)
55
  article.download()
56
  article.parse()
 
57
  return article.text
58
- except Exception as e:
59
- return f"Error fetching URL: {str(e)}"
60
- return text_input
 
 
61
 
62
  def check_grammar(text):
63
  try:
64
  grammar_tool = language_tool_python.LanguageToolPublicAPI('en-US')
 
65
  matches = grammar_tool.check(text)
66
  return [
67
  {
@@ -73,103 +92,123 @@ def check_grammar(text):
73
  } for match in matches
74
  ]
75
  except Exception as e:
 
76
  return [{"text": "", "error": f"Grammar check failed: {str(e)}", "suggestions": [], "offset": 0, "length": 0}]
77
 
78
  def detect_sensitive_content(text):
79
- sentences = sent_tokenize(text)
80
- sensitive_issues = []
81
- for i, sentence in enumerate(sentences):
82
- result = toxicity_classifier(sentence)
83
- if result[0]['label'] == 'toxic' and result[0]['score'] > 0.7:
84
- sensitive_issues.append({
85
- "sentence": sentence,
86
- "score": result[0]['score'],
87
- "index": i
88
- })
89
- return sensitive_issues
 
 
 
 
 
90
 
91
  def generate_suggestions(text, grammar_issues, sensitive_issues):
92
- suggestions = []
93
- for issue in grammar_issues:
94
- if issue['suggestions']:
95
- suggestions.append(f"Replace '{issue['text']}' with '{issue['suggestions'][0]}' ({issue['error']})")
96
- for issue in sensitive_issues:
97
- try:
98
  summary = summarizer(issue['sentence'], max_length=50, min_length=10, do_sample=False)[0]['summary_text']
99
  suggestions.append(f"Rephrase sensitive content '{issue['sentence']}' to: '{summary}' (Toxicity score: {issue['score']:.2f})")
100
- except Exception as e:
101
- suggestions.append(f"Failed to rephrase '{issue['sentence']}': {str(e)}")
102
- return suggestions
 
 
103
 
104
  def highlight_text(text, grammar_issues, sensitive_issues):
105
- highlighted = text
106
- offset_adjust = 0
107
- for issue in grammar_issues:
108
- start = issue['offset'] + offset_adjust
109
- end = start + issue['length']
110
- error_text = highlighted[start:end]
111
- highlighted = highlighted[:start] + f"<span style='background-color: yellow'>{error_text}</span>" + highlighted[end:]
112
- offset_adjust += len("<span style='background-color: yellow'>") + len("</span>")
113
- sentences = sent_tokenize(text)
114
- offset_adjust = 0
115
- for issue in sensitive_issues:
116
- sentence = issue['sentence']
117
- start = highlighted.find(sentence, offset_adjust)
118
- if start != -1:
119
- end = start + len(sentence)
120
- highlighted = highlighted[:start] + f"<span style='background-color: red'>{sentence}</span>" + highlighted[end:]
121
- offset_adjust = end
122
- return highlighted
 
 
 
 
123
 
124
  # Main function to process input
125
  def review_blog(input_type, text_input, url_input):
126
- if not text_input and not url_input:
127
- return "Please provide text or a URL.", "", []
 
128
 
129
- # Step 1: Text Extraction
130
- text = extract_text(input_type, text_input, url_input)
131
- print(f"Processed text: {text}") # Debug print to check text processing
132
- if text.startswith("Error"):
133
- return text, "", []
134
 
135
- # Step 2: Grammar Check
136
- grammar_issues = check_grammar(text)
137
 
138
- # Step 3: Sensitive Content Detection
139
- sensitive_issues = detect_sensitive_content(text)
140
 
141
- # Step 4: Generate Suggestions
142
- suggestions = generate_suggestions(text, grammar_issues, sensitive_issues)
143
 
144
- # Step 5: Coordinate Output
145
- highlighted_text = highlight_text(text, grammar_issues, sensitive_issues)
146
- suggestions_text = "\n".join([f"{i+1}. {sug}" for i, sug in enumerate(suggestions)])
147
 
148
- return highlighted_text, suggestions_text, suggestions
 
 
 
149
 
150
  def apply_changes(text, suggestions, approved_indices):
151
- sentences = sent_tokenize(text)
152
- for idx in approved_indices.split(','):
153
- try:
154
- idx = int(idx.strip()) - 1
155
- if idx < len(suggestions):
156
- suggestion = suggestions[idx]
157
- match = re.search(r"'([^']+)'$", suggestion)
158
- if match:
159
- new_text = match.group(1)
160
- if "Rephrase sensitive content" in suggestion:
161
- orig_match = re.search(r"'([^']+)'\s+to:", suggestion)
162
- if orig_match:
163
- orig_sentence = orig_match.group(1)
164
- text = text.replace(orig_sentence, new_text)
165
- else:
166
- orig_match = re.search(r"Replace '([^']+)'\s+with\s+'([^']+)'", suggestion)
167
- if orig_match:
168
- orig_text = orig_match.group(1)
169
- text = text.replace(orig_text, new_text)
170
- except ValueError:
171
- continue # Skip invalid indices
172
- return text
 
 
 
 
173
 
174
  # Gradio interface
175
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
8
  from nltk.tokenize import sent_tokenize
9
  from autogen import AssistantAgent, UserProxyAgent
10
 
11
+ # Download required NLTK data at startup
12
+ try:
13
+ nltk.download('punkt')
14
+ nltk.download('punkt_tab') # Ensure language-specific data is downloaded
15
+ print("NLTK data (punkt and punkt_tab) downloaded successfully.")
16
+ except Exception as e:
17
+ print(f"Error downloading NLTK data: {str(e)}")
18
 
19
  # Initialize Hugging Face models
20
+ try:
21
+ toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert")
22
+ print("Toxicity classifier loaded successfully.")
23
+ except Exception as e:
24
+ print(f"Error loading toxicity classifier: {str(e)}")
25
+
26
+ try:
27
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6")
28
+ print("Summarizer loaded successfully.")
29
+ except Exception as e:
30
+ print(f"Error loading summarizer: {str(e)}")
31
 
32
  # Define AutoGen Agents (for modularity, but we'll call functions directly)
33
  user_proxy = UserProxyAgent(
 
64
 
65
  # Task functions
66
  def extract_text(input_type, text_input, url_input):
67
+ try:
68
+ if input_type == "URL" and url_input:
69
  article = Article(url_input)
70
  article.download()
71
  article.parse()
72
+ print(f"Extracted text from URL: {url_input[:50]}...") # Debug
73
  return article.text
74
+ print(f"Using input text: {text_input[:50]}...") # Debug
75
+ return text_input
76
+ except Exception as e:
77
+ print(f"Error in extract_text: {str(e)}")
78
+ return f"Error fetching URL or processing text: {str(e)}"
79
 
80
  def check_grammar(text):
81
  try:
82
  grammar_tool = language_tool_python.LanguageToolPublicAPI('en-US')
83
+ print("Checking grammar...") # Debug
84
  matches = grammar_tool.check(text)
85
  return [
86
  {
 
92
  } for match in matches
93
  ]
94
  except Exception as e:
95
+ print(f"Grammar check failed: {str(e)}")
96
  return [{"text": "", "error": f"Grammar check failed: {str(e)}", "suggestions": [], "offset": 0, "length": 0}]
97
 
98
  def detect_sensitive_content(text):
99
+ try:
100
+ sentences = sent_tokenize(text)
101
+ sensitive_issues = []
102
+ for i, sentence in enumerate(sentences):
103
+ result = toxicity_classifier(sentence)
104
+ if result[0]['label'] == 'toxic' and result[0]['score'] > 0.7:
105
+ sensitive_issues.append({
106
+ "sentence": sentence,
107
+ "score": result[0]['score'],
108
+ "index": i
109
+ })
110
+ print(f"Detected {len(sensitive_issues)} sensitive issues.") # Debug
111
+ return sensitive_issues
112
+ except Exception as e:
113
+ print(f"Error in detect_sensitive_content: {str(e)}")
114
+ return []
115
 
116
  def generate_suggestions(text, grammar_issues, sensitive_issues):
117
+ try:
118
+ suggestions = []
119
+ for issue in grammar_issues:
120
+ if issue['suggestions']:
121
+ suggestions.append(f"Replace '{issue['text']}' with '{issue['suggestions'][0]}' ({issue['error']})")
122
+ for issue in sensitive_issues:
123
  summary = summarizer(issue['sentence'], max_length=50, min_length=10, do_sample=False)[0]['summary_text']
124
  suggestions.append(f"Rephrase sensitive content '{issue['sentence']}' to: '{summary}' (Toxicity score: {issue['score']:.2f})")
125
+ print(f"Generated {len(suggestions)} suggestions.") # Debug
126
+ return suggestions
127
+ except Exception as e:
128
+ print(f"Error in generate_suggestions: {str(e)}")
129
+ return []
130
 
131
  def highlight_text(text, grammar_issues, sensitive_issues):
132
+ try:
133
+ highlighted = text
134
+ offset_adjust = 0
135
+ for issue in grammar_issues:
136
+ start = issue['offset'] + offset_adjust
137
+ end = start + issue['length']
138
+ error_text = highlighted[start:end]
139
+ highlighted = highlighted[:start] + f"<span style='background-color: yellow'>{error_text}</span>" + highlighted[end:]
140
+ offset_adjust += len("<span style='background-color: yellow'>") + len("</span>")
141
+ sentences = sent_tokenize(text)
142
+ offset_adjust = 0
143
+ for issue in sensitive_issues:
144
+ sentence = issue['sentence']
145
+ start = highlighted.find(sentence, offset_adjust)
146
+ if start != -1:
147
+ end = start + len(sentence)
148
+ highlighted = highlighted[:start] + f"<span style='background-color: red'>{sentence}</span>" + highlighted[end:]
149
+ offset_adjust = end
150
+ return highlighted
151
+ except Exception as e:
152
+ print(f"Error in highlight_text: {str(e)}")
153
+ return text
154
 
155
  # Main function to process input
156
  def review_blog(input_type, text_input, url_input):
157
+ try:
158
+ if not text_input and not url_input:
159
+ return "Please provide text or a URL.", "", []
160
 
161
+ # Step 1: Text Extraction
162
+ text = extract_text(input_type, text_input, url_input)
163
+ print(f"Processed text: {text[:50]}...") # Debug
164
+ if text.startswith("Error"):
165
+ return text, "", []
166
 
167
+ # Step 2: Grammar Check
168
+ grammar_issues = check_grammar(text)
169
 
170
+ # Step 3: Sensitive Content Detection
171
+ sensitive_issues = detect_sensitive_content(text)
172
 
173
+ # Step 4: Generate Suggestions
174
+ suggestions = generate_suggestions(text, grammar_issues, sensitive_issues)
175
 
176
+ # Step 5: Coordinate Output
177
+ highlighted_text = highlight_text(text, grammar_issues, sensitive_issues)
178
+ suggestions_text = "\n".join([f"{i+1}. {sug}" for i, sug in enumerate(suggestions)])
179
 
180
+ return highlighted_text, suggestions_text, suggestions
181
+ except Exception as e:
182
+ print(f"Error in review_blog: {str(e)}")
183
+ return f"Error processing input: {str(e)}", "", []
184
 
185
  def apply_changes(text, suggestions, approved_indices):
186
+ try:
187
+ sentences = sent_tokenize(text)
188
+ for idx in approved_indices.split(','):
189
+ try:
190
+ idx = int(idx.strip()) - 1
191
+ if idx < len(suggestions):
192
+ suggestion = suggestions[idx]
193
+ match = re.search(r"'([^']+)'$", suggestion)
194
+ if match:
195
+ new_text = match.group(1)
196
+ if "Rephrase sensitive content" in suggestion:
197
+ orig_match = re.search(r"'([^']+)'\s+to:", suggestion)
198
+ if orig_match:
199
+ orig_sentence = orig_match.group(1)
200
+ text = text.replace(orig_sentence, new_text)
201
+ else:
202
+ orig_match = re.search(r"Replace '([^']+)'\s+with\s+'([^']+)'", suggestion)
203
+ if orig_match:
204
+ orig_text = orig_match.group(1)
205
+ text = text.replace(orig_text, new_text)
206
+ except ValueError:
207
+ continue # Skip invalid indices
208
+ return text
209
+ except Exception as e:
210
+ print(f"Error in apply_changes: {str(e)}")
211
+ return text
212
 
213
  # Gradio interface
214
  with gr.Blocks(theme=gr.themes.Soft()) as demo: