from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS import pandas as pd import gradio as gr from transformers import pipeline from gradio.themes.utils.colors import red, green import requests import json import os from dotenv import load_dotenv import time # Load environment variables load_dotenv() # Initialize the NLP pipeline nlp = English() nlp.add_pipe("sentencizer") tokenizer = nlp.tokenizer # Initialize the text classification pipeline detector = pipeline(task='text-classification', model='SJTU-CL/RoBERTa-large-ArguGPT-sent') # Groq API configuration GROQ_API_KEY = os.getenv("GROQ_API_KEY") if not GROQ_API_KEY: raise ValueError("Please set your GROQ_API_KEY in the .env file") GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions" GROQ_MODEL = "llama3-70b-8192" # Updated to latest model # Define color map for highlighted text color_map = { '0%': green.c400, '10%': green.c300, '20%': green.c200, '30%': green.c100, '40%': green.c50, '50%': red.c50, '60%': red.c100, '70%': red.c200, '80%': red.c300, '90%': red.c400, '100%': red.c500 } def is_stopword(word): """Check if a word is a stop word or very short""" return word.lower() in STOP_WORDS or len(word) <= 2 def get_synonyms(word): """Get simple, human-readable synonyms using Groq API""" if is_stopword(word): return [word] # Don't get synonyms for stop words headers = { "Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json" } prompt = f"""Provide a list of exactly 5 simple synonyms for '{word}'. Return ONLY a JSON array of synonyms without any additional text. Example: ["use", "employ", "apply", "make use of", "take"]""" data = { "model": GROQ_MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": 0.3, "max_tokens": 100, "response_format": {"type": "json_object"} } try: response = requests.post(GROQ_API_URL, headers=headers, json=data) response.raise_for_status() result = response.json() content = json.loads(result['choices'][0]['message']['content']) # Improved response parsing if isinstance(content, dict): for key in ['synonyms', 'words', 'alternatives']: if key in content and isinstance(content[key], list): return content[key][:5] return [word] # Fallback if parsing fails except Exception as e: print(f"Error getting synonyms: {e}") return [word] # Fallback to original word if API fails def identify_problem_words(text): """Use Groq API to identify uncommon, difficult, and AI-generated words""" headers = { "Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json" } prompt = f"""Analyze this text and return ONLY a JSON list of words that are: 1. Uncommon (not in everyday vocabulary) 2. Difficult (complex or technical) 3. Likely AI-generated (overly formal, verbose, or unnatural) Exclude all stop words (a, an, the, and, but, etc.) and very short words (1-2 letters). Return format: {{"words": ["word1", "word2", ...]}} Text: {text}""" data = { "model": GROQ_MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": 0.2, "max_tokens": 200, "response_format": {"type": "json_object"} } try: response = requests.post(GROQ_API_URL, headers=headers, json=data) response.raise_for_status() result = response.json() content = json.loads(result['choices'][0]['message']['content']) if isinstance(content, dict) and 'words' in content: # Filter out any stop words that might have slipped through filtered_words = [word for word in content['words'] if not is_stopword(word)] return set(filtered_words) return set() except Exception as e: print(f"Error identifying problem words: {e}") return set() def predict_word(word, problem_words): """Predict AI probability for a single word if it's in problem words""" if len(word) <= 3 or word.lower() not in problem_words or is_stopword(word): return 0.0 try: prob = predict_one_sent(word) return prob except: return 0.0 def predict_doc(doc): start_time = time.time() # First identify problem words using Groq problem_words = identify_problem_words(doc) print(f"Identified problem words: {problem_words}") sents = [s.text for s in nlp(doc).sents] data = {'sentence': [], 'label': [], 'score': []} sent_res = [] word_highlights = [] for sent in sents: sent_prob = predict_one_sent(sent) # Word-level analysis - only for problem words tokens = [token.text for token in tokenizer(sent)] word_probs = [predict_word(token, problem_words) for token in tokens] for word, prob in zip(tokens, word_probs): if prob >= 0.2: # Only highlight words with >20% AI probability if prob < 0.3: label = '20%' elif prob < 0.4: label = '30%' elif prob < 0.5: label = '40%' elif prob < 0.6: label = '50%' elif prob < 0.7: label = '60%' elif prob < 0.8: label = '70%' elif prob < 0.9: label = '80%' elif prob < 1: label = '90%' else: label = '100%' word_highlights.append((word, label)) else: word_highlights.append((word, None)) data['sentence'].append(sent) data['score'].append(round(sent_prob, 4)) if sent_prob <= 0.5: data['label'].append('Human') else: data['label'].append('Machine') if sent_prob < 0.1: label = '0%' elif sent_prob < 0.2: label = '10%' elif sent_prob < 0.3: label = '20%' elif sent_prob < 0.4: label = '30%' elif sent_prob < 0.5: label = '40%' elif sent_prob < 0.6: label = '50%' elif sent_prob < 0.7: label = '60%' elif sent_prob < 0.8: label = '70%' elif sent_prob < 0.9: label = '80%' elif sent_prob < 1: label = '90%' else: label = '100%' sent_res.append((sent, label)) df = pd.DataFrame(data) csv_path = 'result.csv' df.to_csv(csv_path) print(f"Analysis took {time.time() - start_time:.2f} seconds") overall_score = df.score.mean() overall_label = 'Human' if overall_score <= 0.5 else 'Machine' sum_str = f'The essay is probably written by {overall_label}. The probability of being generated by AI is {overall_score:.2f}' return sum_str, sent_res, df, csv_path, word_highlights def predict_one_sent(sent): res = detector(sent)[0] org_label, prob = res['label'], res['score'] if org_label == 'LABEL_0': prob = 1 - prob return prob def update_text(text, selected_word, replacement, word_highlights): new_text = text.replace(selected_word, replacement, 1) # Update word_highlights with the new word (assuming it's now human-written) updated_highlights = [] replaced = False for word, label in word_highlights: if word == selected_word and not replaced: updated_highlights.append((replacement, '0%')) replaced = True else: updated_highlights.append((word, label)) return new_text, updated_highlights def process_word_highlights(highlights): return highlights # Custom CSS for modern look custom_css = """ .gradio-container { font-family: 'Arial', sans-serif; } .gradio-header { background-color: #4CAF50; color: white; padding: 10px; text-align: center; border-radius: 8px; margin-bottom: 20px; } .gradio-button { background-color: #4CAF50; color: white; border: none; padding: 10px 20px; text-align: center; text-decoration: none; display: inline-block; font-size: 16px; margin: 4px 2px; cursor: pointer; border-radius: 5px; transition: background-color 0.3s; } .gradio-button:hover { background-color: #45a049; } .highlighted-word { cursor: pointer; padding: 2px 4px; border-radius: 3px; transition: all 0.2s; } .highlighted-word:hover { text-decoration: underline; background-color: #f0f0f0; transform: scale(1.05); } .replacement-row { border: 1px solid #ddd; padding: 15px; border-radius: 8px; margin-top: 10px; background-color: #f9f9f9; } """ with gr.Blocks(css=custom_css) as demo: gr.Markdown("""## AI vs Human Essay Detector""") gr.Markdown("""Identify and replace uncommon, difficult, and AI-generated words in your text.""") word_highlights = gr.State([]) selected_word = gr.State("") with gr.Row(): with gr.Column(): text_in = gr.Textbox( lines=10, label='Essay Input', placeholder="Paste your essay here...", elem_classes=["text-input"] ) btn = gr.Button('Analyze Text', variant="primary") with gr.Column(): sent_res = gr.HighlightedText( label='Sentence-level Analysis', color_map=color_map, show_legend=True ) word_res = gr.HighlightedText( label='Word-level Analysis (Click words to replace)', color_map=color_map, show_legend=True ) with gr.Row(): summary = gr.Textbox(label='Overall Analysis', interactive=False) csv_f = gr.File(label='Download Detailed Analysis') with gr.Row(): tab = gr.Dataframe( label='Detailed Sentence Analysis', wrap=True, max_rows=10 ) with gr.Column(visible=False) as replacement_row: gr.Markdown("### Replace Word") with gr.Row(): replacement_dropdown = gr.Dropdown( label="Select replacement", interactive=True, allow_custom_value=True ) with gr.Row(): replace_btn = gr.Button("Replace", variant="primary") cancel_btn = gr.Button("Cancel") def on_word_select(evt: gr.SelectData): if evt.value: synonyms = get_synonyms(evt.value) return ( evt.value, gr.Dropdown(choices=synonyms, value=evt.value), gr.Column(visible=True) ) return None, None, gr.Column(visible=False) word_res.select( fn=on_word_select, outputs=[selected_word, replacement_dropdown, replacement_row] ) replace_btn.click( fn=update_text, inputs=[text_in, selected_word, replacement_dropdown, word_highlights], outputs=[text_in, word_highlights] ).then( fn=lambda: gr.Column(visible=False), outputs=replacement_row ).then( fn=lambda x: predict_doc(x), inputs=text_in, outputs=[summary, sent_res, tab, csv_f, word_highlights] ).then( fn=process_word_highlights, inputs=word_highlights, outputs=word_res ) cancel_btn.click( fn=lambda: gr.Column(visible=False), outputs=replacement_row ) btn.click( fn=predict_doc, inputs=text_in, outputs=[summary, sent_res, tab, csv_f, word_highlights] ).then( fn=process_word_highlights, inputs=word_highlights, outputs=word_res ) if __name__ == "__main__": demo.launch()