import streamlit as st import pandas as pd import re import string from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split # Page config st.set_page_config(page_title="SMS Spam Detector", layout="centered") st.title("📩 SMS Spam Detection App") st.markdown("🔍 Enter a message below to check if it's **Spam** or **Not Spam (Ham)**") # --- Load dataset --- csv_url = "https://huggingface.co/spaces/MLDeveloper/Spam_SMS_Detection/resolve/main/spam.csv" try: df = pd.read_csv(csv_url, encoding='latin-1')[['v1', 'v2']] df.columns = ['label', 'message'] df['label'] = df['label'].map({'ham': 0, 'spam': 1}) except Exception as e: st.error(f"Failed to load CSV: {e}") st.stop() # --- Text Cleaning Function --- def clean_text(text): text = text.lower() text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) text = re.sub(r'\@w+|\#','', text) text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\d+', '', text) text = text.translate(str.maketrans('', '', string.punctuation)) return text.strip() df['cleaned'] = df['message'].apply(clean_text) # --- Train model --- X = df['cleaned'] y = df['label'] vectorizer = TfidfVectorizer() X_vec = vectorizer.fit_transform(X) model = MultinomialNB() model.fit(X_vec, y) # --- Prediction Function --- def predict_spam(message): cleaned = clean_text(message) vector = vectorizer.transform([cleaned]) prediction = model.predict(vector) return "Spam" if prediction[0] == 1 else "Not Spam" # --- UI --- user_input = st.text_area("✉️ Enter your SMS message here:") if st.button("Check Message"): if user_input.strip() == "": st.warning("⚠️ Please enter a valid message.") else: result = predict_spam(user_input) if result == "Spam": st.error("🚫 This message is classified as **SPAM**.") else: st.success("✅ This message is classified as **NOT SPAM (HAM)**.") # Optional: View CSV with st.expander("📄 View sample dataset (CSV)"): st.dataframe(df[['label', 'message']].head()) st.markdown("---") st.markdown("🔒 **Note**: Model is trained in real-time from CSV and not saved for reuse. Ideal for demo purposes.**")