Spaces:
Sleeping
Sleeping
import pandas as pd | |
import mlflow | |
import mlflow.sklearn | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
# Load dữ liệu | |
df = pd.read_csv("https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/spam.csv", encoding='latin-1')[['v1', 'v2']] | |
df.columns = ['label', 'text'] | |
df['label'] = df['label'].map({'ham': 0, 'spam': 1}) | |
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42) | |
# Pipeline gồm TF-IDF + Naive Bayes | |
pipeline = Pipeline([ | |
('tfidf', TfidfVectorizer()), | |
('clf', MultinomialNB(alpha=1.0)) # bạn có thể thay đổi alpha để tạo version mới | |
]) | |
pipeline.fit(X_train, y_train) | |
y_pred = pipeline.predict(X_test) | |
acc = accuracy_score(y_test, y_pred) | |
with mlflow.start_run(): | |
mlflow.log_param("alpha", 1.0) | |
mlflow.log_metric("accuracy", acc) | |
mlflow.sklearn.log_model(pipeline, "model", registered_model_name="SpamClassifier") | |
print(f"Logged model with acc={acc}") | |