Spaces:

vanhai123
/

spam-mlflow-registry-demo

Sleeping

spam-mlflow-registry-demo / train_spam_model.py

Upload 7 files

e2e9623 verified 17 days ago

1.2 kB

	import pandas as pd
	import mlflow
	import mlflow.sklearn
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score

	# Load dữ liệu
	df = pd.read_csv("https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/spam.csv", encoding='latin-1')[['v1', 'v2']]
	df.columns = ['label', 'text']
	df['label'] = df['label'].map({'ham': 0, 'spam': 1})

	X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

	# Pipeline gồm TF-IDF + Naive Bayes
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer()),
	('clf', MultinomialNB(alpha=1.0)) # bạn có thể thay đổi alpha để tạo version mới
	])

	pipeline.fit(X_train, y_train)
	y_pred = pipeline.predict(X_test)
	acc = accuracy_score(y_test, y_pred)

	with mlflow.start_run():
	mlflow.log_param("alpha", 1.0)
	mlflow.log_metric("accuracy", acc)
	mlflow.sklearn.log_model(pipeline, "model", registered_model_name="SpamClassifier")
	print(f"Logged model with acc={acc}")