|
import pandas as pd |
|
from datasets import Dataset |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
import torch |
|
import streamlit as st |
|
|
|
def fine_tune_model(uploaded_file): |
|
|
|
df = pd.read_csv(uploaded_file) |
|
st.subheader("Dataset Preview") |
|
st.write(df.head()) |
|
|
|
|
|
if 'text' not in df.columns: |
|
st.warning("No 'text' column found. Please select the column to use for fine-tuning.") |
|
column_choice = st.selectbox("Select the column containing text data", df.columns) |
|
df['text'] = df[column_choice] |
|
|
|
|
|
dataset = Dataset.from_pandas(df) |
|
|
|
model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"]) |
|
|
|
if st.button("Fine-tune Model"): |
|
if model_name: |
|
try: |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
def preprocess_function(examples): |
|
return tokenizer(examples['text'], truncation=True, padding=True) |
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True) |
|
|
|
|
|
train_args = { |
|
"output_dir": "./results", |
|
"num_train_epochs": 3, |
|
"per_device_train_batch_size": 16, |
|
"logging_dir": "./logs", |
|
} |
|
|
|
st.success("Fine-tuning started (demo)!") |
|
except Exception as e: |
|
st.error(f"Error during fine-tuning: {e}") |
|
else: |
|
st.warning("Please select a model for fine-tuning.") |
|
|