Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
import torch | |
import torch.nn as nn | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import classification_report | |
from torch.optim import optimizer | |
import transformers | |
from transformers import AutoModel, AutoTokenizer | |
from keras.preprocessing.sequence import pad_sequences | |
import json | |
#from vncorenlp import VnCoreNLP | |
from sklearn.utils import shuffle | |
from torch.optim import AdamW | |
from torch.utils.data import TensorDataset, DataLoader, RandomSampler | |
from underthesea import word_tokenize | |
#Đọc dữ liệu | |
def get_data(all_path): | |
sentences=[] | |
labels=[] | |
for i in all_path: | |
try: | |
with open(i, "r", encoding='utf-8') as f: | |
datastore = json.load(f) | |
except FileNotFoundError: | |
print(f"Error: File {i} not found") | |
continue | |
except json.JSONDecodeError: | |
print(f"Error: File {i} contains invalid JSON") | |
continue | |
for item in datastore: | |
sentences.append(item["sentences"]) | |
labels.append(item["toxic"]) | |
return sentences, labels | |
#Tách từ tiếng việt | |
# rdrsegmenter=VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') | |
# def sentences_segment(sentences): | |
# for i in range(len(sentences)): | |
# tokens=rdrsegmenter.tokenize(sentences[i]) | |
# statement="" | |
# for token in tokens: | |
# statement+=" ".join(token) | |
# sentences[i]=statement | |
def sentences_segment(sentences): | |
for i in range(len(sentences)): | |
# word_tokenize trả về chuỗi các từ cách nhau bằng dấu space | |
sentences[i] = word_tokenize(sentences[i], format="text") | |
#Mã hóa các câu thành Token ID và pad chuỗi về độ dài maxlen | |
phobert=AutoModel.from_pretrained('vinai/phobert-base') | |
tokenizer=AutoTokenizer.from_pretrained('vinai/phobert-base') | |
def shuffle_and_tokenize(sentences,labels,maxlen): | |
sentences,labels=shuffle(sentences,labels) | |
sequences=[tokenizer.encode(i) for i in sentences] | |
labels=[int(i) for i in labels] | |
padded=pad_sequences(sequences, maxlen=maxlen, padding="pre") | |
return padded, labels | |
def check_maxlen(sentences): | |
sentences_len=[len(i.split()) for i in sentences] | |
return max(sentences_len) | |
#Chia dữ liệu huấn luyện/val/test | |
def split_data(padded, labels): | |
padded=torch.tensor(padded) | |
labels=torch.tensor(labels) | |
X_train,X_,y_train,y_=train_test_split(padded, labels,random_state=2018, train_size=0.8, stratify=labels) | |
X_val,X_test, y_val, y_test=train_test_split(X_, y_, random_state=2018, train_size=0.5, stratify=y_) | |
return X_train,X_val,X_test, y_train,y_val, y_test | |
#Tạo DataLoader | |
def Data_Loader(X_train,X_val,y_train,y_val): | |
train_data=TensorDataset(X_train,y_train) | |
train_sampler=RandomSampler(train_data) | |
train_dataloader=DataLoader(train_data, sampler=train_sampler,batch_size=2) | |
val_data=TensorDataset(X_val,y_val) | |
val_sampler=RandomSampler(val_data) | |
val_dataloader=DataLoader(val_data, sampler=val_sampler,batch_size=2) | |
return train_dataloader, val_dataloader | |
# Chuẩn bị dữ liệu (chỉ chạy 1 lần ở train_model.py) | |
sentences,labels=get_data(['toxic_dataset.json','normal_dataset.json']) | |
sentences_segment(sentences) | |
padded,labels=shuffle_and_tokenize(sentences,labels,check_maxlen(sentences)) | |
X_train,X_val,X_test, y_train,y_val, y_test=split_data(padded, labels) | |
train_dataloader, val_dataloader=Data_Loader(X_train,X_val,y_train,y_val) | |
# Freeze PhoBERT để không train lại | |
for param in phobert.parameters(): | |
param.requires_grad=False | |
class classify(nn.Module): | |
def __init__(self, phobert, number_of_category): | |
super(classify,self).__init__() | |
self.phobert=phobert | |
self.relu=nn.ReLU() | |
self.dropout=nn.Dropout(0.1) | |
self.first_function=nn.Linear(768, 512) | |
self.second_function=nn.Linear(512, 32) | |
self.third_function=nn.Linear(32,number_of_category) | |
self.softmax=nn.LogSoftmax(dim=1) | |
def forward(self, input): | |
x=self.phobert(input) | |
x=self.first_function(x[1]) | |
x=self.relu(x) | |
x=self.dropout(x) | |
x=self.second_function(x) | |
x=self.relu(x) | |
x=self.third_function(x) | |
x=self.softmax(x) | |
return x | |
cross_entropy=nn.NLLLoss() | |
model=classify(phobert,2) | |
optimizer=AdamW(model.parameters(),lr=1e-5) | |
#Train / Evaluate | |
def train(): | |
model.train() | |
total_loss=0 | |
total_preds=[] | |
for step , batch in enumerate(train_dataloader): | |
if step%50==0 and step!=0: | |
print("BATCH {} of {}".format(step, len(train_dataloader))) | |
input,labels=batch | |
model.zero_grad() | |
preds=model(input) | |
loss=cross_entropy(preds, labels) | |
total_loss+=loss.item() | |
loss.backward() | |
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
optimizer.step() | |
preds=preds.detach().cpu().numpy() | |
total_preds.append(preds) | |
avg_loss=total_loss/len(train_dataloader) | |
total_preds=np.concatenate(total_preds,axis=0) | |
return avg_loss, total_preds | |
def evaluate(): | |
model.eval() | |
total_loss=0 | |
total_preds=[] | |
for step, batch in enumerate(val_dataloader): | |
if step%50==0 and step!=0: | |
print("BATCH {} of {}".format(step, len(val_dataloader))) | |
input,labels=batch | |
with torch.no_grad(): | |
preds=model(input) | |
loss=cross_entropy(preds, labels) | |
total_loss+=loss.item() | |
preds=preds.detach().cpu().numpy() | |
total_preds.append(preds) | |
avg_loss=total_loss/len(val_dataloader) | |
total_preds=np.concatenate(total_preds,axis=0) | |
return avg_loss, total_preds | |
#Huấn luyện toàn bộ | |
def run(epochs): | |
best_valid_loss=float("inf") | |
train_losses=[] | |
valid_losses=[] | |
for epoch in range(epochs): | |
print("EPOCH {}/{}".format(epoch+1, epochs)) | |
train_loss,_ =train() | |
valid_loss,_ =evaluate() | |
if valid_loss < best_valid_loss: | |
best_valid_loss = valid_loss | |
torch.save(model.state_dict(),"save_weights.pt") | |
train_losses.append(train_loss) | |
valid_losses.append(valid_loss) | |
print(f"Train Loss: {train_loss}, Val Loss: {valid_loss}") | |
#print(f"Train Loss: {train_loss:.4f}, Val Loss: {valid_loss:.4f}") | |
# print(train_loss) | |
# print(valid_loss) | |
if __name__ == "__main__": | |
print("Module classify_model.py đã được load. Không chạy trực tiếp.") | |