Spaces:

zArabi
/

Persian-Sentiment-Analysis

Runtime error

File size: 4,659 Bytes

import gradio as gr
from transformers import BertModel, BertConfig, BertTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import huggingface_hub
from huggingface_hub import hf_hub_download
import hazm
from cleantext import clean
import regex as re

huggingface_hub.Repository = 'zArabi/Persian-Sentiment-Analysis'

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    # https://pypi.org/project/clean-text/ >> works well for eng and de languages
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False, #Keep the punc
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing > https://github.com/sobhe/hazm
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text

class SentimentModel(nn.Module):
    def __init__(self, config):
        super(SentimentModel, self).__init__()
        self.bert = BertModel.from_pretrained(modelName, return_dict=False)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

modelName = 'HooshvareLab/bert-fa-base-uncased'
class_names = ['negative', 'neutral', 'positive']
label2id = {label: i for i, label in enumerate(class_names)}
id2label = {v: k for k, v in label2id.items()}

config = BertConfig.from_pretrained(
    modelName,
    num_labels=len(class_names),
    id2label=id2label,
    label2id=label2id)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

path = 'HooshvareLab-bert-fa-base-uncased-3class.bin'
downloadedModelDict = hf_hub_download(repo_id="zArabi/Persian-Sentiment-Analysis", filename=path)
loaded_model = SentimentModel(config=config).to(device)
loaded_model.load_state_dict(torch.load(downloadedModelDict, map_location="cpu"))


tokenizer = BertTokenizer.from_pretrained(modelName)
max_len=512

def predict(text):
  text = cleaning(text)
  encoding = tokenizer.encode_plus(
    text,
    max_length=max_len,
    truncation=True,
    padding="max_length",
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=True,
    return_attention_mask=True,
    return_tensors='pt',  # Return PyTorch tensors
  )
  input_ids = encoding["input_ids"].to(device)
  attention_mask = encoding["attention_mask"].to(device)
  outputs = loaded_model (input_ids, attention_mask)
  probs = F.softmax(outputs,dim=1)
  values, indices = torch.max(probs, dim=1)
  data = {
    'comments': text,
    'preds': indices.cpu().numpy()[0],
    'label': class_names[indices.cpu().numpy()[0]],
    'probablities': {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}
  }
  return {class_names[i] : round(probs[0][i].item(),3) for i in range(len(probs[0]))}

gr.Interface(
    predict,
    inputs=gr.Textbox(label="Explore your sentence!",lines=2, placeholder="Type Here..."),
    outputs=gr.outputs.Label(num_top_classes=3),
    title="How are feeling?!!",
).launch()