Anecedotal_Discourse_Classifier_Multitext

Running

Anecedotal_Discourse_Classifier_Multitext

File size: 6,836 Bytes

import os

os.system("python -m pip install transformers==4.26.1")
os.system("python -m pip install spacy==3.5.4")
os.system("python -m pip install spacy-alignments==0.9.1")
os.system("python -m pip install spacy-legacy==3.0.12")
os.system("python -m pip install spacy-loggers==1.0.3")
os.system("python -m pip install torch")
os.system("python -m pip install seaborn==0.11.2")
os.system("python -m pip install gradio==3.16.1")
os.system("python -m pip install typer==0.4.1")
os.system("python -m pip install pydantic==1.9.2")
os.system("python -m pip install matplotlib==3.4.3")
os.system("python -m pip install Flask")
os.system("python -m pip install sty==1.0.4")
os.system("python -m pip install numpy==1.26.4")

from flask import Flask, render_template, request, send_file, after_this_request
import hashlib
from pipeline import *
import csv

app = Flask(__name__, template_folder='templates', static_folder='templates')

# Set the folder for saving uploaded files
UPLOAD_FOLDER = 'uploads/'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
if not os.path.isdir(app.config['UPLOAD_FOLDER']):
    # Create the directory if it doesn't exist
    os.makedirs(app.config['UPLOAD_FOLDER'])

# Allowable file extensions for uploading
ALLOWED_EXTENSIONS = {'txt','csv'}

# Check if a file has an allowable extension
def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

# Reverse the lines in the file and return a list of dictionaries containing the original input and the processed output
def process_file(file_path):
    file_handle = open(file_path,'r',encoding='utf-8-sig',errors='ignore')

    with open(file_path, 'r', encoding='utf-8-sig',errors='ignore') as f:
        if file_path.endswith(".txt"):
            lines = f.readlines()
            results = []
            for line in lines:
                if line.strip() != "":
                    result = run_pipeline(line.strip())
                    results.append(result)
        elif file_path.endswith(".csv"):
            reader = csv.reader(f)
            for line in reader:
                if line[0].strip() != "":
                    result = run_pipeline(line[0].strip())
                    results.append(result)
    result = [{'input': line.strip(), 'output': result} for line, result in zip(lines, results)]
    @after_this_request
    def remove_file(response):
        try:
            os.remove(file_path)
            file_handle.close()
        except Exception as error:
            app.logger.error("Error removing or closing downloaded file handle", error)
        return response
    return result

# Home page route that allows users to upload files
@app.route('/')
def index():
    # Otherwise, render the index page
    return render_template('index.html')

# Handle file upload requests
@app.route('/', methods=['POST'])
def upload_file():
    # Get the uploaded file
    file = request.files['file']

    # If the user did not select any file, return an error message
    if not file:
        return 'No file selected'

    # If the file type is not allowed, return an error message
    if not allowed_file(file.filename):
        return 'File type not allowed'

    # Generate a unique hash code for the file name
    hash_code = hashlib.md5(file.read()).hexdigest()
    # if ".txt" in filename:
    filename = f"{hash_code}.txt"
    # elif ".csv" in filename:
    #     filename = f"{hash_code}.csv"
    file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)

    # Save the uploaded file
    file.seek(0)
    file.save(file_path)

    # Process the uploaded file and return the result as a JSON line file
    result = process_file(file_path)
    result_file_path = os.path.join(app.config['UPLOAD_FOLDER'], 'result.csv')

    counts = {}
    individual_labels = {}
    props = {}
    for id_,text in enumerate(result):  
        individual_labels[id_] = []  
        counts[id_] = {"generic":0,"specific":0,"stative":0,"dynamic":0,"static":0,"episodic":0,"habitual":0,"NA genericity":0,"NA eventivity":0,"NA boundedness":0}
        for clause in text['output'][1]:
            individual_labels[id_].append(clause[1])
            label = labels2attrs[clause[1]]
            for id__,feature in enumerate(label):
                if "NA" not in feature:
                    counts[id_][feature] += 1
                elif id__ == 0:
                    counts[id_]["NA genericity"] += 1
                elif id__ == 1:
                    counts[id_]["NA eventivity"] += 1
                else:
                    counts[id_]["NA boundedness"] += 1
            props[id_] = [counts[id_]['generic']/(counts[id_]['generic']+counts[id_]['specific']+counts[id_]['NA genericity']),counts[id_]['specific']/(counts[id_]['generic']+counts[id_]['specific']+counts[id_]['NA genericity']),
                counts[id_]['stative']/(counts[id_]['stative']+counts[id_]['dynamic']+counts[id_]['NA eventivity']), counts[id_]['dynamic']/(counts[id_]['stative']+counts[id_]['dynamic']+counts[id_]['NA eventivity']),
                counts[id_]['static']/(counts[id_]['static']+counts[id_]['episodic']+counts[id_]["habitual"]+counts[id_]['NA boundedness']),counts[id_]['episodic']/(counts[id_]['static']+counts[id_]['episodic']+counts[id_]["habitual"]+counts[id_]["NA boundedness"]),
                counts[id_]['habitual']/(counts[id_]['static']+counts[id_]['episodic']+counts[id_]["habitual"]+counts[id_]["NA boundedness"])]


    with open(result_file_path, 'w', encoding='utf-8', errors='ignore', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["input","clauses","individual labels","genericity: generic count","genericity: specific count","eventivity: stative count","eventivity: dynamic count","boundedness: static count","boundedness: episodic count","habitual count","genericity:  proportion generic","genericity: proportion specific","eventivity: proportion stative","eventivity: proportion dynamic","boundedness: proportion static","boundedness: proportion episodic","proportion habitual"])
        for id_ in counts.keys():
            clauses = []
            for clause in result[id_]["output"][0]:
                clauses.append("{}: {}".format(clause[1],clause[0]))
            clauses = "\n".join(clauses)
            ind_labels = "\n".join(individual_labels[id_])
            extracted = [result[id_]["input"],clauses,ind_labels,counts[id_]['generic'],counts[id_]['specific'],counts[id_]['stative'],counts[id_]['dynamic'],counts[id_]['static'],counts[id_]['episodic'],counts[id_]['habitual']]+props[id_]
            writer.writerow(extracted)

    # Return the result file as a download once the processing is complete
    return send_file(result_file_path, as_attachment=True)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=7860)