nam pham
feat: improve ui/ux
a33a001
"""File processing utilities for NER annotation."""
import os
import json
import pandas as pd
from typing import List, Dict, Union, Optional
from .text_processing import tokenize_text, process_text_for_gliner
def process_uploaded_file(file_obj) -> List[str]:
"""Process an uploaded file into a list of sentences.
Args:
file_obj: The uploaded file object
Returns:
List of processed sentences
Raises:
Exception: If file processing fails
"""
if file_obj is None:
raise ValueError("Please upload a file first!")
try:
if file_obj.name.endswith('.csv'):
# Process CSV file
df = pd.read_csv(file_obj.name)
sentences = df['Nội dung'].dropna().tolist()
else:
# Process text file
content = file_obj.read().decode('utf-8')
sentences = [line.strip() for line in content.splitlines() if line.strip()]
# Process each sentence and flatten the list
processed_sentences = []
for sentence in sentences:
processed_sentences.extend(process_text_for_gliner(sentence))
return processed_sentences
except Exception as e:
raise Exception(f"Error reading file: {str(e)}")
def load_from_local_file(
file_path: str,
file_format: str = "json"
) -> List[Dict]:
"""Load and convert data from local file in various formats.
Args:
file_path: Path to the file to load
file_format: Format of the file (json, conll, or txt)
Returns:
List of converted examples
Raises:
Exception: If file loading fails
"""
try:
if file_format == "json":
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
# If data is already in the correct format
if all("tokenized_text" in item and "ner" in item for item in data):
return data
# Convert from other JSON formats
return _convert_json_format(data)
else:
raise ValueError("JSON file must contain a list of examples")
elif file_format == "conll":
return _load_conll_file(file_path)
elif file_format == "txt":
return _load_txt_file(file_path)
else:
raise ValueError(f"Unsupported file format: {file_format}")
except Exception as e:
raise Exception(f"Error loading file: {str(e)}")
def _convert_json_format(data: List[Dict]) -> List[Dict]:
"""Convert JSON data from various formats to the standard format.
Args:
data: List of examples in various JSON formats
Returns:
List of examples in the standard format
"""
converted_data = []
for item in data:
if "tokens" in item and "ner_tags" in item:
ner_spans = []
current_span = None
for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
if tag != "O":
if current_span is None:
current_span = [i, i, tag]
elif tag == current_span[2]:
current_span[1] = i
else:
ner_spans.append(current_span)
current_span = [i, i, tag]
elif current_span is not None:
ner_spans.append(current_span)
current_span = None
if current_span is not None:
ner_spans.append(current_span)
converted_data.append({
"tokenized_text": item["tokens"],
"ner": ner_spans,
"validated": False
})
return converted_data
def _load_conll_file(file_path: str) -> List[Dict]:
"""Load and convert data from a CoNLL format file.
Args:
file_path: Path to the CoNLL file
Returns:
List of converted examples
"""
converted_data = []
current_example = {"tokens": [], "ner_tags": []}
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
if line.startswith("#"):
continue
parts = line.split()
if len(parts) >= 2:
token, tag = parts[0], parts[-1]
current_example["tokens"].append(token)
current_example["ner_tags"].append(tag)
elif current_example["tokens"]:
# Convert current example
ner_spans = []
current_span = None
for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
if tag != "O":
if current_span is None:
current_span = [i, i, tag]
elif tag == current_span[2]:
current_span[1] = i
else:
ner_spans.append(current_span)
current_span = [i, i, tag]
elif current_span is not None:
ner_spans.append(current_span)
current_span = None
if current_span is not None:
ner_spans.append(current_span)
converted_data.append({
"tokenized_text": current_example["tokens"],
"ner": ner_spans,
"validated": False
})
current_example = {"tokens": [], "ner_tags": []}
# Handle last example if exists
if current_example["tokens"]:
ner_spans = []
current_span = None
for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
if tag != "O":
if current_span is None:
current_span = [i, i, tag]
elif tag == current_span[2]:
current_span[1] = i
else:
ner_spans.append(current_span)
current_span = [i, i, tag]
elif current_span is not None:
ner_spans.append(current_span)
current_span = None
if current_span is not None:
ner_spans.append(current_span)
converted_data.append({
"tokenized_text": current_example["tokens"],
"ner": ner_spans,
"validated": False
})
return converted_data
def _load_txt_file(file_path: str) -> List[Dict]:
"""Load and convert data from a text file.
Args:
file_path: Path to the text file
Returns:
List of converted examples
"""
converted_data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
tokens = tokenize_text(line)
converted_data.append({
"tokenized_text": tokens,
"ner": [],
"validated": False
})
return converted_data