Spaces:
Running
Running
"""File processing utilities for NER annotation.""" | |
import os | |
import json | |
import pandas as pd | |
from typing import List, Dict, Union, Optional | |
from .text_processing import tokenize_text, process_text_for_gliner | |
def process_uploaded_file(file_obj) -> List[str]: | |
"""Process an uploaded file into a list of sentences. | |
Args: | |
file_obj: The uploaded file object | |
Returns: | |
List of processed sentences | |
Raises: | |
Exception: If file processing fails | |
""" | |
if file_obj is None: | |
raise ValueError("Please upload a file first!") | |
try: | |
if file_obj.name.endswith('.csv'): | |
# Process CSV file | |
df = pd.read_csv(file_obj.name) | |
sentences = df['Nội dung'].dropna().tolist() | |
else: | |
# Process text file | |
content = file_obj.read().decode('utf-8') | |
sentences = [line.strip() for line in content.splitlines() if line.strip()] | |
# Process each sentence and flatten the list | |
processed_sentences = [] | |
for sentence in sentences: | |
processed_sentences.extend(process_text_for_gliner(sentence)) | |
return processed_sentences | |
except Exception as e: | |
raise Exception(f"Error reading file: {str(e)}") | |
def load_from_local_file( | |
file_path: str, | |
file_format: str = "json" | |
) -> List[Dict]: | |
"""Load and convert data from local file in various formats. | |
Args: | |
file_path: Path to the file to load | |
file_format: Format of the file (json, conll, or txt) | |
Returns: | |
List of converted examples | |
Raises: | |
Exception: If file loading fails | |
""" | |
try: | |
if file_format == "json": | |
with open(file_path, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
if isinstance(data, list): | |
# If data is already in the correct format | |
if all("tokenized_text" in item and "ner" in item for item in data): | |
return data | |
# Convert from other JSON formats | |
return _convert_json_format(data) | |
else: | |
raise ValueError("JSON file must contain a list of examples") | |
elif file_format == "conll": | |
return _load_conll_file(file_path) | |
elif file_format == "txt": | |
return _load_txt_file(file_path) | |
else: | |
raise ValueError(f"Unsupported file format: {file_format}") | |
except Exception as e: | |
raise Exception(f"Error loading file: {str(e)}") | |
def _convert_json_format(data: List[Dict]) -> List[Dict]: | |
"""Convert JSON data from various formats to the standard format. | |
Args: | |
data: List of examples in various JSON formats | |
Returns: | |
List of examples in the standard format | |
""" | |
converted_data = [] | |
for item in data: | |
if "tokens" in item and "ner_tags" in item: | |
ner_spans = [] | |
current_span = None | |
for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])): | |
if tag != "O": | |
if current_span is None: | |
current_span = [i, i, tag] | |
elif tag == current_span[2]: | |
current_span[1] = i | |
else: | |
ner_spans.append(current_span) | |
current_span = [i, i, tag] | |
elif current_span is not None: | |
ner_spans.append(current_span) | |
current_span = None | |
if current_span is not None: | |
ner_spans.append(current_span) | |
converted_data.append({ | |
"tokenized_text": item["tokens"], | |
"ner": ner_spans, | |
"validated": False | |
}) | |
return converted_data | |
def _load_conll_file(file_path: str) -> List[Dict]: | |
"""Load and convert data from a CoNLL format file. | |
Args: | |
file_path: Path to the CoNLL file | |
Returns: | |
List of converted examples | |
""" | |
converted_data = [] | |
current_example = {"tokens": [], "ner_tags": []} | |
with open(file_path, 'r', encoding='utf-8') as f: | |
for line in f: | |
line = line.strip() | |
if line: | |
if line.startswith("#"): | |
continue | |
parts = line.split() | |
if len(parts) >= 2: | |
token, tag = parts[0], parts[-1] | |
current_example["tokens"].append(token) | |
current_example["ner_tags"].append(tag) | |
elif current_example["tokens"]: | |
# Convert current example | |
ner_spans = [] | |
current_span = None | |
for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])): | |
if tag != "O": | |
if current_span is None: | |
current_span = [i, i, tag] | |
elif tag == current_span[2]: | |
current_span[1] = i | |
else: | |
ner_spans.append(current_span) | |
current_span = [i, i, tag] | |
elif current_span is not None: | |
ner_spans.append(current_span) | |
current_span = None | |
if current_span is not None: | |
ner_spans.append(current_span) | |
converted_data.append({ | |
"tokenized_text": current_example["tokens"], | |
"ner": ner_spans, | |
"validated": False | |
}) | |
current_example = {"tokens": [], "ner_tags": []} | |
# Handle last example if exists | |
if current_example["tokens"]: | |
ner_spans = [] | |
current_span = None | |
for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])): | |
if tag != "O": | |
if current_span is None: | |
current_span = [i, i, tag] | |
elif tag == current_span[2]: | |
current_span[1] = i | |
else: | |
ner_spans.append(current_span) | |
current_span = [i, i, tag] | |
elif current_span is not None: | |
ner_spans.append(current_span) | |
current_span = None | |
if current_span is not None: | |
ner_spans.append(current_span) | |
converted_data.append({ | |
"tokenized_text": current_example["tokens"], | |
"ner": ner_spans, | |
"validated": False | |
}) | |
return converted_data | |
def _load_txt_file(file_path: str) -> List[Dict]: | |
"""Load and convert data from a text file. | |
Args: | |
file_path: Path to the text file | |
Returns: | |
List of converted examples | |
""" | |
converted_data = [] | |
with open(file_path, 'r', encoding='utf-8') as f: | |
for line in f: | |
line = line.strip() | |
if line: | |
tokens = tokenize_text(line) | |
converted_data.append({ | |
"tokenized_text": tokens, | |
"ner": [], | |
"validated": False | |
}) | |
return converted_data |