Spaces:

BookingCare
/

ner-annotation

Running

ner-annotation / src /ner_annotation /utils /file_processing.py

nam pham

feat: improve ui/ux

a33a001 17 days ago

7.53 kB

	"""File processing utilities for NER annotation."""

	import os
	import json
	import pandas as pd
	from typing import List, Dict, Union, Optional
	from .text_processing import tokenize_text, process_text_for_gliner

	def process_uploaded_file(file_obj) -> List[str]:
	"""Process an uploaded file into a list of sentences.

	Args:
	file_obj: The uploaded file object

	Returns:
	List of processed sentences

	Raises:
	Exception: If file processing fails
	"""
	if file_obj is None:
	raise ValueError("Please upload a file first!")

	try:
	if file_obj.name.endswith('.csv'):
	# Process CSV file
	df = pd.read_csv(file_obj.name)
	sentences = df['Nội dung'].dropna().tolist()
	else:
	# Process text file
	content = file_obj.read().decode('utf-8')
	sentences = [line.strip() for line in content.splitlines() if line.strip()]

	# Process each sentence and flatten the list
	processed_sentences = []
	for sentence in sentences:
	processed_sentences.extend(process_text_for_gliner(sentence))

	return processed_sentences
	except Exception as e:
	raise Exception(f"Error reading file: {str(e)}")

	def load_from_local_file(
	file_path: str,
	file_format: str = "json"
	) -> List[Dict]:
	"""Load and convert data from local file in various formats.

	Args:
	file_path: Path to the file to load
	file_format: Format of the file (json, conll, or txt)

	Returns:
	List of converted examples

	Raises:
	Exception: If file loading fails
	"""
	try:
	if file_format == "json":
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	if isinstance(data, list):
	# If data is already in the correct format
	if all("tokenized_text" in item and "ner" in item for item in data):
	return data
	# Convert from other JSON formats
	return _convert_json_format(data)
	else:
	raise ValueError("JSON file must contain a list of examples")

	elif file_format == "conll":
	return _load_conll_file(file_path)

	elif file_format == "txt":
	return _load_txt_file(file_path)

	else:
	raise ValueError(f"Unsupported file format: {file_format}")

	except Exception as e:
	raise Exception(f"Error loading file: {str(e)}")

	def _convert_json_format(data: List[Dict]) -> List[Dict]:
	"""Convert JSON data from various formats to the standard format.

	Args:
	data: List of examples in various JSON formats

	Returns:
	List of examples in the standard format
	"""
	converted_data = []
	for item in data:
	if "tokens" in item and "ner_tags" in item:
	ner_spans = []
	current_span = None
	for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
	if tag != "O":
	if current_span is None:
	current_span = [i, i, tag]
	elif tag == current_span[2]:
	current_span[1] = i
	else:
	ner_spans.append(current_span)
	current_span = [i, i, tag]
	elif current_span is not None:
	ner_spans.append(current_span)
	current_span = None
	if current_span is not None:
	ner_spans.append(current_span)
	converted_data.append({
	"tokenized_text": item["tokens"],
	"ner": ner_spans,
	"validated": False
	})
	return converted_data

	def _load_conll_file(file_path: str) -> List[Dict]:
	"""Load and convert data from a CoNLL format file.

	Args:
	file_path: Path to the CoNLL file

	Returns:
	List of converted examples
	"""
	converted_data = []
	current_example = {"tokens": [], "ner_tags": []}

	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if line:
	if line.startswith("#"):
	continue
	parts = line.split()
	if len(parts) >= 2:
	token, tag = parts[0], parts[-1]
	current_example["tokens"].append(token)
	current_example["ner_tags"].append(tag)
	elif current_example["tokens"]:
	# Convert current example
	ner_spans = []
	current_span = None
	for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
	if tag != "O":
	if current_span is None:
	current_span = [i, i, tag]
	elif tag == current_span[2]:
	current_span[1] = i
	else:
	ner_spans.append(current_span)
	current_span = [i, i, tag]
	elif current_span is not None:
	ner_spans.append(current_span)
	current_span = None
	if current_span is not None:
	ner_spans.append(current_span)

	converted_data.append({
	"tokenized_text": current_example["tokens"],
	"ner": ner_spans,
	"validated": False
	})
	current_example = {"tokens": [], "ner_tags": []}

	# Handle last example if exists
	if current_example["tokens"]:
	ner_spans = []
	current_span = None
	for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
	if tag != "O":
	if current_span is None:
	current_span = [i, i, tag]
	elif tag == current_span[2]:
	current_span[1] = i
	else:
	ner_spans.append(current_span)
	current_span = [i, i, tag]
	elif current_span is not None:
	ner_spans.append(current_span)
	current_span = None
	if current_span is not None:
	ner_spans.append(current_span)

	converted_data.append({
	"tokenized_text": current_example["tokens"],
	"ner": ner_spans,
	"validated": False
	})

	return converted_data

	def _load_txt_file(file_path: str) -> List[Dict]:
	"""Load and convert data from a text file.

	Args:
	file_path: Path to the text file

	Returns:
	List of converted examples
	"""
	converted_data = []
	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if line:
	tokens = tokenize_text(line)
	converted_data.append({
	"tokenized_text": tokens,
	"ner": [],
	"validated": False
	})
	return converted_data