File size: 7,532 Bytes
a33a001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""File processing utilities for NER annotation."""

import os
import json
import pandas as pd
from typing import List, Dict, Union, Optional
from .text_processing import tokenize_text, process_text_for_gliner

def process_uploaded_file(file_obj) -> List[str]:
    """Process an uploaded file into a list of sentences.
    
    Args:
        file_obj: The uploaded file object
        
    Returns:
        List of processed sentences
        
    Raises:
        Exception: If file processing fails
    """
    if file_obj is None:
        raise ValueError("Please upload a file first!")
    
    try:
        if file_obj.name.endswith('.csv'):
            # Process CSV file
            df = pd.read_csv(file_obj.name)
            sentences = df['Nội dung'].dropna().tolist()
        else:
            # Process text file
            content = file_obj.read().decode('utf-8')
            sentences = [line.strip() for line in content.splitlines() if line.strip()]
        
        # Process each sentence and flatten the list
        processed_sentences = []
        for sentence in sentences:
            processed_sentences.extend(process_text_for_gliner(sentence))
        
        return processed_sentences
    except Exception as e:
        raise Exception(f"Error reading file: {str(e)}")

def load_from_local_file(
    file_path: str,
    file_format: str = "json"
) -> List[Dict]:
    """Load and convert data from local file in various formats.
    
    Args:
        file_path: Path to the file to load
        file_format: Format of the file (json, conll, or txt)
        
    Returns:
        List of converted examples
        
    Raises:
        Exception: If file loading fails
    """
    try:
        if file_format == "json":
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    # If data is already in the correct format
                    if all("tokenized_text" in item and "ner" in item for item in data):
                        return data
                    # Convert from other JSON formats
                    return _convert_json_format(data)
                else:
                    raise ValueError("JSON file must contain a list of examples")

        elif file_format == "conll":
            return _load_conll_file(file_path)

        elif file_format == "txt":
            return _load_txt_file(file_path)

        else:
            raise ValueError(f"Unsupported file format: {file_format}")

    except Exception as e:
        raise Exception(f"Error loading file: {str(e)}")

def _convert_json_format(data: List[Dict]) -> List[Dict]:
    """Convert JSON data from various formats to the standard format.
    
    Args:
        data: List of examples in various JSON formats
        
    Returns:
        List of examples in the standard format
    """
    converted_data = []
    for item in data:
        if "tokens" in item and "ner_tags" in item:
            ner_spans = []
            current_span = None
            for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
                if tag != "O":
                    if current_span is None:
                        current_span = [i, i, tag]
                    elif tag == current_span[2]:
                        current_span[1] = i
                    else:
                        ner_spans.append(current_span)
                        current_span = [i, i, tag]
                elif current_span is not None:
                    ner_spans.append(current_span)
                    current_span = None
            if current_span is not None:
                ner_spans.append(current_span)
            converted_data.append({
                "tokenized_text": item["tokens"],
                "ner": ner_spans,
                "validated": False
            })
    return converted_data

def _load_conll_file(file_path: str) -> List[Dict]:
    """Load and convert data from a CoNLL format file.
    
    Args:
        file_path: Path to the CoNLL file
        
    Returns:
        List of converted examples
    """
    converted_data = []
    current_example = {"tokens": [], "ner_tags": []}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                if line.startswith("#"):
                    continue
                parts = line.split()
                if len(parts) >= 2:
                    token, tag = parts[0], parts[-1]
                    current_example["tokens"].append(token)
                    current_example["ner_tags"].append(tag)
            elif current_example["tokens"]:
                # Convert current example
                ner_spans = []
                current_span = None
                for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
                    if tag != "O":
                        if current_span is None:
                            current_span = [i, i, tag]
                        elif tag == current_span[2]:
                            current_span[1] = i
                        else:
                            ner_spans.append(current_span)
                            current_span = [i, i, tag]
                    elif current_span is not None:
                        ner_spans.append(current_span)
                        current_span = None
                if current_span is not None:
                    ner_spans.append(current_span)
                
                converted_data.append({
                    "tokenized_text": current_example["tokens"],
                    "ner": ner_spans,
                    "validated": False
                })
                current_example = {"tokens": [], "ner_tags": []}
        
        # Handle last example if exists
        if current_example["tokens"]:
            ner_spans = []
            current_span = None
            for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
                if tag != "O":
                    if current_span is None:
                        current_span = [i, i, tag]
                    elif tag == current_span[2]:
                        current_span[1] = i
                    else:
                        ner_spans.append(current_span)
                        current_span = [i, i, tag]
                elif current_span is not None:
                    ner_spans.append(current_span)
                    current_span = None
            if current_span is not None:
                ner_spans.append(current_span)
            
            converted_data.append({
                "tokenized_text": current_example["tokens"],
                "ner": ner_spans,
                "validated": False
            })
    
    return converted_data

def _load_txt_file(file_path: str) -> List[Dict]:
    """Load and convert data from a text file.
    
    Args:
        file_path: Path to the text file
        
    Returns:
        List of converted examples
    """
    converted_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                tokens = tokenize_text(line)
                converted_data.append({
                    "tokenized_text": tokens,
                    "ner": [],
                    "validated": False
                })
    return converted_data