Spaces:
Sleeping
Sleeping
Update app.py
Browse filesadded batch processing
app.py
CHANGED
@@ -5,6 +5,8 @@ import re
|
|
5 |
from tokenizers import normalizers
|
6 |
from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
|
7 |
from tokenizers import Regex
|
|
|
|
|
8 |
|
9 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
10 |
|
@@ -58,6 +60,23 @@ tokenizer.backend_tokenizer.normalizer = Sequence([
|
|
58 |
Strip()
|
59 |
])
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def classify_text(text):
|
62 |
cleaned_text = clean_text(text)
|
63 |
if not text.strip():
|
@@ -100,7 +119,16 @@ def classify_text(text):
|
|
100 |
|
101 |
return result_message
|
102 |
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
|
106 |
|
@@ -232,12 +260,19 @@ with iface:
|
|
232 |
gr.Markdown(f"# {title}")
|
233 |
gr.Markdown(description)
|
234 |
text_input = gr.Textbox(label="", placeholder="Type or paste your content here...", elem_id="text_input_box", lines=5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
result_output = gr.Markdown("", elem_id="result_output_box")
|
236 |
text_input.change(classify_text, inputs=text_input, outputs=result_output)
|
237 |
-
|
238 |
-
|
239 |
-
with gr.Tab("Human text examples"):
|
240 |
-
gr.Examples(Human_texts, inputs=text_input)
|
241 |
gr.Markdown(bottom_text, elem_id="bottom_text")
|
242 |
|
243 |
iface.launch(share=True)
|
|
|
5 |
from tokenizers import normalizers
|
6 |
from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
|
7 |
from tokenizers import Regex
|
8 |
+
import fitz # PyMuPDF
|
9 |
+
import os
|
10 |
|
11 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
12 |
|
|
|
60 |
Strip()
|
61 |
])
|
62 |
|
63 |
+
def extract_text_from_file(file):
|
64 |
+
file_path = file.name
|
65 |
+
ext = os.path.splitext(file_path)[1].lower()
|
66 |
+
|
67 |
+
if ext == ".pdf":
|
68 |
+
doc = fitz.open(file_path)
|
69 |
+
text = ""
|
70 |
+
for page in doc:
|
71 |
+
text += page.get_text()
|
72 |
+
doc.close()
|
73 |
+
return text
|
74 |
+
elif ext == ".txt":
|
75 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
76 |
+
return f.read()
|
77 |
+
else:
|
78 |
+
return ""
|
79 |
+
|
80 |
def classify_text(text):
|
81 |
cleaned_text = clean_text(text)
|
82 |
if not text.strip():
|
|
|
119 |
|
120 |
return result_message
|
121 |
|
122 |
+
def batch_classify(files):
|
123 |
+
results = []
|
124 |
+
for file in files:
|
125 |
+
try:
|
126 |
+
text = extract_text_from_file(file)
|
127 |
+
result_html = classify_text(text)
|
128 |
+
results.append([os.path.basename(file.name), result_html])
|
129 |
+
except Exception as e:
|
130 |
+
results.append([os.path.basename(file.name), f"Error: {str(e)}"])
|
131 |
+
return results
|
132 |
|
133 |
|
134 |
|
|
|
260 |
gr.Markdown(f"# {title}")
|
261 |
gr.Markdown(description)
|
262 |
text_input = gr.Textbox(label="", placeholder="Type or paste your content here...", elem_id="text_input_box", lines=5)
|
263 |
+
|
264 |
+
gr.Markdown("## Or upload multiple files for batch classification")
|
265 |
+
file_input = gr.File(label="Upload PDF or Text Files", file_types=[".pdf", ".txt", ".py", ".ipynb"], file_count="multiple")
|
266 |
+
result_table = gr.Dataframe(headers=["File Name", "Classification Result"], wrap=True)
|
267 |
+
|
268 |
+
file_input.change(fn=batch_classify, inputs=file_input, outputs=result_table)
|
269 |
+
|
270 |
+
|
271 |
+
|
272 |
result_output = gr.Markdown("", elem_id="result_output_box")
|
273 |
text_input.change(classify_text, inputs=text_input, outputs=result_output)
|
274 |
+
|
275 |
+
|
|
|
|
|
276 |
gr.Markdown(bottom_text, elem_id="bottom_text")
|
277 |
|
278 |
iface.launch(share=True)
|