mwiehl commited on
Commit
637ec09
·
verified ·
1 Parent(s): 86c105f

Update app.py

Browse files

added batch processing

Files changed (1) hide show
  1. app.py +40 -5
app.py CHANGED
@@ -5,6 +5,8 @@ import re
5
  from tokenizers import normalizers
6
  from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
7
  from tokenizers import Regex
 
 
8
 
9
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
10
 
@@ -58,6 +60,23 @@ tokenizer.backend_tokenizer.normalizer = Sequence([
58
  Strip()
59
  ])
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def classify_text(text):
62
  cleaned_text = clean_text(text)
63
  if not text.strip():
@@ -100,7 +119,16 @@ def classify_text(text):
100
 
101
  return result_message
102
 
103
-
 
 
 
 
 
 
 
 
 
104
 
105
 
106
 
@@ -232,12 +260,19 @@ with iface:
232
  gr.Markdown(f"# {title}")
233
  gr.Markdown(description)
234
  text_input = gr.Textbox(label="", placeholder="Type or paste your content here...", elem_id="text_input_box", lines=5)
 
 
 
 
 
 
 
 
 
235
  result_output = gr.Markdown("", elem_id="result_output_box")
236
  text_input.change(classify_text, inputs=text_input, outputs=result_output)
237
- with gr.Tab("AI text examples"):
238
- gr.Examples(AI_texts, inputs=text_input)
239
- with gr.Tab("Human text examples"):
240
- gr.Examples(Human_texts, inputs=text_input)
241
  gr.Markdown(bottom_text, elem_id="bottom_text")
242
 
243
  iface.launch(share=True)
 
5
  from tokenizers import normalizers
6
  from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
7
  from tokenizers import Regex
8
+ import fitz # PyMuPDF
9
+ import os
10
 
11
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
 
 
60
  Strip()
61
  ])
62
 
63
+ def extract_text_from_file(file):
64
+ file_path = file.name
65
+ ext = os.path.splitext(file_path)[1].lower()
66
+
67
+ if ext == ".pdf":
68
+ doc = fitz.open(file_path)
69
+ text = ""
70
+ for page in doc:
71
+ text += page.get_text()
72
+ doc.close()
73
+ return text
74
+ elif ext == ".txt":
75
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
76
+ return f.read()
77
+ else:
78
+ return ""
79
+
80
  def classify_text(text):
81
  cleaned_text = clean_text(text)
82
  if not text.strip():
 
119
 
120
  return result_message
121
 
122
+ def batch_classify(files):
123
+ results = []
124
+ for file in files:
125
+ try:
126
+ text = extract_text_from_file(file)
127
+ result_html = classify_text(text)
128
+ results.append([os.path.basename(file.name), result_html])
129
+ except Exception as e:
130
+ results.append([os.path.basename(file.name), f"Error: {str(e)}"])
131
+ return results
132
 
133
 
134
 
 
260
  gr.Markdown(f"# {title}")
261
  gr.Markdown(description)
262
  text_input = gr.Textbox(label="", placeholder="Type or paste your content here...", elem_id="text_input_box", lines=5)
263
+
264
+ gr.Markdown("## Or upload multiple files for batch classification")
265
+ file_input = gr.File(label="Upload PDF or Text Files", file_types=[".pdf", ".txt", ".py", ".ipynb"], file_count="multiple")
266
+ result_table = gr.Dataframe(headers=["File Name", "Classification Result"], wrap=True)
267
+
268
+ file_input.change(fn=batch_classify, inputs=file_input, outputs=result_table)
269
+
270
+
271
+
272
  result_output = gr.Markdown("", elem_id="result_output_box")
273
  text_input.change(classify_text, inputs=text_input, outputs=result_output)
274
+
275
+
 
 
276
  gr.Markdown(bottom_text, elem_id="bottom_text")
277
 
278
  iface.launch(share=True)