Lucas ARRIESSE commited on
Commit
f282fc7
·
1 Parent(s): 0925e1c

Add ability to do free form conversion of user files

Browse files
Files changed (3) hide show
  1. api/docs.py +95 -46
  2. static/index.html +29 -4
  3. static/js/app.js +34 -1
api/docs.py CHANGED
@@ -16,7 +16,7 @@ import re
16
  import tempfile
17
  from lxml import etree
18
  from bs4 import BeautifulSoup
19
- from fastapi import Depends, HTTPException
20
  from dependencies import get_http_client, get_llm_router
21
  from fastapi.responses import StreamingResponse
22
  from litellm.router import Router
@@ -105,6 +105,40 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
105
  return out_bytes
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # Rate limit of FTP downloads per minute
109
  FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
110
  # Max number of parallel workers downloading
@@ -208,49 +242,9 @@ FORMAT_MIME_TYPES = {
208
  ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
209
  }
210
 
211
-
212
- async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
213
- """
214
- Télécharge le TDoc spécifié et le convertit en texte.
215
- """
216
-
217
- # Grab the document archive
218
- filename, ext, bytes = await get_doc_archive(url, client)
219
-
220
- final_text: str = None
221
- if ext == ".doc":
222
- logging.debug(f"Converting {filename} .doc --> .docx")
223
- docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
224
- extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
225
- final_text = extracted_data.content
226
- elif ext == ".docx":
227
- # Applying doc revisions to docx files (especially for pCR / draftCR files)
228
- logging.debug(f"Updating .docx revisions for {doc_id}.")
229
- applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
230
- extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
231
- final_text = extracted_data.content
232
- elif ext == ".ppt":
233
- logging.debug(f"Converting {filename} .ppt --> .pptx")
234
- docx_bytes = await convert_file(bytes, doc_id, "ppt", "pptx")
235
- extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
236
- final_text = extracted_data.content
237
- else:
238
- if ext in FORMAT_MIME_TYPES: # file extension is supported
239
- extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
240
- final_text = extracted_data.content
241
- else:
242
- raise Exception(
243
- f"Unsupported file type: {ext}, filename: {filename}")
244
-
245
- # include an empty line in the beginning
246
- txt_data = ["\n"] + [line.strip()
247
- for line in final_text.splitlines() if line.strip()]
248
-
249
- return txt_data
250
-
251
-
252
  # ============================================= Doc routes =========================================================
253
 
 
254
  @router.post("/get_meetings", response_model=GetMeetingsResponse)
255
  async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
256
  """
@@ -358,10 +352,11 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
358
  text = re.sub(r'[^\w\s-]', '', text).strip()
359
  return text if text else "_unspecified_agenda_item"
360
 
361
- async def _process_single_document(item: DocInfo) -> Tuple[bool, bytes]:
362
  """Attempts to convert a document to text and returns success status and content."""
363
  try:
364
- text_lines = await doc_to_txt(item.document, item.url, http_client)
 
365
  content_bytes = "\n".join(text_lines).encode("utf-8")
366
  return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
367
  except Exception as e:
@@ -399,6 +394,59 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
399
  # ======================================================================================================================================================================================
400
 
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  class ProgressUpdate(BaseModel):
403
  """Defines the structure of a single SSE message."""
404
  status: Literal["progress", "complete"]
@@ -429,8 +477,9 @@ async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_ro
429
 
430
  # convert the docx to txt for use
431
  try:
432
- doc = await doc_to_txt(doc_id, url, http_client)
433
- full = "\n".join(doc)
 
434
  except Exception as e:
435
  fmt = "".join(traceback.format_exception(e))
436
  logging.error(f"Failed to process doc {doc_id} : {fmt}")
 
16
  import tempfile
17
  from lxml import etree
18
  from bs4 import BeautifulSoup
19
+ from fastapi import Depends, File, HTTPException, UploadFile
20
  from dependencies import get_http_client, get_llm_router
21
  from fastapi.responses import StreamingResponse
22
  from litellm.router import Router
 
105
  return out_bytes
106
 
107
 
108
+ async def convert_to_txt(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
109
+ """Convert given file represented as a (filename, ext, bytes) to a list of lines"""
110
+
111
+ final_text: str = None
112
+ if ext == ".doc":
113
+ logging.debug(f"Converting {filename} .doc --> .docx")
114
+ docx_bytes = await convert_file(bytes, filename, "doc", "docx")
115
+ extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
116
+ final_text = extracted_data.content
117
+ elif ext == ".docx":
118
+ # Applying doc revisions to docx files (especially for pCR / draftCR files)
119
+ logging.debug(f"Updating .docx revisions for {filename}.")
120
+ applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
121
+ extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
122
+ final_text = extracted_data.content
123
+ elif ext == ".ppt":
124
+ logging.debug(f"Converting {filename} .ppt --> .pptx")
125
+ docx_bytes = await convert_file(bytes, filename, "ppt", "pptx")
126
+ extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
127
+ final_text = extracted_data.content
128
+ else:
129
+ if ext in FORMAT_MIME_TYPES: # file extension is supported
130
+ extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
131
+ final_text = extracted_data.content
132
+ else:
133
+ raise Exception(
134
+ f"Unsupported file type: {ext}, filename: {filename}")
135
+
136
+ # include an empty line in the beginning
137
+ txt_data = [""] + [line.strip()
138
+ for line in final_text.splitlines() if line.strip()]
139
+
140
+ return txt_data
141
+
142
  # Rate limit of FTP downloads per minute
143
  FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
144
  # Max number of parallel workers downloading
 
242
  ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
243
  }
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  # ============================================= Doc routes =========================================================
246
 
247
+
248
  @router.post("/get_meetings", response_model=GetMeetingsResponse)
249
  async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
250
  """
 
352
  text = re.sub(r'[^\w\s-]', '', text).strip()
353
  return text if text else "_unspecified_agenda_item"
354
 
355
+ async def _process_single_document(item: DocInfo):
356
  """Attempts to convert a document to text and returns success status and content."""
357
  try:
358
+ filename, ext, bytes = await get_doc_archive(item.url, http_client)
359
+ text_lines = await convert_to_txt(filename, ext, bytes)
360
  content_bytes = "\n".join(text_lines).encode("utf-8")
361
  return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
362
  except Exception as e:
 
394
  # ======================================================================================================================================================================================
395
 
396
 
397
+ @router.post("/download_user_docs")
398
+ async def download_user_docs(files: list[UploadFile] = File(...)):
399
+ """Freeform convert the user files into text and downloads them as a single zip file."""
400
+ file_infos = []
401
+
402
+ # retrieve all files
403
+ for file in files:
404
+ filename, ext = os.path.splitext(file.filename)
405
+ file_infos.append({
406
+ "filename": filename,
407
+ "extension": ext,
408
+ "content": io.BytesIO(await file.read())
409
+ })
410
+
411
+ filenames = [file["filename"] for file in file_infos]
412
+ logging.info(f"Got {len(file_infos)} user files to convert.")
413
+ logging.info(f"Filenames: {filenames}")
414
+
415
+ # convert files to text
416
+ async def _process_single_document(item: dict):
417
+ try:
418
+ text_lines = await convert_to_txt(item["filename"], item["extension"], item["content"])
419
+ content_bytes = "\n".join(text_lines).encode("utf-8")
420
+ return {"doc_id": item["filename"], "content": content_bytes}
421
+ except Exception as e:
422
+ doc = item["filename"]
423
+ logging.warning(
424
+ f"Failed to process document '{doc}': {e}")
425
+ error_message = f"Document '{doc}' text extraction failed: {e}".encode(
426
+ "utf-8")
427
+ return {"doc_id": doc, "content": error_message, "failed": True}
428
+
429
+ convert_tasks = await asyncio.gather(*[_process_single_document(file) for file in file_infos], return_exceptions=False)
430
+
431
+ zip_buffer = io.BytesIO()
432
+ with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
433
+ for task in convert_tasks:
434
+ failed = "failed" in task
435
+ doc_id = task["doc_id"]
436
+ base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
437
+ zip_file.writestr(base_filename, task["content"])
438
+
439
+ zip_buffer.seek(0)
440
+
441
+ return StreamingResponse(
442
+ zip_buffer,
443
+ media_type="application/zip",
444
+ headers={"Content-Disposition": "attachment; filename=user_files.zip"}
445
+ )
446
+
447
+ # ======================================================================================================================================================================================
448
+
449
+
450
  class ProgressUpdate(BaseModel):
451
  """Defines the structure of a single SSE message."""
452
  status: Literal["progress", "complete"]
 
477
 
478
  # convert the docx to txt for use
479
  try:
480
+ filename, ext, bytes = await get_doc_archive(url, http_client)
481
+ txt_data = await convert_to_txt(filename, ext, bytes)
482
+ full = "\n".join(txt_data)
483
  except Exception as e:
484
  fmt = "".join(traceback.format_exception(e))
485
  logging.error(f"Failed to process doc {doc_id} : {fmt}")
static/index.html CHANGED
@@ -13,7 +13,7 @@
13
 
14
  <body class="bg-gray-100 min-h-screen">
15
  <!-- Loading Overlay -->
16
- <div id="loading-overlay" class="fixed inset-0 bg-black/50 flex items-center justify-center z-50 hidden">
17
  <div class="bg-white p-6 rounded-lg shadow-lg text-center">
18
  <span class="loading loading-spinner loading-xl"></span>
19
  <p id="progress-text" class="text-gray-700">Chargement en cours...</p>
@@ -156,17 +156,21 @@
156
  <!-- Data Table Informations -->
157
  <div class="flex justify-between items-center mb-2 pt-5" id="data-table-info-container">
158
  <div class="flex gap-2 items-center">
 
159
  <div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
160
  <button id="extract-requirements-btn"
161
  class="btn bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
162
  Extract Requirements from CRs
163
  </button>
164
  </div>
 
 
165
  <div class="tooltip" data-tip="Download all selected docs as text files">
166
  <div class="dropdown">
167
- <div tabindex="0" role="button" class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
 
168
  📦 Download </div>
169
- <div tabindex="0" class="dropdown-content card card-sm bg-base-100 z-1 w-64 shadow-md">
170
  <div class="card-body space-y-2">
171
  <label class="label">
172
  <input class="checkbox checkbox-primary" name="download-sorted-files"
@@ -174,13 +178,34 @@
174
  <p class="text-m">Sort files by agenda</p>
175
  </label>
176
  <button id="download-tdocs-btn"
177
- class="text-sm rounded px-3 py-1 shadow cursor-pointer">
178
  📦 Download docs
179
  </button>
180
  </div>
181
  </div>
182
  </div>
183
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  </div>
185
 
186
  <!-- document counts -->
 
13
 
14
  <body class="bg-gray-100 min-h-screen">
15
  <!-- Loading Overlay -->
16
+ <div id="loading-overlay" class="fixed inset-0 bg-black/50 flex items-center justify-center z-500 hidden">
17
  <div class="bg-white p-6 rounded-lg shadow-lg text-center">
18
  <span class="loading loading-spinner loading-xl"></span>
19
  <p id="progress-text" class="text-gray-700">Chargement en cours...</p>
 
156
  <!-- Data Table Informations -->
157
  <div class="flex justify-between items-center mb-2 pt-5" id="data-table-info-container">
158
  <div class="flex gap-2 items-center">
159
+ <!--Extract xCR button-->
160
  <div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
161
  <button id="extract-requirements-btn"
162
  class="btn bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
163
  Extract Requirements from CRs
164
  </button>
165
  </div>
166
+
167
+ <!--Download button-->
168
  <div class="tooltip" data-tip="Download all selected docs as text files">
169
  <div class="dropdown">
170
+ <div tabindex="0" role="button"
171
+ class="btn btn-primary text-sm rounded px-3 py-1 shadow cursor-pointer">
172
  📦 Download </div>
173
+ <div class="dropdown-content card card-sm bg-base-100 z-1 w-64 shadow-md">
174
  <div class="card-body space-y-2">
175
  <label class="label">
176
  <input class="checkbox checkbox-primary" name="download-sorted-files"
 
178
  <p class="text-m">Sort files by agenda</p>
179
  </label>
180
  <button id="download-tdocs-btn"
181
+ class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
182
  📦 Download docs
183
  </button>
184
  </div>
185
  </div>
186
  </div>
187
  </div>
188
+
189
+ <!--Free form convert zone-->
190
+ <div class="tooltip" data-tip="Convert user uploaded files to text">
191
+ <div class="dropdown">
192
+ <div tabindex="0" role="button"
193
+ class="btn btn-secondary text-sm rounded px-3 py-1 shadow cursor-pointer">
194
+ ⚗ Freeform convert </div>
195
+ <div class="dropdown-content card card-sm bg-base-100 z-1 w-128 shadow-md">
196
+ <div class="card-body space-y-2">
197
+ <fieldset class="fieldset">
198
+ <legend class="fieldset-legend">Select files to convert to text</legend>
199
+ <input id="freeform-convert-files" type="file" class="file-input" multiple />
200
+ </fieldset>
201
+ <button id="freeform-convert-btn"
202
+ class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
203
+ 📦 Download converted docs
204
+ </button>
205
+ </div>
206
+ </div>
207
+ </div>
208
+ </div>
209
  </div>
210
 
211
  <!-- document counts -->
static/js/app.js CHANGED
@@ -265,6 +265,37 @@ async function downloadTDocs() {
265
  }
266
  }
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  /**
269
  * Génère un nom de fichier pour le téléchargement
270
  * @returns {string} Nom du fichier
@@ -1104,4 +1135,6 @@ document.getElementById('read-assessment-button').addEventListener('click', _ =>
1104
  // Events des boutons pour le drafting de solutions
1105
  document.getElementById('refine-btn').addEventListener('click', handleDraftRefine);
1106
  document.getElementById('fto-analysis-btn').addEventListener('click', handleFTOAnalysis);
1107
- document.getElementById('export-timeline-btn').addEventListener('click', handleExportDrafts)
 
 
 
265
  }
266
  }
267
 
268
+ /**
269
+ *
270
+ */
271
+ async function downloadFreeformDocs() {
272
+ const files = document.getElementById('freeform-convert-files').files;
273
+
274
+ if (!files.length) {
275
+ alert("Please select at least one file to convert.");
276
+ return;
277
+ }
278
+
279
+ const formData = new FormData();
280
+ for (let i = 0; i < files.length; i++)
281
+ formData.append("files", files[i]);
282
+
283
+ try {
284
+ showLoadingOverlay("Converting user docs to text files");
285
+ const response = await fetch("/docs/download_user_docs", {
286
+ method: "POST",
287
+ body: formData
288
+ });
289
+ const blob = await response.blob();
290
+ downloadBlob(blob, "user_files");
291
+ }
292
+ catch (err) {
293
+
294
+ } finally {
295
+ hideLoadingOverlay();
296
+ }
297
+ }
298
+
299
  /**
300
  * Génère un nom de fichier pour le téléchargement
301
  * @returns {string} Nom du fichier
 
1135
  // Events des boutons pour le drafting de solutions
1136
  document.getElementById('refine-btn').addEventListener('click', handleDraftRefine);
1137
  document.getElementById('fto-analysis-btn').addEventListener('click', handleFTOAnalysis);
1138
+ document.getElementById('export-timeline-btn').addEventListener('click', handleExportDrafts);
1139
+
1140
+ document.getElementById('freeform-convert-btn').addEventListener('click', downloadFreeformDocs);