Lucas ARRIESSE
commited on
Commit
·
f282fc7
1
Parent(s):
0925e1c
Add ability to do free form conversion of user files
Browse files- api/docs.py +95 -46
- static/index.html +29 -4
- static/js/app.js +34 -1
api/docs.py
CHANGED
@@ -16,7 +16,7 @@ import re
|
|
16 |
import tempfile
|
17 |
from lxml import etree
|
18 |
from bs4 import BeautifulSoup
|
19 |
-
from fastapi import Depends, HTTPException
|
20 |
from dependencies import get_http_client, get_llm_router
|
21 |
from fastapi.responses import StreamingResponse
|
22 |
from litellm.router import Router
|
@@ -105,6 +105,40 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
|
|
105 |
return out_bytes
|
106 |
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
# Rate limit of FTP downloads per minute
|
109 |
FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
|
110 |
# Max number of parallel workers downloading
|
@@ -208,49 +242,9 @@ FORMAT_MIME_TYPES = {
|
|
208 |
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
209 |
}
|
210 |
|
211 |
-
|
212 |
-
async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
|
213 |
-
"""
|
214 |
-
Télécharge le TDoc spécifié et le convertit en texte.
|
215 |
-
"""
|
216 |
-
|
217 |
-
# Grab the document archive
|
218 |
-
filename, ext, bytes = await get_doc_archive(url, client)
|
219 |
-
|
220 |
-
final_text: str = None
|
221 |
-
if ext == ".doc":
|
222 |
-
logging.debug(f"Converting {filename} .doc --> .docx")
|
223 |
-
docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
|
224 |
-
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
225 |
-
final_text = extracted_data.content
|
226 |
-
elif ext == ".docx":
|
227 |
-
# Applying doc revisions to docx files (especially for pCR / draftCR files)
|
228 |
-
logging.debug(f"Updating .docx revisions for {doc_id}.")
|
229 |
-
applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
|
230 |
-
extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
231 |
-
final_text = extracted_data.content
|
232 |
-
elif ext == ".ppt":
|
233 |
-
logging.debug(f"Converting {filename} .ppt --> .pptx")
|
234 |
-
docx_bytes = await convert_file(bytes, doc_id, "ppt", "pptx")
|
235 |
-
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
|
236 |
-
final_text = extracted_data.content
|
237 |
-
else:
|
238 |
-
if ext in FORMAT_MIME_TYPES: # file extension is supported
|
239 |
-
extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
|
240 |
-
final_text = extracted_data.content
|
241 |
-
else:
|
242 |
-
raise Exception(
|
243 |
-
f"Unsupported file type: {ext}, filename: {filename}")
|
244 |
-
|
245 |
-
# include an empty line in the beginning
|
246 |
-
txt_data = ["\n"] + [line.strip()
|
247 |
-
for line in final_text.splitlines() if line.strip()]
|
248 |
-
|
249 |
-
return txt_data
|
250 |
-
|
251 |
-
|
252 |
# ============================================= Doc routes =========================================================
|
253 |
|
|
|
254 |
@router.post("/get_meetings", response_model=GetMeetingsResponse)
|
255 |
async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
|
256 |
"""
|
@@ -358,10 +352,11 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
|
|
358 |
text = re.sub(r'[^\w\s-]', '', text).strip()
|
359 |
return text if text else "_unspecified_agenda_item"
|
360 |
|
361 |
-
async def _process_single_document(item: DocInfo)
|
362 |
"""Attempts to convert a document to text and returns success status and content."""
|
363 |
try:
|
364 |
-
|
|
|
365 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
366 |
return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
|
367 |
except Exception as e:
|
@@ -399,6 +394,59 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
|
|
399 |
# ======================================================================================================================================================================================
|
400 |
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
class ProgressUpdate(BaseModel):
|
403 |
"""Defines the structure of a single SSE message."""
|
404 |
status: Literal["progress", "complete"]
|
@@ -429,8 +477,9 @@ async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_ro
|
|
429 |
|
430 |
# convert the docx to txt for use
|
431 |
try:
|
432 |
-
|
433 |
-
|
|
|
434 |
except Exception as e:
|
435 |
fmt = "".join(traceback.format_exception(e))
|
436 |
logging.error(f"Failed to process doc {doc_id} : {fmt}")
|
|
|
16 |
import tempfile
|
17 |
from lxml import etree
|
18 |
from bs4 import BeautifulSoup
|
19 |
+
from fastapi import Depends, File, HTTPException, UploadFile
|
20 |
from dependencies import get_http_client, get_llm_router
|
21 |
from fastapi.responses import StreamingResponse
|
22 |
from litellm.router import Router
|
|
|
105 |
return out_bytes
|
106 |
|
107 |
|
108 |
+
async def convert_to_txt(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
|
109 |
+
"""Convert given file represented as a (filename, ext, bytes) to a list of lines"""
|
110 |
+
|
111 |
+
final_text: str = None
|
112 |
+
if ext == ".doc":
|
113 |
+
logging.debug(f"Converting {filename} .doc --> .docx")
|
114 |
+
docx_bytes = await convert_file(bytes, filename, "doc", "docx")
|
115 |
+
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
116 |
+
final_text = extracted_data.content
|
117 |
+
elif ext == ".docx":
|
118 |
+
# Applying doc revisions to docx files (especially for pCR / draftCR files)
|
119 |
+
logging.debug(f"Updating .docx revisions for {filename}.")
|
120 |
+
applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
|
121 |
+
extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
|
122 |
+
final_text = extracted_data.content
|
123 |
+
elif ext == ".ppt":
|
124 |
+
logging.debug(f"Converting {filename} .ppt --> .pptx")
|
125 |
+
docx_bytes = await convert_file(bytes, filename, "ppt", "pptx")
|
126 |
+
extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
|
127 |
+
final_text = extracted_data.content
|
128 |
+
else:
|
129 |
+
if ext in FORMAT_MIME_TYPES: # file extension is supported
|
130 |
+
extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
|
131 |
+
final_text = extracted_data.content
|
132 |
+
else:
|
133 |
+
raise Exception(
|
134 |
+
f"Unsupported file type: {ext}, filename: {filename}")
|
135 |
+
|
136 |
+
# include an empty line in the beginning
|
137 |
+
txt_data = [""] + [line.strip()
|
138 |
+
for line in final_text.splitlines() if line.strip()]
|
139 |
+
|
140 |
+
return txt_data
|
141 |
+
|
142 |
# Rate limit of FTP downloads per minute
|
143 |
FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
|
144 |
# Max number of parallel workers downloading
|
|
|
242 |
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
243 |
}
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
# ============================================= Doc routes =========================================================
|
246 |
|
247 |
+
|
248 |
@router.post("/get_meetings", response_model=GetMeetingsResponse)
|
249 |
async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
|
250 |
"""
|
|
|
352 |
text = re.sub(r'[^\w\s-]', '', text).strip()
|
353 |
return text if text else "_unspecified_agenda_item"
|
354 |
|
355 |
+
async def _process_single_document(item: DocInfo):
|
356 |
"""Attempts to convert a document to text and returns success status and content."""
|
357 |
try:
|
358 |
+
filename, ext, bytes = await get_doc_archive(item.url, http_client)
|
359 |
+
text_lines = await convert_to_txt(filename, ext, bytes)
|
360 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
361 |
return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
|
362 |
except Exception as e:
|
|
|
394 |
# ======================================================================================================================================================================================
|
395 |
|
396 |
|
397 |
+
@router.post("/download_user_docs")
|
398 |
+
async def download_user_docs(files: list[UploadFile] = File(...)):
|
399 |
+
"""Freeform convert the user files into text and downloads them as a single zip file."""
|
400 |
+
file_infos = []
|
401 |
+
|
402 |
+
# retrieve all files
|
403 |
+
for file in files:
|
404 |
+
filename, ext = os.path.splitext(file.filename)
|
405 |
+
file_infos.append({
|
406 |
+
"filename": filename,
|
407 |
+
"extension": ext,
|
408 |
+
"content": io.BytesIO(await file.read())
|
409 |
+
})
|
410 |
+
|
411 |
+
filenames = [file["filename"] for file in file_infos]
|
412 |
+
logging.info(f"Got {len(file_infos)} user files to convert.")
|
413 |
+
logging.info(f"Filenames: {filenames}")
|
414 |
+
|
415 |
+
# convert files to text
|
416 |
+
async def _process_single_document(item: dict):
|
417 |
+
try:
|
418 |
+
text_lines = await convert_to_txt(item["filename"], item["extension"], item["content"])
|
419 |
+
content_bytes = "\n".join(text_lines).encode("utf-8")
|
420 |
+
return {"doc_id": item["filename"], "content": content_bytes}
|
421 |
+
except Exception as e:
|
422 |
+
doc = item["filename"]
|
423 |
+
logging.warning(
|
424 |
+
f"Failed to process document '{doc}': {e}")
|
425 |
+
error_message = f"Document '{doc}' text extraction failed: {e}".encode(
|
426 |
+
"utf-8")
|
427 |
+
return {"doc_id": doc, "content": error_message, "failed": True}
|
428 |
+
|
429 |
+
convert_tasks = await asyncio.gather(*[_process_single_document(file) for file in file_infos], return_exceptions=False)
|
430 |
+
|
431 |
+
zip_buffer = io.BytesIO()
|
432 |
+
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
433 |
+
for task in convert_tasks:
|
434 |
+
failed = "failed" in task
|
435 |
+
doc_id = task["doc_id"]
|
436 |
+
base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
|
437 |
+
zip_file.writestr(base_filename, task["content"])
|
438 |
+
|
439 |
+
zip_buffer.seek(0)
|
440 |
+
|
441 |
+
return StreamingResponse(
|
442 |
+
zip_buffer,
|
443 |
+
media_type="application/zip",
|
444 |
+
headers={"Content-Disposition": "attachment; filename=user_files.zip"}
|
445 |
+
)
|
446 |
+
|
447 |
+
# ======================================================================================================================================================================================
|
448 |
+
|
449 |
+
|
450 |
class ProgressUpdate(BaseModel):
|
451 |
"""Defines the structure of a single SSE message."""
|
452 |
status: Literal["progress", "complete"]
|
|
|
477 |
|
478 |
# convert the docx to txt for use
|
479 |
try:
|
480 |
+
filename, ext, bytes = await get_doc_archive(url, http_client)
|
481 |
+
txt_data = await convert_to_txt(filename, ext, bytes)
|
482 |
+
full = "\n".join(txt_data)
|
483 |
except Exception as e:
|
484 |
fmt = "".join(traceback.format_exception(e))
|
485 |
logging.error(f"Failed to process doc {doc_id} : {fmt}")
|
static/index.html
CHANGED
@@ -13,7 +13,7 @@
|
|
13 |
|
14 |
<body class="bg-gray-100 min-h-screen">
|
15 |
<!-- Loading Overlay -->
|
16 |
-
<div id="loading-overlay" class="fixed inset-0 bg-black/50 flex items-center justify-center z-
|
17 |
<div class="bg-white p-6 rounded-lg shadow-lg text-center">
|
18 |
<span class="loading loading-spinner loading-xl"></span>
|
19 |
<p id="progress-text" class="text-gray-700">Chargement en cours...</p>
|
@@ -156,17 +156,21 @@
|
|
156 |
<!-- Data Table Informations -->
|
157 |
<div class="flex justify-between items-center mb-2 pt-5" id="data-table-info-container">
|
158 |
<div class="flex gap-2 items-center">
|
|
|
159 |
<div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
|
160 |
<button id="extract-requirements-btn"
|
161 |
class="btn bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
|
162 |
Extract Requirements from CRs
|
163 |
</button>
|
164 |
</div>
|
|
|
|
|
165 |
<div class="tooltip" data-tip="Download all selected docs as text files">
|
166 |
<div class="dropdown">
|
167 |
-
<div tabindex="0" role="button"
|
|
|
168 |
📦 Download </div>
|
169 |
-
<div
|
170 |
<div class="card-body space-y-2">
|
171 |
<label class="label">
|
172 |
<input class="checkbox checkbox-primary" name="download-sorted-files"
|
@@ -174,13 +178,34 @@
|
|
174 |
<p class="text-m">Sort files by agenda</p>
|
175 |
</label>
|
176 |
<button id="download-tdocs-btn"
|
177 |
-
class="text-sm rounded px-3 py-1 shadow cursor-pointer">
|
178 |
📦 Download docs
|
179 |
</button>
|
180 |
</div>
|
181 |
</div>
|
182 |
</div>
|
183 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
</div>
|
185 |
|
186 |
<!-- document counts -->
|
|
|
13 |
|
14 |
<body class="bg-gray-100 min-h-screen">
|
15 |
<!-- Loading Overlay -->
|
16 |
+
<div id="loading-overlay" class="fixed inset-0 bg-black/50 flex items-center justify-center z-500 hidden">
|
17 |
<div class="bg-white p-6 rounded-lg shadow-lg text-center">
|
18 |
<span class="loading loading-spinner loading-xl"></span>
|
19 |
<p id="progress-text" class="text-gray-700">Chargement en cours...</p>
|
|
|
156 |
<!-- Data Table Informations -->
|
157 |
<div class="flex justify-between items-center mb-2 pt-5" id="data-table-info-container">
|
158 |
<div class="flex gap-2 items-center">
|
159 |
+
<!--Extract xCR button-->
|
160 |
<div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
|
161 |
<button id="extract-requirements-btn"
|
162 |
class="btn bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
|
163 |
Extract Requirements from CRs
|
164 |
</button>
|
165 |
</div>
|
166 |
+
|
167 |
+
<!--Download button-->
|
168 |
<div class="tooltip" data-tip="Download all selected docs as text files">
|
169 |
<div class="dropdown">
|
170 |
+
<div tabindex="0" role="button"
|
171 |
+
class="btn btn-primary text-sm rounded px-3 py-1 shadow cursor-pointer">
|
172 |
📦 Download </div>
|
173 |
+
<div class="dropdown-content card card-sm bg-base-100 z-1 w-64 shadow-md">
|
174 |
<div class="card-body space-y-2">
|
175 |
<label class="label">
|
176 |
<input class="checkbox checkbox-primary" name="download-sorted-files"
|
|
|
178 |
<p class="text-m">Sort files by agenda</p>
|
179 |
</label>
|
180 |
<button id="download-tdocs-btn"
|
181 |
+
class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
|
182 |
📦 Download docs
|
183 |
</button>
|
184 |
</div>
|
185 |
</div>
|
186 |
</div>
|
187 |
</div>
|
188 |
+
|
189 |
+
<!--Free form convert zone-->
|
190 |
+
<div class="tooltip" data-tip="Convert user uploaded files to text">
|
191 |
+
<div class="dropdown">
|
192 |
+
<div tabindex="0" role="button"
|
193 |
+
class="btn btn-secondary text-sm rounded px-3 py-1 shadow cursor-pointer">
|
194 |
+
⚗ Freeform convert </div>
|
195 |
+
<div class="dropdown-content card card-sm bg-base-100 z-1 w-128 shadow-md">
|
196 |
+
<div class="card-body space-y-2">
|
197 |
+
<fieldset class="fieldset">
|
198 |
+
<legend class="fieldset-legend">Select files to convert to text</legend>
|
199 |
+
<input id="freeform-convert-files" type="file" class="file-input" multiple />
|
200 |
+
</fieldset>
|
201 |
+
<button id="freeform-convert-btn"
|
202 |
+
class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
|
203 |
+
📦 Download converted docs
|
204 |
+
</button>
|
205 |
+
</div>
|
206 |
+
</div>
|
207 |
+
</div>
|
208 |
+
</div>
|
209 |
</div>
|
210 |
|
211 |
<!-- document counts -->
|
static/js/app.js
CHANGED
@@ -265,6 +265,37 @@ async function downloadTDocs() {
|
|
265 |
}
|
266 |
}
|
267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
/**
|
269 |
* Génère un nom de fichier pour le téléchargement
|
270 |
* @returns {string} Nom du fichier
|
@@ -1104,4 +1135,6 @@ document.getElementById('read-assessment-button').addEventListener('click', _ =>
|
|
1104 |
// Events des boutons pour le drafting de solutions
|
1105 |
document.getElementById('refine-btn').addEventListener('click', handleDraftRefine);
|
1106 |
document.getElementById('fto-analysis-btn').addEventListener('click', handleFTOAnalysis);
|
1107 |
-
document.getElementById('export-timeline-btn').addEventListener('click', handleExportDrafts)
|
|
|
|
|
|
265 |
}
|
266 |
}
|
267 |
|
268 |
+
/**
|
269 |
+
*
|
270 |
+
*/
|
271 |
+
async function downloadFreeformDocs() {
|
272 |
+
const files = document.getElementById('freeform-convert-files').files;
|
273 |
+
|
274 |
+
if (!files.length) {
|
275 |
+
alert("Please select at least one file to convert.");
|
276 |
+
return;
|
277 |
+
}
|
278 |
+
|
279 |
+
const formData = new FormData();
|
280 |
+
for (let i = 0; i < files.length; i++)
|
281 |
+
formData.append("files", files[i]);
|
282 |
+
|
283 |
+
try {
|
284 |
+
showLoadingOverlay("Converting user docs to text files");
|
285 |
+
const response = await fetch("/docs/download_user_docs", {
|
286 |
+
method: "POST",
|
287 |
+
body: formData
|
288 |
+
});
|
289 |
+
const blob = await response.blob();
|
290 |
+
downloadBlob(blob, "user_files");
|
291 |
+
}
|
292 |
+
catch (err) {
|
293 |
+
|
294 |
+
} finally {
|
295 |
+
hideLoadingOverlay();
|
296 |
+
}
|
297 |
+
}
|
298 |
+
|
299 |
/**
|
300 |
* Génère un nom de fichier pour le téléchargement
|
301 |
* @returns {string} Nom du fichier
|
|
|
1135 |
// Events des boutons pour le drafting de solutions
|
1136 |
document.getElementById('refine-btn').addEventListener('click', handleDraftRefine);
|
1137 |
document.getElementById('fto-analysis-btn').addEventListener('click', handleFTOAnalysis);
|
1138 |
+
document.getElementById('export-timeline-btn').addEventListener('click', handleExportDrafts);
|
1139 |
+
|
1140 |
+
document.getElementById('freeform-convert-btn').addEventListener('click', downloadFreeformDocs);
|