Lucas ARRIESSE
commited on
Commit
·
9f79248
1
Parent(s):
dc794b1
Place files in agenda item directory when downloading tdocs
Browse files- api/docs.py +26 -12
- schemas.py +2 -0
- static/js/app.js +2 -2
api/docs.py
CHANGED
@@ -22,7 +22,7 @@ from fastapi.responses import StreamingResponse
|
|
22 |
from litellm.router import Router
|
23 |
from kreuzberg import ExtractionConfig, extract_bytes
|
24 |
|
25 |
-
from schemas import GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
|
26 |
|
27 |
# API router for requirement extraction from docs / doc list retrieval / download
|
28 |
router = APIRouter(tags=["document extraction"])
|
@@ -237,8 +237,9 @@ async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
|
|
237 |
raise Exception(
|
238 |
f"Unsupported file type: {ext}, filename: {filename}")
|
239 |
|
240 |
-
|
241 |
-
|
|
|
242 |
|
243 |
return txt_data
|
244 |
|
@@ -344,28 +345,41 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
|
|
344 |
|
345 |
logging.info(f"Downloading TDocs: {document_ids}")
|
346 |
|
347 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
"""Attempts to convert a document to text and returns success status and content."""
|
349 |
try:
|
350 |
-
text_lines = await doc_to_txt(
|
351 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
352 |
-
return {"doc_id":
|
353 |
except Exception as e:
|
354 |
logging.warning(
|
355 |
-
f"Failed to process document '{
|
356 |
-
error_message = f"Document '{
|
357 |
"utf-8")
|
358 |
-
return {"doc_id":
|
359 |
|
360 |
-
convert_tasks = await asyncio.gather(*[_process_single_document(doc
|
361 |
|
362 |
zip_buffer = io.BytesIO()
|
363 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
364 |
for task in convert_tasks:
|
|
|
|
|
|
|
|
|
365 |
failed = "failed" in task
|
366 |
doc_id = task["doc_id"]
|
367 |
-
|
368 |
-
|
|
|
369 |
|
370 |
zip_buffer.seek(0)
|
371 |
|
|
|
22 |
from litellm.router import Router
|
23 |
from kreuzberg import ExtractionConfig, extract_bytes
|
24 |
|
25 |
+
from schemas import DocInfo, GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
|
26 |
|
27 |
# API router for requirement extraction from docs / doc list retrieval / download
|
28 |
router = APIRouter(tags=["document extraction"])
|
|
|
237 |
raise Exception(
|
238 |
f"Unsupported file type: {ext}, filename: {filename}")
|
239 |
|
240 |
+
# include an empty line in the beginning
|
241 |
+
txt_data = ["\n"] + [line.strip()
|
242 |
+
for line in final_text.splitlines() if line.strip()]
|
243 |
|
244 |
return txt_data
|
245 |
|
|
|
345 |
|
346 |
logging.info(f"Downloading TDocs: {document_ids}")
|
347 |
|
348 |
+
# quick function for normalizing agenda item names
|
349 |
+
def __normalize_for_path(text: str) -> str:
|
350 |
+
if not text:
|
351 |
+
return "_unspecified_agenda_item"
|
352 |
+
text = re.sub(r'\s+', '_', text)
|
353 |
+
text = re.sub(r'[^\w\s-]', '', text).strip()
|
354 |
+
return text if text else "_unspecified_agenda_item"
|
355 |
+
|
356 |
+
async def _process_single_document(item: DocInfo) -> Tuple[bool, bytes]:
|
357 |
"""Attempts to convert a document to text and returns success status and content."""
|
358 |
try:
|
359 |
+
text_lines = await doc_to_txt(item.document, item.url, http_client)
|
360 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
361 |
+
return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
|
362 |
except Exception as e:
|
363 |
logging.warning(
|
364 |
+
f"Failed to process document '{item.document}' from URL '{item.url}': {e}")
|
365 |
+
error_message = f"Document '{item.document}' text extraction failed: {e}".encode(
|
366 |
"utf-8")
|
367 |
+
return {"doc_id": item.document, "content": error_message, "failed": True, "agenda_item": item.agenda_item}
|
368 |
|
369 |
+
convert_tasks = await asyncio.gather(*[_process_single_document(doc) for doc in req.documents], return_exceptions=False)
|
370 |
|
371 |
zip_buffer = io.BytesIO()
|
372 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
373 |
for task in convert_tasks:
|
374 |
+
# get agenda item directory
|
375 |
+
agenda_item_str = task.get("agenda_item") or ""
|
376 |
+
directory_name = __normalize_for_path(agenda_item_str)
|
377 |
+
|
378 |
failed = "failed" in task
|
379 |
doc_id = task["doc_id"]
|
380 |
+
base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
|
381 |
+
full_file_path = f"{directory_name}/{base_filename}"
|
382 |
+
zip_file.writestr(full_file_path, task["content"])
|
383 |
|
384 |
zip_buffer.seek(0)
|
385 |
|
schemas.py
CHANGED
@@ -33,6 +33,8 @@ class DocInfo(BaseModel):
|
|
33 |
url: str
|
34 |
# Document type
|
35 |
type: str
|
|
|
|
|
36 |
|
37 |
|
38 |
class DownloadDocsRequest(BaseModel):
|
|
|
33 |
url: str
|
34 |
# Document type
|
35 |
type: str
|
36 |
+
# Agenda item this document belongs to.
|
37 |
+
agenda_item: str
|
38 |
|
39 |
|
40 |
class DownloadDocsRequest(BaseModel):
|
static/js/app.js
CHANGED
@@ -235,7 +235,7 @@ async function downloadTDocs() {
|
|
235 |
try {
|
236 |
// Extraire les données du tableau avec le format suivant pour la requete backend
|
237 |
// { document: "nom_doc", url: "url_doc", type: "type_de_doc"}
|
238 |
-
const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': "type" });
|
239 |
|
240 |
if (selectedData.length === 0) {
|
241 |
alert('Please select at least one document');
|
@@ -319,7 +319,7 @@ function downloadBlob(blob, filename) {
|
|
319 |
* Extrait les requirements des documents sélectionnés
|
320 |
*/
|
321 |
async function extractRequirements() {
|
322 |
-
const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': 'type' });
|
323 |
|
324 |
console.log("Selected docs data");
|
325 |
console.log(selectedData);
|
|
|
235 |
try {
|
236 |
// Extraire les données du tableau avec le format suivant pour la requete backend
|
237 |
// { document: "nom_doc", url: "url_doc", type: "type_de_doc"}
|
238 |
+
const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': "type", "Agenda": "agenda_item" });
|
239 |
|
240 |
if (selectedData.length === 0) {
|
241 |
alert('Please select at least one document');
|
|
|
319 |
* Extrait les requirements des documents sélectionnés
|
320 |
*/
|
321 |
async function extractRequirements() {
|
322 |
+
const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': 'type', "Agenda": "agenda_item" });
|
323 |
|
324 |
console.log("Selected docs data");
|
325 |
console.log(selectedData);
|