Lucas ARRIESSE commited on
Commit
9f79248
·
1 Parent(s): dc794b1

Place files in agenda item directory when downloading tdocs

Browse files
Files changed (3) hide show
  1. api/docs.py +26 -12
  2. schemas.py +2 -0
  3. static/js/app.js +2 -2
api/docs.py CHANGED
@@ -22,7 +22,7 @@ from fastapi.responses import StreamingResponse
22
  from litellm.router import Router
23
  from kreuzberg import ExtractionConfig, extract_bytes
24
 
25
- from schemas import GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
26
 
27
  # API router for requirement extraction from docs / doc list retrieval / download
28
  router = APIRouter(tags=["document extraction"])
@@ -237,8 +237,9 @@ async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
237
  raise Exception(
238
  f"Unsupported file type: {ext}, filename: {filename}")
239
 
240
- txt_data = [line.strip()
241
- for line in final_text.splitlines() if line.strip()]
 
242
 
243
  return txt_data
244
 
@@ -344,28 +345,41 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
344
 
345
  logging.info(f"Downloading TDocs: {document_ids}")
346
 
347
- async def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
 
 
 
 
 
 
 
 
348
  """Attempts to convert a document to text and returns success status and content."""
349
  try:
350
- text_lines = await doc_to_txt(doc_id, doc_url, http_client)
351
  content_bytes = "\n".join(text_lines).encode("utf-8")
352
- return {"doc_id": doc_id, "content": content_bytes}
353
  except Exception as e:
354
  logging.warning(
355
- f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
356
- error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
357
  "utf-8")
358
- return {"doc_id": doc_id, "content": error_message, "failed": True}
359
 
360
- convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
361
 
362
  zip_buffer = io.BytesIO()
363
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
364
  for task in convert_tasks:
 
 
 
 
365
  failed = "failed" in task
366
  doc_id = task["doc_id"]
367
- safe_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
368
- zip_file.writestr(safe_filename, task["content"])
 
369
 
370
  zip_buffer.seek(0)
371
 
 
22
  from litellm.router import Router
23
  from kreuzberg import ExtractionConfig, extract_bytes
24
 
25
+ from schemas import DocInfo, GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
26
 
27
  # API router for requirement extraction from docs / doc list retrieval / download
28
  router = APIRouter(tags=["document extraction"])
 
237
  raise Exception(
238
  f"Unsupported file type: {ext}, filename: {filename}")
239
 
240
+ # include an empty line in the beginning
241
+ txt_data = ["\n"] + [line.strip()
242
+ for line in final_text.splitlines() if line.strip()]
243
 
244
  return txt_data
245
 
 
345
 
346
  logging.info(f"Downloading TDocs: {document_ids}")
347
 
348
+ # quick function for normalizing agenda item names
349
+ def __normalize_for_path(text: str) -> str:
350
+ if not text:
351
+ return "_unspecified_agenda_item"
352
+ text = re.sub(r'\s+', '_', text)
353
+ text = re.sub(r'[^\w\s-]', '', text).strip()
354
+ return text if text else "_unspecified_agenda_item"
355
+
356
+ async def _process_single_document(item: DocInfo) -> Tuple[bool, bytes]:
357
  """Attempts to convert a document to text and returns success status and content."""
358
  try:
359
+ text_lines = await doc_to_txt(item.document, item.url, http_client)
360
  content_bytes = "\n".join(text_lines).encode("utf-8")
361
+ return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
362
  except Exception as e:
363
  logging.warning(
364
+ f"Failed to process document '{item.document}' from URL '{item.url}': {e}")
365
+ error_message = f"Document '{item.document}' text extraction failed: {e}".encode(
366
  "utf-8")
367
+ return {"doc_id": item.document, "content": error_message, "failed": True, "agenda_item": item.agenda_item}
368
 
369
+ convert_tasks = await asyncio.gather(*[_process_single_document(doc) for doc in req.documents], return_exceptions=False)
370
 
371
  zip_buffer = io.BytesIO()
372
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
373
  for task in convert_tasks:
374
+ # get agenda item directory
375
+ agenda_item_str = task.get("agenda_item") or ""
376
+ directory_name = __normalize_for_path(agenda_item_str)
377
+
378
  failed = "failed" in task
379
  doc_id = task["doc_id"]
380
+ base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
381
+ full_file_path = f"{directory_name}/{base_filename}"
382
+ zip_file.writestr(full_file_path, task["content"])
383
 
384
  zip_buffer.seek(0)
385
 
schemas.py CHANGED
@@ -33,6 +33,8 @@ class DocInfo(BaseModel):
33
  url: str
34
  # Document type
35
  type: str
 
 
36
 
37
 
38
  class DownloadDocsRequest(BaseModel):
 
33
  url: str
34
  # Document type
35
  type: str
36
+ # Agenda item this document belongs to.
37
+ agenda_item: str
38
 
39
 
40
  class DownloadDocsRequest(BaseModel):
static/js/app.js CHANGED
@@ -235,7 +235,7 @@ async function downloadTDocs() {
235
  try {
236
  // Extraire les données du tableau avec le format suivant pour la requete backend
237
  // { document: "nom_doc", url: "url_doc", type: "type_de_doc"}
238
- const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': "type" });
239
 
240
  if (selectedData.length === 0) {
241
  alert('Please select at least one document');
@@ -319,7 +319,7 @@ function downloadBlob(blob, filename) {
319
  * Extrait les requirements des documents sélectionnés
320
  */
321
  async function extractRequirements() {
322
- const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': 'type' });
323
 
324
  console.log("Selected docs data");
325
  console.log(selectedData);
 
235
  try {
236
  // Extraire les données du tableau avec le format suivant pour la requete backend
237
  // { document: "nom_doc", url: "url_doc", type: "type_de_doc"}
238
+ const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': "type", "Agenda": "agenda_item" });
239
 
240
  if (selectedData.length === 0) {
241
  alert('Please select at least one document');
 
319
  * Extrait les requirements des documents sélectionnés
320
  */
321
  async function extractRequirements() {
322
+ const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': 'type', "Agenda": "agenda_item" });
323
 
324
  console.log("Selected docs data");
325
  console.log(selectedData);