Spaces:

OrganizedProgrammers
/

Docxtract

Running

App Files Files Community

Lucas ARRIESSE commited on 4 days ago

Commit

9f79248

1 Parent(s): dc794b1

Place files in agenda item directory when downloading tdocs

Browse files

Files changed (3) hide show

api/docs.py +26 -12
schemas.py +2 -0
static/js/app.js +2 -2

api/docs.py CHANGED Viewed

@@ -22,7 +22,7 @@ from fastapi.responses import StreamingResponse
 from litellm.router import Router
 from kreuzberg import ExtractionConfig, extract_bytes
-from schemas import GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
 # API router for requirement extraction from docs / doc list retrieval / download
 router = APIRouter(tags=["document extraction"])
@@ -237,8 +237,9 @@ async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
             raise Exception(
                 f"Unsupported file type: {ext}, filename: {filename}")
-    txt_data = [line.strip()
-                for line in final_text.splitlines() if line.strip()]
     return txt_data
@@ -344,28 +345,41 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
     logging.info(f"Downloading TDocs: {document_ids}")
-    async def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
         """Attempts to convert a document to text and returns success status and content."""
         try:
-            text_lines = await doc_to_txt(doc_id, doc_url, http_client)
             content_bytes = "\n".join(text_lines).encode("utf-8")
-            return {"doc_id": doc_id, "content": content_bytes}
         except Exception as e:
             logging.warning(
-                f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
-            error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
                 "utf-8")
-            return {"doc_id": doc_id, "content": error_message, "failed": True}
-    convert_tasks = await asyncio.gather(*[_process_single_document(doc.document, doc.url) for doc in req.documents], return_exceptions=False)
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
         for task in convert_tasks:
             failed = "failed" in task
             doc_id = task["doc_id"]
-            safe_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
-            zip_file.writestr(safe_filename, task["content"])
     zip_buffer.seek(0)

 from litellm.router import Router
 from kreuzberg import ExtractionConfig, extract_bytes
+from schemas import DocInfo, GetMeetingDocsRequest, GetMeetingDocsResponse, DocRequirements, DownloadDocsRequest, GetMeetingsRequest, GetMeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
 # API router for requirement extraction from docs / doc list retrieval / download
 router = APIRouter(tags=["document extraction"])
             raise Exception(
                 f"Unsupported file type: {ext}, filename: {filename}")
+    # include an empty line in the beginning
+    txt_data = ["\n"] + [line.strip()
+                         for line in final_text.splitlines() if line.strip()]
     return txt_data
     logging.info(f"Downloading TDocs: {document_ids}")
+    # quick function for normalizing agenda item names
+    def __normalize_for_path(text: str) -> str:
+        if not text:
+            return "_unspecified_agenda_item"
+        text = re.sub(r'\s+', '_', text)
+        text = re.sub(r'[^\w\s-]', '', text).strip()
+        return text if text else "_unspecified_agenda_item"
+    async def _process_single_document(item: DocInfo) -> Tuple[bool, bytes]:
         """Attempts to convert a document to text and returns success status and content."""
         try:
+            text_lines = await doc_to_txt(item.document, item.url, http_client)
             content_bytes = "\n".join(text_lines).encode("utf-8")
+            return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
         except Exception as e:
             logging.warning(
+                f"Failed to process document '{item.document}' from URL '{item.url}': {e}")
+            error_message = f"Document '{item.document}' text extraction failed: {e}".encode(
                 "utf-8")
+            return {"doc_id": item.document, "content": error_message, "failed": True, "agenda_item": item.agenda_item}
+    convert_tasks = await asyncio.gather(*[_process_single_document(doc) for doc in req.documents], return_exceptions=False)
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
         for task in convert_tasks:
+            # get agenda item directory
+            agenda_item_str = task.get("agenda_item") or ""
+            directory_name = __normalize_for_path(agenda_item_str)
             failed = "failed" in task
             doc_id = task["doc_id"]
+            base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
+            full_file_path = f"{directory_name}/{base_filename}"
+            zip_file.writestr(full_file_path, task["content"])
     zip_buffer.seek(0)

schemas.py CHANGED Viewed

@@ -33,6 +33,8 @@ class DocInfo(BaseModel):
     url: str
     # Document type
     type: str
 class DownloadDocsRequest(BaseModel):

     url: str
     # Document type
     type: str
+    # Agenda item this document belongs to.
+    agenda_item: str
 class DownloadDocsRequest(BaseModel):

static/js/app.js CHANGED Viewed

@@ -235,7 +235,7 @@ async function downloadTDocs() {
     try {
         // Extraire les données du tableau avec le format suivant pour la requete backend
         // { document: "nom_doc", url: "url_doc", type: "type_de_doc"}
-        const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': "type" });
         if (selectedData.length === 0) {
             alert('Please select at least one document');
@@ -319,7 +319,7 @@ function downloadBlob(blob, filename) {
  * Extrait les requirements des documents sélectionnés
  */
 async function extractRequirements() {
-    const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': 'type' });
     console.log("Selected docs data");
     console.log(selectedData);

     try {
         // Extraire les données du tableau avec le format suivant pour la requete backend
         // { document: "nom_doc", url: "url_doc", type: "type_de_doc"}
+        const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': "type", "Agenda": "agenda_item" });
         if (selectedData.length === 0) {
             alert('Please select at least one document');
  * Extrait les requirements des documents sélectionnés
  */
 async function extractRequirements() {
+    const selectedData = extractTableData({ 'TDoc': 'document', 'URL': 'url', 'Type': 'type', "Agenda": "agenda_item" });
     console.log("Selected docs data");
     console.log(selectedData);