Spaces:

OrganizedProgrammers
/

Docxtract

Running

App Files Files Community

Lucas ARRIESSE commited on about 8 hours ago

Commit

f282fc7

1 Parent(s): 0925e1c

Add ability to do free form conversion of user files

Browse files

Files changed (3) hide show

api/docs.py +95 -46
static/index.html +29 -4
static/js/app.js +34 -1

api/docs.py CHANGED Viewed

@@ -16,7 +16,7 @@ import re
 import tempfile
 from lxml import etree
 from bs4 import BeautifulSoup
-from fastapi import Depends, HTTPException
 from dependencies import get_http_client, get_llm_router
 from fastapi.responses import StreamingResponse
 from litellm.router import Router
@@ -105,6 +105,40 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
         return out_bytes
 # Rate limit of FTP downloads per minute
 FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
 # Max number of parallel workers downloading
@@ -208,49 +242,9 @@ FORMAT_MIME_TYPES = {
     ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 }
-async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
-    """
-    Télécharge le TDoc spécifié et le convertit en texte.
-    """
-    # Grab the document archive
-    filename, ext, bytes = await get_doc_archive(url, client)
-    final_text: str = None
-    if ext == ".doc":
-        logging.debug(f"Converting {filename} .doc --> .docx")
-        docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
-        extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
-        final_text = extracted_data.content
-    elif ext == ".docx":
-        # Applying doc revisions to docx files (especially for pCR / draftCR files)
-        logging.debug(f"Updating .docx revisions for {doc_id}.")
-        applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
-        extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
-        final_text = extracted_data.content
-    elif ext == ".ppt":
-        logging.debug(f"Converting {filename} .ppt --> .pptx")
-        docx_bytes = await convert_file(bytes, doc_id, "ppt", "pptx")
-        extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
-        final_text = extracted_data.content
-    else:
-        if ext in FORMAT_MIME_TYPES:  # file extension is supported
-            extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
-            final_text = extracted_data.content
-        else:
-            raise Exception(
-                f"Unsupported file type: {ext}, filename: {filename}")
-    # include an empty line in the beginning
-    txt_data = ["\n"] + [line.strip()
-                         for line in final_text.splitlines() if line.strip()]
-    return txt_data
 # ============================================= Doc routes =========================================================
 @router.post("/get_meetings", response_model=GetMeetingsResponse)
 async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
     """
@@ -358,10 +352,11 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
         text = re.sub(r'[^\w\s-]', '', text).strip()
         return text if text else "_unspecified_agenda_item"
-    async def _process_single_document(item: DocInfo) -> Tuple[bool, bytes]:
         """Attempts to convert a document to text and returns success status and content."""
         try:
-            text_lines = await doc_to_txt(item.document, item.url, http_client)
             content_bytes = "\n".join(text_lines).encode("utf-8")
             return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
         except Exception as e:
@@ -399,6 +394,59 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
 # ======================================================================================================================================================================================
 class ProgressUpdate(BaseModel):
     """Defines the structure of a single SSE message."""
     status: Literal["progress", "complete"]
@@ -429,8 +477,9 @@ async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_ro
         # convert the docx to txt for use
         try:
-            doc = await doc_to_txt(doc_id, url, http_client)
-            full = "\n".join(doc)
         except Exception as e:
             fmt = "".join(traceback.format_exception(e))
             logging.error(f"Failed to process doc {doc_id} : {fmt}")

 import tempfile
 from lxml import etree
 from bs4 import BeautifulSoup
+from fastapi import Depends, File, HTTPException, UploadFile
 from dependencies import get_http_client, get_llm_router
 from fastapi.responses import StreamingResponse
 from litellm.router import Router
         return out_bytes
+async def convert_to_txt(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
+    """Convert given file represented as a (filename, ext, bytes) to a list of lines"""
+    final_text: str = None
+    if ext == ".doc":
+        logging.debug(f"Converting {filename} .doc --> .docx")
+        docx_bytes = await convert_file(bytes, filename, "doc", "docx")
+        extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
+        final_text = extracted_data.content
+    elif ext == ".docx":
+        # Applying doc revisions to docx files (especially for pCR / draftCR files)
+        logging.debug(f"Updating .docx revisions for {filename}.")
+        applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
+        extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
+        final_text = extracted_data.content
+    elif ext == ".ppt":
+        logging.debug(f"Converting {filename} .ppt --> .pptx")
+        docx_bytes = await convert_file(bytes, filename, "ppt", "pptx")
+        extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
+        final_text = extracted_data.content
+    else:
+        if ext in FORMAT_MIME_TYPES:  # file extension is supported
+            extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
+            final_text = extracted_data.content
+        else:
+            raise Exception(
+                f"Unsupported file type: {ext}, filename: {filename}")
+    # include an empty line in the beginning
+    txt_data = [""] + [line.strip()
+                       for line in final_text.splitlines() if line.strip()]
+    return txt_data
 # Rate limit of FTP downloads per minute
 FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
 # Max number of parallel workers downloading
     ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 }
 # ============================================= Doc routes =========================================================
 @router.post("/get_meetings", response_model=GetMeetingsResponse)
 async def get_meetings(req: GetMeetingsRequest, http_client: AsyncClient = Depends(get_http_client)):
     """
         text = re.sub(r'[^\w\s-]', '', text).strip()
         return text if text else "_unspecified_agenda_item"
+    async def _process_single_document(item: DocInfo):
         """Attempts to convert a document to text and returns success status and content."""
         try:
+            filename, ext, bytes = await get_doc_archive(item.url, http_client)
+            text_lines = await convert_to_txt(filename, ext, bytes)
             content_bytes = "\n".join(text_lines).encode("utf-8")
             return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
         except Exception as e:
 # ======================================================================================================================================================================================
+@router.post("/download_user_docs")
+async def download_user_docs(files: list[UploadFile] = File(...)):
+    """Freeform convert the user files into text and downloads them as a single zip file."""
+    file_infos = []
+    # retrieve all files
+    for file in files:
+        filename, ext = os.path.splitext(file.filename)
+        file_infos.append({
+            "filename": filename,
+            "extension": ext,
+            "content": io.BytesIO(await file.read())
+        })
+    filenames = [file["filename"] for file in file_infos]
+    logging.info(f"Got {len(file_infos)} user files to convert.")
+    logging.info(f"Filenames: {filenames}")
+    # convert files to text
+    async def _process_single_document(item: dict):
+        try:
+            text_lines = await convert_to_txt(item["filename"], item["extension"], item["content"])
+            content_bytes = "\n".join(text_lines).encode("utf-8")
+            return {"doc_id": item["filename"], "content": content_bytes}
+        except Exception as e:
+            doc = item["filename"]
+            logging.warning(
+                f"Failed to process document '{doc}': {e}")
+            error_message = f"Document '{doc}' text extraction failed: {e}".encode(
+                "utf-8")
+            return {"doc_id": doc, "content": error_message, "failed": True}
+    convert_tasks = await asyncio.gather(*[_process_single_document(file) for file in file_infos], return_exceptions=False)
+    zip_buffer = io.BytesIO()
+    with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
+        for task in convert_tasks:
+            failed = "failed" in task
+            doc_id = task["doc_id"]
+            base_filename = f"failed_{doc_id}.txt" if failed else f"{doc_id}.txt"
+            zip_file.writestr(base_filename, task["content"])
+    zip_buffer.seek(0)
+    return StreamingResponse(
+        zip_buffer,
+        media_type="application/zip",
+        headers={"Content-Disposition": "attachment; filename=user_files.zip"}
+    )
+# ======================================================================================================================================================================================
 class ProgressUpdate(BaseModel):
     """Defines the structure of a single SSE message."""
     status: Literal["progress", "complete"]
         # convert the docx to txt for use
         try:
+            filename, ext, bytes = await get_doc_archive(url, http_client)
+            txt_data = await convert_to_txt(filename, ext, bytes)
+            full = "\n".join(txt_data)
         except Exception as e:
             fmt = "".join(traceback.format_exception(e))
             logging.error(f"Failed to process doc {doc_id} : {fmt}")

static/index.html CHANGED Viewed

@@ -13,7 +13,7 @@
 <body class="bg-gray-100 min-h-screen">
     <!-- Loading Overlay -->
-    <div id="loading-overlay" class="fixed inset-0 bg-black/50 flex items-center justify-center z-50 hidden">
         <div class="bg-white p-6 rounded-lg shadow-lg text-center">
             <span class="loading loading-spinner loading-xl"></span>
             <p id="progress-text" class="text-gray-700">Chargement en cours...</p>
@@ -156,17 +156,21 @@
             <!-- Data Table Informations -->
             <div class="flex justify-between items-center mb-2 pt-5" id="data-table-info-container">
                 <div class="flex gap-2 items-center">
                     <div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
                         <button id="extract-requirements-btn"
                             class="btn bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
                             Extract Requirements from CRs
                         </button>
                     </div>
                     <div class="tooltip" data-tip="Download all selected docs as text files">
                         <div class="dropdown">
-                            <div tabindex="0" role="button" class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
                                 📦 Download </div>
-                            <div tabindex="0" class="dropdown-content card card-sm bg-base-100 z-1 w-64 shadow-md">
                                 <div class="card-body space-y-2">
                                     <label class="label">
                                         <input class="checkbox checkbox-primary" name="download-sorted-files"
@@ -174,13 +178,34 @@
                                         <p class="text-m">Sort files by agenda</p>
                                     </label>
                                     <button id="download-tdocs-btn"
-                                        class="text-sm rounded px-3 py-1 shadow cursor-pointer">
                                         📦 Download docs
                                     </button>
                                 </div>
                             </div>
                         </div>
                     </div>
                 </div>
                 <!-- document counts -->

 <body class="bg-gray-100 min-h-screen">
     <!-- Loading Overlay -->
+    <div id="loading-overlay" class="fixed inset-0 bg-black/50 flex items-center justify-center z-500 hidden">
         <div class="bg-white p-6 rounded-lg shadow-lg text-center">
             <span class="loading loading-spinner loading-xl"></span>
             <p id="progress-text" class="text-gray-700">Chargement en cours...</p>
             <!-- Data Table Informations -->
             <div class="flex justify-between items-center mb-2 pt-5" id="data-table-info-container">
                 <div class="flex gap-2 items-center">
+                    <!--Extract xCR button-->
                     <div class="tooltip" data-tip="Extract requirements from selected pCR / CR documents">
                         <button id="extract-requirements-btn"
                             class="btn bg-orange-300 text-white text-sm rounded px-3 py-1 shadow hover:bg-orange-600">💉
                             Extract Requirements from CRs
                         </button>
                     </div>
+                    <!--Download button-->
                     <div class="tooltip" data-tip="Download all selected docs as text files">
                         <div class="dropdown">
+                            <div tabindex="0" role="button"
+                                class="btn btn-primary text-sm rounded px-3 py-1 shadow cursor-pointer">
                                 📦 Download </div>
+                            <div class="dropdown-content card card-sm bg-base-100 z-1 w-64 shadow-md">
                                 <div class="card-body space-y-2">
                                     <label class="label">
                                         <input class="checkbox checkbox-primary" name="download-sorted-files"
                                         <p class="text-m">Sort files by agenda</p>
                                     </label>
                                     <button id="download-tdocs-btn"
+                                        class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
                                         📦 Download docs
                                     </button>
                                 </div>
                             </div>
                         </div>
                     </div>
+                    <!--Free form convert zone-->
+                    <div class="tooltip" data-tip="Convert user uploaded files to text">
+                        <div class="dropdown">
+                            <div tabindex="0" role="button"
+                                class="btn btn-secondary text-sm rounded px-3 py-1 shadow cursor-pointer">
+                                ⚗ Freeform convert </div>
+                            <div class="dropdown-content card card-sm bg-base-100 z-1 w-128 shadow-md">
+                                <div class="card-body space-y-2">
+                                    <fieldset class="fieldset">
+                                        <legend class="fieldset-legend">Select files to convert to text</legend>
+                                        <input id="freeform-convert-files" type="file" class="file-input" multiple />
+                                    </fieldset>
+                                    <button id="freeform-convert-btn"
+                                        class="btn text-sm rounded px-3 py-1 shadow cursor-pointer">
+                                        📦 Download converted docs
+                                    </button>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
                 </div>
                 <!-- document counts -->

static/js/app.js CHANGED Viewed

@@ -265,6 +265,37 @@ async function downloadTDocs() {
     }
 }
 /**
  * Génère un nom de fichier pour le téléchargement
  * @returns {string} Nom du fichier
@@ -1104,4 +1135,6 @@ document.getElementById('read-assessment-button').addEventListener('click', _ =>
 // Events des boutons pour le drafting de solutions
 document.getElementById('refine-btn').addEventListener('click', handleDraftRefine);
 document.getElementById('fto-analysis-btn').addEventListener('click', handleFTOAnalysis);
-document.getElementById('export-timeline-btn').addEventListener('click', handleExportDrafts)

     }
 }
+/**
+ *
+ */
+async function downloadFreeformDocs() {
+    const files = document.getElementById('freeform-convert-files').files;
+    if (!files.length) {
+        alert("Please select at least one file to convert.");
+        return;
+    }
+    const formData = new FormData();
+    for (let i = 0; i < files.length; i++)
+        formData.append("files", files[i]);
+    try {
+        showLoadingOverlay("Converting user docs to text files");
+        const response = await fetch("/docs/download_user_docs", {
+            method: "POST",
+            body: formData
+        });
+        const blob = await response.blob();
+        downloadBlob(blob, "user_files");
+    }
+    catch (err) {
+    } finally {
+        hideLoadingOverlay();
+    }
+}
 /**
  * Génère un nom de fichier pour le téléchargement
  * @returns {string} Nom du fichier
 // Events des boutons pour le drafting de solutions
 document.getElementById('refine-btn').addEventListener('click', handleDraftRefine);
 document.getElementById('fto-analysis-btn').addEventListener('click', handleFTOAnalysis);
+document.getElementById('export-timeline-btn').addEventListener('click', handleExportDrafts);
+document.getElementById('freeform-convert-btn').addEventListener('click', downloadFreeformDocs);