Spaces:

OrganizedProgrammers
/

Docxtract

Running

App Files Files Community

Lucas ARRIESSE commited on 4 days ago

Commit

dc794b1

1 Parent(s): 4633840

Hotfix : change max DL limit 120 -> 30 and limi

Browse files

Files changed (1) hide show

api/docs.py +21 -18

api/docs.py CHANGED Viewed

@@ -106,35 +106,38 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
 # Rate limit of FTP downloads per minute
-FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=120, time_period=60)
 async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
     """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
     async with FTP_DOWNLOAD_RATE_LIMITER:
-        if not url.endswith("zip"):
-            raise ValueError("URL doit pointer vers un fichier ZIP")
-        doc_id = os.path.splitext(os.path.basename(url))[0]
-        resp = await client.get(url, headers={
-            "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        })
-        resp.raise_for_status()
-        with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
-            # there should be a single file per file
-            for entry in zf.infolist():
-                if entry.is_dir():
-                    continue
-                file_name = entry.filename
-                root, ext = os.path.splitext(file_name)
-                doc_bytes = zf.read(file_name)
-                return (root, ext.lower(), io.BytesIO(doc_bytes))
-        raise ValueError("Aucun fichier trouvé dans l'archive")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:

 # Rate limit of FTP downloads per minute
+FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
+# Max number of parallel workers downloading
+FTP_MAX_PARALLEL_WORKERS = asyncio.Semaphore(4)
 async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
     """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
     async with FTP_DOWNLOAD_RATE_LIMITER:
+        async with FTP_MAX_PARALLEL_WORKERS:
+            if not url.endswith("zip"):
+                raise ValueError("URL doit pointer vers un fichier ZIP")
+            doc_id = os.path.splitext(os.path.basename(url))[0]
+            resp = await client.get(url, headers={
+                "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            })
+            resp.raise_for_status()
+            with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
+                # there should be a single file per file
+                for entry in zf.infolist():
+                    if entry.is_dir():
+                        continue
+                    file_name = entry.filename
+                    root, ext = os.path.splitext(file_name)
+                    doc_bytes = zf.read(file_name)
+                    return (root, ext.lower(), io.BytesIO(doc_bytes))
+            raise ValueError("Aucun fichier trouvé dans l'archive")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO: