Lucas ARRIESSE
commited on
Commit
·
dc794b1
1
Parent(s):
4633840
Hotfix : change max DL limit 120 -> 30 and limi
Browse files- api/docs.py +21 -18
api/docs.py
CHANGED
@@ -106,35 +106,38 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
|
|
106 |
|
107 |
|
108 |
# Rate limit of FTP downloads per minute
|
109 |
-
FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=
|
|
|
|
|
110 |
|
111 |
|
112 |
async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
|
113 |
"""Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
|
114 |
|
115 |
async with FTP_DOWNLOAD_RATE_LIMITER:
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
|
124 |
-
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
|
137 |
-
|
138 |
|
139 |
|
140 |
def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
|
|
|
106 |
|
107 |
|
108 |
# Rate limit of FTP downloads per minute
|
109 |
+
FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=60, time_period=60)
|
110 |
+
# Max number of parallel workers downloading
|
111 |
+
FTP_MAX_PARALLEL_WORKERS = asyncio.Semaphore(4)
|
112 |
|
113 |
|
114 |
async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
|
115 |
"""Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
|
116 |
|
117 |
async with FTP_DOWNLOAD_RATE_LIMITER:
|
118 |
+
async with FTP_MAX_PARALLEL_WORKERS:
|
119 |
+
if not url.endswith("zip"):
|
120 |
+
raise ValueError("URL doit pointer vers un fichier ZIP")
|
121 |
|
122 |
+
doc_id = os.path.splitext(os.path.basename(url))[0]
|
123 |
+
resp = await client.get(url, headers={
|
124 |
+
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
125 |
+
})
|
126 |
|
127 |
+
resp.raise_for_status()
|
128 |
|
129 |
+
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
|
130 |
+
# there should be a single file per file
|
131 |
+
for entry in zf.infolist():
|
132 |
+
if entry.is_dir():
|
133 |
+
continue
|
134 |
|
135 |
+
file_name = entry.filename
|
136 |
+
root, ext = os.path.splitext(file_name)
|
137 |
+
doc_bytes = zf.read(file_name)
|
138 |
+
return (root, ext.lower(), io.BytesIO(doc_bytes))
|
139 |
|
140 |
+
raise ValueError("Aucun fichier trouvé dans l'archive")
|
141 |
|
142 |
|
143 |
def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
|