Lucas ARRIESSE commited on
Commit
1b57e39
·
1 Parent(s): 4e54efb

Fix TDocs downloading not working

Browse files
Files changed (2) hide show
  1. app.py +22 -18
  2. static/script.js +26 -16
app.py CHANGED
@@ -1,20 +1,24 @@
1
- from typing import Literal
2
- from bs4 import BeautifulSoup
 
 
3
  import warnings
4
  import io
 
5
  import zipfile
6
- from lxml import etree
7
  import os
8
- from dotenv import load_dotenv
9
  import requests
10
  import subprocess
11
- import string
 
 
 
 
12
  from nltk.tokenize import word_tokenize
 
13
  from nltk.corpus import stopwords
14
  from nltk.stem import WordNetLemmatizer
15
- from concurrent.futures import ThreadPoolExecutor, as_completed
16
- import json
17
- import traceback
18
  from fastapi import FastAPI, BackgroundTasks, HTTPException, Request
19
  from fastapi.staticfiles import StaticFiles
20
  from schemas import *
@@ -22,11 +26,6 @@ from fastapi.middleware.cors import CORSMiddleware
22
  from fastapi.responses import FileResponse, StreamingResponse
23
  from litellm.router import Router
24
  from aiolimiter import AsyncLimiter
25
- import pandas as pd
26
- import asyncio
27
- import logging
28
- import re
29
- import nltk
30
 
31
  load_dotenv()
32
 
@@ -36,6 +35,7 @@ logging.basicConfig(
36
  datefmt='%Y-%m-%d %H:%M:%S'
37
  )
38
 
 
39
  nltk.download('stopwords')
40
  nltk.download('punkt_tab')
41
  nltk.download('wordnet')
@@ -227,11 +227,14 @@ def get_meetings(req: MeetingsRequest):
227
  working_group = req.working_group
228
  tsg = re.sub(r"\d+", "", working_group)
229
  wg_number = re.search(r"\d", working_group).group(0)
 
230
  logging.debug(tsg, wg_number)
231
  url = "https://www.3gpp.org/ftp/tsg_" + tsg
232
  logging.debug(url)
 
233
  resp = requests.get(url, verify=False)
234
  soup = BeautifulSoup(resp.text, "html.parser")
 
235
  meeting_folders = []
236
  all_meetings = []
237
  wg_folders = [item.get_text() for item in soup.select("tr td a")]
@@ -309,15 +312,18 @@ def download_tdocs(req: DownloadRequest):
309
  data=json.dumps({"doc_id": doc_id}),
310
  verify=False
311
  )
312
- print(url.status_code)
 
313
  url = url.json()['url']
314
- print(url)
 
315
  try:
316
  txt = "\n".join(docx_to_txt(doc_id, url))
317
  except Exception as e:
318
  txt = f"Document {doc_id} text extraction failed: {e}"
319
  return doc_id, txt.encode("utf-8")
320
 
 
321
  def process_batch(batch):
322
  results = {}
323
  for doc in batch:
@@ -420,8 +426,6 @@ async def gen_reqs(req: RequirementsRequest, background_tasks: BackgroundTasks):
420
 
421
  # ======================================================================================================================================================================================
422
 
423
- SUBPROCESS_SEMAPHORE = asyncio.Semaphore(32)
424
-
425
 
426
  class ProgressUpdate(BaseModel):
427
  """Defines the structure of a single SSE message."""
@@ -431,7 +435,7 @@ class ProgressUpdate(BaseModel):
431
  processed_docs: int
432
 
433
 
434
- @app.post("/generate_requirements/v2")
435
  async def gen_reqs(req: RequirementsRequest, con: Request):
436
  """Extract requirements from the specified TDocs using a LLM and returns SSE events about the progress of ongoing operations"""
437
 
 
1
+ import asyncio
2
+ import logging
3
+ import nltk
4
+ import string
5
  import warnings
6
  import io
7
+ import traceback
8
  import zipfile
9
+ import json
10
  import os
 
11
  import requests
12
  import subprocess
13
+ import pandas as pd
14
+ import re
15
+ from lxml import etree
16
+ from typing import Literal
17
+ from dotenv import load_dotenv
18
  from nltk.tokenize import word_tokenize
19
+ from bs4 import BeautifulSoup
20
  from nltk.corpus import stopwords
21
  from nltk.stem import WordNetLemmatizer
 
 
 
22
  from fastapi import FastAPI, BackgroundTasks, HTTPException, Request
23
  from fastapi.staticfiles import StaticFiles
24
  from schemas import *
 
26
  from fastapi.responses import FileResponse, StreamingResponse
27
  from litellm.router import Router
28
  from aiolimiter import AsyncLimiter
 
 
 
 
 
29
 
30
  load_dotenv()
31
 
 
35
  datefmt='%Y-%m-%d %H:%M:%S'
36
  )
37
 
38
+ # Download required packages for NLTK
39
  nltk.download('stopwords')
40
  nltk.download('punkt_tab')
41
  nltk.download('wordnet')
 
227
  working_group = req.working_group
228
  tsg = re.sub(r"\d+", "", working_group)
229
  wg_number = re.search(r"\d", working_group).group(0)
230
+
231
  logging.debug(tsg, wg_number)
232
  url = "https://www.3gpp.org/ftp/tsg_" + tsg
233
  logging.debug(url)
234
+
235
  resp = requests.get(url, verify=False)
236
  soup = BeautifulSoup(resp.text, "html.parser")
237
+
238
  meeting_folders = []
239
  all_meetings = []
240
  wg_folders = [item.get_text() for item in soup.select("tr td a")]
 
312
  data=json.dumps({"doc_id": doc_id}),
313
  verify=False
314
  )
315
+ logging.info(
316
+ f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
317
  url = url.json()['url']
318
+ logging.debug(f"Doc URL for {doc_id} is {url}")
319
+
320
  try:
321
  txt = "\n".join(docx_to_txt(doc_id, url))
322
  except Exception as e:
323
  txt = f"Document {doc_id} text extraction failed: {e}"
324
  return doc_id, txt.encode("utf-8")
325
 
326
+ # PERF: use asyncio?
327
  def process_batch(batch):
328
  results = {}
329
  for doc in batch:
 
426
 
427
  # ======================================================================================================================================================================================
428
 
 
 
429
 
430
  class ProgressUpdate(BaseModel):
431
  """Defines the structure of a single SSE message."""
 
435
  processed_docs: int
436
 
437
 
438
+ @app.post("/generate_requirements/sse")
439
  async def gen_reqs(req: RequirementsRequest, con: Request):
440
  """Extract requirements from the specified TDocs using a LLM and returns SSE events about the progress of ongoing operations"""
441
 
static/script.js CHANGED
@@ -520,7 +520,7 @@ async function downloadTDocs() {
520
  const blob = await response.blob();
521
  downloadBlob(blob, generateDownloadFilename());
522
  } catch (error) {
523
- console.error('Erreur lors du téléchargement:', error);
524
  alert('Erreur lors du téléchargement des TDocs');
525
  } finally {
526
  hideLoadingOverlay();
@@ -535,19 +535,29 @@ async function downloadTDocs() {
535
  function generateDownloadFilename() {
536
  let filename = document.getElementById('meeting-select').value || 'documents';
537
 
538
- const agendaItem = document.getElementById('agenda-item-filter').value;
539
- const docStatus = document.getElementById('doc-status-filter').value;
540
- const docType = document.getElementById('doc-type-filter').value;
541
 
542
- if (agendaItem && agendaItem !== 'Tous') {
543
- filename += `_${agendaItem}`;
 
 
 
544
  }
545
- if (docStatus && docStatus !== 'Tous') {
546
- filename += `_${docStatus}`;
 
 
 
 
547
  }
548
- if (docType && docType !== 'Tous') {
 
 
549
  filename = `${docType}_${filename}`;
550
  }
 
551
  if (hasRequirementsExtracted) {
552
  filename = `requirements_${filename}`;
553
  }
@@ -585,7 +595,7 @@ async function extractRequirements() {
585
  toggleElementsEnabled(['extract-requirements-btn'], false);
586
 
587
  try {
588
- const response = await postWithSSE('/generate_requirements/v2', { documents: selectedData }, {
589
  onMessage: (msg) => {
590
  console.log("SSE message:");
591
  console.log(msg);
@@ -599,11 +609,11 @@ async function extractRequirements() {
599
  });
600
 
601
 
602
- // const response = await fetch('/generate_requirements/', {
603
- // method: 'POST',
604
- // headers: { 'Content-Type': 'application/json' },
605
- // body: req
606
- // });
607
 
608
  const data = response.data; // data in the SSE message contains the requirements response
609
  requirements = data.requirements;
@@ -619,7 +629,7 @@ async function extractRequirements() {
619
  req_id++;
620
  })
621
  })
622
-
623
  displayRequirements(requirements);
624
 
625
  toggleContainersVisibility(['requirements-container', 'query-requirements-container'], true);
 
520
  const blob = await response.blob();
521
  downloadBlob(blob, generateDownloadFilename());
522
  } catch (error) {
523
+ console.error(error);
524
  alert('Erreur lors du téléchargement des TDocs');
525
  } finally {
526
  hideLoadingOverlay();
 
535
  function generateDownloadFilename() {
536
  let filename = document.getElementById('meeting-select').value || 'documents';
537
 
538
+ const agendaItems = selectedAgenda;
539
+ const docStatuses = selectedStatus
540
+ const docType = selectedType;
541
 
542
+ // empty set means "Tous" is selected
543
+ if (agendaItems) {
544
+ for (aItem of agendaItems) {
545
+ filename += `_${aItem}`;
546
+ }
547
  }
548
+
549
+ // empty set means "Tous" is selected
550
+ if (docStatuses) {
551
+ for (docStatus of docStatuses) {
552
+ filename += `_${docStatus}`;
553
+ }
554
  }
555
+
556
+ // empty means "Tous"
557
+ if (docType && docType !== "") {
558
  filename = `${docType}_${filename}`;
559
  }
560
+
561
  if (hasRequirementsExtracted) {
562
  filename = `requirements_${filename}`;
563
  }
 
595
  toggleElementsEnabled(['extract-requirements-btn'], false);
596
 
597
  try {
598
+ const response = await postWithSSE('/generate_requirements/sse', { documents: selectedData }, {
599
  onMessage: (msg) => {
600
  console.log("SSE message:");
601
  console.log(msg);
 
609
  });
610
 
611
 
612
+ // const response = await fetch('/generate_requirements/', {
613
+ // method: 'POST',
614
+ // headers: { 'Content-Type': 'application/json' },
615
+ // body: req
616
+ // });
617
 
618
  const data = response.data; // data in the SSE message contains the requirements response
619
  requirements = data.requirements;
 
629
  req_id++;
630
  })
631
  })
632
+
633
  displayRequirements(requirements);
634
 
635
  toggleContainersVisibility(['requirements-container', 'query-requirements-container'], true);