Yeetek commited on
Commit
efa5b1a
·
verified ·
1 Parent(s): 1b63618

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -56
app.py CHANGED
@@ -1,28 +1,25 @@
1
- # ---------- BEGIN app.py ----------
2
- import os
3
- # 1️⃣ Hard-disable numba’s on-disk cache OR redirect it:
4
- os.environ["NUMBA_DISABLE_CACHE"] = "1" # always works
5
- os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba") # keeps option to re-enable
6
- os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
7
 
8
- # (optional) quick diagnostics delete once confirmed
9
- import sys, json, uuid, sys
10
- print("ENV snapshot (first 20):",
11
- json.dumps(dict(list(os.environ.items())[:20]))); sys.stdout.flush()
12
 
13
- # 2️⃣ *Now* it’s safe to import big libs
14
  from typing import List
15
  from fastapi import FastAPI, HTTPException
16
  from pydantic import BaseModel
17
  from bertopic import BERTopic
18
  from sentence_transformers import SentenceTransformer
19
 
20
- # ---- configuration via env vars ----
21
  MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
22
  MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
23
  MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
24
 
25
- # ---- model initialisation (runs once at container start) ----
26
  embeddings = SentenceTransformer(MODEL_NAME)
27
  topic_model = BERTopic(
28
  embedding_model=embeddings,
@@ -30,7 +27,7 @@ topic_model = BERTopic(
30
  calculate_probabilities=True,
31
  )
32
 
33
- # -------- FastAPI schema ----------
34
  class Sentence(BaseModel):
35
  text: str
36
  start: float
@@ -39,45 +36,4 @@ class Sentence(BaseModel):
39
 
40
  class Segment(BaseModel):
41
  topic_id: int
42
- label: str | None
43
- keywords: List[str]
44
- start: float
45
- end: float
46
- probability: float | None
47
- sentences: List[int]
48
-
49
- class SegmentationResponse(BaseModel):
50
- run_id: str
51
- segments: List[Segment]
52
-
53
- app = FastAPI(title="CZ Topic Segmenter", version="1.0")
54
-
55
- @app.post("/segment", response_model=SegmentationResponse)
56
- def segment(sentences: List[Sentence]):
57
- if len(sentences) > MAX_DOCS:
58
- raise HTTPException(413, f"Too many sentences ({len(sentences)} > {MAX_DOCS})")
59
-
60
- docs = [s.text for s in sentences]
61
- topics, probs = topic_model.fit_transform(docs)
62
-
63
- segments, cur = [], None
64
- for idx, (t_id, prob) in enumerate(zip(topics, probs)):
65
- if cur is None or t_id != cur["topic_id"]:
66
- if cur:
67
- segments.append(cur)
68
- words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
69
- cur = dict(topic_id=t_id,
70
- label=" ".join(words) if t_id != -1 else None,
71
- keywords=words,
72
- start=sentences[idx].start,
73
- end=sentences[idx].end,
74
- probability=float(prob or 0),
75
- sentences=[idx])
76
- else:
77
- cur["end"] = sentences[idx].end
78
- cur["sentences"].append(idx)
79
- if cur:
80
- segments.append(cur)
81
-
82
- return {"run_id": str(uuid.uuid4()), "segments": segments}
83
- # ---------- END app.py ----------
 
1
+ # ---------- BEGIN app.py (diagnostic build) ----------
2
+ import os, sys, json, uuid
3
+
4
+ # DEBUG capture the first 20 env-vars Hugging Face passes in
5
+ print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:20])))
6
+ sys.stdout.flush() # make sure it appears in HF build logs
7
 
8
+ # Optional: leave the numba lines in place for the real fix
9
+ os.environ["NUMBA_DISABLE_CACHE"] = "1"
10
+ os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba")
11
+ os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
12
 
 
13
  from typing import List
14
  from fastapi import FastAPI, HTTPException
15
  from pydantic import BaseModel
16
  from bertopic import BERTopic
17
  from sentence_transformers import SentenceTransformer
18
 
 
19
  MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
20
  MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
21
  MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
22
 
 
23
  embeddings = SentenceTransformer(MODEL_NAME)
24
  topic_model = BERTopic(
25
  embedding_model=embeddings,
 
27
  calculate_probabilities=True,
28
  )
29
 
30
+ # ----- FastAPI schema & endpoint (unchanged) -----
31
  class Sentence(BaseModel):
32
  text: str
33
  start: float
 
36
 
37
  class Segment(BaseModel):
38
  topic_id: int
39
+ label: