File size: 5,082 Bytes
7b5322f
a290aa5
efa5b1a
a290aa5
2e4795a
7b5322f
1b63618
a290aa5
2e4795a
efa5b1a
1b63618
ae9a65a
2e4795a
ae9a65a
a290aa5
ae9a65a
2e4795a
ae9a65a
a290aa5
 
ae9a65a
a290aa5
 
ae9a65a
 
 
a290aa5
ae9a65a
 
 
 
 
a290aa5
2e4795a
 
ae9a65a
2e4795a
ae9a65a
 
2e4795a
a290aa5
a48a75f
 
 
 
 
a290aa5
7b5322f
0d226e8
 
 
 
 
 
 
 
 
 
3133525
0d226e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# ---------- BEGIN app.py ----------
import os, sys, json, uuid, types

# ── 0. Quick env print – delete later if you like ───────────────────────
print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:25])))
sys.stdout.flush()

# ── 1. Ensure a writable dir (good housekeeping) ────────────────────────
os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)

# ── 2. FINAL numba cache kill-switch  ────────────────────────────────────
try:
    import importlib, numba, types
    from numba.core import dispatcher, caching
    import numba.np.ufunc.ufuncbuilder as ufuncbuilder

    # 2-a  UMAP path: no-op dispatcher method
    dispatcher.Dispatcher.enable_caching = lambda self: None

    # 2-b  Build a stub that pretends to be a FunctionCache
    class _NoCache(types.SimpleNamespace):
        def __init__(self, *_, **__): pass
        load_overload   = lambda *_, **__: False
        save_overload   = lambda *_, **__: None
        enable_caching  = lambda *_, **__: None

    # 2-c  Patch *every* place that still holds a reference
    caching.FunctionCache           = _NoCache        # core path
    ufuncbuilder.FunctionCache      = _NoCache        # PyNNDescent path

    # 2-d  Extra belt-and-braces flag
    os.environ["NUMBA_DISABLE_CACHE"] = "1"

except ImportError:
    # numba isn't installed yet during first pip install – harmless
    pass
# ─────────────────────────────────────────────────────────────────────────


# ── 3. Heavy imports (UMAP, BERTopic, FastAPI, …) ───────────────────────
from typing import List
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
# ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------

...
# ---------- rest of the file unchanged ----------


# ── 4. Configuration via env vars ─────────────────────────────────────────────
MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
MIN_TOPIC  = int(os.getenv("MIN_TOPIC_SIZE", "10"))
MAX_DOCS   = int(os.getenv("MAX_DOCS", "5000"))

# ── 5. Initialise models once at container start ─────────────────────────────
embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
topic_model = BERTopic(
    embedding_model=embeddings,
    min_topic_size=MIN_TOPIC,
    calculate_probabilities=True,
)

# ── 6. Pydantic schemas ──────────────────────────────────────────────────────
class Sentence(BaseModel):
    text: str
    start: float
    end: float
    speaker: str | None = None

class Segment(BaseModel):
    topic_id: int
    label: str | None
    keywords: List[str]
    start: float
    end: float
    probability: float | None
    sentences: List[int]

class SegmentationResponse(BaseModel):
    run_id: str
    segments: List[Segment]

# ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
app = FastAPI(title="CZ Topic Segmenter", version="1.0")

@app.post("/segment", response_model=SegmentationResponse)
def segment(sentences: List[Sentence]):
    # Guardrail: avoid oversize requests
    if len(sentences) > MAX_DOCS:
        raise HTTPException(
            status_code=413,
            detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
        )

    docs = [s.text for s in sentences]
    topics, probs = topic_model.fit_transform(docs)

    segments, cur = [], None
    for idx, (t_id, prob) in enumerate(zip(topics, probs)):
        if cur is None or t_id != cur["topic_id"]:
            if cur:
                segments.append(cur)

            # Top-5 keywords for this topic
            words = [w for w, _ in topic_model.get_topic(t_id)[:5]]

            cur = dict(
                topic_id=t_id,
                label=" ".join(words) if t_id != -1 else None,  # βœ“ fixed β€˜=’
                keywords=words,
                start=sentences[idx].start,
                end=sentences[idx].end,
                probability=float(prob or 0),
                sentences=[idx],
            )
        else:
            cur["end"] = sentences[idx].end
            cur["sentences"].append(idx)

    if cur:
        segments.append(cur)

    return {"run_id": str(uuid.uuid4()), "segments": segments}
# ---------- END app.py ----------