Spaces:
Runtime error
Runtime error
# ---------- BEGIN app.py ---------- | |
import os, sys, json, uuid, types | |
# ββ 0. Quick env print β delete later if you like βββββββββββββββββββββββ | |
print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:25]))) | |
sys.stdout.flush() | |
# ββ 1. Ensure a writable dir (good housekeeping) ββββββββββββββββββββββββ | |
os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache") | |
os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True) | |
# ββ 2. FINAL numba cache kill-switch ββββββββββββββββββββββββββββββββββββ | |
try: | |
import importlib, numba, types | |
from numba.core import dispatcher, caching | |
import numba.np.ufunc.ufuncbuilder as ufuncbuilder | |
# 2-a UMAP path: no-op dispatcher method | |
dispatcher.Dispatcher.enable_caching = lambda self: None | |
# 2-b Build a stub that pretends to be a FunctionCache | |
class _NoCache(types.SimpleNamespace): | |
def __init__(self, *_, **__): pass | |
load_overload = lambda *_, **__: False | |
save_overload = lambda *_, **__: None | |
enable_caching = lambda *_, **__: None | |
# 2-c Patch *every* place that still holds a reference | |
caching.FunctionCache = _NoCache # core path | |
ufuncbuilder.FunctionCache = _NoCache # PyNNDescent path | |
# 2-d Extra belt-and-braces flag | |
os.environ["NUMBA_DISABLE_CACHE"] = "1" | |
except ImportError: | |
# numba isn't installed yet during first pip install β harmless | |
pass | |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# ββ 3. Heavy imports (UMAP, BERTopic, FastAPI, β¦) βββββββββββββββββββββββ | |
from typing import List | |
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel | |
from bertopic import BERTopic | |
from sentence_transformers import SentenceTransformer | |
# ---------- the rest of your file (config, model init, endpoint) stays unchanged ---------- | |
... | |
# ---------- rest of the file unchanged ---------- | |
# ββ 4. Configuration via env vars βββββββββββββββββββββββββββββββββββββββββββββ | |
MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech") | |
MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10")) | |
MAX_DOCS = int(os.getenv("MAX_DOCS", "5000")) | |
# ββ 5. Initialise models once at container start βββββββββββββββββββββββββββββ | |
embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache") | |
topic_model = BERTopic( | |
embedding_model=embeddings, | |
min_topic_size=MIN_TOPIC, | |
calculate_probabilities=True, | |
) | |
# ββ 6. Pydantic schemas ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
class Sentence(BaseModel): | |
text: str | |
start: float | |
end: float | |
speaker: str | None = None | |
class Segment(BaseModel): | |
topic_id: int | |
label: str | None | |
keywords: List[str] | |
start: float | |
end: float | |
probability: float | None | |
sentences: List[int] | |
class SegmentationResponse(BaseModel): | |
run_id: str | |
segments: List[Segment] | |
# ββ 7. FastAPI app and endpoint ββββββββββββββββββββββββββββββββββββββββββββββ | |
app = FastAPI(title="CZ Topic Segmenter", version="1.0") | |
def segment(sentences: List[Sentence]): | |
# Guardrail: avoid oversize requests | |
if len(sentences) > MAX_DOCS: | |
raise HTTPException( | |
status_code=413, | |
detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})" | |
) | |
docs = [s.text for s in sentences] | |
topics, probs = topic_model.fit_transform(docs) | |
segments, cur = [], None | |
for idx, (t_id, prob) in enumerate(zip(topics, probs)): | |
if cur is None or t_id != cur["topic_id"]: | |
if cur: | |
segments.append(cur) | |
# Top-5 keywords for this topic | |
words = [w for w, _ in topic_model.get_topic(t_id)[:5]] | |
cur = dict( | |
topic_id=t_id, | |
label=" ".join(words) if t_id != -1 else None, # β fixed β=β | |
keywords=words, | |
start=sentences[idx].start, | |
end=sentences[idx].end, | |
probability=float(prob or 0), | |
sentences=[idx], | |
) | |
else: | |
cur["end"] = sentences[idx].end | |
cur["sentences"].append(idx) | |
if cur: | |
segments.append(cur) | |
return {"run_id": str(uuid.uuid4()), "segments": segments} | |
# ---------- END app.py ---------- |