insightflowv2 / app.py
Yeetek's picture
Update app.py
3133525 verified
raw
history blame
5.08 kB
# ---------- BEGIN app.py ----------
import os, sys, json, uuid, types
# ── 0. Quick env print – delete later if you like ───────────────────────
print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:25])))
sys.stdout.flush()
# ── 1. Ensure a writable dir (good housekeeping) ────────────────────────
os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
# ── 2. FINAL numba cache kill-switch ────────────────────────────────────
try:
import importlib, numba, types
from numba.core import dispatcher, caching
import numba.np.ufunc.ufuncbuilder as ufuncbuilder
# 2-a UMAP path: no-op dispatcher method
dispatcher.Dispatcher.enable_caching = lambda self: None
# 2-b Build a stub that pretends to be a FunctionCache
class _NoCache(types.SimpleNamespace):
def __init__(self, *_, **__): pass
load_overload = lambda *_, **__: False
save_overload = lambda *_, **__: None
enable_caching = lambda *_, **__: None
# 2-c Patch *every* place that still holds a reference
caching.FunctionCache = _NoCache # core path
ufuncbuilder.FunctionCache = _NoCache # PyNNDescent path
# 2-d Extra belt-and-braces flag
os.environ["NUMBA_DISABLE_CACHE"] = "1"
except ImportError:
# numba isn't installed yet during first pip install – harmless
pass
# ─────────────────────────────────────────────────────────────────────────
# ── 3. Heavy imports (UMAP, BERTopic, FastAPI, …) ───────────────────────
from typing import List
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
# ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------
...
# ---------- rest of the file unchanged ----------
# ── 4. Configuration via env vars ─────────────────────────────────────────────
MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
# ── 5. Initialise models once at container start ─────────────────────────────
embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
topic_model = BERTopic(
embedding_model=embeddings,
min_topic_size=MIN_TOPIC,
calculate_probabilities=True,
)
# ── 6. Pydantic schemas ──────────────────────────────────────────────────────
class Sentence(BaseModel):
text: str
start: float
end: float
speaker: str | None = None
class Segment(BaseModel):
topic_id: int
label: str | None
keywords: List[str]
start: float
end: float
probability: float | None
sentences: List[int]
class SegmentationResponse(BaseModel):
run_id: str
segments: List[Segment]
# ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
app = FastAPI(title="CZ Topic Segmenter", version="1.0")
@app.post("/segment", response_model=SegmentationResponse)
def segment(sentences: List[Sentence]):
# Guardrail: avoid oversize requests
if len(sentences) > MAX_DOCS:
raise HTTPException(
status_code=413,
detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
)
docs = [s.text for s in sentences]
topics, probs = topic_model.fit_transform(docs)
segments, cur = [], None
for idx, (t_id, prob) in enumerate(zip(topics, probs)):
if cur is None or t_id != cur["topic_id"]:
if cur:
segments.append(cur)
# Top-5 keywords for this topic
words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
cur = dict(
topic_id=t_id,
label=" ".join(words) if t_id != -1 else None, # βœ“ fixed β€˜=’
keywords=words,
start=sentences[idx].start,
end=sentences[idx].end,
probability=float(prob or 0),
sentences=[idx],
)
else:
cur["end"] = sentences[idx].end
cur["sentences"].append(idx)
if cur:
segments.append(cur)
return {"run_id": str(uuid.uuid4()), "segments": segments}
# ---------- END app.py ----------