Spaces:
Runtime error
Runtime error
File size: 5,082 Bytes
7b5322f a290aa5 efa5b1a a290aa5 2e4795a 7b5322f 1b63618 a290aa5 2e4795a efa5b1a 1b63618 ae9a65a 2e4795a ae9a65a a290aa5 ae9a65a 2e4795a ae9a65a a290aa5 ae9a65a a290aa5 ae9a65a a290aa5 ae9a65a a290aa5 2e4795a ae9a65a 2e4795a ae9a65a 2e4795a a290aa5 a48a75f a290aa5 7b5322f 0d226e8 3133525 0d226e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# ---------- BEGIN app.py ----------
import os, sys, json, uuid, types
# ββ 0. Quick env print β delete later if you like βββββββββββββββββββββββ
print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:25])))
sys.stdout.flush()
# ββ 1. Ensure a writable dir (good housekeeping) ββββββββββββββββββββββββ
os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)
# ββ 2. FINAL numba cache kill-switch ββββββββββββββββββββββββββββββββββββ
try:
import importlib, numba, types
from numba.core import dispatcher, caching
import numba.np.ufunc.ufuncbuilder as ufuncbuilder
# 2-a UMAP path: no-op dispatcher method
dispatcher.Dispatcher.enable_caching = lambda self: None
# 2-b Build a stub that pretends to be a FunctionCache
class _NoCache(types.SimpleNamespace):
def __init__(self, *_, **__): pass
load_overload = lambda *_, **__: False
save_overload = lambda *_, **__: None
enable_caching = lambda *_, **__: None
# 2-c Patch *every* place that still holds a reference
caching.FunctionCache = _NoCache # core path
ufuncbuilder.FunctionCache = _NoCache # PyNNDescent path
# 2-d Extra belt-and-braces flag
os.environ["NUMBA_DISABLE_CACHE"] = "1"
except ImportError:
# numba isn't installed yet during first pip install β harmless
pass
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# ββ 3. Heavy imports (UMAP, BERTopic, FastAPI, β¦) βββββββββββββββββββββββ
from typing import List
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
# ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------
...
# ---------- rest of the file unchanged ----------
# ββ 4. Configuration via env vars βββββββββββββββββββββββββββββββββββββββββββββ
MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))
# ββ 5. Initialise models once at container start βββββββββββββββββββββββββββββ
embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
topic_model = BERTopic(
embedding_model=embeddings,
min_topic_size=MIN_TOPIC,
calculate_probabilities=True,
)
# ββ 6. Pydantic schemas ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class Sentence(BaseModel):
text: str
start: float
end: float
speaker: str | None = None
class Segment(BaseModel):
topic_id: int
label: str | None
keywords: List[str]
start: float
end: float
probability: float | None
sentences: List[int]
class SegmentationResponse(BaseModel):
run_id: str
segments: List[Segment]
# ββ 7. FastAPI app and endpoint ββββββββββββββββββββββββββββββββββββββββββββββ
app = FastAPI(title="CZ Topic Segmenter", version="1.0")
@app.post("/segment", response_model=SegmentationResponse)
def segment(sentences: List[Sentence]):
# Guardrail: avoid oversize requests
if len(sentences) > MAX_DOCS:
raise HTTPException(
status_code=413,
detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
)
docs = [s.text for s in sentences]
topics, probs = topic_model.fit_transform(docs)
segments, cur = [], None
for idx, (t_id, prob) in enumerate(zip(topics, probs)):
if cur is None or t_id != cur["topic_id"]:
if cur:
segments.append(cur)
# Top-5 keywords for this topic
words = [w for w, _ in topic_model.get_topic(t_id)[:5]]
cur = dict(
topic_id=t_id,
label=" ".join(words) if t_id != -1 else None, # β fixed β=β
keywords=words,
start=sentences[idx].start,
end=sentences[idx].end,
probability=float(prob or 0),
sentences=[idx],
)
else:
cur["end"] = sentences[idx].end
cur["sentences"].append(idx)
if cur:
segments.append(cur)
return {"run_id": str(uuid.uuid4()), "segments": segments}
# ---------- END app.py ---------- |