Spaces:

Yeetek
/

insightflowv2

Runtime error

App Files Files Community

insightflowv2 / app.py

Yeetek

Update app.py

3133525 verified 3 months ago

raw

history blame

5.08 kB

	# ---------- BEGIN app.py ----------
	import os, sys, json, uuid, types

	# ── 0. Quick env print – delete later if you like ───────────────────────
	print("ENV-snapshot:", json.dumps(dict(list(os.environ.items())[:25])))
	sys.stdout.flush()

	# ── 1. Ensure a writable dir (good housekeeping) ────────────────────────
	os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
	os.makedirs(os.environ["NUMBA_CACHE_DIR"], exist_ok=True)

	# ── 2. FINAL numba cache kill-switch ────────────────────────────────────
	try:
	import importlib, numba, types
	from numba.core import dispatcher, caching
	import numba.np.ufunc.ufuncbuilder as ufuncbuilder

	# 2-a UMAP path: no-op dispatcher method
	dispatcher.Dispatcher.enable_caching = lambda self: None

	# 2-b Build a stub that pretends to be a FunctionCache
	class _NoCache(types.SimpleNamespace):
	def __init__(self, _, *__): pass
	load_overload = lambda _, *__: False
	save_overload = lambda _, *__: None
	enable_caching = lambda _, *__: None

	# 2-c Patch every place that still holds a reference
	caching.FunctionCache = _NoCache # core path
	ufuncbuilder.FunctionCache = _NoCache # PyNNDescent path

	# 2-d Extra belt-and-braces flag
	os.environ["NUMBA_DISABLE_CACHE"] = "1"

	except ImportError:
	# numba isn't installed yet during first pip install – harmless
	pass
	# ─────────────────────────────────────────────────────────────────────────


	# ── 3. Heavy imports (UMAP, BERTopic, FastAPI, …) ───────────────────────
	from typing import List
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from bertopic import BERTopic
	from sentence_transformers import SentenceTransformer
	# ---------- the rest of your file (config, model init, endpoint) stays unchanged ----------

	...
	# ---------- rest of the file unchanged ----------


	# ── 4. Configuration via env vars ─────────────────────────────────────────────
	MODEL_NAME = os.getenv("EMBED_MODEL", "Seznam/simcse-small-e-czech")
	MIN_TOPIC = int(os.getenv("MIN_TOPIC_SIZE", "10"))
	MAX_DOCS = int(os.getenv("MAX_DOCS", "5000"))

	# ── 5. Initialise models once at container start ─────────────────────────────
	embeddings = SentenceTransformer(MODEL_NAME, cache_folder="/tmp/hfcache")
	topic_model = BERTopic(
	embedding_model=embeddings,
	min_topic_size=MIN_TOPIC,
	calculate_probabilities=True,
	)

	# ── 6. Pydantic schemas ──────────────────────────────────────────────────────
	class Sentence(BaseModel):
	text: str
	start: float
	end: float
	speaker: str \| None = None

	class Segment(BaseModel):
	topic_id: int
	label: str \| None
	keywords: List[str]
	start: float
	end: float
	probability: float \| None
	sentences: List[int]

	class SegmentationResponse(BaseModel):
	run_id: str
	segments: List[Segment]

	# ── 7. FastAPI app and endpoint ──────────────────────────────────────────────
	app = FastAPI(title="CZ Topic Segmenter", version="1.0")

	@app.post("/segment", response_model=SegmentationResponse)
	def segment(sentences: List[Sentence]):
	# Guardrail: avoid oversize requests
	if len(sentences) > MAX_DOCS:
	raise HTTPException(
	status_code=413,
	detail=f"Too many sentences ({len(sentences)} > {MAX_DOCS})"
	)

	docs = [s.text for s in sentences]
	topics, probs = topic_model.fit_transform(docs)

	segments, cur = [], None
	for idx, (t_id, prob) in enumerate(zip(topics, probs)):
	if cur is None or t_id != cur["topic_id"]:
	if cur:
	segments.append(cur)

	# Top-5 keywords for this topic
	words = [w for w, _ in topic_model.get_topic(t_id)[:5]]

	cur = dict(
	topic_id=t_id,
	label=" ".join(words) if t_id != -1 else None, # ✓ fixed ‘=’
	keywords=words,
	start=sentences[idx].start,
	end=sentences[idx].end,
	probability=float(prob or 0),
	sentences=[idx],
	)
	else:
	cur["end"] = sentences[idx].end
	cur["sentences"].append(idx)

	if cur:
	segments.append(cur)

	return {"run_id": str(uuid.uuid4()), "segments": segments}
	# ---------- END app.py ----------