Spaces:
Runtime error
Runtime error
Commit
ยท
07af1d2
1
Parent(s):
a5a5de8
Add files via upload
Browse files- app.py +356 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from pinecone import Pinecone
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
from typing import List, Dict
|
6 |
+
import re # For parsing timestamp and extracting video ID
|
7 |
+
import streamlit.components.v1 as components # For embedding HTML
|
8 |
+
from openai import OpenAI # Import OpenAI library
|
9 |
+
import logging
|
10 |
+
|
11 |
+
# Setup logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
# --- Helper Functions (Existing: parse_timestamp_to_seconds, get_youtube_video_id, add_timestamp_to_youtube_url, generate_youtube_embed_html) ---
|
16 |
+
def parse_timestamp_to_seconds(timestamp: str) -> int | None:
|
17 |
+
"""HH:MM:SS ๋๋ HH:MM:SS.ms ํ์์ ํ์์คํฌํ๋ฅผ ์ด ๋จ์๋ก ๋ณํํฉ๋๋ค."""
|
18 |
+
if not isinstance(timestamp, str):
|
19 |
+
return None
|
20 |
+
# Remove milliseconds part if present
|
21 |
+
timestamp_no_ms = timestamp.split('.')[0]
|
22 |
+
parts = timestamp_no_ms.split(':')
|
23 |
+
try:
|
24 |
+
if len(parts) == 3:
|
25 |
+
h, m, s = map(int, parts)
|
26 |
+
return h * 3600 + m * 60 + s
|
27 |
+
elif len(parts) == 2:
|
28 |
+
m, s = map(int, parts)
|
29 |
+
return m * 60 + s
|
30 |
+
elif len(parts) == 1:
|
31 |
+
return int(parts[0])
|
32 |
+
else:
|
33 |
+
return None
|
34 |
+
except ValueError:
|
35 |
+
return None
|
36 |
+
|
37 |
+
def get_youtube_video_id(url: str) -> str | None:
|
38 |
+
"""YouTube URL์์ ๋น๋์ค ID๋ฅผ ์ถ์ถํฉ๋๋ค."""
|
39 |
+
if not isinstance(url, str):
|
40 |
+
return None
|
41 |
+
# Standard YouTube URLs (youtube.com/watch?v=...), shortened URLs (youtu.be/...), etc.
|
42 |
+
match = re.search(r"(?:v=|/|youtu\.be/|embed/|shorts/)([0-9A-Za-z_-]{11})", url)
|
43 |
+
return match.group(1) if match else None
|
44 |
+
|
45 |
+
def add_timestamp_to_youtube_url(youtube_url: str, timestamp: str) -> str:
|
46 |
+
"""YouTube URL์ ํ์์คํฌํ๋ฅผ ์ถ๊ฐํฉ๋๋ค."""
|
47 |
+
seconds = parse_timestamp_to_seconds(timestamp)
|
48 |
+
if seconds is None or not youtube_url:
|
49 |
+
return youtube_url # Return original URL if timestamp is invalid or URL is empty
|
50 |
+
|
51 |
+
separator = '&' if '?' in youtube_url else '?'
|
52 |
+
# Remove existing t= parameter if present
|
53 |
+
cleaned_url = re.sub(r'[?&]t=\d+s?', '', youtube_url)
|
54 |
+
separator = '&' if '?' in cleaned_url else '?' # Re-check separator after cleaning
|
55 |
+
return f"{cleaned_url}{separator}t={seconds}s"
|
56 |
+
|
57 |
+
def generate_youtube_embed_html(youtube_url: str, timestamp: str) -> str | None:
|
58 |
+
"""ํ์์คํฌํ๊ฐ ์ ์ฉ๋ YouTube ์๋ฒ ๋ HTML ์ฝ๋๋ฅผ ์์ฑํฉ๋๋ค. ๊ฐ๋ก 800px ๊ณ ์ , ์ธ๋ก ์๋ ์กฐ์ ."""
|
59 |
+
video_id = get_youtube_video_id(youtube_url)
|
60 |
+
start_seconds = parse_timestamp_to_seconds(timestamp)
|
61 |
+
|
62 |
+
if not video_id:
|
63 |
+
logger.warning(f"Could not extract video ID from URL: {youtube_url}")
|
64 |
+
return None # Cannot generate embed code without video ID
|
65 |
+
|
66 |
+
start_param = f"start={start_seconds}" if start_seconds is not None else ""
|
67 |
+
|
68 |
+
# Use aspect ratio approach with fixed width 800px
|
69 |
+
return f'''
|
70 |
+
<div style="position: relative; width: 800px; padding-bottom: 450px; /* 800px * 9 / 16 = 450px */ height: 0; overflow: hidden;">
|
71 |
+
<iframe
|
72 |
+
src="https://www.youtube.com/embed/{video_id}?{start_param}&autoplay=0&rel=0"
|
73 |
+
frameborder="0"
|
74 |
+
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
|
75 |
+
referrerpolicy="strict-origin-when-cross-origin"
|
76 |
+
allowfullscreen
|
77 |
+
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;">
|
78 |
+
</iframe>
|
79 |
+
</div>
|
80 |
+
'''
|
81 |
+
|
82 |
+
# --- ์ค์ ---
|
83 |
+
# Pinecone ์ค์
|
84 |
+
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY","pcsk_PZHLK_TRAvMCyNmJM4FKGCX7rbbY22a58fhnWYasx1mf3WL6sRasoASZXfsbnJYvCQ13w") # Load from environment variable
|
85 |
+
PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east-1")
|
86 |
+
INDEX_NAME = "video-embeddings"
|
87 |
+
EMBEDDING_MODEL = "jhgan/ko-sroberta-multitask"
|
88 |
+
|
89 |
+
# OpenAI ์ค์
|
90 |
+
OPENAI_API_KEY = "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"
|
91 |
+
|
92 |
+
# --- ๋ฆฌ์์ค ๋ก๋ฉ (์บ์ฑ ํ์ฉ) ---
|
93 |
+
@st.cache_resource
|
94 |
+
def init_pinecone():
|
95 |
+
"""Pinecone ํด๋ผ์ด์ธํธ๋ฅผ ์ด๊ธฐํํฉ๋๋ค."""
|
96 |
+
api_key = PINECONE_API_KEY
|
97 |
+
if not api_key:
|
98 |
+
st.error("Pinecone API ํค๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ํ๊ฒฝ ๋ณ์๋ฅผ ํ์ธํ์ธ์.")
|
99 |
+
st.stop()
|
100 |
+
try:
|
101 |
+
pc = Pinecone(api_key=api_key)
|
102 |
+
logger.info("Successfully connected to Pinecone.")
|
103 |
+
return pc
|
104 |
+
except Exception as e:
|
105 |
+
st.error(f"Pinecone ์ด๊ธฐํ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
106 |
+
st.stop()
|
107 |
+
|
108 |
+
@st.cache_resource
|
109 |
+
def load_embedding_model():
|
110 |
+
"""Sentence Transformer ๋ชจ๋ธ์ ๋ก๋ํฉ๋๋ค."""
|
111 |
+
try:
|
112 |
+
model = SentenceTransformer(EMBEDDING_MODEL)
|
113 |
+
logger.info(f"Successfully loaded embedding model: {EMBEDDING_MODEL}")
|
114 |
+
return model
|
115 |
+
except Exception as e:
|
116 |
+
st.error(f"์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋ฉ ์ค ์ค๋ฅ ๏ฟฝ๏ฟฝ๏ฟฝ์: {e}")
|
117 |
+
st.stop()
|
118 |
+
|
119 |
+
@st.cache_resource
|
120 |
+
def get_pinecone_index(_pc: Pinecone, index_name: str):
|
121 |
+
"""Pinecone ์ธ๋ฑ์ค ๊ฐ์ฒด๋ฅผ ๊ฐ์ ธ์ต๋๋ค."""
|
122 |
+
try:
|
123 |
+
index = _pc.Index(index_name)
|
124 |
+
# Optionally, do a quick check like index.describe_index_stats() to confirm connection
|
125 |
+
stats = index.describe_index_stats()
|
126 |
+
logger.info(f"Successfully connected to Pinecone index '{index_name}'. Stats: {stats.get('total_vector_count', 'N/A')} vectors")
|
127 |
+
return index
|
128 |
+
except Exception as e:
|
129 |
+
st.error(f"Pinecone ์ธ๋ฑ์ค '{index_name}' ์ฐ๊ฒฐ ์ค ์ค๋ฅ ๋ฐ์: {e}. ์ธ๋ฑ์ค๊ฐ ์กด์ฌํ๊ณ ํ์ฑ ์ํ์ธ์ง ํ์ธํ์ธ์.")
|
130 |
+
st.stop()
|
131 |
+
|
132 |
+
@st.cache_resource
|
133 |
+
def init_openai_client():
|
134 |
+
"""OpenAI ํด๋ผ์ด์ธํธ๋ฅผ ์ด๊ธฐํํฉ๋๋ค."""
|
135 |
+
if not OPENAI_API_KEY:
|
136 |
+
st.error("OpenAI API ํค๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. ํ๊ฒฝ ๋ณ์๋ฅผ ํ์ธํ์ธ์.")
|
137 |
+
st.stop()
|
138 |
+
try:
|
139 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
140 |
+
# Test connection (optional, but recommended)
|
141 |
+
client.models.list()
|
142 |
+
logger.info("Successfully connected to OpenAI.")
|
143 |
+
return client
|
144 |
+
except Exception as e:
|
145 |
+
st.error(f"OpenAI ํด๋ผ์ด์ธํธ ์ด๊ธฐํ ๋๋ ์ฐ๊ฒฐ ํ
์คํธ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
146 |
+
st.stop()
|
147 |
+
|
148 |
+
# --- ๊ฒ์ ํจ์ ---
|
149 |
+
def search(query: str, top_k: int = 5, _index=None, _model=None) -> List[Dict]:
|
150 |
+
"""Pinecone ์ธ๋ฑ์ค์์ ๊ฒ์์ ์ํํ๊ณ title๊ณผ original_text๋ฅผ ํฌํจํฉ๋๋ค."""
|
151 |
+
if not query or _index is None or _model is None:
|
152 |
+
return []
|
153 |
+
|
154 |
+
try:
|
155 |
+
query_vec = _model.encode(query, convert_to_numpy=True).tolist()
|
156 |
+
result = _index.query(vector=query_vec, top_k=top_k, include_metadata=True)
|
157 |
+
matches = result.get("matches", [])
|
158 |
+
|
159 |
+
search_results = []
|
160 |
+
for m in matches:
|
161 |
+
metadata = m.get("metadata", {})
|
162 |
+
search_results.append({
|
163 |
+
"URL": metadata.get("url", "N/A"),
|
164 |
+
"ํ์์คํฌํ": metadata.get("timestamp", "N/A"),
|
165 |
+
"ํ์
": metadata.get("type", "N/A"),
|
166 |
+
"์ ๋ชฉ": metadata.get("title", "N/A"), # ์ ๋ชฉ ์ถ๊ฐ
|
167 |
+
"์์ฝ": metadata.get("summary", "N/A"),
|
168 |
+
"์๋ณธํ
์คํธ": metadata.get("original_text", "N/A"), # ์ปจํ
์คํธ๋ก ํ์ฉํ ์๋ณธ ํ
์คํธ
|
169 |
+
"์ ์": m.get("score", 0.0)
|
170 |
+
})
|
171 |
+
logger.info(f"Pinecone search returned {len(search_results)} results for query: '{query[:50]}...'")
|
172 |
+
return search_results
|
173 |
+
except Exception as e:
|
174 |
+
st.error(f"Pinecone ๊ฒ์ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
175 |
+
logger.error(f"Error during Pinecone search: {e}", exc_info=True)
|
176 |
+
return []
|
177 |
+
|
178 |
+
# --- OpenAI ๋ต๋ณ ์์ฑ ํจ์ ---
|
179 |
+
def generate_khan_answer(query: str, search_results: List[Dict], client: OpenAI) -> str:
|
180 |
+
"""์ฌ์ฉ์ ์ง๋ฌธ๊ณผ ๊ฒ์ ๊ฒฐ๊ณผ๋ฅผ ๋ฐํ์ผ๋ก Khan ํ๋ฅด์๋ ๋ต๋ณ์ ์์ฑํฉ๋๋ค."""
|
181 |
+
if not search_results:
|
182 |
+
# Return a persona-consistent message even when no results are found
|
183 |
+
return "ํ์ฌ ์ง๋ฌธ์ ๋ํด ์ฐธ๊ณ ํ ๋งํ ๊ด๋ จ ์์์ ์ฐพ์ง ๋ชปํ์ต๋๋ค. ์ง๋ฌธ์ ์กฐ๊ธ ๋ ๋ช
ํํ๊ฒ ํด์ฃผ์๊ฑฐ๋ ๋ค๋ฅธ ๋ฐฉ์์ผ๋ก ์ง๋ฌธํด์ฃผ์๋ฉด ๋์์ด ๋ ๊ฒ ๊ฐ์ต๋๋ค."
|
184 |
+
|
185 |
+
# Build context string for OpenAI more robustly, including timestamped URL
|
186 |
+
context_parts = []
|
187 |
+
for i, r in enumerate(search_results):
|
188 |
+
original_text_snippet = ""
|
189 |
+
if r.get('์๋ณธํ
์คํธ'):
|
190 |
+
snippet = r['์๋ณธํ
์คํธ'][:200]
|
191 |
+
original_text_snippet = f"\n(์๋ณธ ๋ด์ฉ ์ผ๋ถ: {snippet}...)"
|
192 |
+
|
193 |
+
# Generate timestamped URL if possible
|
194 |
+
timestamped_url_str = "N/A"
|
195 |
+
url = r.get('URL', 'N/A')
|
196 |
+
timestamp = r.get('ํ์์คํฌํ', 'N/A')
|
197 |
+
is_youtube = url and isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
|
198 |
+
has_valid_timestamp = timestamp and timestamp != 'N/A' and parse_timestamp_to_seconds(timestamp) is not None
|
199 |
+
if is_youtube and has_valid_timestamp:
|
200 |
+
try:
|
201 |
+
timestamped_url_str = add_timestamp_to_youtube_url(url, timestamp)
|
202 |
+
except Exception:
|
203 |
+
timestamped_url_str = url # Fallback to original URL on error
|
204 |
+
elif url != "N/A":
|
205 |
+
timestamped_url_str = url # Use original URL if not YouTube/no timestamp
|
206 |
+
|
207 |
+
context_parts.append(
|
208 |
+
f"๊ด๋ จ ์ ๋ณด {i+1}:\n"
|
209 |
+
f"์ ๋ชฉ: {r.get('์ ๋ชฉ', 'N/A')}\n"
|
210 |
+
f"์์ URL (์๋ณธ): {url}\n"
|
211 |
+
f"ํ์์คํฌํ: {timestamp}\n"
|
212 |
+
f"ํ์์คํฌํ ์ ์ฉ URL: {timestamped_url_str}\n" # Add the timestamped URL here
|
213 |
+
f"๋ด์ฉ ํ์
: {r.get('ํ์
', 'N/A')}\n"
|
214 |
+
f"์์ฝ: {r.get('์์ฝ', 'N/A')}"
|
215 |
+
f"{original_text_snippet}" # Append the snippet safely
|
216 |
+
)
|
217 |
+
context = "\n\n---\n\n".join(context_parts) # Join the parts
|
218 |
+
|
219 |
+
# Updated system prompt to instruct Markdown link usage
|
220 |
+
system_prompt = """๋๋ ํ์ค์ ์ธ ์กฐ์ธ์ ์ํ๋ PM ๋ฉํ Khan์ด๋ค.
|
221 |
+
- ๋งํฌ๋ ๋จํธํ์ง๋ง ๊ณต๊ฐ๋ ฅ์ด ์๋ค. "~์
๋๋ค." ๋๋ "~์ฃ ." ์ ๊ฐ์ด ๋ช
ํํ๊ฒ ๋๋งบ๋๋ค. ์กด๋๋ง์ ์ฌ์ฉํ๋ค.
|
222 |
+
- ๋ชจํธํ ์๋ก๋ณด๋ค๋ ๊ตฌ์กฐ์ ์ด๊ณ ์ค์ฉ์ ์ธ ์ ์์ ํ๋ค. ๋ฌธ์ ์ ํต์ฌ์ ํ์
ํ๊ณ ๊ตฌ์ฒด์ ์ธ ํด๊ฒฐ์ฑ
์ด๋ ๋ค์ ๋จ๊ณ๋ฅผ ์ ์ํ๋ค.
|
223 |
+
- ์ง๋ฌธ์ด ๋ง์ฐํ๋ฉด ๊ตฌ์ฒดํํด์ ๋๋ฌผ์ด๋ณธ๋ค.
|
224 |
+
- (์ด์ ๋ํ ๊ธฐ๋ก์ ์์ผ๋ฏ๋ก) ๋ฐ๋ณต ์ง๋ฌธ์๋ "์ด์ ์ ์ ์ฌํ ๋ด์ฉ์ ์ฐพ์๋ดค์์ฃ . ๋ค์ ํ๋ฒ ์ดํด๋ณด๋ฉด..." ๊ณผ ๊ฐ์ด ์ธ๊ธํ ์ ์๋ค.
|
225 |
+
- ๊ธด ์ค๋ช
๋ณด๋จ ํต์ฌ์ ๋น ๋ฅด๊ฒ ์ ๋ฌํ๊ณ , ํ์ํ๋ค๋ฉด ๊ฐ๊ฒฐํ ๋น์ ๋ฅผ ํ์ฉํ๋ค.
|
226 |
+
- ์ฃผ์ด์ง '๊ด๋ จ ์ ๋ณด' (์์ ์ ๋ชฉ, ์์ฝ, ์๋ณธ ๋ด์ฉ, ํ์์คํฌํ ์ ์ฉ URL ๋ฑ)๋ฅผ ๋ฐํ์ผ๋ก ๋ต๋ณํด์ผ ํ๋ค. ์ ๋ณด๊ฐ ๋ถ์กฑํ๊ฑฐ๋ ์ง๋ฌธ๊ณผ ๊ด๋ จ์ฑ์ด ๋ฎ์ผ๋ฉด, ๊ทธ ์ ์ ๋ช
ํํ ๋ฐํ๊ณ ์ถ๊ฐ ์ ๋ณด๋ฅผ ์์ฒญํ๊ฑฐ๋ ์ง๋ฌธ์ ๊ตฌ์ฒดํํ๋๋ก ์ ๋ํ๋ค.
|
227 |
+
- **๋ต๋ณ ์ค ๊ด๋ จ ์ ๋ณด๋ฅผ ์ฐธ์กฐํ ๋๋, ๋ฐ๋์ 'ํ์์คํฌํ ์ ์ฉ URL'์ ์ฌ์ฉํ์ฌ ๋ค์๊ณผ ๊ฐ์ Markdown ๋งํฌ ํ์์ผ๋ก ๋ง๋ค์ด์ผ ํ๋ค: `[์์ ์ ๋ชฉ](ํ์์คํฌํ_์ ์ฉ_URL)`. ์๋ฅผ ๋ค์ด, "์์ธํ ๋ด์ฉ์ [๋น๊ฐ๋ฐ์๊ฐ ์ฐ๋ด 2์ต์ ๋ฐ๋ ํ์ค์ ์ธ ๋ฐฉ๋ฒ](https://www.youtube.com/watch?v=VIDEO_ID&t=178s) ์์์ ์ฐธ๊ณ ํ์๋ฉด ๋์์ด ๋ ๊ฒ๋๋ค." ์ ๊ฐ์ด ํ์ํ๋ค.**
|
228 |
+
- ๋ต๋ณ์ ํ๊ตญ์ด๋ก ํ๋ค."""
|
229 |
+
|
230 |
+
# Use triple quotes for the multi-line f-string
|
231 |
+
user_message = f"""์ฌ์ฉ์ ์ง๋ฌธ: {query}
|
232 |
+
|
233 |
+
์๋ ๊ด๋ จ ์ ๋ณด๋ฅผ ๋ฐํ์ผ๋ก Khan ๋ฉํ ๋ก์ ๋ต๋ณํด์ฃผ์ธ์:
|
234 |
+
{context}"""
|
235 |
+
|
236 |
+
try:
|
237 |
+
logger.info("Calling OpenAI API...")
|
238 |
+
completion = client.chat.completions.create(
|
239 |
+
model="gpt-4o", # Use gpt-4 if available and preferred
|
240 |
+
messages=[
|
241 |
+
{"role": "system", "content": system_prompt},
|
242 |
+
{"role": "user", "content": user_message}
|
243 |
+
],
|
244 |
+
temperature=0.5, # Slightly less creative, more focused on instructions
|
245 |
+
)
|
246 |
+
answer = completion.choices[0].message.content
|
247 |
+
logger.info("Received response from OpenAI.")
|
248 |
+
return answer.strip()
|
249 |
+
except Exception as e:
|
250 |
+
st.error(f"OpenAI ๋ต๋ณ ์์ฑ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
251 |
+
logger.error(f"Error during OpenAI API call: {e}", exc_info=True)
|
252 |
+
return "๋ต๋ณ์ ์์ฑํ๋ ์ค์ ๋ฌธ์ ๊ฐ ๋ฐ์ํ์ต๋๋ค. OpenAI API ํค ๋๋ ์๋น์ค ์ํ๋ฅผ ํ์ธํด์ฃผ์ธ์."
|
253 |
+
|
254 |
+
# --- Streamlit ์ฑ UI ---
|
255 |
+
st.set_page_config(page_title="Khan ๋ฉํ (PM ์์ ๊ธฐ๋ฐ)", layout="wide")
|
256 |
+
st.title("โจ Khan ๋ฉํ ์๊ฒ ์ง๋ฌธํ๊ธฐ")
|
257 |
+
st.markdown("PM ๊ด๋ จ ์์ ๋ด์ฉ์ ๊ธฐ๋ฐ์ผ๋ก Khan ๋ฉํ ๊ฐ ๋ต๋ณํด ๋๋ฆฝ๋๋ค.")
|
258 |
+
|
259 |
+
# --- API ํค ํ์ธ ๋ฐ ๋ฆฌ์์ค ์ด๊ธฐํ ---
|
260 |
+
openai_client = init_openai_client()
|
261 |
+
pc = init_pinecone()
|
262 |
+
model = load_embedding_model()
|
263 |
+
index = get_pinecone_index(pc, INDEX_NAME)
|
264 |
+
|
265 |
+
# --- ์ฌ์ฉ์ ์
๋ ฅ ---
|
266 |
+
query = st.text_input("๋ฉํ ์๊ฒ ์ง๋ฌธํ ๋ด์ฉ์ ์
๋ ฅํ์ธ์:", placeholder="์: ์ ์
PM์ด ๊ฐ์ฅ ๋จผ์ ํด์ผ ํ ์ผ์ ๋ฌด์์ธ๊ฐ์?")
|
267 |
+
|
268 |
+
# --- ๊ฒ์ ๋ฐ ๋ต๋ณ ์์ฑ ์คํ ---
|
269 |
+
if st.button("Khan ๋ฉํ ์๊ฒ ์ง๋ฌธํ๊ธฐ"):
|
270 |
+
# Always use top_k=3 for Pinecone search
|
271 |
+
if query and index and model and openai_client:
|
272 |
+
with st.spinner("๊ด๋ จ ์์์ ์ฐพ๊ณ Khan ๋ฉํ ๊ฐ ๋ต๋ณ์ ์ค๋นํ๋ ์ค..."):
|
273 |
+
# 1. Pinecone ๊ฒ์ (Always use top_k=3)
|
274 |
+
pinecone_results = search(query, top_k=5, _index=index, _model=model)
|
275 |
+
|
276 |
+
# 2. OpenAI ๋ต๋ณ ์์ฑ
|
277 |
+
khan_answer = generate_khan_answer(query, pinecone_results, openai_client)
|
278 |
+
|
279 |
+
# 3. ๊ฒฐ๊ณผ ํ์
|
280 |
+
st.subheader("๐ก Khan ๋ฉํ ์ ๋ต๋ณ")
|
281 |
+
st.markdown(khan_answer) # ์์ฑ๋ ๋ต๋ณ ํ์
|
282 |
+
|
283 |
+
# 4. ์ฐธ๊ณ ์๋ฃ (Pinecone ๊ฒ์ ๊ฒฐ๊ณผ) ํ์
|
284 |
+
if pinecone_results:
|
285 |
+
with st.expander("๋ต๋ณ์ ์ฐธ๊ณ ํ ์์ ์ ๋ณด ๋ณด๊ธฐ"):
|
286 |
+
displayed_urls = set() # Keep track of displayed video URLs
|
287 |
+
# Display up to 3 *unique* results based on URL
|
288 |
+
for i, r in enumerate(pinecone_results):
|
289 |
+
url = r.get('URL', 'N/A')
|
290 |
+
|
291 |
+
# Skip if this video URL has already been displayed
|
292 |
+
if url in displayed_urls or url == 'N/A':
|
293 |
+
continue
|
294 |
+
|
295 |
+
# Add the URL to the set of displayed URLs
|
296 |
+
displayed_urls.add(url)
|
297 |
+
|
298 |
+
# --- Display unique video info ---
|
299 |
+
st.markdown(f"--- **์ฐธ๊ณ ์๋ฃ {len(displayed_urls)} (์ ์ฌ๋: {r['์ ์']:.4f})** ---") # Use length of set for counter
|
300 |
+
st.markdown(f"**์ ๋ชฉ:** {r.get('์ ๋ชฉ', 'N/A')}")
|
301 |
+
st.markdown(f"**์์ฝ:** {r.get('์์ฝ', 'N/A')}")
|
302 |
+
|
303 |
+
timestamp = r.get('ํ์์คํฌํ', 'N/A')
|
304 |
+
is_youtube = url and isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
|
305 |
+
start_seconds = None # Initialize start_seconds
|
306 |
+
|
307 |
+
# Try to calculate start_seconds if timestamp is valid
|
308 |
+
if is_youtube and timestamp and timestamp != 'N/A':
|
309 |
+
start_seconds = parse_timestamp_to_seconds(timestamp)
|
310 |
+
|
311 |
+
# Display timestamped link (still useful for user)
|
312 |
+
if is_youtube and start_seconds is not None:
|
313 |
+
try:
|
314 |
+
# We still generate timestamped URL for the link text
|
315 |
+
timestamped_link_url = add_timestamp_to_youtube_url(url, timestamp)
|
316 |
+
st.markdown(f"**์์ ๋งํฌ (ํ์์คํฌํ ํฌํจ):** [{timestamped_link_url}]({timestamped_link_url})")
|
317 |
+
except Exception as e:
|
318 |
+
logger.error(f"Error creating timestamped URL for link: {e}")
|
319 |
+
st.markdown(f"**์์ ๋งํฌ (์๋ณธ):** [{url}]({url})") # Fallback link
|
320 |
+
elif url != "N/A" and isinstance(url, str) and url.startswith("http"):
|
321 |
+
st.markdown(f"**URL:** [{url}]({url})")
|
322 |
+
else:
|
323 |
+
st.markdown(f"**URL:** {url}")
|
324 |
+
|
325 |
+
# Use st.video with original URL and start_time parameter
|
326 |
+
if is_youtube and url != "N/A":
|
327 |
+
# Create columns to control width, place video in the first column (50% width)
|
328 |
+
col1, col2 = st.columns(2)
|
329 |
+
with col1:
|
330 |
+
try:
|
331 |
+
# Pass original URL and calculated start_seconds to st.video
|
332 |
+
st.video(url, start_time=start_seconds or 0)
|
333 |
+
except Exception as e:
|
334 |
+
st.error(f"๋น๋์ค({url}) ์ฌ์ ์ค ์ค๋ฅ ๋ฐ์: {e}")
|
335 |
+
# Fallback link uses the original URL here as start_time likely failed
|
336 |
+
st.markdown(f"[YouTube์์ ๋ณด๊ธฐ]({url})")
|
337 |
+
elif url != "N/A": # Try st.video for other potential video URLs (no start_time)
|
338 |
+
# Create columns for non-YouTube videos as well
|
339 |
+
col1, col2 = st.columns(2)
|
340 |
+
with col1:
|
341 |
+
try:
|
342 |
+
st.video(url)
|
343 |
+
except Exception as e:
|
344 |
+
logger.warning(f"st.video failed for non-YouTube URL {url}: {e}")
|
345 |
+
|
346 |
+
# Remove the display of original timestamp and type
|
347 |
+
# st.markdown(f"**ํ์์คํฌํ (์๋ณธ):** {timestamp}")
|
348 |
+
# st.markdown(f"**๋ด์ฉ ํ์
:** {r.get('ํ์
', 'N/A')}")
|
349 |
+
|
350 |
+
|
351 |
+
elif not query:
|
352 |
+
st.warning("์ง๋ฌธ ๋ด์ฉ์ ์
๋ ฅํด์ฃผ์ธ์.")
|
353 |
+
# API ํค ๋ฑ ๋ค๋ฅธ ์์ ๋ถ์ฌ ์ ์๋ฌ๋ ๊ฐ init ํจ์์์ ์ฒ๋ฆฌ๋จ
|
354 |
+
|
355 |
+
st.markdown("---")
|
356 |
+
st.caption("Powered by Pinecone, Sentence Transformers, and OpenAI")
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pinecone
|
3 |
+
sentence-transformers
|
4 |
+
torch
|
5 |
+
pandas
|
6 |
+
openai
|