morethanair commited on
Commit
07af1d2
ยท
1 Parent(s): a5a5de8

Add files via upload

Browse files
Files changed (2) hide show
  1. app.py +356 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from pinecone import Pinecone
4
+ from sentence_transformers import SentenceTransformer
5
+ from typing import List, Dict
6
+ import re # For parsing timestamp and extracting video ID
7
+ import streamlit.components.v1 as components # For embedding HTML
8
+ from openai import OpenAI # Import OpenAI library
9
+ import logging
10
+
11
+ # Setup logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # --- Helper Functions (Existing: parse_timestamp_to_seconds, get_youtube_video_id, add_timestamp_to_youtube_url, generate_youtube_embed_html) ---
16
+ def parse_timestamp_to_seconds(timestamp: str) -> int | None:
17
+ """HH:MM:SS ๋˜๋Š” HH:MM:SS.ms ํ˜•์‹์˜ ํƒ€์ž„์Šคํƒฌํ”„๋ฅผ ์ดˆ ๋‹จ์œ„๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
18
+ if not isinstance(timestamp, str):
19
+ return None
20
+ # Remove milliseconds part if present
21
+ timestamp_no_ms = timestamp.split('.')[0]
22
+ parts = timestamp_no_ms.split(':')
23
+ try:
24
+ if len(parts) == 3:
25
+ h, m, s = map(int, parts)
26
+ return h * 3600 + m * 60 + s
27
+ elif len(parts) == 2:
28
+ m, s = map(int, parts)
29
+ return m * 60 + s
30
+ elif len(parts) == 1:
31
+ return int(parts[0])
32
+ else:
33
+ return None
34
+ except ValueError:
35
+ return None
36
+
37
+ def get_youtube_video_id(url: str) -> str | None:
38
+ """YouTube URL์—์„œ ๋น„๋””์˜ค ID๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."""
39
+ if not isinstance(url, str):
40
+ return None
41
+ # Standard YouTube URLs (youtube.com/watch?v=...), shortened URLs (youtu.be/...), etc.
42
+ match = re.search(r"(?:v=|/|youtu\.be/|embed/|shorts/)([0-9A-Za-z_-]{11})", url)
43
+ return match.group(1) if match else None
44
+
45
+ def add_timestamp_to_youtube_url(youtube_url: str, timestamp: str) -> str:
46
+ """YouTube URL์— ํƒ€์ž„์Šคํƒฌํ”„๋ฅผ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค."""
47
+ seconds = parse_timestamp_to_seconds(timestamp)
48
+ if seconds is None or not youtube_url:
49
+ return youtube_url # Return original URL if timestamp is invalid or URL is empty
50
+
51
+ separator = '&' if '?' in youtube_url else '?'
52
+ # Remove existing t= parameter if present
53
+ cleaned_url = re.sub(r'[?&]t=\d+s?', '', youtube_url)
54
+ separator = '&' if '?' in cleaned_url else '?' # Re-check separator after cleaning
55
+ return f"{cleaned_url}{separator}t={seconds}s"
56
+
57
+ def generate_youtube_embed_html(youtube_url: str, timestamp: str) -> str | None:
58
+ """ํƒ€์ž„์Šคํƒฌํ”„๊ฐ€ ์ ์šฉ๋œ YouTube ์ž„๋ฒ ๋“œ HTML ์ฝ”๋“œ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. ๊ฐ€๋กœ 800px ๊ณ ์ •, ์„ธ๋กœ ์ž๋™ ์กฐ์ ˆ."""
59
+ video_id = get_youtube_video_id(youtube_url)
60
+ start_seconds = parse_timestamp_to_seconds(timestamp)
61
+
62
+ if not video_id:
63
+ logger.warning(f"Could not extract video ID from URL: {youtube_url}")
64
+ return None # Cannot generate embed code without video ID
65
+
66
+ start_param = f"start={start_seconds}" if start_seconds is not None else ""
67
+
68
+ # Use aspect ratio approach with fixed width 800px
69
+ return f'''
70
+ <div style="position: relative; width: 800px; padding-bottom: 450px; /* 800px * 9 / 16 = 450px */ height: 0; overflow: hidden;">
71
+ <iframe
72
+ src="https://www.youtube.com/embed/{video_id}?{start_param}&autoplay=0&rel=0"
73
+ frameborder="0"
74
+ allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
75
+ referrerpolicy="strict-origin-when-cross-origin"
76
+ allowfullscreen
77
+ style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;">
78
+ </iframe>
79
+ </div>
80
+ '''
81
+
82
+ # --- ์„ค์ • ---
83
+ # Pinecone ์„ค์ •
84
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY","pcsk_PZHLK_TRAvMCyNmJM4FKGCX7rbbY22a58fhnWYasx1mf3WL6sRasoASZXfsbnJYvCQ13w") # Load from environment variable
85
+ PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east-1")
86
+ INDEX_NAME = "video-embeddings"
87
+ EMBEDDING_MODEL = "jhgan/ko-sroberta-multitask"
88
+
89
+ # OpenAI ์„ค์ •
90
+ OPENAI_API_KEY = "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"
91
+
92
+ # --- ๋ฆฌ์†Œ์Šค ๋กœ๋”ฉ (์บ์‹ฑ ํ™œ์šฉ) ---
93
+ @st.cache_resource
94
+ def init_pinecone():
95
+ """Pinecone ํด๋ผ์ด์–ธํŠธ๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค."""
96
+ api_key = PINECONE_API_KEY
97
+ if not api_key:
98
+ st.error("Pinecone API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ํ™•์ธํ•˜์„ธ์š”.")
99
+ st.stop()
100
+ try:
101
+ pc = Pinecone(api_key=api_key)
102
+ logger.info("Successfully connected to Pinecone.")
103
+ return pc
104
+ except Exception as e:
105
+ st.error(f"Pinecone ์ดˆ๊ธฐํ™” ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
106
+ st.stop()
107
+
108
+ @st.cache_resource
109
+ def load_embedding_model():
110
+ """Sentence Transformer ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค."""
111
+ try:
112
+ model = SentenceTransformer(EMBEDDING_MODEL)
113
+ logger.info(f"Successfully loaded embedding model: {EMBEDDING_MODEL}")
114
+ return model
115
+ except Exception as e:
116
+ st.error(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘ ์˜ค๋ฅ˜ ๏ฟฝ๏ฟฝ๏ฟฝ์ƒ: {e}")
117
+ st.stop()
118
+
119
+ @st.cache_resource
120
+ def get_pinecone_index(_pc: Pinecone, index_name: str):
121
+ """Pinecone ์ธ๋ฑ์Šค ๊ฐ์ฒด๋ฅผ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค."""
122
+ try:
123
+ index = _pc.Index(index_name)
124
+ # Optionally, do a quick check like index.describe_index_stats() to confirm connection
125
+ stats = index.describe_index_stats()
126
+ logger.info(f"Successfully connected to Pinecone index '{index_name}'. Stats: {stats.get('total_vector_count', 'N/A')} vectors")
127
+ return index
128
+ except Exception as e:
129
+ st.error(f"Pinecone ์ธ๋ฑ์Šค '{index_name}' ์—ฐ๊ฒฐ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}. ์ธ๋ฑ์Šค๊ฐ€ ์กด์žฌํ•˜๊ณ  ํ™œ์„ฑ ์ƒํƒœ์ธ์ง€ ํ™•์ธํ•˜์„ธ์š”.")
130
+ st.stop()
131
+
132
+ @st.cache_resource
133
+ def init_openai_client():
134
+ """OpenAI ํด๋ผ์ด์–ธํŠธ๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค."""
135
+ if not OPENAI_API_KEY:
136
+ st.error("OpenAI API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ํ™•์ธํ•˜์„ธ์š”.")
137
+ st.stop()
138
+ try:
139
+ client = OpenAI(api_key=OPENAI_API_KEY)
140
+ # Test connection (optional, but recommended)
141
+ client.models.list()
142
+ logger.info("Successfully connected to OpenAI.")
143
+ return client
144
+ except Exception as e:
145
+ st.error(f"OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” ๋˜๋Š” ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
146
+ st.stop()
147
+
148
+ # --- ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ---
149
+ def search(query: str, top_k: int = 5, _index=None, _model=None) -> List[Dict]:
150
+ """Pinecone ์ธ๋ฑ์Šค์—์„œ ๊ฒ€์ƒ‰์„ ์ˆ˜ํ–‰ํ•˜๊ณ  title๊ณผ original_text๋ฅผ ํฌํ•จํ•ฉ๋‹ˆ๋‹ค."""
151
+ if not query or _index is None or _model is None:
152
+ return []
153
+
154
+ try:
155
+ query_vec = _model.encode(query, convert_to_numpy=True).tolist()
156
+ result = _index.query(vector=query_vec, top_k=top_k, include_metadata=True)
157
+ matches = result.get("matches", [])
158
+
159
+ search_results = []
160
+ for m in matches:
161
+ metadata = m.get("metadata", {})
162
+ search_results.append({
163
+ "URL": metadata.get("url", "N/A"),
164
+ "ํƒ€์ž„์Šคํƒฌํ”„": metadata.get("timestamp", "N/A"),
165
+ "ํƒ€์ž…": metadata.get("type", "N/A"),
166
+ "์ œ๋ชฉ": metadata.get("title", "N/A"), # ์ œ๋ชฉ ์ถ”๊ฐ€
167
+ "์š”์•ฝ": metadata.get("summary", "N/A"),
168
+ "์›๋ณธํ…์ŠคํŠธ": metadata.get("original_text", "N/A"), # ์ปจํ…์ŠคํŠธ๋กœ ํ™œ์šฉํ•  ์›๋ณธ ํ…์ŠคํŠธ
169
+ "์ ์ˆ˜": m.get("score", 0.0)
170
+ })
171
+ logger.info(f"Pinecone search returned {len(search_results)} results for query: '{query[:50]}...'")
172
+ return search_results
173
+ except Exception as e:
174
+ st.error(f"Pinecone ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
175
+ logger.error(f"Error during Pinecone search: {e}", exc_info=True)
176
+ return []
177
+
178
+ # --- OpenAI ๋‹ต๋ณ€ ์ƒ์„ฑ ํ•จ์ˆ˜ ---
179
+ def generate_khan_answer(query: str, search_results: List[Dict], client: OpenAI) -> str:
180
+ """์‚ฌ์šฉ์ž ์งˆ๋ฌธ๊ณผ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ Khan ํŽ˜๋ฅด์†Œ๋‚˜ ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค."""
181
+ if not search_results:
182
+ # Return a persona-consistent message even when no results are found
183
+ return "ํ˜„์žฌ ์งˆ๋ฌธ์— ๋Œ€ํ•ด ์ฐธ๊ณ ํ•  ๋งŒํ•œ ๊ด€๋ จ ์˜์ƒ์„ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ์งˆ๋ฌธ์„ ์กฐ๊ธˆ ๋” ๋ช…ํ™•ํ•˜๊ฒŒ ํ•ด์ฃผ์‹œ๊ฑฐ๋‚˜ ๋‹ค๋ฅธ ๋ฐฉ์‹์œผ๋กœ ์งˆ๋ฌธํ•ด์ฃผ์‹œ๋ฉด ๋„์›€์ด ๋  ๊ฒƒ ๊ฐ™์Šต๋‹ˆ๋‹ค."
184
+
185
+ # Build context string for OpenAI more robustly, including timestamped URL
186
+ context_parts = []
187
+ for i, r in enumerate(search_results):
188
+ original_text_snippet = ""
189
+ if r.get('์›๋ณธํ…์ŠคํŠธ'):
190
+ snippet = r['์›๋ณธํ…์ŠคํŠธ'][:200]
191
+ original_text_snippet = f"\n(์›๋ณธ ๋‚ด์šฉ ์ผ๋ถ€: {snippet}...)"
192
+
193
+ # Generate timestamped URL if possible
194
+ timestamped_url_str = "N/A"
195
+ url = r.get('URL', 'N/A')
196
+ timestamp = r.get('ํƒ€์ž„์Šคํƒฌํ”„', 'N/A')
197
+ is_youtube = url and isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
198
+ has_valid_timestamp = timestamp and timestamp != 'N/A' and parse_timestamp_to_seconds(timestamp) is not None
199
+ if is_youtube and has_valid_timestamp:
200
+ try:
201
+ timestamped_url_str = add_timestamp_to_youtube_url(url, timestamp)
202
+ except Exception:
203
+ timestamped_url_str = url # Fallback to original URL on error
204
+ elif url != "N/A":
205
+ timestamped_url_str = url # Use original URL if not YouTube/no timestamp
206
+
207
+ context_parts.append(
208
+ f"๊ด€๋ จ ์ •๋ณด {i+1}:\n"
209
+ f"์ œ๋ชฉ: {r.get('์ œ๋ชฉ', 'N/A')}\n"
210
+ f"์˜์ƒ URL (์›๋ณธ): {url}\n"
211
+ f"ํƒ€์ž„์Šคํƒฌํ”„: {timestamp}\n"
212
+ f"ํƒ€์ž„์Šคํƒฌํ”„ ์ ์šฉ URL: {timestamped_url_str}\n" # Add the timestamped URL here
213
+ f"๋‚ด์šฉ ํƒ€์ž…: {r.get('ํƒ€์ž…', 'N/A')}\n"
214
+ f"์š”์•ฝ: {r.get('์š”์•ฝ', 'N/A')}"
215
+ f"{original_text_snippet}" # Append the snippet safely
216
+ )
217
+ context = "\n\n---\n\n".join(context_parts) # Join the parts
218
+
219
+ # Updated system prompt to instruct Markdown link usage
220
+ system_prompt = """๋„ˆ๋Š” ํ˜„์‹ค์ ์ธ ์กฐ์–ธ์„ ์ž˜ํ•˜๋Š” PM ๋ฉ˜ํ†  Khan์ด๋‹ค.
221
+ - ๋งํˆฌ๋Š” ๋‹จํ˜ธํ•˜์ง€๋งŒ ๊ณต๊ฐ๋ ฅ์ด ์žˆ๋‹ค. "~์ž…๋‹ˆ๋‹ค." ๋˜๋Š” "~์ฃ ." ์™€ ๊ฐ™์ด ๋ช…ํ™•ํ•˜๊ฒŒ ๋๋งบ๋Š”๋‹ค. ์กด๋Œ“๋ง์„ ์‚ฌ์šฉํ•œ๋‹ค.
222
+ - ๋ชจํ˜ธํ•œ ์œ„๋กœ๋ณด๋‹ค๋Š” ๊ตฌ์กฐ์ ์ด๊ณ  ์‹ค์šฉ์ ์ธ ์ œ์•ˆ์„ ํ•œ๋‹ค. ๋ฌธ์ œ์˜ ํ•ต์‹ฌ์„ ํŒŒ์•…ํ•˜๊ณ  ๊ตฌ์ฒด์ ์ธ ํ•ด๊ฒฐ์ฑ…์ด๋‚˜ ๋‹ค์Œ ๋‹จ๊ณ„๋ฅผ ์ œ์‹œํ•œ๋‹ค.
223
+ - ์งˆ๋ฌธ์ด ๋ง‰์—ฐํ•˜๋ฉด ๊ตฌ์ฒดํ™”ํ•ด์„œ ๋˜๋ฌผ์–ด๋ณธ๋‹ค.
224
+ - (์ด์ „ ๋Œ€ํ™” ๊ธฐ๋ก์€ ์—†์œผ๋ฏ€๋กœ) ๋ฐ˜๋ณต ์งˆ๋ฌธ์—๋Š” "์ด์ „์— ์œ ์‚ฌํ•œ ๋‚ด์šฉ์„ ์ฐพ์•„๋ดค์—ˆ์ฃ . ๋‹ค์‹œ ํ•œ๋ฒˆ ์‚ดํŽด๋ณด๋ฉด..." ๊ณผ ๊ฐ™์ด ์–ธ๊ธ‰ํ•  ์ˆ˜ ์žˆ๋‹ค.
225
+ - ๊ธด ์„ค๋ช…๋ณด๋‹จ ํ•ต์‹ฌ์„ ๋น ๋ฅด๊ฒŒ ์ „๋‹ฌํ•˜๊ณ , ํ•„์š”ํ•˜๋‹ค๋ฉด ๊ฐ„๊ฒฐํ•œ ๋น„์œ ๋ฅผ ํ™œ์šฉํ•œ๋‹ค.
226
+ - ์ฃผ์–ด์ง„ '๊ด€๋ จ ์ •๋ณด' (์˜์ƒ ์ œ๋ชฉ, ์š”์•ฝ, ์›๋ณธ ๋‚ด์šฉ, ํƒ€์ž„์Šคํƒฌํ”„ ์ ์šฉ URL ๋“ฑ)๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ๋‹ต๋ณ€ํ•ด์•ผ ํ•œ๋‹ค. ์ •๋ณด๊ฐ€ ๋ถ€์กฑํ•˜๊ฑฐ๋‚˜ ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ์„ฑ์ด ๋‚ฎ์œผ๋ฉด, ๊ทธ ์ ์„ ๋ช…ํ™•ํžˆ ๋ฐํžˆ๊ณ  ์ถ”๊ฐ€ ์ •๋ณด๋ฅผ ์š”์ฒญํ•˜๊ฑฐ๋‚˜ ์งˆ๋ฌธ์„ ๊ตฌ์ฒดํ™”ํ•˜๋„๋ก ์œ ๋„ํ•œ๋‹ค.
227
+ - **๋‹ต๋ณ€ ์ค‘ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐธ์กฐํ•  ๋•Œ๋Š”, ๋ฐ˜๋“œ์‹œ 'ํƒ€์ž„์Šคํƒฌํ”„ ์ ์šฉ URL'์„ ์‚ฌ์šฉํ•˜์—ฌ ๋‹ค์Œ๊ณผ ๊ฐ™์€ Markdown ๋งํฌ ํ˜•์‹์œผ๋กœ ๋งŒ๋“ค์–ด์•ผ ํ•œ๋‹ค: `[์˜์ƒ ์ œ๋ชฉ](ํƒ€์ž„์Šคํƒฌํ”„_์ ์šฉ_URL)`. ์˜ˆ๋ฅผ ๋“ค์–ด, "์ž์„ธํ•œ ๋‚ด์šฉ์€ [๋น„๊ฐœ๋ฐœ์ž๊ฐ€ ์—ฐ๋ด‰ 2์–ต์„ ๋ฐ›๋Š” ํ˜„์‹ค์ ์ธ ๋ฐฉ๋ฒ•](https://www.youtube.com/watch?v=VIDEO_ID&t=178s) ์˜์ƒ์„ ์ฐธ๊ณ ํ•˜์‹œ๋ฉด ๋„์›€์ด ๋  ๊ฒ๋‹ˆ๋‹ค." ์™€ ๊ฐ™์ด ํ‘œ์‹œํ•œ๋‹ค.**
228
+ - ๋‹ต๋ณ€์€ ํ•œ๊ตญ์–ด๋กœ ํ•œ๋‹ค."""
229
+
230
+ # Use triple quotes for the multi-line f-string
231
+ user_message = f"""์‚ฌ์šฉ์ž ์งˆ๋ฌธ: {query}
232
+
233
+ ์•„๋ž˜ ๊ด€๋ จ ์ •๋ณด๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ Khan ๋ฉ˜ํ† ๋กœ์„œ ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”:
234
+ {context}"""
235
+
236
+ try:
237
+ logger.info("Calling OpenAI API...")
238
+ completion = client.chat.completions.create(
239
+ model="gpt-4o", # Use gpt-4 if available and preferred
240
+ messages=[
241
+ {"role": "system", "content": system_prompt},
242
+ {"role": "user", "content": user_message}
243
+ ],
244
+ temperature=0.5, # Slightly less creative, more focused on instructions
245
+ )
246
+ answer = completion.choices[0].message.content
247
+ logger.info("Received response from OpenAI.")
248
+ return answer.strip()
249
+ except Exception as e:
250
+ st.error(f"OpenAI ๋‹ต๋ณ€ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
251
+ logger.error(f"Error during OpenAI API call: {e}", exc_info=True)
252
+ return "๋‹ต๋ณ€์„ ์ƒ์„ฑํ•˜๋Š” ์ค‘์— ๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. OpenAI API ํ‚ค ๋˜๋Š” ์„œ๋น„์Šค ์ƒํƒœ๋ฅผ ํ™•์ธํ•ด์ฃผ์„ธ์š”."
253
+
254
+ # --- Streamlit ์•ฑ UI ---
255
+ st.set_page_config(page_title="Khan ๋ฉ˜ํ†  (PM ์˜์ƒ ๊ธฐ๋ฐ˜)", layout="wide")
256
+ st.title("โœจ Khan ๋ฉ˜ํ† ์—๊ฒŒ ์งˆ๋ฌธํ•˜๊ธฐ")
257
+ st.markdown("PM ๊ด€๋ จ ์˜์ƒ ๋‚ด์šฉ์„ ๊ธฐ๋ฐ˜์œผ๋กœ Khan ๋ฉ˜ํ† ๊ฐ€ ๋‹ต๋ณ€ํ•ด ๋“œ๋ฆฝ๋‹ˆ๋‹ค.")
258
+
259
+ # --- API ํ‚ค ํ™•์ธ ๋ฐ ๋ฆฌ์†Œ์Šค ์ดˆ๊ธฐํ™” ---
260
+ openai_client = init_openai_client()
261
+ pc = init_pinecone()
262
+ model = load_embedding_model()
263
+ index = get_pinecone_index(pc, INDEX_NAME)
264
+
265
+ # --- ์‚ฌ์šฉ์ž ์ž…๋ ฅ ---
266
+ query = st.text_input("๋ฉ˜ํ† ์—๊ฒŒ ์งˆ๋ฌธํ•  ๋‚ด์šฉ์„ ์ž…๋ ฅํ•˜์„ธ์š”:", placeholder="์˜ˆ: ์‹ ์ž… PM์ด ๊ฐ€์žฅ ๋จผ์ € ํ•ด์•ผ ํ•  ์ผ์€ ๋ฌด์—‡์ธ๊ฐ€์š”?")
267
+
268
+ # --- ๊ฒ€์ƒ‰ ๋ฐ ๋‹ต๋ณ€ ์ƒ์„ฑ ์‹คํ–‰ ---
269
+ if st.button("Khan ๋ฉ˜ํ† ์—๊ฒŒ ์งˆ๋ฌธํ•˜๊ธฐ"):
270
+ # Always use top_k=3 for Pinecone search
271
+ if query and index and model and openai_client:
272
+ with st.spinner("๊ด€๋ จ ์˜์ƒ์„ ์ฐพ๊ณ  Khan ๋ฉ˜ํ† ๊ฐ€ ๋‹ต๋ณ€์„ ์ค€๋น„ํ•˜๋Š” ์ค‘..."):
273
+ # 1. Pinecone ๊ฒ€์ƒ‰ (Always use top_k=3)
274
+ pinecone_results = search(query, top_k=5, _index=index, _model=model)
275
+
276
+ # 2. OpenAI ๋‹ต๋ณ€ ์ƒ์„ฑ
277
+ khan_answer = generate_khan_answer(query, pinecone_results, openai_client)
278
+
279
+ # 3. ๊ฒฐ๊ณผ ํ‘œ์‹œ
280
+ st.subheader("๐Ÿ’ก Khan ๋ฉ˜ํ† ์˜ ๋‹ต๋ณ€")
281
+ st.markdown(khan_answer) # ์ƒ์„ฑ๋œ ๋‹ต๋ณ€ ํ‘œ์‹œ
282
+
283
+ # 4. ์ฐธ๊ณ  ์ž๋ฃŒ (Pinecone ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ) ํ‘œ์‹œ
284
+ if pinecone_results:
285
+ with st.expander("๋‹ต๋ณ€์— ์ฐธ๊ณ ํ•œ ์˜์ƒ ์ •๋ณด ๋ณด๊ธฐ"):
286
+ displayed_urls = set() # Keep track of displayed video URLs
287
+ # Display up to 3 *unique* results based on URL
288
+ for i, r in enumerate(pinecone_results):
289
+ url = r.get('URL', 'N/A')
290
+
291
+ # Skip if this video URL has already been displayed
292
+ if url in displayed_urls or url == 'N/A':
293
+ continue
294
+
295
+ # Add the URL to the set of displayed URLs
296
+ displayed_urls.add(url)
297
+
298
+ # --- Display unique video info ---
299
+ st.markdown(f"--- **์ฐธ๊ณ  ์ž๋ฃŒ {len(displayed_urls)} (์œ ์‚ฌ๋„: {r['์ ์ˆ˜']:.4f})** ---") # Use length of set for counter
300
+ st.markdown(f"**์ œ๋ชฉ:** {r.get('์ œ๋ชฉ', 'N/A')}")
301
+ st.markdown(f"**์š”์•ฝ:** {r.get('์š”์•ฝ', 'N/A')}")
302
+
303
+ timestamp = r.get('ํƒ€์ž„์Šคํƒฌํ”„', 'N/A')
304
+ is_youtube = url and isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
305
+ start_seconds = None # Initialize start_seconds
306
+
307
+ # Try to calculate start_seconds if timestamp is valid
308
+ if is_youtube and timestamp and timestamp != 'N/A':
309
+ start_seconds = parse_timestamp_to_seconds(timestamp)
310
+
311
+ # Display timestamped link (still useful for user)
312
+ if is_youtube and start_seconds is not None:
313
+ try:
314
+ # We still generate timestamped URL for the link text
315
+ timestamped_link_url = add_timestamp_to_youtube_url(url, timestamp)
316
+ st.markdown(f"**์˜์ƒ ๋งํฌ (ํƒ€์ž„์Šคํƒฌํ”„ ํฌํ•จ):** [{timestamped_link_url}]({timestamped_link_url})")
317
+ except Exception as e:
318
+ logger.error(f"Error creating timestamped URL for link: {e}")
319
+ st.markdown(f"**์˜์ƒ ๋งํฌ (์›๋ณธ):** [{url}]({url})") # Fallback link
320
+ elif url != "N/A" and isinstance(url, str) and url.startswith("http"):
321
+ st.markdown(f"**URL:** [{url}]({url})")
322
+ else:
323
+ st.markdown(f"**URL:** {url}")
324
+
325
+ # Use st.video with original URL and start_time parameter
326
+ if is_youtube and url != "N/A":
327
+ # Create columns to control width, place video in the first column (50% width)
328
+ col1, col2 = st.columns(2)
329
+ with col1:
330
+ try:
331
+ # Pass original URL and calculated start_seconds to st.video
332
+ st.video(url, start_time=start_seconds or 0)
333
+ except Exception as e:
334
+ st.error(f"๋น„๋””์˜ค({url}) ์žฌ์ƒ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
335
+ # Fallback link uses the original URL here as start_time likely failed
336
+ st.markdown(f"[YouTube์—์„œ ๋ณด๊ธฐ]({url})")
337
+ elif url != "N/A": # Try st.video for other potential video URLs (no start_time)
338
+ # Create columns for non-YouTube videos as well
339
+ col1, col2 = st.columns(2)
340
+ with col1:
341
+ try:
342
+ st.video(url)
343
+ except Exception as e:
344
+ logger.warning(f"st.video failed for non-YouTube URL {url}: {e}")
345
+
346
+ # Remove the display of original timestamp and type
347
+ # st.markdown(f"**ํƒ€์ž„์Šคํƒฌํ”„ (์›๋ณธ):** {timestamp}")
348
+ # st.markdown(f"**๋‚ด์šฉ ํƒ€์ž…:** {r.get('ํƒ€์ž…', 'N/A')}")
349
+
350
+
351
+ elif not query:
352
+ st.warning("์งˆ๋ฌธ ๋‚ด์šฉ์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”.")
353
+ # API ํ‚ค ๋“ฑ ๋‹ค๋ฅธ ์š”์†Œ ๋ถ€์žฌ ์‹œ ์—๋Ÿฌ๋Š” ๊ฐ init ํ•จ์ˆ˜์—์„œ ์ฒ˜๋ฆฌ๋จ
354
+
355
+ st.markdown("---")
356
+ st.caption("Powered by Pinecone, Sentence Transformers, and OpenAI")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pinecone
3
+ sentence-transformers
4
+ torch
5
+ pandas
6
+ openai