Spaces:

Sodagraph
/

YouTube_Transcript_Extraction

Build error

App Files Files Community

Sodagraph commited on Jul 9

Commit

f6b1133

1 Parent(s): 5fd4118

cli 0.3

Browse files

Files changed (3) hide show

backend/app/main.py +19 -75
backend/app/rag_core.py +100 -46
backend/app/youtube_parser.py +111 -161

backend/app/main.py CHANGED Viewed

@@ -2,21 +2,20 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.staticfiles import StaticFiles # StaticFiles 임포트
 import os
 from pydantic import BaseModel
-import httpx
 from youtube_parser import process_youtube_video_data
-from rag_core import perform_rag_query
 app = FastAPI()
-# CORS 설정: 프론트엔드와 백엔드가 다른 포트에서 실행될 때 필요
 origins = [
-    "http://localhost:8080", # Vue 개발 서버 기본 포트
-    "http://localhost:5173", # Vue Vite 개발 서버 기본 포트
-    "https://sodagraph-po.hf.space", # 여러분의 Hugging Face Space URL
 ]
 app.add_middleware(
@@ -32,102 +31,47 @@ current_file_dir = os.path.dirname(os.path.abspath(__file__))
 project_root_dir = os.path.join(current_file_dir, "..", "..")
 static_files_dir = os.path.join(project_root_dir, "static")
-# OLLAMA_API_BASE_URL 환경 변수 설정
-OLLAMA_API_BASE_URL = os.getenv("OLLAMA_API_BASE_URL", "http://127.0.0.1:11434")
-async def generate_answer_with_ollama(model_name: str, prompt: str) -> str:
-    """
-    Ollama 서버에 질의하여 답변을 생성합니다.
-    """
-    url = f"{OLLAMA_API_BASE_URL}/api/generate"
-    headers = {"Content-Type": "application/json"}
-    data = {
-        "model": model_name,
-        "prompt": prompt,
-        "stream": False # 스트리밍을 사용하지 않고 한 번에 답변을 받습니다.
-    }
-    print(f"INFO: Ollama API 호출 시작. 모델: {model_name}")
-    print(f"INFO: 프롬프트 미리보기: {prompt[:200]}...")
-    try:
-        async with httpx.AsyncClient(timeout=600.0) as client:
-            response = await client.post(url, headers=headers, json=data)
-            response.raise_for_status()
-            response_data = response.json()
-            full_response = response_data.get("response", "").strip()
-            return full_response
-    except httpx.HTTPStatusError as e:
-        print(f"ERROR: Ollama API 호출 실패: {e}")
-        raise HTTPException(status_code=500, detail="Ollama API 호출 실패")
-    except httpx.RequestError as e:
-        print(f"ERROR: 네트워크 오류: {e}")
-        raise HTTPException(status_code=500, detail="네트워크 오류가 발생했습니다. 잠시 후 다시 시도해주세요.")
-    except Exception as e:
-        print(f"ERROR: 알 수 없는 오류: {e}")
-        raise HTTPException(status_code=500, detail="알 수 없는 오류가 발생했습니다. 잠시 후 다시 시도해주세요.")
 class VideoProcessRequest(BaseModel):
     video_url: str
     query: str
-    # 사용자가 사용할 Ollama 모델 이름을 지정할 수 있도록 추가
     ollama_model_name: str = "hf.co/DevQuasar/naver-hyperclovax.HyperCLOVAX-SEED-Text-Instruct-0.5B-GGUF:F16"
 # ✅ 유튜브 영상 처리 API
 @app.post("/api/process_youtube_video")
 async def process_youtube_video(request: VideoProcessRequest):
     try:
         processed_chunks_with_timestamps = await process_youtube_video_data(request.video_url)
         if not processed_chunks_with_timestamps:
             return {"message": "자막 또는 내용을 추출할 수 없습니다.", "results": []}
-        # 1. RAG 검색 수행
-        rag_results = await perform_rag_query(
-            chunks_with_timestamps=processed_chunks_with_timestamps,
             query=request.query,
             top_k=50
         )
-        if not rag_results:
-            return {
-                "status": "error",
-                "message": "검색 결과가 없습니다.",
-                "video_url": request.video_url,
-                "query": request.query,
-                "results": []
-            }
-        # 2. 검색 결과를 프롬프트에 추가
-        context = "\\n\\n".join([chunk["text"] for chunk in rag_results])
-        prompt = f"다음 정보와 대화 내용을 참고하여 사용자의 질문에 답변하세요. 제공된 정보에서 답을 찾을 수 없다면 '정보 부족'이라고 명시하세요.\\n\\n참고 정보:\\n{context}\\n\\n사용자 질문: {request.query}\\n\\n답변:"
-        # 3. Ollama 모델에 질의하여 답변 생성
-        generated_answer = await generate_answer_with_ollama(
-            model_name=request.ollama_model_name,
-            prompt=prompt
-        )
         return {
-            "status": "success",
-            "message": "성공적으로 영상을 처리하고 RAG 검색을 수행했습니다.",
             "video_url": request.video_url,
             "query": request.query,
             "ollama_model_used": request.ollama_model_name,
-            "retrieved_chunks": rag_results,
-            "generated_answer": generated_answer
         }
     except Exception as e:
         print(f"ERROR: 서버 처리 중 오류 발생: {str(e)}")
-        raise HTTPException(status_code=500, detail="서버 처리 중 오류가 발생했습니다. 잠시 후 다시 시도해주세요.")
 # ✅ 정적 파일은 마지막에 mount
 app.mount("/", StaticFiles(directory=static_files_dir, html=True), name="static")
-# 서버 실행을 위한 메인 진입점 (Docker에서는 Uvicorn이 직접 호출하므로 필수는 아님)
 if __name__ == "__main__":
     import uvicorn
-    import os
-    port = int(os.environ.get("PORT", 7860))  # Hugging Face가 전달하는 포트를 우선 사용
-    uvicorn.run(app, host="0.0.0.0", port=port)

 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
 import os
 from pydantic import BaseModel
 from youtube_parser import process_youtube_video_data
+from rag_core import perform_rag_and_generate # 수정된 함수를 임포트
 app = FastAPI()
+# CORS 설정
 origins = [
+    "http://localhost:8080",
+    "http://localhost:5173",
+    "https://sodagraph-po.hf.space",
 ]
 app.add_middleware(
 project_root_dir = os.path.join(current_file_dir, "..", "..")
 static_files_dir = os.path.join(project_root_dir, "static")
 class VideoProcessRequest(BaseModel):
     video_url: str
     query: str
     ollama_model_name: str = "hf.co/DevQuasar/naver-hyperclovax.HyperCLOVAX-SEED-Text-Instruct-0.5B-GGUF:F16"
 # ✅ 유튜브 영상 처리 API
 @app.post("/api/process_youtube_video")
 async def process_youtube_video(request: VideoProcessRequest):
     try:
+        # 1. 유튜브 영상에서 자막/콘텐츠 추출
         processed_chunks_with_timestamps = await process_youtube_video_data(request.video_url)
         if not processed_chunks_with_timestamps:
             return {"message": "자막 또는 내용을 추출할 수 없습니다.", "results": []}
+        # 2. RAG 프로세스 실행 (검색 + 생성)
+        rag_result = await perform_rag_and_generate(
             query=request.query,
+            chunks_with_timestamps=processed_chunks_with_timestamps,
+            ollama_model_name=request.ollama_model_name,
             top_k=50
         )
+        # 3. 최종 결과 반환
         return {
+            **rag_result, # rag_core에서 반환된 결과 딕셔너리를 그대로 사용
             "video_url": request.video_url,
             "query": request.query,
             "ollama_model_used": request.ollama_model_name,
         }
     except Exception as e:
         print(f"ERROR: 서버 처리 중 오류 발생: {str(e)}")
+        # 실제 Exception의 세부 정보를 로깅하는 것이 좋음
+        raise HTTPException(status_code=500, detail=f"서버 처리 중 오류가 발생했습니다: {e}")
 # ✅ 정적 파일은 마지막에 mount
 app.mount("/", StaticFiles(directory=static_files_dir, html=True), name="static")
+# 서버 실행을 위한 메인 진입점
 if __name__ == "__main__":
     import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)

backend/app/rag_core.py CHANGED Viewed

@@ -1,19 +1,21 @@
 # ./backend/app/rag_core.py
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 from typing import List, Dict, Tuple
 # 전역 변수로 모델 로드 (앱 시작 시 한 번만 로드되도록)
-# 네이버 클로바의 한국어 SentenceBERT 모델을 로드합니다.
 try:
-    # 네이버 클로바 HyperCLOVAX-SEED-Text-Instruct-0.5B 모델 로드
     model = SentenceTransformer('jhgan/ko-sroberta-multitask', device='cpu')
     print("INFO: 임베딩 모델 'jhgan/ko-sroberta-multitask' 로드 완료.")
 except Exception as e:
     print(f"ERROR: 임베딩 모델 'jhgan/ko-sroberta-multitask' 로드 실패: {e}. 다국어 모델로 시도합니다.")
-    # 대체 모델 로직 (필요하다면 유지하거나 제거할 수 있습니다)
     try:
         model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device='cpu')
         print("INFO: 임베딩 모델 'sentence-transformers/paraphrase-multilingual-L12-v2' 로드 완료.")
@@ -21,79 +23,131 @@ except Exception as e:
         print(f"ERROR: 대체 임베딩 모델 로드 실패: {e}. RAG 기능을 사용할 수 없습니다.")
         raise
-async def perform_rag_query(chunks_with_timestamps: List[Dict], query: str, top_k: int = 5) -> List[Dict]:
     """
-    제공된 텍스트 청크들과 쿼리를 사용하여 RAG(Retrieval-Augmented Generation) 검색을 수행합니다.
-    현재는 임베딩 기반 유사도 검색만 수행하며, LLM 호출은 추후 추가됩니다.
-    Args:
-        chunks_with_timestamps: [{"text": "...", "timestamp": "...", "start_seconds": ...}] 형태의 리스트.
-        query: 사용자 쿼리 문자열.
-        top_k: 쿼리와 가장 유사한 상위 N개의 청크를 반환.
-    Returns:
-        쿼리와 가장 관련성 높은 상위 N개의 청크 (Dict) 리스트.
     """
     if not chunks_with_timestamps:
         print("WARNING: RAG 검색을 위한 텍스트 청크가 없습니다.")
         return []
-    # 1. 텍스트 임베딩 생성
-    # 모든 청크의 텍스트만 추출
     texts = [chunk["text"] for chunk in chunks_with_timestamps]
     print(f"INFO: 총 {len(texts)}개의 텍스트 청크 임베딩 시작.")
-    # 모델의 encode 메서드는 비동기가 아니므로, 직접 호출.
-    # 만약 시간이 오래 걸린다면 FastAPI의 `run_in_threadpool` 등을 고려.
     try:
         chunk_embeddings = model.encode(texts, convert_to_numpy=True)
-        print("INFO: 텍스트 청크 임베딩 완료.")
     except Exception as e:
         print(f"ERROR: 텍스트 청크 임베딩 중 오류 발생: {e}")
         return []
-    # 2. FAISS 인덱스 생성 및 청크 임베딩 추가
-    dimension = chunk_embeddings.shape[1] # 임베딩 벡터의 차원
-    index = faiss.IndexFlatL2(dimension) # L2 유클리드 거리를 사용하는 간단한 인덱스
     index.add(chunk_embeddings)
-    print("INFO: FAISS 인덱스 생성 및 임베딩 추가 완료.")
-    # 3. 쿼리 임베딩
     query_embedding = model.encode([query], convert_to_numpy=True)
-    print("INFO: 쿼리 임베딩 완료.")
-    # 4. 유사도 검색 (FAISS)
-    # D: 거리 (Distance), I: 인덱스 (Index)
-    distances, indices = index.search(query_embedding, top_k)
-    print(f"INFO: FAISS 유사도 검색 완료. 상위 {top_k}개 결과.")
     retrieved_chunks = []
-    # ✨✨✨ 유사도 임계값 설정 및 필터링 추가 ✨✨✨
-    # 이 값은 실험을 통해 최적의 값을 찾아야 합니다.
-    # 거리가 낮을수록 유사하므로, 이 값보다 '거리(score)'가 낮아야만 결과를 포함합니다.
-    # 예를 들어, 0.5는 '거리가 0.5 미만인 경우에만 결과에 포함하라'는 의미입니다.
-    # 거리가 0이면 완벽히 일치합니다.
-    MIN_DISTANCE_THRESHOLD = 150 # 예시 값: 이 값보다 거리가 작아야 합니다 (더 유사해야 함)
     for i in range(len(indices[0])):
         idx = indices[0][i]
         original_chunk = chunks_with_timestamps[idx]
-        score = float(distances[0][i]) # FAISS에서 반환된 거리 값 (낮을수록 유사)
-        # 설정된 임계값보다 거리가 작을 때만 (즉, 유사도가 높을 때만) 결과에 포함
-        if score < MIN_DISTANCE_THRESHOLD:
             retrieved_chunks.append({
                 "text": original_chunk["text"],
                 "timestamp": original_chunk["timestamp"],
-                "score": score
             })
         else:
-            # 디버깅용: 임계값 때문에 제외된 청크를 로그로 확인
-            print(f"DEBUG: 유사도 임계값({MIN_DISTANCE_THRESHOLD:.4f}) 초과로 제외된 청크 (거리: {score:.4f}): {original_chunk['text'][:50]}...")
-    # 거리가 작은 순서(유사도가 높은 순서)로 정렬하여 반환
-    retrieved_chunks.sort(key=lambda x: x['timestamp'])
     print(f"DEBUG: 최종 검색된 청크 수: {len(retrieved_chunks)}")
-    return retrieved_chunks

 # ./backend/app/rag_core.py
+import os
+import httpx
+from fastapi import HTTPException
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 from typing import List, Dict, Tuple
+# OLLAMA_API_BASE_URL 환경 변수 설정
+OLLAMA_API_BASE_URL = os.getenv("OLLAMA_API_BASE_URL", "http://127.0.0.1:11434")
 # 전역 변수로 모델 로드 (앱 시작 시 한 번만 로드되도록)
 try:
     model = SentenceTransformer('jhgan/ko-sroberta-multitask', device='cpu')
     print("INFO: 임베딩 모델 'jhgan/ko-sroberta-multitask' 로드 완료.")
 except Exception as e:
     print(f"ERROR: 임베딩 모델 'jhgan/ko-sroberta-multitask' 로드 실패: {e}. 다국어 모델로 시도합니다.")
     try:
         model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device='cpu')
         print("INFO: 임베딩 모델 'sentence-transformers/paraphrase-multilingual-L12-v2' 로드 완료.")
         print(f"ERROR: 대체 임베딩 모델 로드 실패: {e}. RAG 기능을 사용할 수 없습니다.")
         raise
+async def generate_answer_with_ollama(model_name: str, prompt: str) -> str:
+    """
+    Ollama 서버에 질의하여 답변을 생성합니다.
     """
+    url = f"{OLLAMA_API_BASE_URL}/api/generate"
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "model": model_name,
+        "prompt": prompt,
+        "stream": False
+    }
+    print(f"INFO: Ollama API 호출 시작. 모델: {model_name}")
+    print(f"INFO: 프롬프트 미리보기: {prompt[:200]}...")
+    try:
+        async with httpx.AsyncClient(timeout=600.0) as client:
+            response = await client.post(url, headers=headers, json=data)
+            response.raise_for_status()
+            response_data = response.json()
+            full_response = response_data.get("response", "").strip()
+            return full_response
+    except httpx.HTTPStatusError as e:
+        print(f"ERROR: Ollama API 호출 실패: {e}")
+        raise HTTPException(status_code=500, detail="Ollama API 호출 실패")
+    except httpx.RequestError as e:
+        print(f"ERROR: 네트워크 오류: {e}")
+        raise HTTPException(status_code=500, detail="네트워크 오류가 발생했습니다. 잠시 후 다시 시도해주세요.")
+    except Exception as e:
+        print(f"ERROR: 알 수 없는 오류: {e}")
+        raise HTTPException(status_code=500, detail="알 수 없는 오류가 발생했습니다. 잠시 후 다시 시도해주세요.")
+async def perform_retrieval(chunks_with_timestamps: List[Dict], query: str, top_k: int = 5) -> List[Dict]:
+    """
+    제공된 텍스트 청크에서 쿼리와 가장 유사한 부분을 검색합니다. (Retrieval-only)
     """
     if not chunks_with_timestamps:
         print("WARNING: RAG 검색을 위한 텍스트 청크가 없습니다.")
         return []
     texts = [chunk["text"] for chunk in chunks_with_timestamps]
     print(f"INFO: 총 {len(texts)}개의 텍스트 청크 임베딩 시작.")
     try:
         chunk_embeddings = model.encode(texts, convert_to_numpy=True)
     except Exception as e:
         print(f"ERROR: 텍스트 청크 임베딩 중 오류 발생: {e}")
         return []
+    dimension = chunk_embeddings.shape[1]
+    index = faiss.IndexFlatIP(dimension)
+    faiss.normalize_L2(chunk_embeddings)
     index.add(chunk_embeddings)
     query_embedding = model.encode([query], convert_to_numpy=True)
+    faiss.normalize_L2(query_embedding)
+    similarities, indices = index.search(query_embedding, top_k)
     retrieved_chunks = []
+    MIN_SIMILARITY_THRESHOLD = 0.35 # 임계값
     for i in range(len(indices[0])):
         idx = indices[0][i]
         original_chunk = chunks_with_timestamps[idx]
+        score = float(similarities[0][i])
+        if score > MIN_SIMILARITY_THRESHOLD:
             retrieved_chunks.append({
                 "text": original_chunk["text"],
                 "timestamp": original_chunk["timestamp"],
+                "score": score,
+                "start_seconds": original_chunk["start_seconds"]
             })
         else:
+            print(f"DEBUG: 유사도 임계값({MIN_SIMILARITY_THRESHOLD:.4f}) 미만으로 제외된 청크 (유사도: {score:.4f}): {original_chunk['text'][:50]}...")
+    retrieved_chunks.sort(key=lambda x: x['start_seconds'])
     print(f"DEBUG: 최종 검색된 청크 수: {len(retrieved_chunks)}")
+    return retrieved_chunks
+async def perform_rag_and_generate(query: str, chunks_with_timestamps: List[Dict], ollama_model_name: str, top_k: int = 50) -> Dict:
+    """
+    RAG의 전체 프로세스(검색, 프롬프트 구성, 생성)를 수행합니다.
+    """
+    # 1. RAG 검색 수행
+    retrieved_chunks = await perform_retrieval(
+        chunks_with_timestamps=chunks_with_timestamps,
+        query=query,
+        top_k=top_k
+    )
+    if not retrieved_chunks:
+        return {
+            "status": "error",
+            "message": "검색 결과가 없습니다.",
+            "retrieved_chunks": [],
+            "generated_answer": "관련 정보를 찾지 못해 답변을 생성할 수 없습니다."
+        }
+    # 2. 검색 결과를 프롬프트에 추가
+    context = "\n\n".join([chunk["text"] for chunk in retrieved_chunks])
+    prompt = f"""당신은 유튜브 영상 내용을 완벽하게 이해하고 사용자의 질문에 답변하는 AI 어시스턴트입니다.
+아래는 분석한 유튜브 영상의 자막 내용입니다. 이 정보를 바탕으로 사용자의 질문에 대해 상세하고 친절하게 답변하세요.
+답변은 반드시 영상 내용에 근거해야 하며, 내용과 관련 없는 질문에는 '영상 내용과 관련이 없어 답변할 수 없습니다'라고 솔직하게 말해야 합니다.
+--- 유튜브 영상 자막 내용 ---
+{context}
+--------------------------
+사용자 질문: {query}
+답변:"""
+    # 3. Ollama 모델에 질의하여 답변 생성
+    generated_answer = await generate_answer_with_ollama(
+        model_name=ollama_model_name,
+        prompt=prompt
+    )
+    return {
+        "status": "success",
+        "message": "성공적으로 영상을 처리하고 RAG 검색을 수행했습니다.",
+        "retrieved_chunks": retrieved_chunks,
+        "generated_answer": generated_answer
+    }

backend/app/youtube_parser.py CHANGED Viewed

@@ -37,120 +37,79 @@ def validate_youtube_url(url):
 async def get_youtube_video_id(url: str) -> str | None:
     """
     유튜브 URL에서 비디오 ID를 추출합니다.
-    표준 유튜브 URL (youtube.com/watch?v=..., youtu.be/...)을 처리합니다.
     """
     parsed_url = urlparse(url)
-    # 표준 YouTube Watch 페이지 도메인 확인
-    # www.youtube.com, m.youtube.com, youtube.com 등을 포함합니다.
-    # 'www.youtube.com', 'm.youtube.com', 'youtube.com'은 실제 YouTube 도메인을 의미합니다.
     if parsed_url.hostname and any(domain in parsed_url.hostname for domain in ['www.youtube.com', 'm.youtube.com', 'youtube.com']):
         query_params = parse_qs(parsed_url.query)
         if 'v' in query_params:
             return query_params['v'][0]
-    # 짧은 YouTube URL (youtu.be/VIDEO_ID)
     elif parsed_url.hostname == 'youtu.be':
-        # path가 /VIDEO_ID 형태이므로 맨 앞의 '/'를 제거
         video_id = parsed_url.path.strip('/')
-        # 유튜브 비디오 ID는 보통 11자리이므로, 유효성 검사
         if len(video_id) == 11:
             return video_id
     logger.warning(f"알 수 없는 형식의 YouTube URL: {url}")
     return None
 def clean_caption_text(text: str) -> str:
-    # <00:00:00.000><c>...</c> 또는 </c> 같은 태그 제거
     cleaned_text = re.sub(r'<[^>]+>', '', text)
-    # 여러 공백을 하나의 공백으로 줄이고 양쪽 끝 공백 제거
     cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
     return cleaned_text
 async def get_transcript_with_timestamps(video_id: str) -> list[dict] | None:
     logger.info(f"비디오 ID '{video_id}'에 대한 자막 가져오기 시도.")
     processed_chunks = []
-    # yt-dlp 옵션 설정
     ydl_opts = {
-        'writesubtitles': True,          # 사용자가 업로드한 수동 자막 파일 쓰기 활성화
-        'writeautomaticsub': True,       # YouTube에서 자동으로 생성된 자막 파일 쓰기 활성화
-        'subtitleslangs': ['ko', 'en'],  # 다운로드할 자막 언어 목록 (한국어 우선, 없으면 영어)
-        'skip_download': True,           # 동영상 자체는 다운로드하지 않고 자막만 다운로드
-        'outtmpl': '%(id)s.%(language)s.%(ext)s', # 다운로드될 파일 이름 템플릿 (temp_dir 안에서 상대 경로로 저장됨)
-        'quiet': False,                  # 콘솔 출력 활성화 (디버깅용)
-        'no_warnings': False,            # 경고 메시지 활성화 (디버깅용)
-        'extractor_args': {              # 특정 extractor (예: 유튜브)에 대한 추가 인자
-            'youtube': {'skip': ['dash']} # dash manifest 관련 오류 회피 시도 (유튜브 관련)
         }
     }
     logger.info("yt-dlp에 프록시가 적용되지 않았습니다.")
-    temp_dir = "./temp_captions"
     os.makedirs(temp_dir, exist_ok=True)
     original_cwd = os.getcwd()
     try:
-        with contextlib.chdir(temp_dir): # 임시 디렉토리로 작업 디렉토리 변경
-            # outtmpl은 현재 chdir된 디렉토리 내의 상대 경로로 지정
-            # yt-dlp가 파일을 temp_dir 안에 바로 생성하도록 함
-            ydl_opts['outtmpl'] = '%(id)s.%(ext)s'
-            logger.debug(f"yt-dlp 실행 전 현재 작업 디렉토리: {os.getcwd()}")
-            logger.debug(f"yt-dlp 옵션: {ydl_opts}")
             with YoutubeDL(ydl_opts) as ydl:
-                # download=True 설정으로 자막 다운로드 시도
-                # 비디오 ID만 전달해도 yt-dlp가 알아서 처리합니다.
                 info_dict = await asyncio.to_thread(ydl.extract_info, video_id, download=True)
-            logger.debug(f"yt-dlp extract_info 결과 (자세한 정보는 debug_yt_dlp.log 파일 확인): {json.dumps(info_dict.get('requested_subtitles', 'No subtitles requested'), indent=2, ensure_ascii=False)}")
             caption_file_path = None
-            # 1. info_dict에서 직접 자막 파일 경로를 찾으려는 시도 (가장 정확)
-            # yt-dlp 0.0.12 버전 이상에서는 _download_lock이 반환됨. info_dict에서 직접 파일을 찾아야 함
             if 'requested_subtitles' in info_dict and info_dict['requested_subtitles']:
                 for lang_code in ydl_opts['subtitleslangs']:
                     if lang_code in info_dict['requested_subtitles']:
                         sub_info = info_dict['requested_subtitles'][lang_code]
-                        # 'filepath' 키가 없거나 None일 수 있으므로 확인
                         if 'filepath' in sub_info and sub_info['filepath']:
-                            # filepath는 이미 현재 작업 디렉토리(temp_dir) 기준으로 되어 있을 것
                             caption_file_path = sub_info['filepath']
                             logger.info(f"yt-dlp가 '{lang_code}' 자막 파일을 info_dict에서 찾았습니다: {caption_file_path}")
-                            break # 찾았으면 루프 종료
-            # 2. info_dict에서 찾지 못했을 경우, 폴백으로 임시 디렉토리를 탐색
             if not caption_file_path:
-                logger.debug(f"info_dict에서 자막 파일 경로를 찾지 못했습니다. 임시 디렉토리 스캔 시작.")
-                downloaded_files = [f for f in os.listdir('.') if f.startswith(video_id) and ('sub' in f or 'vtt' in f or 'json' in f or 'srt' in f)]
-                logger.debug(f"임시 디렉토리의 파일 목록: {downloaded_files}")
-                # 한국어 자막 우선 검색 (vtt, srt, json 순)
                 for ext in ['vtt', 'srt', 'json']:
                     ko_file = next((f for f in downloaded_files if f.endswith(f'.ko.{ext}')), None)
                     if ko_file:
-                        caption_file_path = os.path.join(os.getcwd(), ko_file) # 현재 작업 디렉토리 기준으로 경로 조합
-                        logger.info(f"폴백: yt-dlp로 한국어 {ext.upper()} 자막 파일 '{ko_file}'을 다운로드했습니다.")
                         break
                 if not caption_file_path:
-                    # 한국어 없으면 첫 번째 사용 가능한 자막 찾기
                     for ext in ['vtt', 'srt', 'json']:
                         any_file = next((f for f in downloaded_files if f.endswith(f'.{ext}')), None)
                         if any_file:
-                            caption_file_path = os.path.join(os.getcwd(), any_file) # 현재 작업 디렉토리 기준으로 경로 조합
-                            logger.warning(f"폴백: 한국어 자막이 없어 첫 번째 {ext.upper()} 자막 파일 '{any_file}'을 사용합니다.")
                             break
-            # 3. 자막 파일이 찾아졌으면 파싱 시작
             if caption_file_path and os.path.exists(caption_file_path):
                 if caption_file_path.endswith('.vtt'):
                     with open(caption_file_path, 'r', encoding='utf-8') as f:
                         vtt_content = f.read()
-                    # WEBVTT 파싱
                     segments = vtt_content.split('\n\n')
                     for segment in segments:
                         if '-->' in segment:
@@ -158,41 +117,33 @@ async def get_transcript_with_timestamps(video_id: str) -> list[dict] | None:
                             time_str = lines[0].strip()
                             text_content = ' '.join(lines[1:]).strip()
                             text_content = clean_caption_text(text_content)
                             try:
-                                # VTT 시간은 HH:MM:SS.ms 형태로 제공되므로, 밀리초를 float로 처리 후 정수로 변환
                                 start_time_parts = time_str.split(' --> ')[0].split(':')
-                                if len(start_time_parts) == 3: # HH:MM:SS.ms
                                     hours = int(start_time_parts[0])
                                     minutes = int(start_time_parts[1])
                                     seconds = int(float(start_time_parts[2].split('.')[0]))
-                                elif len(start_time_parts) == 2: # MM:SS.ms
                                     hours = 0
                                     minutes = int(start_time_parts[0])
                                     seconds = int(float(start_time_parts[1].split('.')[0]))
-                                else:
-                                    raise ValueError("Unsupported time format")
-                                # HH:MM:SS 포맷으로 맞춤
-                                if hours > 0:
-                                    timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
-                                else:
-                                    timestamp_str = f"{minutes:02d}:{seconds:02d}"
                                 processed_chunks.append({
                                     "text": text_content,
-                                    "timestamp": timestamp_str
                                 })
                             except Exception as e:
                                 logger.warning(f"VTT 시간 파싱 오류: {time_str} - {e}")
-                    logger.info(f"yt-dlp로 VTT 자막 {len(processed_chunks)}개 청크 처리 완료.")
                 elif caption_file_path.endswith('.json'):
-                    # JSON 자막 파싱 (yt-dlp가 가끔 JSON 포맷으로도 다운로드함)
                     with open(caption_file_path, 'r', encoding='utf-8') as f:
                         json_content = json.load(f)
-                    # yt-dlp의 JSON 자막 형식에 맞춰 파싱 (예시, 실제 구조는 info_dict를 통해 확인 필요)
                     for entry in json_content:
                         if 'start' in entry and 'text' in entry:
                             total_seconds = int(entry['start'])
@@ -202,52 +153,43 @@ async def get_transcript_with_timestamps(video_id: str) -> list[dict] | None:
                             text = entry['text']
                             text = clean_caption_text(text)
-                            if hours > 0:
-                                timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
-                            else:
-                                timestamp_str = f"{minutes:02d}:{seconds:02d}"
                             processed_chunks.append({
                                 "text": text,
-                                "timestamp": timestamp_str
                             })
-                    logger.info(f"yt-dlp로 JSON 자막 {len(processed_chunks)}개 청크 처리 완료.")
                 elif caption_file_path.endswith('.srt'):
-                    # SRT 자막 파싱 (간단한 예시, 실제로는 정규식 등으로 파싱)
                     with open(caption_file_path, 'r', encoding='utf-8') as f:
                         srt_content = f.read()
-                    # SRT 파싱 로직 (매우 간단한 예시, 실제론 srt 라이브러리 사용 권장)
                     blocks = srt_content.strip().split('\n\n')
                     for block in blocks:
                         lines = block.split('\n')
-                        # 최소한 순번, 시간, 텍스트가 있어야 함
                         if len(lines) >= 3 and '-->' in lines[1]:
                             time_str = lines[1].strip()
                             text_content = ' '.join(lines[2:]).strip()
                             text_content = clean_caption_text(text_content)
                             try:
-                                # SRT 시간은 HH:MM:SS,ms 형태로 제공
                                 start_time_parts = time_str.split(' --> ')[0].split(':')
-                                seconds_ms = float(start_time_parts[-1].replace(',', '.')) # 밀리초 처리
                                 seconds = int(seconds_ms)
                                 minutes = int(start_time_parts[-2])
                                 hours = int(start_time_parts[0]) if len(start_time_parts) == 3 else 0
-                                if hours > 0:
-                                    timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
-                                else:
-                                    timestamp_str = f"{minutes:02d}:{seconds:02d}"
                                 processed_chunks.append({
                                     "text": text_content,
-                                    "timestamp": timestamp_str
                                 })
                             except Exception as e:
                                 logger.warning(f"SRT 시간 파싱 오류: {time_str} - {e}")
-                    logger.info(f"yt-dlp로 SRT 자막 {len(processed_chunks)}개 청크 처리 완료.")
                 else:
                     logger.warning(f"지원하지 않는 자막 파일 형식입니다: {caption_file_path}")
@@ -255,83 +197,82 @@ async def get_transcript_with_timestamps(video_id: str) -> list[dict] | None:
                 logger.warning(f"비디오 ID '{video_id}'에 대한 yt-dlp 자막 파일을 찾을 수 없습니다. 최종 시도 경로: {caption_file_path}")
     except Exception as e:
-        logger.error(f"yt-dlp 자막 추출 중 예기치 않은 오류 발생 for video ID '{video_id}': {type(e).__name__}: {e}")
         return []
     finally:
         if os.path.exists(temp_dir):
-            for file_name in os.listdir(temp_dir):
-                file_path = os.path.join(temp_dir, file_name)
-                try:
-                    if os.path.isfile(file_path):
-                        os.remove(file_path)
-                except Exception as e:
-                    logger.error(f"임시 파일 삭제 실패 {file_path}: {e}")
-            os.rmdir(temp_dir)
             logger.info(f"임시 자막 디렉토리 '{temp_dir}' 정리 완료.")
-        os.chdir(original_cwd) # 원래 작업 디렉토리로 돌아옴
     return processed_chunks
-def parse_srt_content(srt_content: str) -> list[dict]:
-    chunks = []
-    # 간단한 SRT 파싱 로직 (yt-dlp의 SRT 출력은 더 간단할 수 있음)
-    # 실제 프로덕션에서는 더 견고한 SRT 파서 라이브러리를 사용하는 것이 좋습니다.
-    import re
-    # SRT 패턴: 1\n00:00:01,000 --> 00:00:03,000\nHello World\n\n
-    blocks = re.split(r'\n\s*\n', srt_content.strip())
-    for block in blocks:
-        lines = block.split('\n')
-        if len(lines) >= 3:
-            # 첫 번째 라인은 순번, 두 번째 라인은 시간, 나머지는 텍스트
-            time_str = lines[1]
-            text = " ".join(lines[2:]).strip()
-            # 시간 형식: 00:00:01,000 --> 00:00:03,000
-            time_parts = time_str.split(' --> ')
-            if len(time_parts) == 2:
-                start_time = time_parts[0].replace(',', '.') # yt-dlp의 VTT 파서와 일관성을 위해 쉼표를 점으로 변경
-                chunks.append({"text": text, "timestamp": start_time})
-    return chunks
-def parse_vtt_content(vtt_content: str) -> list[dict]:
-    chunks = []
-    lines = vtt_content.split('\n')
-    i = 0
-    while i < len(lines):
-        line = lines[i].strip()
-        if '-->' in line:
-            # 시간 정보 라인
-            time_str = line.split(' ')[0] # 예: 00:00:01.000
-            # 다음 라인부터 텍스트 시작
-            text_lines = []
-            j = i + 1
-            while j < len(lines) and lines[j].strip() != '':
-                text_lines.append(lines[j].strip())
-                j += 1
-            text = ' '.join(text_lines)
-            if text:
-                chunks.append({"text": text, "timestamp": time_str})
-            i = j # 다음 자막 블록으로 이동
-        i += 1
-    return chunks
-def parse_json_content(json_content: dict) -> list[dict]:
-    chunks = []
-    for entry in json_content.get('events', []):
-        if 'segs' in entry:
-            text = "".join([seg.get('utf8', '') for seg in entry['segs']])
-            # JSON3 형식은 밀리초까지 표현된 시작 시간이 't' 키에 있을 수 있음
-            # yt-dlp가 생성하는 json3 파일 구조에 따라 유연하게 처리 필요
-            start_ms = entry.get('t', 0)
-            # 밀리초를 HH:MM:SS.mmm 형식으로 변환 (yt-dlp의 VTT timestamp와 유사하게)
-            total_seconds = start_ms / 1000
-            hours = int(total_seconds // 3600)
-            minutes = int((total_seconds % 3600) // 60)
-            seconds = total_seconds % 60
-            timestamp = f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
-            chunks.append({"text": text, "timestamp": timestamp})
-    return chunks
 async def process_youtube_video_data(video_url: str) -> list[dict] | None:
     video_id = await get_youtube_video_id(video_url)
@@ -339,4 +280,13 @@ async def process_youtube_video_data(video_url: str) -> list[dict] | None:
         logger.error(f"유효하지 않은 YouTube URL: {video_url}")
         return None
-    return await get_transcript_with_timestamps(video_id)

 async def get_youtube_video_id(url: str) -> str | None:
     """
     유튜브 URL에서 비디오 ID를 추출합니다.
     """
     parsed_url = urlparse(url)
     if parsed_url.hostname and any(domain in parsed_url.hostname for domain in ['www.youtube.com', 'm.youtube.com', 'youtube.com']):
         query_params = parse_qs(parsed_url.query)
         if 'v' in query_params:
             return query_params['v'][0]
     elif parsed_url.hostname == 'youtu.be':
         video_id = parsed_url.path.strip('/')
         if len(video_id) == 11:
             return video_id
     logger.warning(f"알 수 없는 형식의 YouTube URL: {url}")
     return None
 def clean_caption_text(text: str) -> str:
     cleaned_text = re.sub(r'<[^>]+>', '', text)
     cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
     return cleaned_text
 async def get_transcript_with_timestamps(video_id: str) -> list[dict] | None:
     logger.info(f"비디오 ID '{video_id}'에 대한 자막 가져오기 시도.")
     processed_chunks = []
     ydl_opts = {
+        'writesubtitles': True,
+        'writeautomaticsub': True,
+        'subtitleslangs': ['ko', 'en'],
+        'skip_download': True,
+        'outtmpl': '%(id)s.%(language)s.%(ext)s',
+        'quiet': False,
+        'no_warnings': False,
+        'extractor_args': {
+            'youtube': {'skip': ['dash']}
         }
     }
     logger.info("yt-dlp에 프록시가 적용되지 않았습니다.")
+    temp_dir = f"./temp_captions_{video_id}" # 각 요청별 고유 디렉토리 생성
     os.makedirs(temp_dir, exist_ok=True)
     original_cwd = os.getcwd()
     try:
+        with contextlib.chdir(temp_dir):
+            ydl_opts['outtmpl'] = '%(id)s.%(ext)s'
             with YoutubeDL(ydl_opts) as ydl:
                 info_dict = await asyncio.to_thread(ydl.extract_info, video_id, download=True)
             caption_file_path = None
             if 'requested_subtitles' in info_dict and info_dict['requested_subtitles']:
                 for lang_code in ydl_opts['subtitleslangs']:
                     if lang_code in info_dict['requested_subtitles']:
                         sub_info = info_dict['requested_subtitles'][lang_code]
                         if 'filepath' in sub_info and sub_info['filepath']:
                             caption_file_path = sub_info['filepath']
                             logger.info(f"yt-dlp가 '{lang_code}' 자막 파일을 info_dict에서 찾았습니다: {caption_file_path}")
+                            break
             if not caption_file_path:
+                downloaded_files = [f for f in os.listdir('.') if f.startswith(video_id) and any(ext in f for ext in ['vtt', 'srt', 'json'])]
                 for ext in ['vtt', 'srt', 'json']:
                     ko_file = next((f for f in downloaded_files if f.endswith(f'.ko.{ext}')), None)
                     if ko_file:
+                        caption_file_path = os.path.join(os.getcwd(), ko_file)
                         break
                 if not caption_file_path:
                     for ext in ['vtt', 'srt', 'json']:
                         any_file = next((f for f in downloaded_files if f.endswith(f'.{ext}')), None)
                         if any_file:
+                            caption_file_path = os.path.join(os.getcwd(), any_file)
                             break
             if caption_file_path and os.path.exists(caption_file_path):
                 if caption_file_path.endswith('.vtt'):
                     with open(caption_file_path, 'r', encoding='utf-8') as f:
                         vtt_content = f.read()
                     segments = vtt_content.split('\n\n')
                     for segment in segments:
                         if '-->' in segment:
                             time_str = lines[0].strip()
                             text_content = ' '.join(lines[1:]).strip()
                             text_content = clean_caption_text(text_content)
+                            if not text_content: continue
                             try:
                                 start_time_parts = time_str.split(' --> ')[0].split(':')
+                                if len(start_time_parts) == 3:
                                     hours = int(start_time_parts[0])
                                     minutes = int(start_time_parts[1])
                                     seconds = int(float(start_time_parts[2].split('.')[0]))
+                                else: # MM:SS.ms
                                     hours = 0
                                     minutes = int(start_time_parts[0])
                                     seconds = int(float(start_time_parts[1].split('.')[0]))
+                                total_seconds = hours * 3600 + minutes * 60 + seconds
+                                timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" if hours > 0 else f"{minutes:02d}:{seconds:02d}"
                                 processed_chunks.append({
                                     "text": text_content,
+                                    "timestamp": timestamp_str,
+                                    "start_seconds": total_seconds # Added
                                 })
                             except Exception as e:
                                 logger.warning(f"VTT 시간 파싱 오류: {time_str} - {e}")
+                    logger.info(f"VTT 자막 {len(processed_chunks)}개 청크 처리 완료.")
                 elif caption_file_path.endswith('.json'):
                     with open(caption_file_path, 'r', encoding='utf-8') as f:
                         json_content = json.load(f)
                     for entry in json_content:
                         if 'start' in entry and 'text' in entry:
                             total_seconds = int(entry['start'])
                             text = entry['text']
                             text = clean_caption_text(text)
+                            timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" if hours > 0 else f"{minutes:02d}:{seconds:02d}"
                             processed_chunks.append({
                                 "text": text,
+                                "timestamp": timestamp_str,
+                                "start_seconds": total_seconds # Added
                             })
+                    logger.info(f"JSON 자막 {len(processed_chunks)}개 청크 처리 완료.")
                 elif caption_file_path.endswith('.srt'):
                     with open(caption_file_path, 'r', encoding='utf-8') as f:
                         srt_content = f.read()
                     blocks = srt_content.strip().split('\n\n')
                     for block in blocks:
                         lines = block.split('\n')
                         if len(lines) >= 3 and '-->' in lines[1]:
                             time_str = lines[1].strip()
                             text_content = ' '.join(lines[2:]).strip()
                             text_content = clean_caption_text(text_content)
                             try:
                                 start_time_parts = time_str.split(' --> ')[0].split(':')
+                                seconds_ms = float(start_time_parts[-1].replace(',', '.'))
                                 seconds = int(seconds_ms)
                                 minutes = int(start_time_parts[-2])
                                 hours = int(start_time_parts[0]) if len(start_time_parts) == 3 else 0
+                                total_seconds = hours * 3600 + minutes * 60 + seconds
+                                timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" if hours > 0 else f"{minutes:02d}:{seconds:02d}"
                                 processed_chunks.append({
                                     "text": text_content,
+                                    "timestamp": timestamp_str,
+                                    "start_seconds": total_seconds # Added
                                 })
                             except Exception as e:
                                 logger.warning(f"SRT 시간 파싱 오류: {time_str} - {e}")
+                    logger.info(f"SRT 자막 {len(processed_chunks)}개 청크 처리 완료.")
                 else:
                     logger.warning(f"지원하지 않는 자막 파일 형식입니다: {caption_file_path}")
                 logger.warning(f"비디오 ID '{video_id}'에 대한 yt-dlp 자막 파일을 찾을 수 없습니다. 최종 시도 경로: {caption_file_path}")
     except Exception as e:
+        logger.error(f"자막 추출 중 오류: {e}")
         return []
     finally:
+        os.chdir(original_cwd)
         if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir) # 임시 디렉토리와 내용물 모두 삭제
             logger.info(f"임시 자막 디렉토리 '{temp_dir}' 정리 완료.")
     return processed_chunks
+def remove_duplicate_captions(chunks: list[dict]) -> list[dict]:
+    if not chunks:
+        return []
+    # 첫 번째 청크는 항상 포함
+    deduplicated_chunks = [chunks[0]]
+    for i in range(1, len(chunks)):
+        # 이전 청크의 텍스트와 현재 청크의 텍스트를 가져옴
+        # .strip()으로 양쪽 공백을 제거하여 비교 정확도 향상
+        prev_text = deduplicated_chunks[-1]["text"].strip()
+        current_text = chunks[i]["text"].strip()
+        # 현재 텍스트가 이전 텍스트로 시작하고, 길이가 더 긴 경우 (점진적 구성)
+        # 예: prev="안녕하세요", current="안녕하세요 제 이름은"
+        if current_text.startswith(prev_text) and len(current_text) > len(prev_text):
+            # 이전 항목을 현재의 더 완전한 문장으로 교체
+            deduplicated_chunks[-1] = chunks[i]
+        # 현재 텍스트와 이전 텍스트가 완전히 다른 내용일 경우에만 새로 추가
+        # (완전히 똑같은 중복도 이 조건에서 걸러짐)
+        elif prev_text != current_text:
+            deduplicated_chunks.append(chunks[i])
+    logger.info(f"중복 제거 후 최종 청크 수: {len(deduplicated_chunks)}")
+    return deduplicated_chunks
+def merge_incomplete_sentences(chunks: list[dict]) -> list[dict]:
+    if not chunks:
+        return []
+    merged_chunks = []
+    current_merged_chunk = None
+    for i, chunk in enumerate(chunks):
+        text = chunk["text"].strip()
+        if not text:
+            continue
+        if current_merged_chunk is None:
+            current_merged_chunk = chunk.copy()
+        else:
+            # 이전 청크가 문장 종결 부호로 끝나는지 확인
+            prev_text_ends_with_punctuation = current_merged_chunk["text"].strip().endswith(('.', '?', '!', '...'))
+            # 시간 간격 확인 (예: 0.5초 미만)
+            time_gap_small = True
+            if "start_seconds" in current_merged_chunk and "start_seconds" in chunk:
+                if chunk["start_seconds"] - current_merged_chunk["start_seconds"] > 0.5: # 0.5초 이상 차이나면 병합하지 않음
+                    time_gap_small = False
+            # 이전 청크가 문장 종결 부호로 끝나지 않았고, 시간 간격이 작을 때만 병합
+            if not prev_text_ends_with_punctuation and time_gap_small:
+                current_merged_chunk["text"] += " " + text
+                # 병합된 청크의 시간은 첫 청크의 시간을 유지
+            else:
+                # 병합하지 않는 경우, 현재까지 병합된 청크를 추가하고 새 청크 시작
+                merged_chunks.append(current_merged_chunk)
+                current_merged_chunk = chunk.copy()
+    # 마지막으로 남아있는 병합된 청크 추가
+    if current_merged_chunk is not None:
+        merged_chunks.append(current_merged_chunk)
+    logger.info(f"스마트 병합 후 청크 수: {len(merged_chunks)}")
+    return merged_chunks
 async def process_youtube_video_data(video_url: str) -> list[dict] | None:
     video_id = await get_youtube_video_id(video_url)
         logger.error(f"유효하지 않은 YouTube URL: {video_url}")
         return None
+    processed_chunks = await get_transcript_with_timestamps(video_id)
+    if processed_chunks:
+        # 1. 중복 제거
+        deduplicated_chunks = remove_duplicate_captions(processed_chunks)
+        # 2. 불완전한 문장 병합
+        final_chunks = merge_incomplete_sentences(deduplicated_chunks)
+        return final_chunks
+    else:
+        return processed_chunks