from flask import Flask, request, jsonify, render_template, send_file
from flask_cors import CORS
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
import yt_dlp
import re
import os
import tempfile
import json
import ssl
import time
import hashlib
import threading
import concurrent.futures
from datetime import datetime
# lru_cache가 사용되지 않으므로 제거
# from functools import lru_cache

# SSL 인증서 검증 문제 해결을 위한 설정
try:
    # SSL 검증 비활성화 - 보안상 위험하지만 개발 환경에서는 사용 가능
    ssl._create_default_https_context = ssl._create_unverified_context
    print("SSL certificate verification disabled for development purposes")

    # certifi 사용 시도
    try:
        import certifi
        os.environ['SSL_CERT_FILE'] = certifi.where()
        os.environ['REQUESTS_CA_BUNDLE'] = certifi.where()
        print(f"Using certifi certificates: {certifi.where()}")
    except ImportError:
        print("certifi not installed, using system certificates")
        # 시스템 인증서 경로 확인
        system_cert_paths = [
            "/etc/ssl/certs/ca-certificates.crt",  # Debian/Ubuntu
            "/etc/pki/tls/certs/ca-bundle.crt",    # RHEL/CentOS
            "/etc/ssl/ca-bundle.pem",              # OpenSUSE
            "/etc/ssl/cert.pem",                   # macOS
            "/usr/local/etc/openssl/cert.pem",     # Homebrew OpenSSL
            "/usr/local/share/certs/ca-root-nss.crt",  # FreeBSD
        ]

        for path in system_cert_paths:
            if os.path.exists(path):
                os.environ['SSL_CERT_FILE'] = path
                os.environ['REQUESTS_CA_BUNDLE'] = path
                print(f"Using system certificate: {path}")
                break
except Exception as e:
    print(f"Failed to configure SSL: {e}")

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# YouTube cookies and user agent configuration
COOKIES_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'youtube_cookies.txt')

# Create a default cookies file if it doesn't exist
if not os.path.exists(COOKIES_FILE):
    with open(COOKIES_FILE, 'w') as f:
        f.write('# HTTP Cookie File for youtube.com by Netscape Navigator')

# 캐시 디렉토리 설정
CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cache')
os.makedirs(CACHE_DIR, exist_ok=True)

# 캐시 설정 - 캐시 기간 증가
CACHE_ENABLED = True
CACHE_EXPIRY = 60 * 60 * 24 * 30  # 30일 (초 단위)로 증가

# 스레드 풀 설정 - 성능 향상을 위해 워커 수 증가
MAX_WORKERS = 8  # 4에서 8로 증가
thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)

# 캐시 관리 함수
def get_cache_path(video_id, language, format_type):
    """캐시 파일 경로 생성"""
    cache_key = f"{video_id}_{language}_{format_type}"
    hash_key = hashlib.md5(cache_key.encode()).hexdigest()
    return os.path.join(CACHE_DIR, f"{hash_key}.json")

def save_to_cache(video_id, language, format_type, data):
    """결과를 캐시에 저장"""
    if not CACHE_ENABLED:
        return

    cache_path = get_cache_path(video_id, language, format_type)
    try:
        cache_data = {
            'timestamp': time.time(),
            'video_id': video_id,
            'language': language,
            'format': format_type,
            'data': data
        }
        with open(cache_path, 'w', encoding='utf-8') as f:
            json.dump(cache_data, f, ensure_ascii=False)
        print(f"Cached result for video {video_id}")
    except Exception as e:
        print(f"Error saving to cache: {e}")

def get_from_cache(video_id, language, format_type):
    """캐시에서 결과 가져오기"""
    if not CACHE_ENABLED:
        return None

    cache_path = get_cache_path(video_id, language, format_type)
    if not os.path.exists(cache_path):
        return None

    try:
        with open(cache_path, 'r', encoding='utf-8') as f:
            cache_data = json.load(f)

        # 캐시 만료 확인
        if time.time() - cache_data['timestamp'] > CACHE_EXPIRY:
            os.remove(cache_path)  # 만료된 캐시 삭제
            return None

        print(f"Cache hit for video {video_id}")
        return cache_data['data']
    except Exception as e:
        print(f"Error reading from cache: {e}")
        return None

# 캐시 정리 함수 (오래된 캐시 파일 삭제)
def clean_cache():
    """오래된 캐시 파일 삭제"""
    try:
        now = time.time()
        for filename in os.listdir(CACHE_DIR):
            file_path = os.path.join(CACHE_DIR, filename)
            if os.path.isfile(file_path) and filename.endswith('.json'):
                # 파일 수정 시간 확인
                if now - os.path.getmtime(file_path) > CACHE_EXPIRY:
                    os.remove(file_path)
                    print(f"Removed expired cache file: {filename}")
    except Exception as e:
        print(f"Error cleaning cache: {e}")

# 백그라운드에서 주기적으로 캐시 정리
def cache_cleaner():
    while True:
        time.sleep(3600)  # 1시간마다 실행
        clean_cache()

# 캐시 정리 스레드 시작
cleaner_thread = threading.Thread(target=cache_cleaner, daemon=True)
cleaner_thread.start()

# 표준 User-Agent 사용
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'

# Available subtitle languages
SUBTITLE_LANGUAGES = {
    'en': 'English',
    'ar': 'Arabic',
    'zh': 'Chinese',
    'nl': 'Dutch',
    'fi': 'Finnish',
    'fr': 'French',
    'de': 'German',
    'hi': 'Hindi',
    'hu': 'Hungarian',
    'id': 'Indonesian',
    'ga': 'Irish',
    'it': 'Italian',
    'ja': 'Japanese',
    'ko': 'Korean',
    'pl': 'Polish',
    'pt': 'Portuguese',
    'ru': 'Russian',
    'es': 'Spanish',
    'sv': 'Swedish',
    'tr': 'Turkish',
    'uk': 'Ukrainian',
    'vi': 'Vietnamese'
}

# Function to extract video ID from YouTube URL
def extract_video_id(url):
    # Regular expressions to match different YouTube URL formats
    youtube_regex = (
        r'(https?://)?(www\.)?'
        r'(youtube|youtu|youtube-nocookie)\.(com|be)/'
        r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')

    youtube_match = re.match(youtube_regex, url)
    if youtube_match:
        return youtube_match.group(6)
    return None

# Function to format timestamp
def format_timestamp(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"

# Function to convert transcript to SRT format
def transcript_to_srt(transcript):
    srt_content = ""
    for i, segment in enumerate(transcript, 1):
        start_time = format_timestamp(segment['start'])
        # Calculate end time (either from next segment or by adding duration)
        if i < len(transcript):
            end_time = format_timestamp(transcript[i]['start'])
        else:
            end_time = format_timestamp(segment['start'] + segment.get('duration', 5))

        srt_content += f"{i}\n{start_time},000 --> {end_time},000\n{segment['text']}\n\n"
    return srt_content

# Function to convert transcript to VTT format
def transcript_to_vtt(transcript):
    vtt_content = "WEBVTT\n\n"
    for i, segment in enumerate(transcript, 1):
        start_time = format_timestamp(segment['start']).replace(',', '.')
        # Calculate end time
        if i < len(transcript):
            end_time = format_timestamp(transcript[i]['start']).replace(',', '.')
        else:
            end_time = format_timestamp(segment['start'] + segment.get('duration', 5)).replace(',', '.')

        vtt_content += f"{start_time} --> {end_time}\n{segment['text']}\n\n"
    return vtt_content

# Function to convert transcript to JSON format
def transcript_to_json(transcript):
    return json.dumps(transcript, indent=2, ensure_ascii=False)



# Function to convert transcript to plain text
def transcript_to_text(transcript):
    return "\n".join([segment['text'] for segment in transcript])

# Function to get transcript using youtube-transcript-api
def get_transcript_with_api(video_id, language='en'):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        return transcript
    except (NoTranscriptFound, TranscriptsDisabled):
        return None

# Function to get transcript using yt-dlp (as a fallback)
def get_transcript_with_ytdlp(video_id, language='en'):
    try:
        url = f"https://www.youtube.com/watch?v={video_id}"
        print(f"Attempting to extract subtitles for video {video_id} with language {language} using yt-dlp")

        # Check if cookies file has actual cookies
        has_valid_cookies = False
        try:
            with open(COOKIES_FILE, 'r') as f:
                cookie_content = f.read()
                # Check if file has more than just the header line
                has_valid_cookies = len(cookie_content.strip().split('\n')) > 1
        except Exception as cookie_err:
            print(f"Error reading cookies file: {cookie_err}")

        if not has_valid_cookies:
            print("Warning: No valid YouTube cookies found. You may encounter anti-bot measures.")
            print("Please follow the instructions in YOUTUBE_COOKIES_GUIDE.md to set up proper cookies.")

        # yt-dlp 옵션 간소화 - 기본 옵션만 사용하여 안정성 향상
        ydl_opts = {
            'skip_download': True,  # 비디오 다운로드 건너뛰기
            'writesubtitles': True,  # 자막 다운로드
            'writeautomaticsub': True,  # 자동 생성 자막도 다운로드
            'subtitleslangs': [language],  # 지정된 언어의 자막
            'subtitlesformat': 'json3',  # JSON 형식으로 자막 추출
            'quiet': True,  # 출력 최소화
            'cookies': COOKIES_FILE,  # 쿠키 파일 사용
            'user_agent': DEFAULT_USER_AGENT,  # 표준 User-Agent 사용
            'nocheckcertificate': True,  # SSL 인증서 검증 건너뛰기
        }

        with tempfile.TemporaryDirectory() as tmpdir:
            ydl_opts['paths'] = {'home': tmpdir}
            try:
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    info = ydl.extract_info(url, download=False)

                    # Check if we're being asked to verify we're not a bot
                    if info.get('_type') == 'url' and 'accounts.google.com' in info.get('url', ''):
                        print("YouTube is requesting verification. Please update your cookies file.")
                        return None

                    # Check if subtitles are available
                    if info.get('subtitles') or info.get('automatic_captions'):
                        print(f"Found subtitles for video {video_id}, downloading...")
                        # Download subtitles
                        ydl.download([url])

                        # Find the subtitle file
                        subtitle_files = [f for f in os.listdir(tmpdir) if f.endswith(f'.{language}.json3')]
                        if not subtitle_files:
                            print(f"No subtitle files found in {tmpdir} for language {language}")
                            # Try to find any subtitle files
                            all_subtitle_files = [f for f in os.listdir(tmpdir) if f.endswith('.json3')]
                            if all_subtitle_files:
                                print(f"Found other subtitle files: {all_subtitle_files}")
                            return None

                        subtitle_file = subtitle_files[0]
                        print(f"Processing subtitle file: {subtitle_file}")

                        with open(os.path.join(tmpdir, subtitle_file), 'r', encoding='utf-8') as f:
                            subtitle_data = json.load(f)
                            # Convert to the same format as youtube-transcript-api
                            transcript = []
                            for event in subtitle_data.get('events', []):
                                if 'segs' in event:
                                    text = ' '.join([seg.get('utf8', '') for seg in event.get('segs', [])])
                                    if text.strip():
                                        transcript.append({
                                            'text': text,
                                            'start': event.get('tStartMs', 0) / 1000,
                                            'duration': (event.get('dDurationMs', 0) / 1000)
                                        })

                            if transcript:
                                print(f"Successfully extracted {len(transcript)} subtitle segments")
                                return transcript
                            else:
                                print("No subtitle segments found in the file")
                    else:
                        print(f"No subtitles available for video {video_id} with language {language}")
            except yt_dlp.utils.DownloadError as dl_err:
                print(f"yt-dlp download error: {dl_err}")
                if 'Sign in to confirm' in str(dl_err) or 'Please sign in' in str(dl_err):
                    print("YouTube is requesting verification. Please update your cookies file.")
                return None
        return None
    except Exception as e:
        print(f"Error with yt-dlp: {e}")
        import traceback
        traceback.print_exc()
        return None

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/batch')
def batch():
    return render_template('batch.html')

@app.route('/api/extract', methods=['POST'])
def extract_subtitles():
    data = request.json
    url = data.get('url')
    format_type = data.get('format', 'text')  # Default to text format
    language = data.get('language', 'en')  # Default to English

    if not url:
        return jsonify({'error': 'URL is required'}), 400

    video_id = extract_video_id(url)
    if not video_id:
        return jsonify({'error': 'Invalid YouTube URL'}), 400

    print(f"Processing request for video {video_id}, language: {language}, format: {format_type}")

    # 캐시에서 결과 확인
    cached_result = get_from_cache(video_id, language, format_type)
    if cached_result:
        print(f"Returning cached result for video {video_id}")
        return jsonify(cached_result)

    # Check if cookies file has actual cookies
    has_valid_cookies = False
    try:
        with open(COOKIES_FILE, 'r') as f:
            cookie_content = f.read()
            # Check if file has more than just the header line
            has_valid_cookies = len(cookie_content.strip().split('\n')) > 1
    except Exception as cookie_err:
        print(f"Error reading cookies file: {cookie_err}")

    if not has_valid_cookies:
        print("Warning: No valid YouTube cookies found. You may encounter anti-bot measures.")
        # Return a specific error message about cookies
        return jsonify({
            'error': 'YouTube cookies not configured',
            'message': 'YouTube requires authentication to extract subtitles. Please follow the instructions in YOUTUBE_COOKIES_GUIDE.md to set up proper cookies.',
            'details': 'This is likely why the extraction is failing or getting stuck in a loop.',
            'solution': 'Set up YouTube cookies as described in the guide to bypass anti-bot measures.'
        }), 400

    # 비동기 처리를 위한 함수
    def process_video():
        try:
            print(f"Starting subtitle extraction for video {video_id}")
            # Try to get transcript with youtube-transcript-api
            transcript = get_transcript_with_api(video_id, language)
            if transcript:
                print(f"Successfully extracted subtitles using youtube-transcript-api")

            # If that fails, try with yt-dlp
            if transcript is None:
                print(f"youtube-transcript-api failed, trying yt-dlp...")
                transcript = get_transcript_with_ytdlp(video_id, language)

            if transcript is None:
                print(f"Failed to extract subtitles for video {video_id}")
                return {'error': 'No subtitles found for this video',
                        'message': 'Could not find subtitles in the requested language. Try another language or check if the video has subtitles.'}, 404

            print(f"Formatting transcript to {format_type} format")
            # Format the transcript according to the requested format
            if format_type == 'srt':
                formatted_transcript = transcript_to_srt(transcript)
            elif format_type == 'vtt':
                formatted_transcript = transcript_to_vtt(transcript)
            elif format_type == 'json':
                formatted_transcript = transcript_to_json(transcript)
            else:  # Default to text
                formatted_transcript = transcript_to_text(transcript)

            # Get video title using yt-dlp
            try:
                print(f"Getting video title for {video_id}")
                # 간소화된 옵션으로 비디오 제목 가져오기
                ydl_opts = {
                    'skip_download': True,
                    'quiet': True,
                    'cookies': COOKIES_FILE,  # 쿠키 파일 사용
                    'user_agent': DEFAULT_USER_AGENT,  # 표준 User-Agent 사용
                    'nocheckcertificate': True,  # SSL 인증서 검증 건너뛰기
                }
                try:
                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                        info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
                        if info and 'title' in info:
                            video_title = info.get('title', f'youtube_{video_id}')
                            print(f"Video title: {video_title}")
                        else:
                            # 비디오 제목을 가져올 수 없는 경우 기본값 사용
                            video_title = f'youtube_{video_id}'
                            print(f"Could not get video title, using default: {video_title}")
                except Exception as e:
                    # 예외 발생 시 기본값 사용
                    video_title = f'youtube_{video_id}'
                    print(f"Error getting video info: {e}, using default title: {video_title}")
            except Exception as e:
                print(f"Error getting video title: {e}")
                video_title = f'youtube_{video_id}'

            # Sanitize the title for use as a filename
            safe_title = re.sub(r'[^\w\s-]', '', video_title).strip().replace(' ', '_')

            # Prepare response
            response_data = {
                'video_id': video_id,
                'video_title': video_title,
                'transcript': formatted_transcript,
                'format': format_type,
                'language': language,
                'filename': f"{safe_title}.{format_type}",
                'cached': True,
                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

            # 결과 캐싱
            save_to_cache(video_id, language, format_type, response_data)
            print(f"Extraction complete for video {video_id}")

            return response_data
        except Exception as e:
            print(f"Unexpected error in process_video: {e}")
            import traceback
            traceback.print_exc()
            return {'error': 'An unexpected error occurred', 'message': str(e)}, 500

    # 비동기 처리 시작
    print(f"Submitting extraction task to thread pool")
    future = thread_pool.submit(process_video)

    try:
        # 최대 30초 동안 결과를 기다림 (성능 향상을 위해 45초에서 감소)
        print(f"Waiting for extraction result (timeout: 30 seconds)")
        result = future.result(timeout=30)
        print(f"Extraction completed within timeout period")
        return jsonify(result)
    except concurrent.futures.TimeoutError:
        # 시간 초과시 백그라운드에서 계속 처리하고 임시 응답 반환
        print(f"Extraction timeout - continuing in background")
        return jsonify({
            'video_id': video_id,
            'status': 'processing',
            'message': 'Your request is being processed in the background. Please try again in a few seconds.',
            'retry_after': 5,  # 5초 후 다시 시도하도록 안내
            'note': 'If this keeps happening, you may need to set up YouTube cookies. See YOUTUBE_COOKIES_GUIDE.md for instructions.'
        })

@app.route('/api/download', methods=['POST'])
def download_subtitles():
    data = request.json
    transcript = data.get('transcript')
    filename = data.get('filename', 'subtitles.txt')

    if not transcript:
        return jsonify({'error': 'Transcript is required'}), 400

    # Create a temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8')
    temp_file.write(transcript)
    temp_file.close()

    # Send the file
    return send_file(
        temp_file.name,
        as_attachment=True,
        download_name=filename,
        mimetype='text/plain'
    )

@app.route('/api/batch-extract', methods=['POST'])
def batch_extract_subtitles():
    data = request.json
    urls = data.get('urls', [])
    format_type = data.get('format', 'text')  # Default to text format
    language = data.get('language', 'en')  # Default to English

    if not urls or not isinstance(urls, list):
        return jsonify({'error': 'URLs array is required'}), 400

    # 비동기 처리를 위한 함수
    def process_url(url):
        try:
            # Extract video ID
            video_id = extract_video_id(url)
            if not video_id:
                return {'url': url, 'error': 'Invalid YouTube URL'}, None

            # 캐시에서 결과 확인
            cached_result = get_from_cache(video_id, language, format_type)
            if cached_result:
                cached_result['url'] = url  # URL 추가
                return None, cached_result

            # Try to get transcript
            transcript = get_transcript_with_api(video_id, language)
            if transcript is None:
                transcript = get_transcript_with_ytdlp(video_id, language)

            if transcript is None:
                return {'url': url, 'error': 'No subtitles found for this video'}, None

            # Format the transcript
            if format_type == 'srt':
                formatted_transcript = transcript_to_srt(transcript)
            elif format_type == 'vtt':
                formatted_transcript = transcript_to_vtt(transcript)
            elif format_type == 'json':
                formatted_transcript = transcript_to_json(transcript)
            else:  # Default to text
                formatted_transcript = transcript_to_text(transcript)

            # Get video title
            try:
                # 간소화된 옵션으로 비디오 제목 가져오기
                ydl_opts = {
                    'skip_download': True,
                    'quiet': True,
                    'cookies': COOKIES_FILE,  # 쿠키 파일 사용
                    'user_agent': DEFAULT_USER_AGENT,  # 표준 User-Agent 사용
                    'nocheckcertificate': True,  # SSL 인증서 검증 건너뛰기
                }
                try:
                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                        info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
                        if info and 'title' in info:
                            video_title = info.get('title', f'youtube_{video_id}')
                        else:
                            # 비디오 제목을 가져올 수 없는 경우 기본값 사용
                            video_title = f'youtube_{video_id}'
                except Exception as e:
                    # 예외 발생 시 기본값 사용
                    video_title = f'youtube_{video_id}'
                    print(f"Error getting video info: {e}, using default title: {video_title}")
            except Exception as e:
                print(f"Error getting video title in batch mode: {e}")
                video_title = f'youtube_{video_id}'

            # Sanitize the title for use as a filename
            safe_title = re.sub(r'[^\w\s-]', '', video_title).strip().replace(' ', '_')

            # Add to results
            result = {
                'url': url,
                'video_id': video_id,
                'video_title': video_title,
                'transcript': formatted_transcript,
                'format': format_type,
                'language': language,
                'filename': f"{safe_title}.{format_type}",
                'cached': True,
                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }

            # 결과 캐싱
            save_to_cache(video_id, language, format_type, result)

            return None, result

        except Exception as e:
            return {'url': url, 'error': str(e)}, None

    # 모든 URL을 병렬로 처리
    futures = [thread_pool.submit(process_url, url) for url in urls]

    # 결과 수집
    results = []
    errors = []
    processing = []

    for i, future in enumerate(concurrent.futures.as_completed(futures, timeout=60)):
        try:
            error, result = future.result()
            if error:
                errors.append(error)
            elif result:
                results.append(result)
        except concurrent.futures.TimeoutError:
            # 시간 초과된 URL 처리 정보 추가
            processing.append({'url': urls[i], 'status': 'processing'})
        except Exception as e:
            errors.append({'url': urls[i], 'error': f"Unexpected error: {str(e)}"})

    # 응답 생성
    response = {
        'results': results,
        'errors': errors,
        'processing': processing,
        'total_processed': len(results),
        'total_errors': len(errors),
        'total_processing': len(processing)
    }

    # 처리 중인 작업이 있는 경우 재시도 안내
    if processing:
        response['message'] = 'Some videos are still being processed. Please try again in a few seconds.'
        response['retry_after'] = 5  # 5초 후 다시 시도하도록 안내

    return jsonify(response)

if __name__ == '__main__':
    # 호스팅 환경에서 포트 설정 (환경 변수가 있는 경우 사용)
    port = int(os.environ.get('PORT', 8081))  # 기본 포트를 8081로 변경

    # 개발 환경에서만 디버그 모드 활성화
    debug_mode = os.environ.get('FLASK_ENV') == 'development'

    app.run(host='0.0.0.0', port=port, debug=debug_mode)
