# -*- coding: utf-8 -*-
"""
YouTube Comment Sentiment Analyzer - TOP 50 COMMENTS VERSION
"""

import gradio as gr
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from collections import Counter
from textblob import TextBlob
from langdetect import detect, DetectorFactory
from wordcloud import WordCloud
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline

try:
    import emoji
    EMOJI_AVAILABLE = True
except ImportError:
    EMOJI_AVAILABLE = False

DetectorFactory.seed = 0
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

YOUTUBE_API_KEY = os.environ.get("GoogleAPIKey")

print("Loading Urdu sentiment model...")
try:
    urdu_sentiment_pipeline = pipeline(
        "text-classification",
        model="Khubaib01/roman-urdu-sentiment-xlm-r",
        truncation=True,
        device=-1
    )
    URDU_MODEL_AVAILABLE = True
    print("✅ Urdu sentiment model loaded")
except Exception as e:
    print(f"⚠️ Urdu model error: {e}")
    URDU_MODEL_AVAILABLE = False

CUSTOM_STOPWORDS = {'imran', 'khan', 'pti', 'pakistan', 'people', 'say', 'would', 'could',
                    'should', 'like', 'just', 'get', 'really', 'got', 'even', 'also', 'well'}


class YouTubeSentimentAnalyzer:
    def __init__(self):
        self.youtube = None
        if YOUTUBE_API_KEY:
            try:
                self.youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
                print("✅ YouTube API initialized")
            except Exception as e:
                print(f"❌ API Error: {e}")
    
    def get_video_details(self, video_url):
        try:
            if 'v=' in video_url:
                video_id = video_url.split('v=')[-1].split('&')[0]
            elif 'youtu.be/' in video_url:
                video_id = video_url.split('youtu.be/')[-1].split('?')[0]
            else:
                video_id = video_url
            
            request = self.youtube.videos().list(part='statistics,snippet', id=video_id)
            response = request.execute()
            
            if response['items']:
                stats = response['items'][0]['statistics']
                snippet = response['items'][0]['snippet']
                return {
                    'title': snippet.get('title', 'N/A'),
                    'total_comments': int(stats.get('commentCount', 0))
                }
            return None
        except:
            return None
    
    def extract_top_comments(self, video_url, max_comments=50):
        """Extract only top comments (by likes/relevance)"""
        if not self.youtube:
            return [], "YouTube API not configured."
        
        try:
            if 'v=' in video_url:
                video_id = video_url.split('v=')[-1].split('&')[0]
            elif 'youtu.be/' in video_url:
                video_id = video_url.split('youtu.be/')[-1].split('?')[0]
            else:
                video_id = video_url
            
            comments = []
            
            # Get top comments (sorted by relevance/likes by YouTube)
            request = self.youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=max_comments,
                order='relevance',  # This gets the most relevant/important comments
                textFormat='plainText'
            )
            response = request.execute()
            
            for item in response['items']:
                comment_data = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'author': comment_data.get('authorDisplayName', 'Anonymous'),
                    'text': comment_data.get('textDisplay', ''),
                    'likes': comment_data.get('likeCount', 0),
                    'time': comment_data.get('publishedAt', '')
                })
            
            return comments, None
        except Exception as e:
            return [], str(e)
    
    def clean_text(self, text):
        if not text or not isinstance(text, str):
            return ""
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'[^\w\s\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\.\,\!\?\']', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def detect_language(self, text):
        try:
            if not text or len(text) < 3:
                return 'unknown'
            if re.search(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]', text):
                return 'urdu'
            lang = detect(text)
            return 'english' if lang == 'en' else 'other'
        except:
            return 'unknown'
    
    def extract_emojis(self, text):
        if not EMOJI_AVAILABLE or not isinstance(text, str):
            return []
        return [char for char in text if emoji.is_emoji(char)]
    
    def analyze_sentiment_english(self, text):
        try:
            blob = TextBlob(text)
            polarity = blob.sentiment.polarity
            if polarity > 0.1:
                return 'Positive', polarity
            elif polarity < -0.1:
                return 'Negative', polarity
            return 'Neutral', polarity
        except:
            return 'Neutral', 0.0
    
    def analyze_sentiment_urdu(self, text):
        if URDU_MODEL_AVAILABLE:
            try:
                result = urdu_sentiment_pipeline(text)[0]
                label = result['label']
                score = result['score']
                if label in ['LABEL_0', 'Positive']:
                    return 'Positive', score
                elif label in ['LABEL_1', 'Negative']:
                    return 'Negative', score
                return 'Neutral', score
            except:
                return 'Neutral', 0.5
        else:
            return 'Neutral', 0.5
    
    def process_comments(self, comments):
        if not comments:
            return pd.DataFrame(), 0
        
        df = pd.DataFrame(comments)
        df['clean_text'] = df['text'].apply(self.clean_text)
        df['text_for_wc'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
        
        before_filter = len(df)
        df = df[df['clean_text'].str.len() > 2]
        filtered_out = before_filter - len(df)
        
        if len(df) == 0:
            return df, filtered_out
        
        df['language'] = df['clean_text'].apply(self.detect_language)
        
        sentiments = []
        scores = []
        for _, row in df.iterrows():
            if row['language'] == 'english':
                sent, score = self.analyze_sentiment_english(row['clean_text'])
            else:
                sent, score = self.analyze_sentiment_urdu(row['clean_text'])
            sentiments.append(sent)
            scores.append(score)
        
        df['sentiment'] = sentiments
        df['polarity'] = scores
        
        if EMOJI_AVAILABLE:
            df['emojis'] = df['text'].apply(self.extract_emojis)
            df['emoji_count'] = df['emojis'].apply(len)
            df['has_emoji'] = df['emoji_count'] > 0
        
        return df, filtered_out


def create_visualizations(df):
    if len(df) == 0:
        return (None, None, None, None, None, None, None, None)
    
    plt.clf()
    plt.close('all')
    
    # 1. Sentiment Pie
    fig1 = plt.figure(figsize=(10, 8))
    counts = df['sentiment'].value_counts()
    colors = {'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'}
    plt.pie(counts.values, labels=counts.index, autopct='%1.1f%%',
            colors=[colors.get(s, '#95a5a6') for s in counts.index], startangle=90, explode=[0.05]*len(counts))
    plt.title('Sentiment Distribution (Top 50 Comments)', fontsize=16, fontweight='bold')
    plt.tight_layout()
    pie = fig1
    
    # 2. Sentiment Bar
    fig2 = plt.figure(figsize=(10, 6))
    bars = plt.bar(counts.index, counts.values, color=[colors.get(x, '#95a5a6') for x in counts.index])
    for bar in bars:
        plt.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 5, f'{int(bar.get_height())}', ha='center', va='bottom')
    plt.title('Sentiment Bar Chart', fontsize=14, fontweight='bold')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    bar = fig2
    
    # 3. Language Pie
    fig3 = plt.figure(figsize=(10, 8))
    lang_counts = df['language'].value_counts()
    lang_labels = {'english': 'English', 'urdu': 'Urdu', 'other': 'Other'}
    plt.pie(lang_counts.values, labels=[lang_labels.get(l, l) for l in lang_counts.index], autopct='%1.1f%%')
    plt.title('Language Distribution', fontsize=14, fontweight='bold')
    plt.tight_layout()
    lang_pie = fig3
    
    # 4. Top Comments Table
    fig4, ax4 = plt.subplots(figsize=(14, 8))
    ax4.axis('tight')
    ax4.axis('off')
    top = df.nlargest(10, 'likes')[['author', 'text', 'likes', 'sentiment']]
    top['text'] = top['text'].apply(lambda x: str(x)[:70] + '...' if len(str(x)) > 70 else str(x))
    table = ax4.table(cellText=top.values, colLabels=['Author', 'Comment', 'Likes', 'Sentiment'],
                     cellLoc='left', loc='center', colWidths=[0.15, 0.55, 0.1, 0.1])
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    for i, sent in enumerate(top['sentiment'].values, start=1):
        if sent == 'Positive':
            table[(i, 3)].set_facecolor('#90EE90')
        elif sent == 'Negative':
            table[(i, 3)].set_facecolor('#FFB6C1')
    ax4.set_title('Top 10 Most Engaging Comments', fontsize=16, fontweight='bold')
    plt.tight_layout()
    top_table = fig4
    
    # 5. Word Cloud
    fig5, ax5 = plt.subplots(figsize=(12, 6))
    all_text = ' '.join(df['clean_text'].tolist())
    if all_text.strip():
        try:
            wc = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(all_text)
            ax5.imshow(wc, interpolation='bilinear')
            ax5.axis('off')
            ax5.set_title('Word Cloud', fontsize=14, fontweight='bold')
        except:
            ax5.text(0.5, 0.5, 'Could not generate', ha='center', va='center')
    plt.tight_layout()
    wc = fig5
    
    return pie, bar, lang_pie, top_table, wc, None, None, None


def analyze_youtube_video(video_url, progress=gr.Progress()):
    if not video_url or not video_url.strip():
        return "❌ Enter a valid URL", [None] * 8
    
    if not YOUTUBE_API_KEY:
        return "❌ Add GoogleAPIKey to Secrets", [None] * 8
    
    try:
        progress(0.1, desc="Getting video info...")
        analyzer = YouTubeSentimentAnalyzer()
        
        if not analyzer.youtube:
            return "❌ API failed", [None] * 8
        
        video_info = analyzer.get_video_details(video_url)
        total_comments = video_info['total_comments'] if video_info else 0
        video_title = video_info['title'] if video_info else "Unknown"
        
        progress(0.3, desc="Extracting top 50 comments...")
        comments, error = analyzer.extract_top_comments(video_url, max_comments=50)
        
        if error:
            return f"❌ {error}", [None] * 8
        
        if not comments:
            return "❌ No comments found", [None] * 8
        
        progress(0.6, desc="Analyzing sentiment...")
        df, filtered = analyzer.process_comments(comments)
        
        if len(df) == 0:
            return "❌ No valid comments", [None] * 8
        
        # Get negative comments for display
        negative_comments = df[df['sentiment'] == 'Negative'].nlargest(5, 'likes')[['author', 'text', 'likes']]
        
        negative_text = ""
        if len(negative_comments) > 0:
            negative_text = "\n### 👎 Sample Negative Comments:\n"
            for _, row in negative_comments.iterrows():
                text_preview = row['text'][:150] + '...' if len(row['text']) > 150 else row['text']
                negative_text += f"- **@{row['author']}** (👍 {row['likes']}): {text_preview}\n"
        else:
            negative_text = "\n### 👎 No negative comments found in top 50.\n"
        
        stats = f"""
##  Analysis Complete - Top 50 Most Relevant Comments

**Video:** {video_title}

###  Results:
- **Comments Analyzed:** {len(df)} most relevant comments
- **Positive:** {len(df[df['sentiment'] == 'Positive'])} ({len(df[df['sentiment'] == 'Positive'])/len(df)*100:.1f}%)
- **Negative:** {len(df[df['sentiment'] == 'Negative'])} ({len(df[df['sentiment'] == 'Negative'])/len(df)*100:.1f}%)
- **Neutral:** {len(df[df['sentiment'] == 'Neutral'])} ({len(df[df['sentiment'] == 'Neutral'])/len(df)*100:.1f}%)
- **Urdu Comments:** {len(df[df['language'] == 'urdu'])} ({len(df[df['language'] == 'urdu'])/len(df)*100:.1f}%)
- **English Comments:** {len(df[df['language'] == 'english'])} ({len(df[df['language'] == 'english'])/len(df)*100:.1f}%)

{negative_text}

---
###  Methodology:
Analyzed the **50 most relevant comments** (sorted by YouTube's relevance algorithm, which prioritizes likes and engagement). Comments with only emojis or less than 3 characters were excluded. Replies to other comments are not included.

*Urdu phrases like "Khan zinda bad" are correctly classified as Positive*
"""
        
        progress(0.8, desc="Creating visualizations...")
        pie, bar, lang_pie, top_table, wc, _, _, _ = create_visualizations(df)
        
        progress(1.0, desc="Complete!")
        
        return stats, pie, bar, lang_pie, top_table, wc, None, None
        
    except Exception as e:
        return f"❌ Error: {str(e)}", [None] * 8


# Create Interface
with gr.Blocks(title="YouTube Sentiment Analyzer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎬 YouTube Comment Sentiment Analyzer
    
    **Analyzes the 50 most important comments (by relevance/likes)**
    
    """)
    
    with gr.Row():
        url_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", scale=4)
        analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg", scale=1)
    
    stats_output = gr.Markdown("### Enter a URL above")
    
    with gr.Row():
        sentiment_pie = gr.Plot(label="Sentiment Distribution")
        sentiment_bar = gr.Plot(label="Sentiment Bar Chart")
    
    with gr.Row():
        language_pie = gr.Plot(label="Language Distribution")
        top_table = gr.Plot(label="Top 10 Comments")
    
    with gr.Row():
        wordcloud = gr.Plot(label="Word Cloud")
    
    analyze_btn.click(analyze_youtube_video, [url_input], 
                     [stats_output, sentiment_pie, sentiment_bar, language_pie, top_table, wordcloud, 
                      gr.Plot(visible=False), gr.Plot(visible=False)])

demo.launch(server_name="0.0.0.0", server_port=7860)