# -*- coding: utf-8 -*- """ YouTube Comment Sentiment Analyzer - TOP 50 COMMENTS VERSION """ import gradio as gr import pandas as pd import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns import re import os from collections import Counter from textblob import TextBlob from langdetect import detect, DetectorFactory from wordcloud import WordCloud from googleapiclient.discovery import build from googleapiclient.errors import HttpError import warnings warnings.filterwarnings('ignore') from transformers import pipeline try: import emoji EMOJI_AVAILABLE = True except ImportError: EMOJI_AVAILABLE = False DetectorFactory.seed = 0 plt.style.use('seaborn-v0_8-darkgrid') sns.set_palette("husl") YOUTUBE_API_KEY = os.environ.get("GoogleAPIKey") print("Loading Urdu sentiment model...") try: urdu_sentiment_pipeline = pipeline( "text-classification", model="Khubaib01/roman-urdu-sentiment-xlm-r", truncation=True, device=-1 ) URDU_MODEL_AVAILABLE = True print("✅ Urdu sentiment model loaded") except Exception as e: print(f"⚠️ Urdu model error: {e}") URDU_MODEL_AVAILABLE = False CUSTOM_STOPWORDS = {'imran', 'khan', 'pti', 'pakistan', 'people', 'say', 'would', 'could', 'should', 'like', 'just', 'get', 'really', 'got', 'even', 'also', 'well'} class YouTubeSentimentAnalyzer: def __init__(self): self.youtube = None if YOUTUBE_API_KEY: try: self.youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY) print("✅ YouTube API initialized") except Exception as e: print(f"❌ API Error: {e}") def get_video_details(self, video_url): try: if 'v=' in video_url: video_id = video_url.split('v=')[-1].split('&')[0] elif 'youtu.be/' in video_url: video_id = video_url.split('youtu.be/')[-1].split('?')[0] else: video_id = video_url request = self.youtube.videos().list(part='statistics,snippet', id=video_id) response = request.execute() if response['items']: stats = response['items'][0]['statistics'] snippet = response['items'][0]['snippet'] return { 'title': snippet.get('title', 'N/A'), 'total_comments': int(stats.get('commentCount', 0)) } return None except: return None def extract_top_comments(self, video_url, max_comments=50): """Extract only top comments (by likes/relevance)""" if not self.youtube: return [], "YouTube API not configured." try: if 'v=' in video_url: video_id = video_url.split('v=')[-1].split('&')[0] elif 'youtu.be/' in video_url: video_id = video_url.split('youtu.be/')[-1].split('?')[0] else: video_id = video_url comments = [] # Get top comments (sorted by relevance/likes by YouTube) request = self.youtube.commentThreads().list( part='snippet', videoId=video_id, maxResults=max_comments, order='relevance', # This gets the most relevant/important comments textFormat='plainText' ) response = request.execute() for item in response['items']: comment_data = item['snippet']['topLevelComment']['snippet'] comments.append({ 'author': comment_data.get('authorDisplayName', 'Anonymous'), 'text': comment_data.get('textDisplay', ''), 'likes': comment_data.get('likeCount', 0), 'time': comment_data.get('publishedAt', '') }) return comments, None except Exception as e: return [], str(e) def clean_text(self, text): if not text or not isinstance(text, str): return "" text = re.sub(r'http\S+|www\S+|https\S+', '', text) text = re.sub(r'<.*?>', '', text) text = re.sub(r'[^\w\s\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\.\,\!\?\']', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def detect_language(self, text): try: if not text or len(text) < 3: return 'unknown' if re.search(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]', text): return 'urdu' lang = detect(text) return 'english' if lang == 'en' else 'other' except: return 'unknown' def extract_emojis(self, text): if not EMOJI_AVAILABLE or not isinstance(text, str): return [] return [char for char in text if emoji.is_emoji(char)] def analyze_sentiment_english(self, text): try: blob = TextBlob(text) polarity = blob.sentiment.polarity if polarity > 0.1: return 'Positive', polarity elif polarity < -0.1: return 'Negative', polarity return 'Neutral', polarity except: return 'Neutral', 0.0 def analyze_sentiment_urdu(self, text): if URDU_MODEL_AVAILABLE: try: result = urdu_sentiment_pipeline(text)[0] label = result['label'] score = result['score'] if label in ['LABEL_0', 'Positive']: return 'Positive', score elif label in ['LABEL_1', 'Negative']: return 'Negative', score return 'Neutral', score except: return 'Neutral', 0.5 else: return 'Neutral', 0.5 def process_comments(self, comments): if not comments: return pd.DataFrame(), 0 df = pd.DataFrame(comments) df['clean_text'] = df['text'].apply(self.clean_text) df['text_for_wc'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x))) before_filter = len(df) df = df[df['clean_text'].str.len() > 2] filtered_out = before_filter - len(df) if len(df) == 0: return df, filtered_out df['language'] = df['clean_text'].apply(self.detect_language) sentiments = [] scores = [] for _, row in df.iterrows(): if row['language'] == 'english': sent, score = self.analyze_sentiment_english(row['clean_text']) else: sent, score = self.analyze_sentiment_urdu(row['clean_text']) sentiments.append(sent) scores.append(score) df['sentiment'] = sentiments df['polarity'] = scores if EMOJI_AVAILABLE: df['emojis'] = df['text'].apply(self.extract_emojis) df['emoji_count'] = df['emojis'].apply(len) df['has_emoji'] = df['emoji_count'] > 0 return df, filtered_out def create_visualizations(df): if len(df) == 0: return (None, None, None, None, None, None, None, None) plt.clf() plt.close('all') # 1. Sentiment Pie fig1 = plt.figure(figsize=(10, 8)) counts = df['sentiment'].value_counts() colors = {'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'} plt.pie(counts.values, labels=counts.index, autopct='%1.1f%%', colors=[colors.get(s, '#95a5a6') for s in counts.index], startangle=90, explode=[0.05]*len(counts)) plt.title('Sentiment Distribution (Top 50 Comments)', fontsize=16, fontweight='bold') plt.tight_layout() pie = fig1 # 2. Sentiment Bar fig2 = plt.figure(figsize=(10, 6)) bars = plt.bar(counts.index, counts.values, color=[colors.get(x, '#95a5a6') for x in counts.index]) for bar in bars: plt.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 5, f'{int(bar.get_height())}', ha='center', va='bottom') plt.title('Sentiment Bar Chart', fontsize=14, fontweight='bold') plt.grid(axis='y', alpha=0.3) plt.tight_layout() bar = fig2 # 3. Language Pie fig3 = plt.figure(figsize=(10, 8)) lang_counts = df['language'].value_counts() lang_labels = {'english': 'English', 'urdu': 'Urdu', 'other': 'Other'} plt.pie(lang_counts.values, labels=[lang_labels.get(l, l) for l in lang_counts.index], autopct='%1.1f%%') plt.title('Language Distribution', fontsize=14, fontweight='bold') plt.tight_layout() lang_pie = fig3 # 4. Top Comments Table fig4, ax4 = plt.subplots(figsize=(14, 8)) ax4.axis('tight') ax4.axis('off') top = df.nlargest(10, 'likes')[['author', 'text', 'likes', 'sentiment']] top['text'] = top['text'].apply(lambda x: str(x)[:70] + '...' if len(str(x)) > 70 else str(x)) table = ax4.table(cellText=top.values, colLabels=['Author', 'Comment', 'Likes', 'Sentiment'], cellLoc='left', loc='center', colWidths=[0.15, 0.55, 0.1, 0.1]) table.auto_set_font_size(False) table.set_fontsize(9) for i, sent in enumerate(top['sentiment'].values, start=1): if sent == 'Positive': table[(i, 3)].set_facecolor('#90EE90') elif sent == 'Negative': table[(i, 3)].set_facecolor('#FFB6C1') ax4.set_title('Top 10 Most Engaging Comments', fontsize=16, fontweight='bold') plt.tight_layout() top_table = fig4 # 5. Word Cloud fig5, ax5 = plt.subplots(figsize=(12, 6)) all_text = ' '.join(df['clean_text'].tolist()) if all_text.strip(): try: wc = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(all_text) ax5.imshow(wc, interpolation='bilinear') ax5.axis('off') ax5.set_title('Word Cloud', fontsize=14, fontweight='bold') except: ax5.text(0.5, 0.5, 'Could not generate', ha='center', va='center') plt.tight_layout() wc = fig5 return pie, bar, lang_pie, top_table, wc, None, None, None def analyze_youtube_video(video_url, progress=gr.Progress()): if not video_url or not video_url.strip(): return "❌ Enter a valid URL", [None] * 8 if not YOUTUBE_API_KEY: return "❌ Add GoogleAPIKey to Secrets", [None] * 8 try: progress(0.1, desc="Getting video info...") analyzer = YouTubeSentimentAnalyzer() if not analyzer.youtube: return "❌ API failed", [None] * 8 video_info = analyzer.get_video_details(video_url) total_comments = video_info['total_comments'] if video_info else 0 video_title = video_info['title'] if video_info else "Unknown" progress(0.3, desc="Extracting top 50 comments...") comments, error = analyzer.extract_top_comments(video_url, max_comments=50) if error: return f"❌ {error}", [None] * 8 if not comments: return "❌ No comments found", [None] * 8 progress(0.6, desc="Analyzing sentiment...") df, filtered = analyzer.process_comments(comments) if len(df) == 0: return "❌ No valid comments", [None] * 8 # Get negative comments for display negative_comments = df[df['sentiment'] == 'Negative'].nlargest(5, 'likes')[['author', 'text', 'likes']] negative_text = "" if len(negative_comments) > 0: negative_text = "\n### 👎 Sample Negative Comments:\n" for _, row in negative_comments.iterrows(): text_preview = row['text'][:150] + '...' if len(row['text']) > 150 else row['text'] negative_text += f"- **@{row['author']}** (👍 {row['likes']}): {text_preview}\n" else: negative_text = "\n### 👎 No negative comments found in top 50.\n" stats = f""" ## Analysis Complete - Top 50 Most Relevant Comments **Video:** {video_title} ### Results: - **Comments Analyzed:** {len(df)} most relevant comments - **Positive:** {len(df[df['sentiment'] == 'Positive'])} ({len(df[df['sentiment'] == 'Positive'])/len(df)*100:.1f}%) - **Negative:** {len(df[df['sentiment'] == 'Negative'])} ({len(df[df['sentiment'] == 'Negative'])/len(df)*100:.1f}%) - **Neutral:** {len(df[df['sentiment'] == 'Neutral'])} ({len(df[df['sentiment'] == 'Neutral'])/len(df)*100:.1f}%) - **Urdu Comments:** {len(df[df['language'] == 'urdu'])} ({len(df[df['language'] == 'urdu'])/len(df)*100:.1f}%) - **English Comments:** {len(df[df['language'] == 'english'])} ({len(df[df['language'] == 'english'])/len(df)*100:.1f}%) {negative_text} --- ### Methodology: Analyzed the **50 most relevant comments** (sorted by YouTube's relevance algorithm, which prioritizes likes and engagement). Comments with only emojis or less than 3 characters were excluded. Replies to other comments are not included. *Urdu phrases like "Khan zinda bad" are correctly classified as Positive* """ progress(0.8, desc="Creating visualizations...") pie, bar, lang_pie, top_table, wc, _, _, _ = create_visualizations(df) progress(1.0, desc="Complete!") return stats, pie, bar, lang_pie, top_table, wc, None, None except Exception as e: return f"❌ Error: {str(e)}", [None] * 8 # Create Interface with gr.Blocks(title="YouTube Sentiment Analyzer", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎬 YouTube Comment Sentiment Analyzer **Analyzes the 50 most important comments (by relevance/likes)** """) with gr.Row(): url_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", scale=4) analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg", scale=1) stats_output = gr.Markdown("### Enter a URL above") with gr.Row(): sentiment_pie = gr.Plot(label="Sentiment Distribution") sentiment_bar = gr.Plot(label="Sentiment Bar Chart") with gr.Row(): language_pie = gr.Plot(label="Language Distribution") top_table = gr.Plot(label="Top 10 Comments") with gr.Row(): wordcloud = gr.Plot(label="Word Cloud") analyze_btn.click(analyze_youtube_video, [url_input], [stats_output, sentiment_pie, sentiment_bar, language_pie, top_table, wordcloud, gr.Plot(visible=False), gr.Plot(visible=False)]) demo.launch(server_name="0.0.0.0", server_port=7860)