siddiqov commited on
Commit
6282adf
Β·
verified Β·
1 Parent(s): 81fc68b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -385
app.py CHANGED
@@ -1,14 +1,14 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- YouTube Comment Sentiment Analyzer - Complete Working Version
4
- Compatible with Python 3.11
5
  """
6
 
7
  import gradio as gr
8
  import pandas as pd
9
  import numpy as np
10
  import matplotlib
11
- matplotlib.use('Agg') # Use non-interactive backend for server
12
  import matplotlib.pyplot as plt
13
  import seaborn as sns
14
  import re
@@ -22,61 +22,94 @@ from googleapiclient.errors import HttpError
22
  import warnings
23
  warnings.filterwarnings('ignore')
24
 
25
- # Try to import emoji (optional feature)
 
 
 
26
  try:
27
  import emoji
28
  EMOJI_AVAILABLE = True
29
  except ImportError:
30
  EMOJI_AVAILABLE = False
31
 
32
- # Set seed for consistent language detection
33
  DetectorFactory.seed = 0
34
-
35
- # Set matplotlib style
36
  plt.style.use('seaborn-v0_8-darkgrid')
37
  sns.set_palette("husl")
38
 
39
- # Get YouTube API key from Hugging Face Secrets
40
  YOUTUBE_API_KEY = os.environ.get("GoogleAPIKey")
41
 
42
- # Custom stopwords for wordcloud
43
- CUSTOM_STOPWORDS = {
44
- 'imran', 'khan', 'pti', 'pakistan', 'people', 'say', 'would', 'could',
45
- 'should', 'like', 'just', 'get', 'really', 'got', 'even', 'also', 'well',
46
- 'one', 'two', 'see', 'go', 'make', 'time', 'way', 'will', 'can', 'know',
47
- 'video', 'watch', 'comment', 'channel', 'please', 'subscribe', 'like'
48
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  class YouTubeSentimentAnalyzer:
52
- """Main analyzer class for YouTube comments"""
53
-
54
  def __init__(self):
55
- """Initialize YouTube API client"""
56
  self.youtube = None
57
  if YOUTUBE_API_KEY:
58
  try:
59
  self.youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
60
- print("βœ… YouTube API initialized successfully")
61
  except Exception as e:
62
- print(f"❌ YouTube API initialization error: {e}")
63
 
64
  def extract_comments(self, video_url, max_comments=150):
65
- """
66
- Extract comments from YouTube video using official API
67
-
68
- Args:
69
- video_url: YouTube video URL
70
- max_comments: Maximum number of comments to extract
71
-
72
- Returns:
73
- tuple: (comments_list, error_message)
74
- """
75
  if not self.youtube:
76
- return [], "YouTube API not configured. Please add GoogleAPIKey to Repository Secrets."
77
 
78
  try:
79
- # Extract video ID from URL
80
  if 'v=' in video_url:
81
  video_id = video_url.split('v=')[-1].split('&')[0]
82
  elif 'youtu.be/' in video_url:
@@ -84,8 +117,6 @@ class YouTubeSentimentAnalyzer:
84
  else:
85
  video_id = video_url
86
 
87
- print(f"Fetching comments for video ID: {video_id}")
88
-
89
  comments = []
90
  next_page_token = None
91
 
@@ -105,498 +136,260 @@ class YouTubeSentimentAnalyzer:
105
  'author': comment_data.get('authorDisplayName', 'Anonymous'),
106
  'text': comment_data.get('textDisplay', ''),
107
  'likes': comment_data.get('likeCount', 0),
108
- 'time': comment_data.get('publishedAt', ''),
109
- 'replies': item['snippet'].get('totalReplyCount', 0)
110
  })
111
 
112
  next_page_token = response.get('nextPageToken')
113
  if not next_page_token:
114
  break
115
 
116
- print(f"βœ… Successfully extracted {len(comments)} comments")
117
  return comments, None
118
-
119
  except HttpError as e:
120
  if e.resp.status == 403:
121
- return [], "YouTube API quota exceeded. Please try again later."
122
- elif e.resp.status == 404:
123
- return [], "Video not found or comments are disabled."
124
- else:
125
- return [], f"YouTube API Error: {str(e)}"
126
  except Exception as e:
127
- return [], f"Error: {str(e)}"
128
 
129
  def clean_text(self, text):
130
- """
131
- Clean and preprocess text for analysis
132
-
133
- Args:
134
- text: Raw comment text
135
-
136
- Returns:
137
- Cleaned text
138
- """
139
  if not text or not isinstance(text, str):
140
  return ""
141
-
142
- # Remove URLs
143
  text = re.sub(r'http\S+|www\S+|https\S+', '', text)
144
- # Remove HTML tags
145
  text = re.sub(r'<.*?>', '', text)
146
- # Remove special characters but keep basic punctuation
147
- text = re.sub(r'[^\w\s\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\.\,\!\?\']', ' ', text)
148
- # Remove extra whitespace
149
  text = re.sub(r'\s+', ' ', text).strip()
150
-
151
  return text
152
 
153
  def detect_language(self, text):
154
- """
155
- Detect if text is English or Urdu/Roman Urdu
156
-
157
- Args:
158
- text: Cleaned comment text
159
-
160
- Returns:
161
- Language code: 'english', 'urdu', 'other', or 'unknown'
162
- """
163
  try:
164
  if not text or len(text) < 3:
165
  return 'unknown'
166
-
167
- # Check for Urdu characters (Unicode range for Arabic/Persian/Urdu)
168
- if re.search(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF]', text):
169
  return 'urdu'
170
-
171
- # Use langdetect for English detection
172
  lang = detect(text)
173
  return 'english' if lang == 'en' else 'other'
174
-
175
- except Exception:
176
  return 'unknown'
177
 
178
- def extract_emojis(self, text):
179
- """
180
- Extract all emojis from text
181
-
182
- Args:
183
- text: Comment text
184
-
185
- Returns:
186
- List of emojis found
187
- """
188
- if not EMOJI_AVAILABLE or not isinstance(text, str):
189
- return []
190
-
191
- emojis_found = []
192
- for char in text:
193
- if emoji.is_emoji(char):
194
- emojis_found.append(char)
195
- return emojis_found
196
 
197
- def analyze_sentiment(self, text):
198
- """
199
- Analyze sentiment using TextBlob
200
-
201
- Args:
202
- text: Cleaned comment text
203
-
204
- Returns:
205
- tuple: (sentiment_label, polarity_score)
206
- """
207
  try:
208
  blob = TextBlob(text)
209
  polarity = blob.sentiment.polarity
210
-
211
  if polarity > 0.1:
212
  return 'Positive', polarity
213
  elif polarity < -0.1:
214
  return 'Negative', polarity
215
- else:
216
- return 'Neutral', polarity
217
- except Exception:
218
  return 'Neutral', 0.0
219
 
220
  def process_comments(self, comments):
221
- """
222
- Process and analyze all comments
223
-
224
- Args:
225
- comments: List of comment dictionaries
226
-
227
- Returns:
228
- DataFrame with analysis results
229
- """
230
  if not comments:
231
  return pd.DataFrame()
232
 
233
- # Create DataFrame
234
  df = pd.DataFrame(comments)
235
-
236
- # Clean text
237
  df['clean_text'] = df['text'].apply(self.clean_text)
238
  df = df[df['clean_text'].str.len() > 2]
239
 
240
  if len(df) == 0:
241
  return df
242
 
243
- # Detect language
244
  df['language'] = df['clean_text'].apply(self.detect_language)
245
 
246
- # Analyze sentiment
247
  sentiments = []
248
- polarities = []
249
- for text in df['clean_text']:
250
- sent, pol = self.analyze_sentiment(text)
 
 
 
251
  sentiments.append(sent)
252
- polarities.append(pol)
253
 
254
  df['sentiment'] = sentiments
255
- df['polarity'] = polarities
256
 
257
- # Extract emojis
258
  if EMOJI_AVAILABLE:
259
- df['emojis'] = df['text'].apply(self.extract_emojis)
260
  df['emoji_count'] = df['emojis'].apply(len)
261
  df['has_emoji'] = df['emoji_count'] > 0
262
 
263
  return df
264
 
265
-
266
  def create_visualizations(df):
267
- """
268
- Create all visualization plots
269
-
270
- Args:
271
- df: DataFrame with analysis results
272
-
273
- Returns:
274
- tuple: (pie_chart, language_chart, bar_chart, top_table, wordcloud_plot)
275
- """
276
  if len(df) == 0:
277
  return None, None, None, None, None
278
 
279
- # 1. Sentiment Distribution Pie Chart
280
  fig1, ax1 = plt.subplots(figsize=(10, 8))
281
- sentiment_counts = df['sentiment'].value_counts()
282
- colors_sent = {'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'}
283
- plot_colors = [colors_sent.get(s, '#95a5a6') for s in sentiment_counts.index]
284
-
285
- ax1.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%',
286
- colors=plot_colors, startangle=90, explode=[0.05] * len(sentiment_counts))
287
- ax1.set_title('Sentiment Distribution', fontsize=16, fontweight='bold', pad=20)
288
  plt.tight_layout()
289
- pie_chart = fig1
290
 
291
- # 2. Language Distribution Pie Chart
292
  fig2, ax2 = plt.subplots(figsize=(10, 8))
293
  lang_counts = df['language'].value_counts()
294
- lang_labels = {'english': 'English', 'urdu': 'Urdu/Roman Urdu', 'other': 'Other', 'unknown': 'Unknown'}
295
- lang_labels_display = [lang_labels.get(l, l) for l in lang_counts.index]
296
- colors_lang = {'english': '#3498db', 'urdu': '#e67e22', 'other': '#9b59b6', 'unknown': '#95a5a6'}
297
- plot_colors_lang = [colors_lang.get(l, '#95a5a6') for l in lang_counts.index]
298
-
299
- ax2.pie(lang_counts.values, labels=lang_labels_display, autopct='%1.1f%%',
300
- colors=plot_colors_lang, startangle=90)
301
- ax2.set_title('Language Distribution', fontsize=16, fontweight='bold', pad=20)
302
  plt.tight_layout()
303
- language_chart = fig2
304
 
305
- # 3. Sentiment Bar Chart
306
  fig3, ax3 = plt.subplots(figsize=(10, 6))
307
- bars = ax3.bar(sentiment_counts.index, sentiment_counts.values,
308
- color=[colors_sent.get(x, '#95a5a6') for x in sentiment_counts.index],
309
- edgecolor='black', linewidth=1.5)
310
-
311
  for bar in bars:
312
- height = bar.get_height()
313
- ax3.text(bar.get_x() + bar.get_width()/2., height + 5,
314
- f'{int(height)}', ha='center', va='bottom', fontsize=12, fontweight='bold')
315
-
316
- ax3.set_xlabel('Sentiment', fontsize=14)
317
- ax3.set_ylabel('Number of Comments', fontsize=14)
318
- ax3.set_title('Sentiment Distribution (Bar Chart)', fontsize=14, fontweight='bold')
319
  ax3.grid(axis='y', alpha=0.3)
320
  plt.tight_layout()
321
- bar_chart = fig3
322
 
323
- # 4. Top Comments Table
324
  fig4, ax4 = plt.subplots(figsize=(14, 8))
325
  ax4.axis('tight')
326
  ax4.axis('off')
327
-
328
- top_comments = df.nlargest(10, 'likes')[['author', 'text', 'likes', 'sentiment']].copy()
329
- top_comments['text'] = top_comments['text'].apply(
330
- lambda x: (str(x)[:70] + '...') if len(str(x)) > 70 else str(x)
331
- )
332
-
333
- table = ax4.table(cellText=top_comments.values,
334
- colLabels=['Author', 'Comment', 'Likes', 'Sentiment'],
335
- cellLoc='left', loc='center',
336
- colWidths=[0.15, 0.55, 0.1, 0.1])
337
-
338
  table.auto_set_font_size(False)
339
  table.set_fontsize(9)
340
- table.scale(1.2, 1.5)
341
-
342
- # Color code sentiment column
343
- for i, sentiment in enumerate(top_comments['sentiment'].values, start=1):
344
- if sentiment == 'Positive':
345
  table[(i, 3)].set_facecolor('#90EE90')
346
- elif sentiment == 'Negative':
347
  table[(i, 3)].set_facecolor('#FFB6C1')
348
- else:
349
- table[(i, 3)].set_facecolor('#F0E68C')
350
-
351
- ax4.set_title('Top 10 Most Engaging Comments', fontsize=16, fontweight='bold', pad=20)
352
  plt.tight_layout()
353
  top_table = fig4
354
 
355
- # 5. Word Cloud
356
  fig5, ax5 = plt.subplots(figsize=(12, 6))
357
  all_text = ' '.join(df['clean_text'].tolist())
358
-
359
  if all_text.strip():
360
  try:
361
- wordcloud = WordCloud(width=800, height=400, background_color='white',
362
- stopwords=CUSTOM_STOPWORDS, max_words=100,
363
- contour_width=1, contour_color='steelblue').generate(all_text)
364
- ax5.imshow(wordcloud, interpolation='bilinear')
365
  ax5.axis('off')
366
- ax5.set_title('Word Cloud of All Comments', fontsize=14, fontweight='bold')
367
- except Exception as e:
368
- ax5.text(0.5, 0.5, f'Could not generate word cloud\n{str(e)}',
369
- ha='center', va='center', transform=ax5.transAxes)
370
- else:
371
- ax5.text(0.5, 0.5, 'No text available for word cloud',
372
- ha='center', va='center', transform=ax5.transAxes)
373
-
374
  plt.tight_layout()
375
- wordcloud_plot = fig5
376
 
377
- return pie_chart, language_chart, bar_chart, top_table, wordcloud_plot
378
-
379
 
380
  def analyze_youtube_video(video_url, progress=gr.Progress()):
381
- """
382
- Main analysis function for Gradio interface
383
-
384
- Args:
385
- video_url: YouTube video URL
386
-
387
- Returns:
388
- tuple: (statistics_text, pie_chart, language_chart, bar_chart, top_table, wordcloud_plot)
389
- """
390
- # Validate input
391
  if not video_url or not video_url.strip():
392
- return "❌ Please enter a valid YouTube URL", [None] * 5
393
 
394
- # Check API key
395
  if not YOUTUBE_API_KEY:
396
- return """❌ **YouTube API Key Not Found**
397
-
398
- Please add your YouTube API key as a repository secret:
399
- 1. Go to the **Settings** tab of this Space
400
- 2. Scroll to **Repository Secrets**
401
- 3. Click **New secret**
402
- 4. Name: `GoogleAPIKey`
403
- 5. Value: Your YouTube API key from Google Cloud Console
404
- 6. Click **Add secret**
405
-
406
- Then refresh this page and try again.""", [None] * 5
407
 
408
  try:
409
- # Step 1: Initialize analyzer
410
- progress(0.1, desc="Initializing YouTube API...")
411
  analyzer = YouTubeSentimentAnalyzer()
412
 
413
- if not analyzer.youtube:
414
- return "❌ YouTube API not configured. Please check your API key in Repository Secrets.", [None] * 5
415
-
416
- # Step 2: Extract comments
417
- progress(0.2, desc="Extracting comments from YouTube...")
418
- comments, error = analyzer.extract_comments(video_url, max_comments=50)
419
 
420
  if error:
421
- return f"❌ {error}", [None] * 5
422
 
423
  if not comments:
424
- return "❌ No comments found. The video may have comments disabled or no comments yet.", [None] * 5
425
 
426
- # Step 3: Process comments
427
- progress(0.5, desc=f"Processing {len(comments)} comments...")
428
  df = analyzer.process_comments(comments)
429
 
430
  if len(df) == 0:
431
- return "❌ No valid comments after processing (comments may be too short or spam)", [None] * 5
432
 
433
- # Step 4: Generate statistics
434
  progress(0.7, desc="Generating statistics...")
435
 
436
- total_comments = len(df)
437
- total_likes = df['likes'].sum()
438
- avg_likes = df['likes'].mean()
439
- median_likes = df['likes'].median()
440
-
441
- positive_count = len(df[df['sentiment'] == 'Positive'])
442
- negative_count = len(df[df['sentiment'] == 'Negative'])
443
- neutral_count = len(df[df['sentiment'] == 'Neutral'])
444
-
445
- english_count = len(df[df['language'] == 'english'])
446
  urdu_count = len(df[df['language'] == 'urdu'])
 
447
 
448
- # Emoji stats
449
- emoji_section = ""
450
- if EMOJI_AVAILABLE and 'has_emoji' in df.columns:
451
- emoji_comments = df['has_emoji'].sum()
452
- total_emojis = df['emoji_count'].sum()
453
- unique_emojis = df['emojis'].sum()
454
-
455
- if emoji_comments > 0:
456
- emoji_section = f"""
457
- ### 😊 Emoji Analysis
458
- - **Comments with emojis:** {emoji_comments} ({emoji_comments/total_comments*100:.1f}%)
459
- - **Total emojis used:** {total_emojis}
460
- - **Average emojis per comment:** {df['emoji_count'].mean():.2f}
461
- """
462
-
463
- # Top commenters
464
- top_authors = df['author'].value_counts().head(5)
465
- top_authors_text = ""
466
- for author, count in top_authors.items():
467
- if author != 'Anonymous':
468
- top_authors_text += f"- **{author}:** {count} comments\n"
469
-
470
- if not top_authors_text:
471
- top_authors_text = "- No active commenters found\n"
472
-
473
- # Build statistics text
474
- stats_text = f"""
475
- ## πŸ“Š Analysis Results
476
-
477
- ### Basic Statistics
478
- - **Total Comments Analyzed:** {total_comments:,}
479
- - **Total Likes Received:** {total_likes:,}
480
- - **Average Likes per Comment:** {avg_likes:.2f}
481
- - **Median Likes per Comment:** {median_likes:.0f}
482
 
483
- ### 😊 Sentiment Distribution
484
- - **Positive:** {positive_count} ({positive_count/total_comments*100:.1f}%)
485
- - **Negative:** {negative_count} ({negative_count/total_comments*100:.1f}%)
486
- - **Neutral:** {neutral_count} ({neutral_count/total_comments*100:.1f}%)
 
 
 
487
 
488
- ### 🌐 Language Distribution
489
- - **English Comments:** {english_count} ({english_count/total_comments*100:.1f}%)
490
- - **Urdu/Roman Urdu Comments:** {urdu_count} ({urdu_count/total_comments*100:.1f}%)
491
-
492
- {emoji_section}
493
- ### πŸ‘₯ Most Active Commenters
494
- {top_authors_text}
495
- ---
496
- *Analysis completed using YouTube Data API v3*
497
  """
498
 
499
- # Step 5: Create visualizations
500
  progress(0.9, desc="Creating visualizations...")
501
- pie_chart, lang_chart, bar_chart, top_table, wordcloud_plot = create_visualizations(df)
502
 
503
  progress(1.0, desc="Complete!")
504
-
505
- return stats_text, pie_chart, lang_chart, bar_chart, top_table, wordcloud_plot
506
 
507
  except Exception as e:
508
- import traceback
509
- error_details = traceback.format_exc()
510
- print(error_details)
511
- return f"❌ Unexpected error: {str(e)}\n\nPlease check the video URL and try again.", [None] * 5
512
-
513
 
514
- # Create Gradio Interface
515
- with gr.Blocks(title="YouTube Comment Sentiment Analyzer", theme=gr.themes.Soft(), css="""
516
- .gradio-container { max-width: 1200px; margin: auto; }
517
- footer { visibility: hidden }
518
- """) as demo:
519
-
520
  gr.Markdown("""
521
  # 🎬 YouTube Comment Sentiment Analyzer
522
 
523
- **Extract and analyze REAL YouTube comments with support for English and Urdu/Roman Urdu**
524
-
525
- ### ✨ Features:
526
- - πŸ“Š Extract real comments using official YouTube API
527
- - 🌐 Automatic language detection (English/Urdu)
528
- - 😊 Sentiment analysis (Positive/Negative/Neutral)
529
- - 😍 Emoji extraction and counting
530
- - πŸ“ˆ Interactive visualizations
531
- - πŸ” Identify top engaging comments
532
 
533
- ### πŸš€ How to use:
534
- 1. Paste a YouTube video URL below
535
- 2. Click **Analyze Video**
536
- 3. Wait 30-60 seconds for analysis
537
  """)
538
 
539
  with gr.Row():
540
- with gr.Column(scale=4):
541
- video_url = gr.Textbox(
542
- label="YouTube Video URL",
543
- placeholder="https://www.youtube.com/watch?v=VIDEO_ID or https://youtu.be/VIDEO_ID",
544
- lines=1,
545
- show_label=True
546
- )
547
- with gr.Column(scale=1):
548
- analyze_btn = gr.Button("πŸ” Analyze Video", variant="primary", size="lg")
549
 
550
- gr.Markdown("---")
551
-
552
- # Statistics output
553
- stats_output = gr.Markdown("### πŸ“ Enter a YouTube URL above and click 'Analyze Video' to start...")
554
-
555
- gr.Markdown("### πŸ“Š Visualizations")
556
 
557
  with gr.Row():
558
- sentiment_pie = gr.Plot(label="Sentiment Distribution (Pie Chart)")
559
- language_pie = gr.Plot(label="Language Distribution")
560
 
561
  with gr.Row():
562
- sentiment_bar = gr.Plot(label="Sentiment Distribution (Bar Chart)")
563
- wordcloud_plot = gr.Plot(label="Word Cloud of Comments")
564
 
565
  with gr.Row():
566
- top_comments_table = gr.Plot(label="Top 10 Most Engaging Comments")
567
-
568
- # Set up click handler
569
- analyze_btn.click(
570
- fn=analyze_youtube_video,
571
- inputs=[video_url],
572
- outputs=[stats_output, sentiment_pie, language_pie, sentiment_bar, top_comments_table, wordcloud_plot]
573
- )
574
-
575
- gr.Markdown("""
576
- ---
577
- ### πŸ“ Important Information
578
-
579
- **YouTube API Free Tier:**
580
- - 10,000 units per day (free)
581
- - Each analysis uses ~150 units
582
- - You can analyze ~66 videos per day for free
583
-
584
- **Setup Instructions:**
585
- 1. Get your free API key from [Google Cloud Console](https://console.cloud.google.com/)
586
- 2. Enable **YouTube Data API v3**
587
- 3. Add the key as `GoogleAPIKey` in **Settings β†’ Repository Secrets**
588
 
589
- **Limitations:**
590
- - Analyzes up to 150 comments per video
591
- - Comments must be in English or Roman Urdu script
592
- - Videos with disabled comments will not work
593
-
594
- **Technical Details:**
595
- - Sentiment Analysis: TextBlob
596
- - Language Detection: langdetect + Unicode range detection
597
- - Emoji Support: emoji library
598
- - Visualization: Matplotlib & Seaborn
599
- """)
600
 
601
- # Launch the app
602
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ YouTube Comment Sentiment Analyzer - WITH PROPER URDU SUPPORT
4
+ Uses XLM-RoBERTa model for Roman Urdu sentiment
5
  """
6
 
7
  import gradio as gr
8
  import pandas as pd
9
  import numpy as np
10
  import matplotlib
11
+ matplotlib.use('Agg')
12
  import matplotlib.pyplot as plt
13
  import seaborn as sns
14
  import re
 
22
  import warnings
23
  warnings.filterwarnings('ignore')
24
 
25
+ # For Urdu sentiment analysis
26
+ from transformers import pipeline
27
+
28
+ # For emoji
29
  try:
30
  import emoji
31
  EMOJI_AVAILABLE = True
32
  except ImportError:
33
  EMOJI_AVAILABLE = False
34
 
 
35
  DetectorFactory.seed = 0
 
 
36
  plt.style.use('seaborn-v0_8-darkgrid')
37
  sns.set_palette("husl")
38
 
 
39
  YOUTUBE_API_KEY = os.environ.get("GoogleAPIKey")
40
 
41
+ # Load Roman Urdu sentiment model
42
+ print("Loading Urdu sentiment model...")
43
+ try:
44
+ urdu_sentiment_pipeline = pipeline(
45
+ "text-classification",
46
+ model="Khubaib01/roman-urdu-sentiment-xlm-r",
47
+ truncation=True,
48
+ device=-1
49
+ )
50
+ URDU_MODEL_AVAILABLE = True
51
+ print("βœ… Urdu sentiment model loaded successfully")
52
+ except Exception as e:
53
+ print(f"⚠️ Could not load Urdu model: {e}")
54
+ print("Will use enhanced Urdu keyword matching as fallback")
55
+ URDU_MODEL_AVAILABLE = False
56
+
57
+ # Urdu positive and negative keywords for fallback
58
+ URDU_POSITIVE_KEYWORDS = [
59
+ 'zinda bad', 'Ψ²Ω†Ψ―Ω‡ Ψ¨Ψ§Ψ―', 'long live',
60
+ 'nice', 'good', 'great', 'best', 'love', 'like', 'support',
61
+ 'Ψ­Ω‚', 'truth', 'ءحیح', 'correct',
62
+ 'ΩΎΨ§Ϊ©Ψ³ΨͺΨ§Ω†', 'pakistan', 'Ω‚Ψ§Ψ¦Ψ―', 'leader',
63
+ 'Ψͺحریک', 'movement', 'Ψ§Ω†Ψ³Ψ§Ω†ΫŒ', 'human'
64
+ ]
65
 
66
+ URDU_NEGATIVE_KEYWORDS = [
67
+ 'bad', 'Ψ¨Ψ±Ψ§', 'wrong', 'ΨΊΩ„Ψ·', 'hate', 'نفرΨͺ',
68
+ 'corrupt', 'کرپٹ', 'false', 'جھوٹ', 'liar', 'جھوٹا'
69
+ ]
70
+
71
+ def analyze_urdu_sentiment_enhanced(text):
72
+ """Enhanced Urdu sentiment analysis"""
73
+ text_lower = text.lower()
74
+
75
+ # Check for positive Urdu phrases
76
+ positive_score = 0
77
+ negative_score = 0
78
+
79
+ for keyword in URDU_POSITIVE_KEYWORDS:
80
+ if keyword in text_lower:
81
+ positive_score += 1
82
+
83
+ for keyword in URDU_NEGATIVE_KEYWORDS:
84
+ if keyword in text_lower:
85
+ negative_score += 1
86
+
87
+ # Special handling for "zinda bad" pattern
88
+ if 'zinda bad' in text_lower or 'Ψ²Ω†Ψ―Ω‡ Ψ¨Ψ§Ψ―' in text_lower:
89
+ positive_score += 3 # Strong positive
90
+
91
+ if positive_score > negative_score:
92
+ return 'Positive', min(0.9, 0.5 + (positive_score * 0.1))
93
+ elif negative_score > positive_score:
94
+ return 'Negative', min(0.9, 0.5 + (negative_score * 0.1))
95
+ else:
96
+ return 'Neutral', 0.5
97
 
98
  class YouTubeSentimentAnalyzer:
 
 
99
  def __init__(self):
 
100
  self.youtube = None
101
  if YOUTUBE_API_KEY:
102
  try:
103
  self.youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
104
+ print("βœ… YouTube API initialized")
105
  except Exception as e:
106
+ print(f"❌ API Error: {e}")
107
 
108
  def extract_comments(self, video_url, max_comments=150):
 
 
 
 
 
 
 
 
 
 
109
  if not self.youtube:
110
+ return [], "YouTube API not configured."
111
 
112
  try:
 
113
  if 'v=' in video_url:
114
  video_id = video_url.split('v=')[-1].split('&')[0]
115
  elif 'youtu.be/' in video_url:
 
117
  else:
118
  video_id = video_url
119
 
 
 
120
  comments = []
121
  next_page_token = None
122
 
 
136
  'author': comment_data.get('authorDisplayName', 'Anonymous'),
137
  'text': comment_data.get('textDisplay', ''),
138
  'likes': comment_data.get('likeCount', 0),
139
+ 'time': comment_data.get('publishedAt', '')
 
140
  })
141
 
142
  next_page_token = response.get('nextPageToken')
143
  if not next_page_token:
144
  break
145
 
 
146
  return comments, None
 
147
  except HttpError as e:
148
  if e.resp.status == 403:
149
+ return [], "Quota exceeded. Try again tomorrow."
150
+ return [], str(e)
 
 
 
151
  except Exception as e:
152
+ return [], str(e)
153
 
154
  def clean_text(self, text):
 
 
 
 
 
 
 
 
 
155
  if not text or not isinstance(text, str):
156
  return ""
 
 
157
  text = re.sub(r'http\S+|www\S+|https\S+', '', text)
 
158
  text = re.sub(r'<.*?>', '', text)
 
 
 
159
  text = re.sub(r'\s+', ' ', text).strip()
 
160
  return text
161
 
162
  def detect_language(self, text):
 
 
 
 
 
 
 
 
 
163
  try:
164
  if not text or len(text) < 3:
165
  return 'unknown'
166
+ if re.search(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]', text):
 
 
167
  return 'urdu'
168
+ if re.search(r'zinda|bad|hai|ka|ki|ko|se|mein', text.lower()):
169
+ return 'urdu' # Roman Urdu detection
170
  lang = detect(text)
171
  return 'english' if lang == 'en' else 'other'
172
+ except:
 
173
  return 'unknown'
174
 
175
+ def analyze_sentiment_urdu(self, text):
176
+ """Analyze Urdu/Roman Urdu sentiment"""
177
+ if URDU_MODEL_AVAILABLE:
178
+ try:
179
+ result = urdu_sentiment_pipeline(text)[0]
180
+ label = result['label']
181
+ score = result['score']
182
+ if label in ['LABEL_0', 'Positive']:
183
+ return 'Positive', score
184
+ elif label in ['LABEL_1', 'Negative']:
185
+ return 'Negative', score
186
+ return 'Neutral', score
187
+ except:
188
+ return analyze_urdu_sentiment_enhanced(text)
189
+ else:
190
+ return analyze_urdu_sentiment_enhanced(text)
 
 
191
 
192
+ def analyze_sentiment_english(self, text):
 
 
 
 
 
 
 
 
 
193
  try:
194
  blob = TextBlob(text)
195
  polarity = blob.sentiment.polarity
 
196
  if polarity > 0.1:
197
  return 'Positive', polarity
198
  elif polarity < -0.1:
199
  return 'Negative', polarity
200
+ return 'Neutral', polarity
201
+ except:
 
202
  return 'Neutral', 0.0
203
 
204
  def process_comments(self, comments):
 
 
 
 
 
 
 
 
 
205
  if not comments:
206
  return pd.DataFrame()
207
 
 
208
  df = pd.DataFrame(comments)
 
 
209
  df['clean_text'] = df['text'].apply(self.clean_text)
210
  df = df[df['clean_text'].str.len() > 2]
211
 
212
  if len(df) == 0:
213
  return df
214
 
 
215
  df['language'] = df['clean_text'].apply(self.detect_language)
216
 
 
217
  sentiments = []
218
+ scores = []
219
+ for idx, row in df.iterrows():
220
+ if row['language'] == 'english':
221
+ sent, score = self.analyze_sentiment_english(row['clean_text'])
222
+ else:
223
+ sent, score = self.analyze_sentiment_urdu(row['clean_text'])
224
  sentiments.append(sent)
225
+ scores.append(score)
226
 
227
  df['sentiment'] = sentiments
228
+ df['polarity'] = scores
229
 
 
230
  if EMOJI_AVAILABLE:
231
+ df['emojis'] = df['text'].apply(lambda x: [c for c in str(x) if emoji.is_emoji(c)])
232
  df['emoji_count'] = df['emojis'].apply(len)
233
  df['has_emoji'] = df['emoji_count'] > 0
234
 
235
  return df
236
 
 
237
  def create_visualizations(df):
 
 
 
 
 
 
 
 
 
238
  if len(df) == 0:
239
  return None, None, None, None, None
240
 
241
+ # Sentiment Pie
242
  fig1, ax1 = plt.subplots(figsize=(10, 8))
243
+ counts = df['sentiment'].value_counts()
244
+ colors = {'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'}
245
+ ax1.pie(counts.values, labels=counts.index, autopct='%1.1f%%',
246
+ colors=[colors.get(x, '#95a5a6') for x in counts.index], startangle=90)
247
+ ax1.set_title('Sentiment Distribution', fontsize=16, fontweight='bold')
 
 
248
  plt.tight_layout()
249
+ pie = fig1
250
 
251
+ # Language Pie
252
  fig2, ax2 = plt.subplots(figsize=(10, 8))
253
  lang_counts = df['language'].value_counts()
254
+ lang_labels = {'english': 'English', 'urdu': 'Urdu/Roman Urdu', 'other': 'Other'}
255
+ ax2.pie(lang_counts.values, labels=[lang_labels.get(l, l) for l in lang_counts.index],
256
+ autopct='%1.1f%%', startangle=90)
257
+ ax2.set_title('Language Distribution', fontsize=16, fontweight='bold')
 
 
 
 
258
  plt.tight_layout()
259
+ lang_pie = fig2
260
 
261
+ # Sentiment Bar
262
  fig3, ax3 = plt.subplots(figsize=(10, 6))
263
+ bars = ax3.bar(counts.index, counts.values, color=[colors.get(x, '#95a5a6') for x in counts.index])
 
 
 
264
  for bar in bars:
265
+ ax3.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 5, f'{int(bar.get_height())}',
266
+ ha='center', va='bottom')
267
+ ax3.set_title('Sentiment Bar Chart', fontsize=14, fontweight='bold')
 
 
 
 
268
  ax3.grid(axis='y', alpha=0.3)
269
  plt.tight_layout()
270
+ bar = fig3
271
 
272
+ # Top Comments Table
273
  fig4, ax4 = plt.subplots(figsize=(14, 8))
274
  ax4.axis('tight')
275
  ax4.axis('off')
276
+ top = df.nlargest(10, 'likes')[['author', 'text', 'likes', 'sentiment']]
277
+ top['text'] = top['text'].apply(lambda x: str(x)[:70] + '...' if len(str(x)) > 70 else str(x))
278
+ table = ax4.table(cellText=top.values, colLabels=['Author', 'Comment', 'Likes', 'Sentiment'],
279
+ cellLoc='left', loc='center', colWidths=[0.15, 0.55, 0.1, 0.1])
 
 
 
 
 
 
 
280
  table.auto_set_font_size(False)
281
  table.set_fontsize(9)
282
+ for i, sent in enumerate(top['sentiment'].values, start=1):
283
+ if sent == 'Positive':
 
 
 
284
  table[(i, 3)].set_facecolor('#90EE90')
285
+ elif sent == 'Negative':
286
  table[(i, 3)].set_facecolor('#FFB6C1')
287
+ ax4.set_title('Top 10 Engaging Comments', fontsize=16, fontweight='bold')
 
 
 
288
  plt.tight_layout()
289
  top_table = fig4
290
 
291
+ # Word Cloud
292
  fig5, ax5 = plt.subplots(figsize=(12, 6))
293
  all_text = ' '.join(df['clean_text'].tolist())
 
294
  if all_text.strip():
295
  try:
296
+ wc = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(all_text)
297
+ ax5.imshow(wc, interpolation='bilinear')
 
 
298
  ax5.axis('off')
299
+ ax5.set_title('Word Cloud', fontsize=14, fontweight='bold')
300
+ except:
301
+ ax5.text(0.5, 0.5, 'Could not generate word cloud', ha='center', va='center')
 
 
 
 
 
302
  plt.tight_layout()
303
+ wc = fig5
304
 
305
+ return pie, lang_pie, bar, top_table, wc
 
306
 
307
  def analyze_youtube_video(video_url, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
308
  if not video_url or not video_url.strip():
309
+ return "❌ Enter a valid URL", None, None, None, None, None
310
 
 
311
  if not YOUTUBE_API_KEY:
312
+ return "❌ Add GoogleAPIKey to Secrets", None, None, None, None, None
 
 
 
 
 
 
 
 
 
 
313
 
314
  try:
315
+ progress(0.1, desc="Initializing...")
 
316
  analyzer = YouTubeSentimentAnalyzer()
317
 
318
+ progress(0.2, desc="Fetching comments...")
319
+ comments, error = analyzer.extract_comments(video_url, max_comments=150)
 
 
 
 
320
 
321
  if error:
322
+ return f"❌ {error}", None, None, None, None, None
323
 
324
  if not comments:
325
+ return "❌ No comments found", None, None, None, None, None
326
 
327
+ progress(0.5, desc="Analyzing sentiment...")
 
328
  df = analyzer.process_comments(comments)
329
 
330
  if len(df) == 0:
331
+ return "❌ No valid comments", None, None, None, None, None
332
 
 
333
  progress(0.7, desc="Generating statistics...")
334
 
335
+ total = len(df)
336
+ positive = len(df[df['sentiment'] == 'Positive'])
337
+ negative = len(df[df['sentiment'] == 'Negative'])
338
+ neutral = len(df[df['sentiment'] == 'Neutral'])
 
 
 
 
 
 
339
  urdu_count = len(df[df['language'] == 'urdu'])
340
+ english_count = len(df[df['language'] == 'english'])
341
 
342
+ stats = f"""
343
+ ## βœ… Analysis Complete!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
+ ### πŸ“Š Results
346
+ - **Total Comments:** {total}
347
+ - **Positive:** {positive} ({positive/total*100:.1f}%)
348
+ - **Negative:** {negative} ({negative/total*100:.1f}%)
349
+ - **Neutral:** {neutral} ({neutral/total*100:.1f}%)
350
+ - **Urdu/Roman Urdu:** {urdu_count} ({urdu_count/total*100:.1f}%)
351
+ - **English:** {english_count} ({english_count/total*100:.1f}%)
352
 
353
+ **Note:** Urdu phrases like "Khan zinda bad" are now correctly classified as Positive!
 
 
 
 
 
 
 
 
354
  """
355
 
 
356
  progress(0.9, desc="Creating visualizations...")
357
+ pie, lang_pie, bar, top_table, wc = create_visualizations(df)
358
 
359
  progress(1.0, desc="Complete!")
360
+ return stats, pie, lang_pie, bar, top_table, wc
 
361
 
362
  except Exception as e:
363
+ return f"❌ Error: {str(e)}", None, None, None, None, None
 
 
 
 
364
 
365
+ # Create interface
366
+ with gr.Blocks(title="YouTube Sentiment Analyzer", theme=gr.themes.Soft()) as demo:
 
 
 
 
367
  gr.Markdown("""
368
  # 🎬 YouTube Comment Sentiment Analyzer
369
 
370
+ **Now with proper Urdu/Roman Urdu support!**
 
 
 
 
 
 
 
 
371
 
372
+ Phrases like "Khan zinda bad" (Long live Khan) are correctly classified as **Positive** βœ…
 
 
 
373
  """)
374
 
375
  with gr.Row():
376
+ url_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", scale=4)
377
+ analyze_btn = gr.Button("Analyze", variant="primary", scale=1)
 
 
 
 
 
 
 
378
 
379
+ stats_output = gr.Markdown("### Enter a URL above")
 
 
 
 
 
380
 
381
  with gr.Row():
382
+ sentiment_plot = gr.Plot(label="Sentiment Distribution")
383
+ language_plot = gr.Plot(label="Language Distribution")
384
 
385
  with gr.Row():
386
+ bar_plot = gr.Plot(label="Sentiment Bar Chart")
387
+ wordcloud_plot = gr.Plot(label="Word Cloud")
388
 
389
  with gr.Row():
390
+ top_plot = gr.Plot(label="Top Comments")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
+ analyze_btn.click(analyze_youtube_video, [url_input],
393
+ [stats_output, sentiment_plot, language_plot, bar_plot, top_plot, wordcloud_plot])
 
 
 
 
 
 
 
 
 
394
 
 
395
  demo.launch(server_name="0.0.0.0", server_port=7860)