siddqamar commited on
Commit
40ec98a
·
verified ·
1 Parent(s): c2246d1

Upload scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +90 -0
scraper.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ import csv
4
+ from urllib.parse import urlparse, parse_qs
5
+
6
+ def extract_video_id(youtube_url):
7
+ parsed = urlparse(youtube_url)
8
+ if parsed.hostname in ("youtu.be", "www.youtu.be"):
9
+ return parsed.path.lstrip("/")
10
+ if parsed.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"):
11
+ qs = parse_qs(parsed.query)
12
+ if "v" in qs:
13
+ return qs["v"][0]
14
+ path_parts = parsed.path.split("/")
15
+ if "shorts" in path_parts:
16
+ return path_parts[-1]
17
+ raise ValueError("could not extract video id from url: " + youtube_url)
18
+
19
+ def get_video_title(api_key, video_id):
20
+ url = "https://www.googleapis.com/youtube/v3/videos"
21
+ params = {"part": "snippet", "id": video_id, "key": api_key}
22
+ resp = requests.get(url, params=params)
23
+ resp.raise_for_status()
24
+ data = resp.json()
25
+ title = data["items"][0]["snippet"]["title"]
26
+ safe_title = "".join(c for c in title if c.isalnum() or c in (" ", "-", "_")).rstrip()
27
+ return safe_title or "youtube_comments"
28
+
29
+ def exponential_backoff_request(url, params, max_retries=6, backoff_base=1.5):
30
+ for attempt in range(max_retries):
31
+ resp = requests.get(url, params=params, timeout=30)
32
+ if resp.status_code == 200:
33
+ return resp
34
+ if resp.status_code in (403, 429, 500, 503):
35
+ wait = (backoff_base ** attempt) + (attempt * 0.5)
36
+ time.sleep(wait)
37
+ continue
38
+ resp.raise_for_status()
39
+ raise RuntimeError(f"max retries reached for url {url} (last status {resp.status_code})")
40
+
41
+ def fetch_all_comment_threads(api_key, video_id):
42
+ endpoint = "https://www.googleapis.com/youtube/v3/commentThreads"
43
+ params = {
44
+ "part": "snippet",
45
+ "videoId": video_id,
46
+ "key": api_key,
47
+ "maxResults": 100,
48
+ "textFormat": "plainText",
49
+ "order": "time"
50
+ }
51
+
52
+ comments = []
53
+ next_token = None
54
+ page = 0
55
+
56
+ while True:
57
+ if next_token:
58
+ params["pageToken"] = next_token
59
+ else:
60
+ params.pop("pageToken", None)
61
+
62
+ response = exponential_backoff_request(endpoint, params)
63
+ data = response.json()
64
+ items = data.get("items", [])
65
+ page += 1
66
+ comments.extend(items)
67
+
68
+ next_token = data.get("nextPageToken")
69
+ if not next_token:
70
+ break
71
+
72
+ time.sleep(0.1)
73
+ return comments
74
+
75
+ def flatten_comment_thread(thread_item):
76
+ s = thread_item["snippet"]
77
+ top = s["topLevelComment"]["snippet"]
78
+ return {
79
+ "text": top.get("textDisplay"),
80
+ "published_at": top.get("publishedAt"),
81
+ "like_count": top.get("likeCount"),
82
+ "reply_count": s.get("totalReplyCount", 0),
83
+ }
84
+
85
+ def scrape_comments(api_key, video_url):
86
+ video_id = extract_video_id(video_url)
87
+ title = get_video_title(api_key, video_id)
88
+ raw_comments = fetch_all_comment_threads(api_key, video_id)
89
+ flat_comments = [flatten_comment_thread(c) for c in raw_comments]
90
+ return title, flat_comments