Spaces:
Sleeping
Sleeping
Upload scraper.py
Browse files- scraper.py +90 -0
scraper.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import requests
|
| 3 |
+
import csv
|
| 4 |
+
from urllib.parse import urlparse, parse_qs
|
| 5 |
+
|
| 6 |
+
def extract_video_id(youtube_url):
|
| 7 |
+
parsed = urlparse(youtube_url)
|
| 8 |
+
if parsed.hostname in ("youtu.be", "www.youtu.be"):
|
| 9 |
+
return parsed.path.lstrip("/")
|
| 10 |
+
if parsed.hostname in ("www.youtube.com", "youtube.com", "m.youtube.com"):
|
| 11 |
+
qs = parse_qs(parsed.query)
|
| 12 |
+
if "v" in qs:
|
| 13 |
+
return qs["v"][0]
|
| 14 |
+
path_parts = parsed.path.split("/")
|
| 15 |
+
if "shorts" in path_parts:
|
| 16 |
+
return path_parts[-1]
|
| 17 |
+
raise ValueError("could not extract video id from url: " + youtube_url)
|
| 18 |
+
|
| 19 |
+
def get_video_title(api_key, video_id):
|
| 20 |
+
url = "https://www.googleapis.com/youtube/v3/videos"
|
| 21 |
+
params = {"part": "snippet", "id": video_id, "key": api_key}
|
| 22 |
+
resp = requests.get(url, params=params)
|
| 23 |
+
resp.raise_for_status()
|
| 24 |
+
data = resp.json()
|
| 25 |
+
title = data["items"][0]["snippet"]["title"]
|
| 26 |
+
safe_title = "".join(c for c in title if c.isalnum() or c in (" ", "-", "_")).rstrip()
|
| 27 |
+
return safe_title or "youtube_comments"
|
| 28 |
+
|
| 29 |
+
def exponential_backoff_request(url, params, max_retries=6, backoff_base=1.5):
|
| 30 |
+
for attempt in range(max_retries):
|
| 31 |
+
resp = requests.get(url, params=params, timeout=30)
|
| 32 |
+
if resp.status_code == 200:
|
| 33 |
+
return resp
|
| 34 |
+
if resp.status_code in (403, 429, 500, 503):
|
| 35 |
+
wait = (backoff_base ** attempt) + (attempt * 0.5)
|
| 36 |
+
time.sleep(wait)
|
| 37 |
+
continue
|
| 38 |
+
resp.raise_for_status()
|
| 39 |
+
raise RuntimeError(f"max retries reached for url {url} (last status {resp.status_code})")
|
| 40 |
+
|
| 41 |
+
def fetch_all_comment_threads(api_key, video_id):
|
| 42 |
+
endpoint = "https://www.googleapis.com/youtube/v3/commentThreads"
|
| 43 |
+
params = {
|
| 44 |
+
"part": "snippet",
|
| 45 |
+
"videoId": video_id,
|
| 46 |
+
"key": api_key,
|
| 47 |
+
"maxResults": 100,
|
| 48 |
+
"textFormat": "plainText",
|
| 49 |
+
"order": "time"
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
comments = []
|
| 53 |
+
next_token = None
|
| 54 |
+
page = 0
|
| 55 |
+
|
| 56 |
+
while True:
|
| 57 |
+
if next_token:
|
| 58 |
+
params["pageToken"] = next_token
|
| 59 |
+
else:
|
| 60 |
+
params.pop("pageToken", None)
|
| 61 |
+
|
| 62 |
+
response = exponential_backoff_request(endpoint, params)
|
| 63 |
+
data = response.json()
|
| 64 |
+
items = data.get("items", [])
|
| 65 |
+
page += 1
|
| 66 |
+
comments.extend(items)
|
| 67 |
+
|
| 68 |
+
next_token = data.get("nextPageToken")
|
| 69 |
+
if not next_token:
|
| 70 |
+
break
|
| 71 |
+
|
| 72 |
+
time.sleep(0.1)
|
| 73 |
+
return comments
|
| 74 |
+
|
| 75 |
+
def flatten_comment_thread(thread_item):
|
| 76 |
+
s = thread_item["snippet"]
|
| 77 |
+
top = s["topLevelComment"]["snippet"]
|
| 78 |
+
return {
|
| 79 |
+
"text": top.get("textDisplay"),
|
| 80 |
+
"published_at": top.get("publishedAt"),
|
| 81 |
+
"like_count": top.get("likeCount"),
|
| 82 |
+
"reply_count": s.get("totalReplyCount", 0),
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
def scrape_comments(api_key, video_url):
|
| 86 |
+
video_id = extract_video_id(video_url)
|
| 87 |
+
title = get_video_title(api_key, video_id)
|
| 88 |
+
raw_comments = fetch_all_comment_threads(api_key, video_id)
|
| 89 |
+
flat_comments = [flatten_comment_thread(c) for c in raw_comments]
|
| 90 |
+
return title, flat_comments
|