empriselab.github.io/scrape_yt.py at master · empriselab/empriselab.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from bs4 import BeautifulSoup
import requests
import yaml
import os

# Channel RSS feed to scrape for video data
url = "https://www.youtube.com/feeds/videos.xml?channel_id=UC964lfWIMojN48cA6FK2Fhg"
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")

vidlist = []
for entry in soup.find_all("entry"):
    video = {}
    # We want just the title and the video ID
    for title in entry.find_all("title"):
        print(title.text)
        video["title"] = title.text
    for id in entry.find_all("yt:videoid"):
        print(id.text)
        video["ytid"] = id.text
    vidlist.append(video)

# Saved to vidlist yaml file that is read by the website generator
vidlist_path = os.path.join(os.path.dirname(__file__), "_data", "vidlist.yml")
yaml.dump(vidlist, open(vidlist_path, "w"))