From 4e3bc48a2163e31a7ad4d35e7fa23a5093f21141 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Feb 2026 14:19:59 +0100 Subject: [PATCH 1/3] fix: replace the CNN exercise with EZ population --- .../11_scraping_variants.md | 19 ++++----- .../exercises/cnn_sports_shortest_article.mjs | 40 ------------------ .../exercises/eurozone_population.mjs | 41 +++++++++++++++++++ .../exercises/test.bats | 6 +-- .../11_scraping_variants.md | 19 ++++----- .../exercises/cnn_sports_shortest_article.py | 32 --------------- .../exercises/eurozone_population.py | 28 +++++++++++++ .../exercises/test.bats | 6 +-- 8 files changed, 93 insertions(+), 98 deletions(-) delete mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs create mode 100644 sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs delete mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py create mode 100644 sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md index 5c256f17ae..022be427ea 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md @@ -9,7 +9,7 @@ import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; import JsLlmProjectsExercise from '!!raw-loader!roa-loader!./exercises/js_llm_projects.mjs'; -import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.mjs'; +import EurozonePopulationExercise from '!!raw-loader!roa-loader!./exercises/eurozone_population.mjs'; @@ -394,19 +394,18 @@ Your output should look something like this: {JsLlmProjectsExercise.code} -### Find the shortest CNN article which made it to the Sports homepage +### Count eurozone population from country pages -Scrape the [CNN Sports](https://edition.cnn.com/sport) homepage. For each linked article, calculate its length in characters: +Scrape the [Countries using the euro](https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en) page. -- Locate the element that holds the main content of the article. -- Use `.text()` to extract all the content as plain text. -- Use `.length` to calculate the character count. +1. Locate links for countries in the **Euro area countries** section. +1. Visit each linked country detail page. +1. Find the value labeled **Population**. +1. Convert it to a number and sum all country populations. -Skip pages without text (like those that only have a video). Sort the results and print the URL of the shortest article that made it to the homepage. - -At the time of writing, the shortest article on the CNN Sports homepage is [about a donation to the Augusta National Golf Club](https://edition.cnn.com/2024/10/03/sport/masters-donation-hurricane-helene-relief-spt-intl/), which is just 1,642 characters long. +Print one number – the total population of all countries using euro as their currency.
Solution - {CnnSportsShortestArticleExercise.code} + {EurozonePopulationExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs deleted file mode 100644 index c9e0bad89a..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs +++ /dev/null @@ -1,40 +0,0 @@ -import * as cheerio from 'cheerio'; - -async function download(url) { - const response = await fetch(url); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - const html = await response.text(); - return cheerio.load(html); -} - -const listingUrl = 'https://edition.cnn.com/sport'; -const $ = await download(listingUrl); - -const results = await Promise.all( - $('.layout__main .card').toArray().map(async (element) => { - const $element = $(element); - const $link = $element.find('a').first(); - if (!$link.length) { - return null; - } - - const articleUrl = new URL($link.attr('href'), listingUrl).href; - const $article = await download(articleUrl); - const content = $article('.article__content').text().trim(); - - if (!content) { - return null; - } - - return { url: articleUrl, length: content.length }; - }), -); - -const nonEmpty = results.filter((item) => item && item.length > 0); -nonEmpty.sort((a, b) => a.length - b.length); - -if (nonEmpty.length > 0) { - console.log(nonEmpty[0].url); -} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs new file mode 100644 index 0000000000..1f9c26753f --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs @@ -0,0 +1,41 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +function parsePopulation($) { + for (const element of $('li').toArray()) { + const text = $(element).text(); + if (text.includes('Population')) { + const digits = text + .replace("Population:", "") + .replaceAll(" ", ""); + return Number.parseInt(digits, 10); + } + } +} + +const listingUrl = 'https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en'; +const $ = await download(listingUrl); + +const $euroCountriesAccordion = $('.ecl-accordion__item').first(); +const $countryLinks = $euroCountriesAccordion.find('li a'); + +const promises = $countryLinks.toArray().map(async (element) => { + const countryUrl = new URL($(element).attr('href'), listingUrl).href; + const $country = await download(countryUrl); + return parsePopulation($country); +}); + +const populations = await Promise.all(promises); +const totalPopulation = populations + .filter((population) => Number.isInteger(population)) + .reduce((sum, population) => sum + population, 0); + +console.log(totalPopulation); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index caa3c9c75f..bb6d8cc0dc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -144,10 +144,10 @@ teardown_file() { [[ "$output" == *' updatedOn: '* ]] } -@test "finds the shortest CNN sports article" { - run node cnn_sports_shortest_article.mjs +@test "counts total eurozone population" { + run node eurozone_population.mjs - [[ "$output" == 'https://edition.cnn.com/'* ]] + [[ "$output" -gt 300000000 ]] } @test "scrapes F1 Academy driver details with Crawlee" { diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index e654ee34eb..f5135e9ddc 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -8,7 +8,7 @@ slug: /scraping-basics-python/scraping-variants import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; import PythonJobsDatabaseExercise from '!!raw-loader!roa-loader!./exercises/python_jobs_database.py'; -import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.py'; +import EurozonePopulationExercise from '!!raw-loader!roa-loader!./exercises/eurozone_population.py'; **In this lesson, we'll scrape the product detail pages to represent each product variant as a separate item in our dataset.** @@ -348,19 +348,18 @@ You can find everything you need for working with dates and times in Python's [` {PythonJobsDatabaseExercise.code} -### Find the shortest CNN article which made it to the Sports homepage +### Count eurozone population from country pages -Scrape the [CNN Sports](https://edition.cnn.com/sport) homepage. For each linked article, calculate its length in characters: +Scrape the [Countries using the euro](https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en) page. -- Locate the element that holds the main content of the article. -- Use [`get_text()`](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#get-text) to extract all the content as plain text. -- Use `len()` to calculate the character count. +1. Locate links for countries in the **Euro area countries** section. +1. Visit each linked country detail page. +1. Find the value labeled `Population`. +1. Convert it to a number and sum all country populations. -Skip pages without text (like those that only have a video). Sort the results and print the URL of the shortest article that made it to the homepage. - -At the time of writing, the shortest article on the CNN Sports homepage is [about a donation to the Augusta National Golf Club](https://edition.cnn.com/2024/10/03/sport/masters-donation-hurricane-helene-relief-spt-intl/), which is just 1,642 characters long. +Print one number - the total population of all countries in the euro area.
Solution - {CnnSportsShortestArticleExercise.code} + {EurozonePopulationExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py b/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py deleted file mode 100644 index bf8c03f07b..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py +++ /dev/null @@ -1,32 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - - -def download(url: str) -> BeautifulSoup: - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - -listing_url = "https://edition.cnn.com/sport" -listing_soup = download(listing_url) - -results: list[tuple[int, str]] = [] -for card in listing_soup.select('.layout__main .card'): - link = card.select_one('.container__link') - if not link or 'href' not in link.attrs: - continue - - article_url = urljoin(listing_url, link['href']) - article_soup = download(article_url) - content = article_soup.select_one('.article__content') - - if not content: - continue - - results.append((len(content.get_text()), article_url)) - -results.sort() -if results: - print(results[0][1]) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py b/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py new file mode 100644 index 0000000000..b469dbb8b1 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py @@ -0,0 +1,28 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +def parse_population(country_soup: BeautifulSoup) -> int | None: + for item in country_soup.select("li"): + if "Population" in item.text: + digits = item.text.replace("Population:", "").replace(" ", "") + return int(digits) + + +listing_url = "https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en" +listing_soup = download(listing_url) + +total_population = 0 +euro_countries_accordion = listing_soup.select(".ecl-accordion__item")[0] +for country_link in euro_countries_accordion.select("li a"): + country_url = urljoin(listing_url, country_link["href"]) + country_soup = download(country_url) + total_population += parse_population(country_soup) +print(total_population) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 9832cb0634..1a0b2844ed 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -134,10 +134,10 @@ teardown() { [[ "$output" == *"'posted_on': datetime.date("* ]] } -@test "finds the shortest CNN sports article" { - run uv run -q --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py +@test "counts total eurozone population" { + run uv run -q --with=httpx --with=beautifulsoup4 python eurozone_population.py - [[ "$output" == 'https://edition.cnn.com/'* ]] + [[ "$output" -gt 300000000 ]] } @test "scrapes F1 Academy driver details with Crawlee" { From 28be7a5d4b201af16fb9609ff5434939422b24f1 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Feb 2026 14:22:37 +0100 Subject: [PATCH 2/3] reword --- .../scraping_basics_javascript/11_scraping_variants.md | 7 +------ .../scraping_basics_python/11_scraping_variants.md | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md index 022be427ea..41cae1637b 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md @@ -398,12 +398,7 @@ Your output should look something like this: Scrape the [Countries using the euro](https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en) page. -1. Locate links for countries in the **Euro area countries** section. -1. Visit each linked country detail page. -1. Find the value labeled **Population**. -1. Convert it to a number and sum all country populations. - -Print one number – the total population of all countries using euro as their currency. +Locate links for countries in the **Euro area countries** section. Visit each linked country detail page, find the value labeled **Population**, and sum them all to get the total population of all countries using euro as their currency. Print one number, the sum.
Solution diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index f5135e9ddc..cb0ddc2b8c 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -352,12 +352,7 @@ You can find everything you need for working with dates and times in Python's [` Scrape the [Countries using the euro](https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en) page. -1. Locate links for countries in the **Euro area countries** section. -1. Visit each linked country detail page. -1. Find the value labeled `Population`. -1. Convert it to a number and sum all country populations. - -Print one number - the total population of all countries in the euro area. +Locate links for countries in the **Euro area countries** section. Visit each linked country detail page, find the value labeled **Population**, and sum them all to get the total population of all countries using euro as their currency. Print one number, the sum.
Solution From b4e6d5347a6b5948497f0856587488c82406f9d8 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Feb 2026 14:32:44 +0100 Subject: [PATCH 3/3] make linters happy --- .../exercises/eurozone_population.mjs | 5 +++-- .../scraping_basics_python/exercises/eurozone_population.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs index 1f9c26753f..8ebf980d60 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs @@ -14,11 +14,12 @@ function parsePopulation($) { const text = $(element).text(); if (text.includes('Population')) { const digits = text - .replace("Population:", "") - .replaceAll(" ", ""); + .replace('Population:', '') + .replaceAll(' ', ''); return Number.parseInt(digits, 10); } } + throw new Error('Population not found'); } const listingUrl = 'https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en'; diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py b/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py index b469dbb8b1..4cc1030afa 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py @@ -14,6 +14,7 @@ def parse_population(country_soup: BeautifulSoup) -> int | None: if "Population" in item.text: digits = item.text.replace("Population:", "").replace(" ", "") return int(digits) + raise ValueError("Population not found") listing_url = "https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en"