diff --git a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md index 5c256f17ae..41cae1637b 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_javascript/11_scraping_variants.md @@ -9,7 +9,7 @@ import CodeBlock from '@theme/CodeBlock'; import LegacyJsCourseAdmonition from '@site/src/components/LegacyJsCourseAdmonition'; import Exercises from '../scraping_basics/_exercises.mdx'; import JsLlmProjectsExercise from '!!raw-loader!roa-loader!./exercises/js_llm_projects.mjs'; -import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.mjs'; +import EurozonePopulationExercise from '!!raw-loader!roa-loader!./exercises/eurozone_population.mjs'; @@ -394,19 +394,13 @@ Your output should look something like this: {JsLlmProjectsExercise.code} -### Find the shortest CNN article which made it to the Sports homepage +### Count eurozone population from country pages -Scrape the [CNN Sports](https://edition.cnn.com/sport) homepage. For each linked article, calculate its length in characters: +Scrape the [Countries using the euro](https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en) page. -- Locate the element that holds the main content of the article. -- Use `.text()` to extract all the content as plain text. -- Use `.length` to calculate the character count. - -Skip pages without text (like those that only have a video). Sort the results and print the URL of the shortest article that made it to the homepage. - -At the time of writing, the shortest article on the CNN Sports homepage is [about a donation to the Augusta National Golf Club](https://edition.cnn.com/2024/10/03/sport/masters-donation-hurricane-helene-relief-spt-intl/), which is just 1,642 characters long. +Locate links for countries in the **Euro area countries** section. Visit each linked country detail page, find the value labeled **Population**, and sum them all to get the total population of all countries using euro as their currency. Print one number, the sum.
Solution - {CnnSportsShortestArticleExercise.code} + {EurozonePopulationExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs deleted file mode 100644 index c9e0bad89a..0000000000 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/cnn_sports_shortest_article.mjs +++ /dev/null @@ -1,40 +0,0 @@ -import * as cheerio from 'cheerio'; - -async function download(url) { - const response = await fetch(url); - if (!response.ok) { - throw new Error(`HTTP ${response.status}`); - } - const html = await response.text(); - return cheerio.load(html); -} - -const listingUrl = 'https://edition.cnn.com/sport'; -const $ = await download(listingUrl); - -const results = await Promise.all( - $('.layout__main .card').toArray().map(async (element) => { - const $element = $(element); - const $link = $element.find('a').first(); - if (!$link.length) { - return null; - } - - const articleUrl = new URL($link.attr('href'), listingUrl).href; - const $article = await download(articleUrl); - const content = $article('.article__content').text().trim(); - - if (!content) { - return null; - } - - return { url: articleUrl, length: content.length }; - }), -); - -const nonEmpty = results.filter((item) => item && item.length > 0); -nonEmpty.sort((a, b) => a.length - b.length); - -if (nonEmpty.length > 0) { - console.log(nonEmpty[0].url); -} diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs new file mode 100644 index 0000000000..8ebf980d60 --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/eurozone_population.mjs @@ -0,0 +1,42 @@ +import * as cheerio from 'cheerio'; + +async function download(url) { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + const html = await response.text(); + return cheerio.load(html); +} + +function parsePopulation($) { + for (const element of $('li').toArray()) { + const text = $(element).text(); + if (text.includes('Population')) { + const digits = text + .replace('Population:', '') + .replaceAll(' ', ''); + return Number.parseInt(digits, 10); + } + } + throw new Error('Population not found'); +} + +const listingUrl = 'https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en'; +const $ = await download(listingUrl); + +const $euroCountriesAccordion = $('.ecl-accordion__item').first(); +const $countryLinks = $euroCountriesAccordion.find('li a'); + +const promises = $countryLinks.toArray().map(async (element) => { + const countryUrl = new URL($(element).attr('href'), listingUrl).href; + const $country = await download(countryUrl); + return parsePopulation($country); +}); + +const populations = await Promise.all(promises); +const totalPopulation = populations + .filter((population) => Number.isInteger(population)) + .reduce((sum, population) => sum + population, 0); + +console.log(totalPopulation); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index caa3c9c75f..bb6d8cc0dc 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -144,10 +144,10 @@ teardown_file() { [[ "$output" == *' updatedOn: '* ]] } -@test "finds the shortest CNN sports article" { - run node cnn_sports_shortest_article.mjs +@test "counts total eurozone population" { + run node eurozone_population.mjs - [[ "$output" == 'https://edition.cnn.com/'* ]] + [[ "$output" -gt 300000000 ]] } @test "scrapes F1 Academy driver details with Crawlee" { diff --git a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md index e654ee34eb..cb0ddc2b8c 100644 --- a/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md +++ b/sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md @@ -8,7 +8,7 @@ slug: /scraping-basics-python/scraping-variants import CodeBlock from '@theme/CodeBlock'; import Exercises from '../scraping_basics/_exercises.mdx'; import PythonJobsDatabaseExercise from '!!raw-loader!roa-loader!./exercises/python_jobs_database.py'; -import CnnSportsShortestArticleExercise from '!!raw-loader!roa-loader!./exercises/cnn_sports_shortest_article.py'; +import EurozonePopulationExercise from '!!raw-loader!roa-loader!./exercises/eurozone_population.py'; **In this lesson, we'll scrape the product detail pages to represent each product variant as a separate item in our dataset.** @@ -348,19 +348,13 @@ You can find everything you need for working with dates and times in Python's [` {PythonJobsDatabaseExercise.code} -### Find the shortest CNN article which made it to the Sports homepage +### Count eurozone population from country pages -Scrape the [CNN Sports](https://edition.cnn.com/sport) homepage. For each linked article, calculate its length in characters: +Scrape the [Countries using the euro](https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en) page. -- Locate the element that holds the main content of the article. -- Use [`get_text()`](https://beautiful-soup-4.readthedocs.io/en/latest/index.html#get-text) to extract all the content as plain text. -- Use `len()` to calculate the character count. - -Skip pages without text (like those that only have a video). Sort the results and print the URL of the shortest article that made it to the homepage. - -At the time of writing, the shortest article on the CNN Sports homepage is [about a donation to the Augusta National Golf Club](https://edition.cnn.com/2024/10/03/sport/masters-donation-hurricane-helene-relief-spt-intl/), which is just 1,642 characters long. +Locate links for countries in the **Euro area countries** section. Visit each linked country detail page, find the value labeled **Population**, and sum them all to get the total population of all countries using euro as their currency. Print one number, the sum.
Solution - {CnnSportsShortestArticleExercise.code} + {EurozonePopulationExercise.code}
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py b/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py deleted file mode 100644 index bf8c03f07b..0000000000 --- a/sources/academy/webscraping/scraping_basics_python/exercises/cnn_sports_shortest_article.py +++ /dev/null @@ -1,32 +0,0 @@ -import httpx -from bs4 import BeautifulSoup -from urllib.parse import urljoin - - -def download(url: str) -> BeautifulSoup: - response = httpx.get(url) - response.raise_for_status() - return BeautifulSoup(response.text, "html.parser") - - -listing_url = "https://edition.cnn.com/sport" -listing_soup = download(listing_url) - -results: list[tuple[int, str]] = [] -for card in listing_soup.select('.layout__main .card'): - link = card.select_one('.container__link') - if not link or 'href' not in link.attrs: - continue - - article_url = urljoin(listing_url, link['href']) - article_soup = download(article_url) - content = article_soup.select_one('.article__content') - - if not content: - continue - - results.append((len(content.get_text()), article_url)) - -results.sort() -if results: - print(results[0][1]) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py b/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py new file mode 100644 index 0000000000..4cc1030afa --- /dev/null +++ b/sources/academy/webscraping/scraping_basics_python/exercises/eurozone_population.py @@ -0,0 +1,29 @@ +import httpx +from bs4 import BeautifulSoup +from urllib.parse import urljoin + + +def download(url: str) -> BeautifulSoup: + response = httpx.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, "html.parser") + + +def parse_population(country_soup: BeautifulSoup) -> int | None: + for item in country_soup.select("li"): + if "Population" in item.text: + digits = item.text.replace("Population:", "").replace(" ", "") + return int(digits) + raise ValueError("Population not found") + + +listing_url = "https://european-union.europa.eu/institutions-law-budget/euro/countries-using-euro_en" +listing_soup = download(listing_url) + +total_population = 0 +euro_countries_accordion = listing_soup.select(".ecl-accordion__item")[0] +for country_link in euro_countries_accordion.select("li a"): + country_url = urljoin(listing_url, country_link["href"]) + country_soup = download(country_url) + total_population += parse_population(country_soup) +print(total_population) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 9832cb0634..1a0b2844ed 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -134,10 +134,10 @@ teardown() { [[ "$output" == *"'posted_on': datetime.date("* ]] } -@test "finds the shortest CNN sports article" { - run uv run -q --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py +@test "counts total eurozone population" { + run uv run -q --with=httpx --with=beautifulsoup4 python eurozone_population.py - [[ "$output" == 'https://edition.cnn.com/'* ]] + [[ "$output" -gt 300000000 ]] } @test "scrapes F1 Academy driver details with Crawlee" {