Skip to content

Commit 8ab9357

Browse files
authored
Merge pull request #46 from cuappdev/claire/articleSportTypeFix
Implement sport type extraction from article titles
2 parents aeb28ff + 5a71190 commit 8ab9357

2 files changed

Lines changed: 50 additions & 6 deletions

File tree

src/scrapers/daily_sun_scrape.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from dotenv import load_dotenv
55
from ..services import ArticleService
66
from ..utils.constants import ARTICLE_IMG_TAG
7+
from ..utils.helpers import extract_sport_type_from_title
78
import logging
89
from bs4 import BeautifulSoup
910
import base64
@@ -35,11 +36,10 @@ def fetch_news():
3536
published_at_dt = published_at_dt.replace(tzinfo=timezone.utc)
3637
published_at = published_at_dt.isoformat().replace('+00:00', 'Z')
3738

38-
if published_at_dt >= three_days_ago:
39-
sports_type = next(
40-
(tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]),
41-
"General"
42-
)
39+
if published_at >= three_days_ago:
40+
# Extract sport type from title
41+
title = article["headline"]
42+
sports_type = extract_sport_type_from_title(title)
4343
article_url = f"https://cornellsun.com/article/{article['slug']}"
4444

4545
article_image = None

src/utils/helpers.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,48 @@ def is_cornell_loss(result: str):
9494

9595
# Common loss indicators in result strings
9696
loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"]
97-
return any(indicator in result for indicator in loss_indicators)
97+
return any(indicator in result for indicator in loss_indicators)
98+
99+
def extract_sport_type_from_title(title: str):
100+
"""
101+
Extract the sport type from an article title by matching against known sports.
102+
103+
Args:
104+
title (str): The article title to analyze
105+
106+
Returns:
107+
str: The sport name if found, otherwise "sports" as default
108+
"""
109+
from .constants import SPORT_URLS
110+
111+
if not title:
112+
return "sports"
113+
114+
# Get all unique sport names from SPORT_URLS
115+
sport_names = set()
116+
for sport_data in SPORT_URLS.values():
117+
sport_name = sport_data["sport"].strip()
118+
if sport_name:
119+
sport_names.add(sport_name)
120+
121+
# Sort by length (longest first) to match "Swimming & Diving" before "Swimming"
122+
sport_names_sorted = sorted(sport_names, key=len, reverse=True)
123+
124+
title_lower = title.lower()
125+
126+
for sport_name in sport_names_sorted:
127+
if sport_name.lower() in title_lower:
128+
return sport_name
129+
130+
# Special mappings for common variations in titles
131+
# Only checked if no exact match found above
132+
# e.g., "Hockey" in title should match "Ice Hockey" in sport names
133+
special_mappings = {
134+
"hockey": "Ice Hockey", # "Men's Hockey" or "Women's Hockey" → "Ice Hockey"
135+
}
136+
137+
for keyword, sport_name in special_mappings.items():
138+
if keyword in title_lower and sport_name in sport_names:
139+
return sport_name
140+
141+
return "sports"

0 commit comments

Comments
 (0)