Merge pull request #46 from cuappdev/claire/articleSportTypeFix

claiireyu · web-flow · commit 8ab93577165f · 2025-11-19T18:23:56.000-05:00
Implement sport type extraction from article titles
diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py
@@ -4,6 +4,7 @@
 from dotenv import load_dotenv
 from ..services import ArticleService
 from ..utils.constants import ARTICLE_IMG_TAG
+from ..utils.helpers import extract_sport_type_from_title
 import logging
 from bs4 import BeautifulSoup
 import base64
@@ -35,11 +36,10 @@ def fetch_news():
             published_at_dt = published_at_dt.replace(tzinfo=timezone.utc)
             published_at = published_at_dt.isoformat().replace('+00:00', 'Z')
             
-            if published_at_dt >= three_days_ago:
-                sports_type = next(
-                    (tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]),
-                    "General"
-                )
+            if published_at >= three_days_ago:
+                # Extract sport type from title
+                title = article["headline"]
+                sports_type = extract_sport_type_from_title(title)
                 article_url = f"https://cornellsun.com/article/{article['slug']}"
 
                 article_image = None
diff --git a/src/utils/helpers.py b/src/utils/helpers.py
@@ -94,4 +94,48 @@ def is_cornell_loss(result: str):
     
     # Common loss indicators in result strings
     loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"]
-    return any(indicator in result for indicator in loss_indicators)
+    return any(indicator in result for indicator in loss_indicators)
+
+def extract_sport_type_from_title(title: str):
+    """
+    Extract the sport type from an article title by matching against known sports.
+    
+    Args:
+        title (str): The article title to analyze
+        
+    Returns:
+        str: The sport name if found, otherwise "sports" as default
+    """
+    from .constants import SPORT_URLS
+    
+    if not title:
+        return "sports"
+    
+    # Get all unique sport names from SPORT_URLS
+    sport_names = set()
+    for sport_data in SPORT_URLS.values():
+        sport_name = sport_data["sport"].strip()
+        if sport_name:
+            sport_names.add(sport_name)
+    
+    # Sort by length (longest first) to match "Swimming & Diving" before "Swimming"
+    sport_names_sorted = sorted(sport_names, key=len, reverse=True)
+    
+    title_lower = title.lower()
+    
+    for sport_name in sport_names_sorted:
+        if sport_name.lower() in title_lower:
+            return sport_name
+    
+    # Special mappings for common variations in titles
+    # Only checked if no exact match found above
+    # e.g., "Hockey" in title should match "Ice Hockey" in sport names
+    special_mappings = {
+        "hockey": "Ice Hockey",  # "Men's Hockey" or "Women's Hockey" → "Ice Hockey"
+    }
+    
+    for keyword, sport_name in special_mappings.items():
+        if keyword in title_lower and sport_name in sport_names:
+            return sport_name
+    
+    return "sports"