Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ run:
@echo "==============================================================================="
@echo "Running solution ..."
@echo "==============================================================================="
make docker-run
make python-fetch-data
make python-build-dataset

# destroy all Docker build and local artifacts
# takes around 1 minute to complete
Expand All @@ -61,7 +62,6 @@ tear-down:
@echo "Tearing down solution ..."
@echo "==============================================================================="
make python-clean
make docker-prune

pre-commit-init:
@echo "==============================================================================="
Expand Down Expand Up @@ -140,7 +140,6 @@ python-requirements:
pip install pip==25.3 setuptools wheel pip-tools
pip-compile requirements/in/base.in -o requirements/base.txt
pip-compile requirements/in/local.in -o requirements/local.txt
pip-compile requirements/in/docker.in -o requirements/docker.txt

python-fetch-data:
@echo "==============================================================================="
Expand All @@ -150,7 +149,7 @@ python-fetch-data:

python-build-dataset:
@echo "==============================================================================="
@echo "Building dataset from fetched data ..."
@echo "Building enriched Netflix dataset from fetched data ..."
@echo "==============================================================================="
$(ACTIVATE_VENV) && python -m netflix.fetch.dataset

Expand All @@ -177,4 +176,6 @@ help:
@echo 'python-lint - Run Python linting using pre-commit and pylint'
@echo 'python-clean - Destroy the Python virtual environment and remove __pycache__ directories'
@echo 'python-requirements - Compile and update Python dependency files'
@echo 'python-fetch-data - Fetch data from external APIs and save to local files'
@echo 'python-build-dataset - Build enriched Netflix dataset from fetched data'
@echo '===================================================================='
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,41 @@ Netflix AI Greenlight Challenge: Can Data Science Predict the Next Hit Drama?

## Quickstart

Install required system packages for your operating system:

- [Windows](./setup/windows/setup.ps1)
- [macOS](./setup/macos/setup.sh)
- [Linux](./setup/linux/setup.sh)

Initialize your environment. This includes creating and activating a Python virtual
environment, and then downloading data files for Netflix, IMDb and The Movie
Database (TMDB). The final dataset will be located at `./netflix/db/netflix_enriched_dataset.csv`.

**The setup process will take between 5 and 15 minutes depending on your compute
device and your Internet connection.**

```console
make python-init
make run
```

Other helpful commands:

```console
source venv/bin/activate
which python3
which pip3
python --version # you should see Python 3.13.x
pip --version # you should see pip 25.3.x
```

## Completely Remove This Project

```console
make tear-down
deactivate
```

Setup your [Kaggle API Key](./docs/KAGGLE.md)

## Support
Expand Down
6 changes: 6 additions & 0 deletions changelogs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this p



## [0.1.1-alpha.1](https://github.com/FullStackWithLawrence/netflix-writers/compare/v0.1.0...v0.1.1-alpha.1) (2026-06-16)

### Bug Fixes

* add os-specific setup scripts ([8d39cb0](https://github.com/FullStackWithLawrence/netflix-writers/commit/8d39cb04c762e5f3c096e5ac7b8be2a9f1d4b603))

## [0.1.0](https://github.com/FullStackWithLawrence/netflix-writers/compare/v0.0.1...v0.1.0) (2026-06-16)

### Features
Expand Down
2 changes: 1 addition & 1 deletion netflix/__version__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# DO NOT EDIT.
# Managed via automated CI/CD in .github/workflows/semanticVersionBump.yml.
__version__ = "0.1.0"
__version__ = "0.1.1-alpha.1"

__all__ = ["__version__"]
28 changes: 18 additions & 10 deletions netflix/fetch/fetch_imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- title.ratings.csv
"""

import logging
import os
from pathlib import Path

Expand All @@ -22,6 +23,13 @@
IMDB_DIR = os.path.join(DB_DIR, "imdb")
IMDB_TITLE_TYPES = ["movie", "tvSeries"]

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)


def export_titles(con: duckdb.DuckDBPyConnection) -> None:
"""
Expand All @@ -39,7 +47,7 @@ def export_titles(con: duckdb.DuckDBPyConnection) -> None:
TO '{output_path}'
(FORMAT csv, HEADER true)
""")
print(f"Exported title_basics to {output_path}.")
logger.info("Exported title_basics to %s.", output_path)


def export_ratings(con: duckdb.DuckDBPyConnection) -> None:
Expand All @@ -58,7 +66,7 @@ def export_ratings(con: duckdb.DuckDBPyConnection) -> None:
TO '{output_path}'
(FORMAT csv, HEADER true)
""")
print(f"Exported title_ratings to {output_path}.")
logger.info("Exported title_ratings to %s.", output_path)


def build_titles_table(con: duckdb.DuckDBPyConnection, filename: Path) -> None:
Expand All @@ -72,7 +80,7 @@ def build_titles_table(con: duckdb.DuckDBPyConnection, filename: Path) -> None:
.. returns:
None
"""
print(f"Building title_basics table from {filename}...")
logger.info("Building title_basics table from %s...", filename)
title_types = ", ".join(f"'{t}'" for t in IMDB_TITLE_TYPES)
con.execute(
f"""
Expand All @@ -85,7 +93,7 @@ def build_titles_table(con: duckdb.DuckDBPyConnection, filename: Path) -> None:
""",
[str(filename)],
)
print("built title_basics table.")
logger.info("Built title_basics table.")


def build_ratings_table(con: duckdb.DuckDBPyConnection, filename: Path) -> None:
Expand All @@ -99,7 +107,7 @@ def build_ratings_table(con: duckdb.DuckDBPyConnection, filename: Path) -> None:
.. returns:
None
"""
print(f"Building title_ratings table from {filename}...")
logger.info("Building title_ratings table from %s...", filename)
con.execute(
"""
CREATE OR REPLACE TABLE title_ratings AS
Expand All @@ -110,7 +118,7 @@ def build_ratings_table(con: duckdb.DuckDBPyConnection, filename: Path) -> None:
""",
[str(filename)],
)
print("built title_ratings table.")
logger.info("Built title_ratings table.")


def fetch_title_basics(with_cleanup: bool = False) -> None:
Expand Down Expand Up @@ -140,8 +148,8 @@ def fetch_title_basics(with_cleanup: bool = False) -> None:
if with_cleanup:
cleanup(titles_file)

print("Generated titles.basics.csv")
print("-" * 40)
logger.info("Generated titles.basics.csv")
logger.info("-" * 40)


def fetch_title_ratings(with_cleanup: bool = False) -> None:
Expand Down Expand Up @@ -171,8 +179,8 @@ def fetch_title_ratings(with_cleanup: bool = False) -> None:
if with_cleanup:
cleanup(ratings_file)

print("Generated title.ratings.csv")
print("-" * 40)
logger.info("Generated title.ratings.csv")
logger.info("-" * 40)


def main() -> None:
Expand Down
11 changes: 9 additions & 2 deletions netflix/fetch/fetch_kaggle_netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
https://www.kaggle.com/datasets/dhruvildave/netflix-top-10-tv-shows-and-films
"""

import logging
import os

from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore[import-untyped]
Expand All @@ -18,6 +19,12 @@
DATASET = "dhruvildave/netflix-top-10-tv-shows-and-films"

api = KaggleApi()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)


def main():
Expand All @@ -32,8 +39,8 @@ def main():
"""
api.authenticate()
api.dataset_download_files(DATASET, path=KAGGLE_DIR, unzip=True)
print("Dataset downloaded successfully.")
print("-" * 40)
logger.info("Dataset downloaded successfully.")
logger.info("-" * 40)


if __name__ == "__main__":
Expand Down
11 changes: 9 additions & 2 deletions netflix/fetch/fetch_kaggle_tmdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
https://www.kaggle.com/datasets/asaniczka/full-tmdb-tv-shows-dataset-2023-150k-shows
"""

import logging
import os

from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore[import-untyped]
Expand All @@ -17,6 +18,12 @@
DATASET = "asaniczka/full-tmdb-tv-shows-dataset-2023-150k-shows"

api = KaggleApi()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)


def main():
Expand All @@ -31,8 +38,8 @@ def main():
"""
api.authenticate()
api.dataset_download_files(DATASET, path=KAGGLE_DIR, unzip=True)
print("Dataset downloaded successfully.")
print("-" * 40)
logger.info("Dataset downloaded successfully.")
logger.info("-" * 40)


if __name__ == "__main__":
Expand Down
11 changes: 9 additions & 2 deletions netflix/fetch/fetch_polti.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,19 @@
"""

import csv
import logging
import os
from pathlib import Path

from .const import DB_DIR

POLTI_DIR = os.path.join(DB_DIR, "polti")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)


def write_situations_csv(output_path: Path) -> None:
Expand Down Expand Up @@ -98,13 +105,13 @@ def write_situations_csv(output_path: Path) -> None:
writer.writerow(["Number", "Situation", "Description"])
writer.writerows(rows)

print(f"CSV written to: {output_path}")
logger.info("CSV written to: %s", output_path)


def main():
path = Path(os.path.join(POLTI_DIR, "situations.csv"))
write_situations_csv(path)
print("-" * 40)
logger.info("-" * 40)


if __name__ == "__main__":
Expand Down
28 changes: 19 additions & 9 deletions netflix/fetch/lib.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
"""Utility functions for downloading and managing files."""

import json
import logging
from pathlib import Path
from typing import Any, Optional

import pandas as pd
import requests
from tqdm import tqdm

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

def fetch_url(url: str, output_dir: str | Path, timeout: int = 60) -> Path | None:

def fetch_url(url: str, output_dir: str | Path, timeout: int = 60) -> Optional[Path]:
"""
Download a file and return its local path.

Expand All @@ -26,12 +35,12 @@ def fetch_url(url: str, output_dir: str | Path, timeout: int = 60) -> Path | Non
filename = url.rsplit("/", maxsplit=1)[-1]
output_path = output_dir / filename
if output_path.exists():
print(f"File {output_path} already exists, skipping download.")
logger.info("File %s already exists, skipping download.", output_path)
return output_path

# Reuse an existing download.
if output_path.exists():
print(f"File {output_path} already exists, skipping download.")
logger.info("File %s already exists, skipping download.", output_path)
return output_path

try:
Expand All @@ -53,7 +62,7 @@ def fetch_url(url: str, output_dir: str | Path, timeout: int = 60) -> Path | Non
pbar.update(len(chunk))

except (requests.RequestException, OSError) as exc:
print(f"Failed to download {url}: {exc}")
logger.error("Failed to download %s: %s", url, exc)

# Avoid leaving behind a partial download.
output_path.unlink(missing_ok=True)
Expand All @@ -68,10 +77,10 @@ def cleanup(filename: Path) -> None:
try:
filename.unlink(missing_ok=True)
except OSError as exc:
print(f"Failed to remove {filename}: {exc}")
logger.error("Failed to remove %s: %s", filename, exc)


def safe_cast(x) -> list:
def safe_cast(x: Any) -> list[Any]:
"""
Always returns a list safely, even if input is:

Expand All @@ -96,8 +105,8 @@ def safe_cast(x) -> list:
if pd.isna(x):
return []
# pylint: disable=broad-except
except Exception:
pass
except Exception as exc:
logger.error("Failed to check NaN: %s", exc)

if isinstance(x, str):
try:
Expand All @@ -106,7 +115,8 @@ def safe_cast(x) -> list:
return [c.get("name") for c in parsed if isinstance(c, dict) and "name" in c]
return []
# pylint: disable=broad-except
except Exception:
except Exception as exc:
logger.error("Failed to parse JSON: %s", exc)
return []

return []
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "netflix-writers"
version = "0.1.0"
version = "0.1.1-alpha.1"
requires-python = ">=3.12"
description = "Netflix Writers: An AI-powered storytelling assistant for content creators."
authors = [{ name = "Lawrence McDaniel", email = "lpm0073@gmail.com" }]
Expand Down
Loading