Merge pull request #30 from pythonhealthdatascience:dev

amyheather · web-flow · commit 55b1561d67da · 2026-03-02T11:02:25.000Z
Dev
diff --git a/environment.yaml b/environment.yaml
@@ -3,6 +3,8 @@ channels:
   - conda-forge
 dependencies:
   - flake8=7.3.0
+  - ipython=9.10.0
+  - itables=2.7.0
   - lintquarto=0.7.0
   - numpy=2.4.1
   - pandas=2.3.3
diff --git a/examples/python_package/tests/test_data_mock.py b/examples/python_package/tests/test_data_mock.py
@@ -6,6 +6,7 @@
 
 from waitingtimes.patient_analysis import import_patient_data
 
+
 def test_mocking(monkeypatch):
     """Providing data to a test via mocking"""
 
diff --git a/examples/python_package/tests/test_data_real.py b/examples/python_package/tests/test_data_real.py
@@ -6,6 +6,7 @@
 
 from waitingtimes.patient_analysis import import_patient_data
 
+
 def test_real_data_file():
     """Importing a real data file to a test"""
 
diff --git a/examples/python_package/tests/test_data_temp.py b/examples/python_package/tests/test_data_temp.py
@@ -6,6 +6,7 @@
 
 from waitingtimes.patient_analysis import import_patient_data
 
+
 def test_temporary_file(tmp_path):
     """Providing data to a test via a temporary file"""
 
diff --git a/examples/python_package/tests/test_unit.py b/examples/python_package/tests/test_unit.py
@@ -8,27 +8,29 @@
 from waitingtimes.patient_analysis import import_patient_data
 
 
-def test_import_success(tmp_path):
+def test_import_success(monkeypatch):
     """Small CSV with correct columns should work."""
 
+    # Create sample patient data
     expected_cols = [
         "PATIENT_ID", "ARRIVAL_DATE", "ARRIVAL_TIME",
         "SERVICE_DATE", "SERVICE_TIME",
     ]
-
-    # Create temporary CSV file
-    df_in = pd.DataFrame(
+    testdata = pd.DataFrame(
         [["p1", "2024-01-01", "08:00", "2024-01-01", "09:00"]],
         columns=expected_cols,
     )
-    csv_path = tmp_path / "patients.csv"
-    df_in.to_csv(csv_path, index=False)
 
-    # Run function and check it looks correct
-    result = import_patient_data(csv_path)
+    # Call function (with mocking for pd.read_csv())
+    def mock_read_csv(*args, **kwargs):
+        return testdata
+    monkeypatch.setattr(pd, "read_csv", mock_read_csv)
+    result = import_patient_data("path.csv")
+
+    # Check the result looks correct
     assert isinstance(result, pd.DataFrame)
     assert list(result.columns) == expected_cols
-    pd.testing.assert_frame_equal(result, df_in)
+    pd.testing.assert_frame_equal(result, testdata)
 
 
 @pytest.mark.parametrize(
@@ -50,41 +52,44 @@ def test_import_success(tmp_path):
         ],
     ],
 )
-def test_import_errors(tmp_path, columns):
+def test_import_errors(monkeypatch, columns):
     """Incorrect columns should trigger ValueError."""
 
-    # Create temporary CSV file
-    df_in = pd.DataFrame([range(len(columns))], columns=columns)
-    csv_path = tmp_path / "patients.csv"
-    df_in.to_csv(csv_path, index=False)
+    # Create sample patient data
+    testdata = pd.DataFrame([range(len(columns))], columns=columns)
 
-    # Check it raises ValueError
+    # Call function (with mocking for pd.read_csv()), should raise an error
+    def mock_read_csv(*args, **kwargs):
+        return testdata
+    monkeypatch.setattr(pd, "read_csv", mock_read_csv)
     with pytest.raises(ValueError):
-        import_patient_data(csv_path)
+        import_patient_data("path.csv")
 
 
-def test_import_empty_csv(tmp_path):
+def test_import_empty_csv(monkeypatch):
     """Empty CSV with correct columns should succeed."""
 
+    # Create empty CSV with correct header
     expected_cols = [
         "PATIENT_ID", "ARRIVAL_DATE", "ARRIVAL_TIME",
         "SERVICE_DATE", "SERVICE_TIME",
     ]
+    testdata = pd.DataFrame(columns=expected_cols)
 
-    # Create empty CSV with correct header
-    df_in = pd.DataFrame(columns=expected_cols)
-    csv_path = tmp_path / "patients.csv"
-    df_in.to_csv(csv_path, index=False)
+    # Call function (with mocking for pd.read_csv())
+    def mock_read_csv(*args, **kwargs):
+        return testdata
+    monkeypatch.setattr(pd, "read_csv", mock_read_csv)
+    result = import_patient_data("path.csv")
 
-    # Should succeed and return empty DataFrame
-    result = import_patient_data(csv_path)
+    # Should succeed and return an empty dataframe
     assert len(result) == 0
     assert list(result.columns) == expected_cols
 
 
 def test_import_path_types(tmp_path):
     """str and Path inputs should behave identically."""
-    # Create temporary CSV file
+    # Create sample patient data
     expected_cols = [
         "PATIENT_ID",
         "ARRIVAL_DATE", "ARRIVAL_TIME",
@@ -94,6 +99,9 @@ def test_import_path_types(tmp_path):
         [["p1", "2024-01-01", "08:00", "2024-01-01", "09:00"]],
         columns=expected_cols,
     )
+
+    # Create temporary file (not mocking, as this is about checking
+    # pd.read_csv is working as expected)
     csv_path = tmp_path / "patients.csv"
     df_in.to_csv(csv_path, index=False)
 
diff --git a/pages/case_study.qmd b/pages/case_study.qmd
@@ -28,6 +28,8 @@ In the example we will:
 <!-- We don't use code/patient_analysis__imports.py as we have some additional imports used for displaying things nicely -->
 
 ```{python}
+from IPython.display import HTML
+from itables import to_html_datatable
 import json
 from pathlib import Path
 
@@ -42,9 +44,14 @@ pd.set_option("display.max_columns", 8)
 
 ::: {.r-content}
 
+<!-- We don't use code/patient_analysis__imports.R as we have some additional imports used for displaying things nicely -->
+
 ```{r}
-#| file: code/patient_analysis__imports.R
 #| output: false
+library(dplyr)
+library(knitr)
+library(lubridate)
+library(readr)
 ```
 
 :::
@@ -83,7 +90,7 @@ You can download a copy of this data here:
 raw_data = import_patient_data(
    "../examples/python_package/data/patient_data.csv"
 )
-raw_data
+HTML(to_html_datatable(raw_data))
 ```
 
 :::
@@ -96,7 +103,7 @@ raw_data <- import_patient_data(
     "..", "examples", "r_package", "inst", "extdata", "patient_data.csv"
   )
 )
-raw_data
+kable(raw_data)
 ```
 
 :::
@@ -127,7 +134,7 @@ We then apply this function to the raw data.
 
 ```{python}
 processed_data = calculate_wait_times(raw_data)
-processed_data
+HTML(to_html_datatable(processed_data))
 ```
 
 :::
@@ -136,7 +143,7 @@ processed_data
 
 ```{r}
 processed_data <- calculate_wait_times(raw_data)
-processed_data
+kable(processed_data)
 ```
 
 :::
diff --git a/pages/code/test_unit__test_import_empty_csv.py b/pages/code/test_unit__test_import_empty_csv.py
@@ -1,17 +1,19 @@
-def test_import_empty_csv(tmp_path):
+def test_import_empty_csv(monkeypatch):
     """Empty CSV with correct columns should succeed."""
 
+    # Create empty CSV with correct header
     expected_cols = [
         "PATIENT_ID", "ARRIVAL_DATE", "ARRIVAL_TIME",
         "SERVICE_DATE", "SERVICE_TIME",
     ]
+    testdata = pd.DataFrame(columns=expected_cols)
 
-    # Create empty CSV with correct header
-    df_in = pd.DataFrame(columns=expected_cols)
-    csv_path = tmp_path / "patients.csv"
-    df_in.to_csv(csv_path, index=False)
+    # Call function (with mocking for pd.read_csv())
+    def mock_read_csv(*args, **kwargs):
+        return testdata
+    monkeypatch.setattr(pd, "read_csv", mock_read_csv)
+    result = import_patient_data("path.csv")
 
-    # Should succeed and return empty DataFrame
-    result = import_patient_data(csv_path)
+    # Should succeed and return an empty dataframe
     assert len(result) == 0
     assert list(result.columns) == expected_cols
diff --git a/pages/code/test_unit__test_import_errors.py b/pages/code/test_unit__test_import_errors.py
@@ -17,14 +17,15 @@
         ],
     ],
 )
-def test_import_errors(tmp_path, columns):
+def test_import_errors(monkeypatch, columns):
     """Incorrect columns should trigger ValueError."""
 
-    # Create temporary CSV file
-    df_in = pd.DataFrame([range(len(columns))], columns=columns)
-    csv_path = tmp_path / "patients.csv"
-    df_in.to_csv(csv_path, index=False)
+    # Create sample patient data
+    testdata = pd.DataFrame([range(len(columns))], columns=columns)
 
-    # Check it raises ValueError
+    # Call function (with mocking for pd.read_csv()), should raise an error
+    def mock_read_csv(*args, **kwargs):
+        return testdata
+    monkeypatch.setattr(pd, "read_csv", mock_read_csv)
     with pytest.raises(ValueError):
-        import_patient_data(csv_path)
+        import_patient_data("path.csv")
diff --git a/pages/code/test_unit__test_import_path_types.py b/pages/code/test_unit__test_import_path_types.py
@@ -1,6 +1,6 @@
 def test_import_path_types(tmp_path):
     """str and Path inputs should behave identically."""
-    # Create temporary CSV file
+    # Create sample patient data
     expected_cols = [
         "PATIENT_ID",
         "ARRIVAL_DATE", "ARRIVAL_TIME",
@@ -10,6 +10,9 @@ def test_import_path_types(tmp_path):
         [["p1", "2024-01-01", "08:00", "2024-01-01", "09:00"]],
         columns=expected_cols,
     )
+
+    # Create temporary file (not mocking, as this is about checking
+    # pd.read_csv is working as expected)
     csv_path = tmp_path / "patients.csv"
     df_in.to_csv(csv_path, index=False)
 
diff --git a/pages/code/test_unit__test_import_success.py b/pages/code/test_unit__test_import_success.py
@@ -1,21 +1,23 @@
-def test_import_success(tmp_path):
+def test_import_success(monkeypatch):
     """Small CSV with correct columns should work."""
 
+    # Create sample patient data
     expected_cols = [
         "PATIENT_ID", "ARRIVAL_DATE", "ARRIVAL_TIME",
         "SERVICE_DATE", "SERVICE_TIME",
     ]
-
-    # Create temporary CSV file
-    df_in = pd.DataFrame(
+    testdata = pd.DataFrame(
         [["p1", "2024-01-01", "08:00", "2024-01-01", "09:00"]],
         columns=expected_cols,
     )
-    csv_path = tmp_path / "patients.csv"
-    df_in.to_csv(csv_path, index=False)
 
-    # Run function and check it looks correct
-    result = import_patient_data(csv_path)
+    # Call function (with mocking for pd.read_csv())
+    def mock_read_csv(*args, **kwargs):
+        return testdata
+    monkeypatch.setattr(pd, "read_csv", mock_read_csv)
+    result = import_patient_data("path.csv")
+
+    # Check the result looks correct
     assert isinstance(result, pd.DataFrame)
     assert list(result.columns) == expected_cols
-    pd.testing.assert_frame_equal(result, df_in)
+    pd.testing.assert_frame_equal(result, testdata)
diff --git a/pages/run_tests.qmd b/pages/run_tests.qmd
@@ -107,6 +107,13 @@ In an R package, tests will be automatically discovered and run if they follow t
 1. Stored within `tests/testthat/`.
 2. In R files starting with `test_` or `test-`.
 
+You may have created these structures with `usethis` (as explained on [the prior page](write_basic_test.qmd)) - for example:
+
+```{.r}
+usethis::use_testthat()
+usethis::use_test("intro_simple")
+```
+
 For example, your project might look like:
 
 ```
diff --git a/pages/temp_mock.qmd b/pages/temp_mock.qmd
@@ -214,7 +214,7 @@ Because `pd.read_csv` is patched, the file path passed to `import_patient_data`
 
 ::: {.r-content}
 
-Mocking an external function like `readr::read_csv()` is only really only practical when your R code is structured as a package.
+Mocking an external function like `readr::read_csv()` is only really practical when your R code is structured as a package.
 
 This is because `testthat::local_mocked_bindings()` can only replace names that live in your package namespace. For an external function, that means you must either:
 
@@ -273,10 +273,32 @@ testthat::test_dir(
 
 ## When to use each option
 
-You do not need to pick a single approach for your whole project. Instead, choose the best option for the goal of each test:
+You do not need to pick a single approach for your whole project. Instead, choose the best option for the goal of each test.
 
-* **Real data file** - best when you want to run your workflow on a specific dataset. For example, we will use this in [regression tests](regression_tests.qmd), where check that results stay consistent over time by comparing to a saved analysis output.
+::: {.python-content}
+
+In Python, we often mix all three patterns:
+
+:::
+
+::: {.r-content}
+
+In R, you choice will also be impacted by whether you are working inside a package:
+
+:::
+
+* **Real data file** - best when you want to run your workflow on a specific dataset. For example, we will use this in [regression tests](regression_tests.qmd)  where you compare current results to a saved output to check that behaviour stays stable over time.
+
+* **Temporary file** - a good default for many tests where you just need a small, representative dataset with the right structure. For example, this works well in [smoke tests](smoke_tests.qmd) and [system tests](system_tests.qmd).
+
+::: {.python-content}
+
+* **Mocking** - most useful when your code lives in a package and you want fast, isolated [unit tests](unit_tests.qmd) where you want to **isolate your own logic from external libraries and the filesystem**, and avoid file I/O entirely. However, not every unit test needs mocking: sometimes you deliberately call the real library function because you want to confirm you are using it correctly (for example, that you pass the right arguments, or that it can handle your expected input format).
+
+:::
+
+::: {.r-content}
 
-* **Temporary file** - a good default for many tests where you just need a small, representative dataset with the right structure. We will use in our [smoke tests](smoke_tests.qmd) and [system tests](system_tests.qmd).
+* **Mocking** - most useful when your code lives in a **package** and you want fast [unit tests](unit_tests.qmd) which **isolate your own logic from external libraries and the filesystem**, and avoid file I/O entirely. However, not every unit test needs mocking: sometimes you deliberately call the real library function because you want to confirm you are using it correctly (for example, that you pass the right arguments, or that it can handle your expected input format). Also, for non‑package workflows (scripts, notebooks), mocking infrastructure is usually more effort than it's worth. In these cases, it's clearer and more robust to use temporary or real data files.
 
-* **Mocking** - most appropriate for small, fast [unit tests](unit_tests.qmd) where you want to **isolate your own logic from external libraries and the filesystem**, and avoid file I/O entirely.
+:::
diff --git a/pages/unit_tests.qmd b/pages/unit_tests.qmd
@@ -100,7 +100,7 @@ Each of these becomes something you can check with a test.
 
 Pick the simplest input that should work.
 
-In our case, we can create a small CSV with correct columns in the right order and one or two data rows. We can then write tests that confirm:
+In our case, we can create a small table with correct columns in the right order and one or two data rows. We can then write tests that confirm:
 
 ::: {.python-content}
 
diff --git a/pages/write_basic_test.qmd b/pages/write_basic_test.qmd
@@ -63,9 +63,20 @@ install.packages("testthat")
 If you are structuring your research as a package, we suggest also installing `usethis` and running:
 
 ```{.r}
+install.packages("usethis")
 usethis::use_testthat()
 ```
 
+This will set up the testing infrastructure for you: it creates a `tests/testthat/` folder, adds the files needed to run tests, and records `testthat` in your package metadata.
+
+You can then create individual test files with:
+
+```{.r}
+usethis::use_test("filename")
+```
+
+This creates a new file called `tests/testthat/test-filename.R`, and opens it in your editor so you can start adding tests. We explain how this folder structure is used when you [run tests on the next page](run_tests.qmd).
+
 ## What a testthat test looks like
 
 Tests are created using `test_that()`. They are built around expectations like `expect_true()`, `expect_false()`, `expect_equal()`, `expect_error()`, and others (see [package index](https://testthat.r-lib.org/reference/index.html) for more). If an expectation fails, testthat will return an error message explaining what went wrong.