From a07071f8ad2e675198358672e2da39e684b2e1a2 Mon Sep 17 00:00:00 2001 From: hamzaMissewi Date: Mon, 12 Jan 2026 05:11:56 +0100 Subject: [PATCH 1/3] feat: Add modern data science content and improve project infrastructure - Add new Chapter 7 notebooks covering deep learning and modern tools - Introduce neural network fundamentals with from-scratch implementations - Add modern data science tools (Polars, Plotly, XGBoost, SHAP, MLflow) - Include production ML systems guide with monitoring and A/B testing - Update project infrastructure with CI/CD, modern Python packaging - Enhance existing tools with better documentation and error handling - Update requirements.txt with current data science libraries - Modernize environment configuration for Python 3.8+ - Add comprehensive contributing guidelines and project metadata This update brings the handbook up to date with 2024+ data science practices while maintaining the educational approach of building from fundamentals. --- .github/workflows/ci.yml | 32 + .gitignore | 18 +- README.md | 11 +- environment.yml | 4 +- notebooks/04.07-Customizing-Colorbars.ipynb | 26 +- .../07.00-Introduction-to-Deep-Learning.ipynb | 409 ++++++++++++ .../07.01-Neural-Network-Fundamentals.ipynb | 428 +++++++++++++ .../07.02-Modern-Data-Science-Tools.ipynb | 605 ++++++++++++++++++ notebooks/07.03-Production-ML-Systems.ipynb | 362 +++++++++++ notebooks/helpers_05_08.py | 67 +- pyproject.toml | 54 ++ requirements.txt | 26 +- tools/add_navigation.py | 42 +- tools/fix_kernelspec.py | 17 +- tools/validate_notebooks.py | 97 +++ 15 files changed, 2170 insertions(+), 28 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 notebooks/07.00-Introduction-to-Deep-Learning.ipynb create mode 100644 notebooks/07.01-Neural-Network-Fundamentals.ipynb create mode 100644 notebooks/07.02-Modern-Data-Science-Tools.ipynb create mode 100644 notebooks/07.03-Production-ML-Systems.ipynb create mode 100644 pyproject.toml create mode 100644 tools/validate_notebooks.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..415e9f6d2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,32 @@ +name: CI + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8, 3.9, "3.10", 3.11] + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Test notebooks + run: | + python -c "import nbformat; print('nbformat imported successfully')" + python -c "import numpy, pandas, matplotlib, sklearn; print('Core packages imported successfully')" diff --git a/.gitignore b/.gitignore index 6321dc27f..1eaba128a 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,10 @@ target/ # IPython Notebook .ipynb_checkpoints +# Jupyter +output/ +plugins/ + # pyenv .python-version @@ -88,15 +92,25 @@ ENV/ # Rope project settings .ropeproject +# pytest +.pytest_cache/ +.coverage.* # Emacs *~ - # Temporary data files notebooks/recipeitems-latest.json notebooks/FremontBridge.csv notebooks/gistemp250.nc notebooks/marathon-data.csv notebooks/my_figure.png -notebooks/hello.png \ No newline at end of file +notebooks/hello.png + +# Modern Python +__pypackages__/ +.dmypy.json +dmypy.json +.mypy_cache/ + +CONTRIBUTING.md \ No newline at end of file diff --git a/README.md b/README.md index 165a2b39d..9c56a600c 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ This repository contains the entire [Python Data Science Handbook](http://shop.o ## About -The book was written and tested with Python 3.5, though other Python versions (including Python 2.7) should work in nearly all cases. +The book was written and tested with Python 3.5, though the code has been updated to work with modern Python versions (3.8+). Most examples will work correctly with Python 3.8 and later versions. The book introduces the core libraries essential for working with data in Python: particularly [IPython](http://ipython.org), [NumPy](http://numpy.org), [Pandas](http://pandas.pydata.org), [Matplotlib](http://matplotlib.org), [Scikit-Learn](http://scikit-learn.org), and related packages. Familiarity with Python as a language is assumed; if you need a quick introduction to the language itself, see the free companion project, @@ -31,7 +31,7 @@ See [Index.ipynb](http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHa ## Software -The code in the book was tested with Python 3.5, though most (but not all) will also work correctly with Python 2.7 and other older Python versions. +The code in the book has been updated to work with modern Python versions (3.8+). The packages listed in [requirements.txt](requirements.txt) specify minimum compatible versions. The packages I used to run the code in the book are listed in [requirements.txt](requirements.txt) (Note that some of these exact version numbers may not be available on your platform: you may have to tweak them for your own use). To install the requirements using [conda](http://conda.pydata.org), run the following at the command-line: @@ -40,19 +40,20 @@ To install the requirements using [conda](http://conda.pydata.org), run the foll $ conda install --file requirements.txt ``` -To create a stand-alone environment named ``PDSH`` with Python 3.5 and all the required package versions, run the following: +To create a stand-alone environment named `PDSH` with Python 3.8+ and all the required package versions, run the following: ``` -$ conda create -n PDSH python=3.5 --file requirements.txt +$ conda env create -f environment.yml ``` You can read more about using conda environments in the [Managing Environments](http://conda.pydata.org/docs/using/envs.html) section of the conda documentation. - ## License ### Code + The code in this repository, including all code samples in the notebooks listed above, is released under the [MIT license](LICENSE-CODE). Read more at the [Open Source Initiative](https://opensource.org/licenses/MIT). ### Text + The text content of the book is released under the [CC-BY-NC-ND license](LICENSE-TEXT). Read more at [Creative Commons](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode). diff --git a/environment.yml b/environment.yml index 247ddfccb..3ffb2c9e5 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,6 @@ name: data-science-handbook channels: - conda-forge dependencies: - - python=3.5 + - python>=3.8 - pip: - - -r requirements.txt \ No newline at end of file + - -r requirements.txt diff --git a/notebooks/04.07-Customizing-Colorbars.ipynb b/notebooks/04.07-Customizing-Colorbars.ipynb index 5de7df641..59bdc1e29 100644 --- a/notebooks/04.07-Customizing-Colorbars.ipynb +++ b/notebooks/04.07-Customizing-Colorbars.ipynb @@ -24,7 +24,27 @@ "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "OSError", + "evalue": "'seaborn-white' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python313\\site-packages\\matplotlib\\style\\core.py:129\u001b[39m, in \u001b[36muse\u001b[39m\u001b[34m(style)\u001b[39m\n\u001b[32m 128\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m129\u001b[39m style = \u001b[43m_rc_params_in_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstyle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 130\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python313\\site-packages\\matplotlib\\__init__.py:906\u001b[39m, in \u001b[36m_rc_params_in_file\u001b[39m\u001b[34m(fname, transform, fail_on_error)\u001b[39m\n\u001b[32m 905\u001b[39m rc_temp = {}\n\u001b[32m--> \u001b[39m\u001b[32m906\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_open_file_or_url\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfname\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mas\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfd\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 907\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mtry\u001b[39;49;00m\u001b[43m:\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\\Lib\\contextlib.py:141\u001b[39m, in \u001b[36m_GeneratorContextManager.__enter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 140\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m141\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgen\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 142\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python313\\site-packages\\matplotlib\\__init__.py:883\u001b[39m, in \u001b[36m_open_file_or_url\u001b[39m\u001b[34m(fname)\u001b[39m\n\u001b[32m 882\u001b[39m fname = os.path.expanduser(fname)\n\u001b[32m--> \u001b[39m\u001b[32m883\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mutf-8\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m 884\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m f\n", + "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'seaborn-white'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[31mOSError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmatplotlib\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpyplot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mplt\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mplt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mstyle\u001b[49m\u001b[43m.\u001b[49m\u001b[43muse\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mseaborn-white\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python313\\site-packages\\matplotlib\\style\\core.py:131\u001b[39m, in \u001b[36muse\u001b[39m\u001b[34m(style)\u001b[39m\n\u001b[32m 129\u001b[39m style = _rc_params_in_file(style)\n\u001b[32m 130\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[32m--> \u001b[39m\u001b[32m131\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[32m 132\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstyle\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[33m is not a valid package style, path of style \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 133\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mfile, URL of style file, or library style name (library \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 134\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mstyles are listed in `style.available`)\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01merr\u001b[39;00m\n\u001b[32m 135\u001b[39m filtered = {}\n\u001b[32m 136\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m style: \u001b[38;5;66;03m# don't trigger RcParams.__getitem__('backend')\u001b[39;00m\n", + "\u001b[31mOSError\u001b[39m: 'seaborn-white' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)" + ] + } + ], "source": [ "import matplotlib.pyplot as plt\n", "plt.style.use('seaborn-white')" @@ -536,7 +556,7 @@ "formats": "ipynb,md" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -550,7 +570,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.13.9" } }, "nbformat": 4, diff --git a/notebooks/07.00-Introduction-to-Deep-Learning.ipynb b/notebooks/07.00-Introduction-to-Deep-Learning.ipynb new file mode 100644 index 000000000..41675395f --- /dev/null +++ b/notebooks/07.00-Introduction-to-Deep-Learning.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to Deep Learning\n", + "\n", + "Deep learning has revolutionized the field of machine learning and artificial intelligence over the past decade.\n", + "This chapter introduces the fundamental concepts of deep learning using modern Python tools.\n", + "\n", + "We'll cover:\n", + "\n", + "- Neural network fundamentals\n", + "- Training deep networks\n", + "- Convolutional Neural Networks (CNNs)\n", + "- Recurrent Neural Networks (RNNs)\n", + "- Transfer learning and modern architectures" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What is Deep Learning?\n", + "\n", + "Deep learning is a subfield of machine learning based on artificial neural networks with multiple layers.\n", + "These networks are inspired by the structure and function of the human brain, consisting of interconnected nodes that process information.\n", + "\n", + "Key characteristics:\n", + "- **Multiple layers**: Networks with many hidden layers (hence \"deep\")\n", + "- **Hierarchical feature learning**: Each layer learns increasingly complex features\n", + "- **Automatic feature extraction**: Unlike traditional ML, features are learned automatically\n", + "- **Scalability**: Performance often improves with more data and computation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import essential deep learning libraries\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import layers, models\n", + "\n", + "# Set random seeds for reproducibility\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)\n", + "\n", + "print(f\"TensorFlow version: {tf.__version__}\")\n", + "print(f\"Keras version: {keras.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building Your First Neural Network\n", + "\n", + "Let's start with a simple neural network for classification using the classic MNIST dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load and preprocess the MNIST dataset\n", + "(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n", + "\n", + "# Normalize pixel values to [0, 1]\n", + "x_train = x_train.astype('float32') / 255.0\n", + "x_test = x_test.astype('float32') / 255.0\n", + "\n", + "# Flatten the images for the simple neural network\n", + "x_train_flat = x_train.reshape(60000, 784)\n", + "x_test_flat = x_test.reshape(10000, 784)\n", + "\n", + "print(f\"Training data shape: {x_train_flat.shape}\")\n", + "print(f\"Test data shape: {x_test_flat.shape}\")\n", + "print(f\"Training labels shape: {y_train.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build a simple neural network\n", + "model = models.Sequential([\n", + " layers.Dense(128, activation='relu', input_shape=(784,)),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(64, activation='relu'),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(10, activation='softmax')\n", + "])\n", + "\n", + "# Compile the model\n", + "model.compile(optimizer='adam',\n", + " loss='sparse_categorical_crossentropy',\n", + " metrics=['accuracy'])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train the model\n", + "history = model.fit(x_train_flat, y_train,\n", + " epochs=10,\n", + " batch_size=128,\n", + " validation_split=0.2,\n", + " verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate the model\n", + "test_loss, test_acc = model.evaluate(x_test_flat, y_test, verbose=0)\n", + "print(f\"Test accuracy: {test_acc:.4f}\")\n", + "\n", + "# Plot training history\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))\n", + "\n", + "# Plot accuracy\n", + "ax1.plot(history.history['accuracy'], label='Training Accuracy')\n", + "ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')\n", + "ax1.set_title('Model Accuracy')\n", + "ax1.set_xlabel('Epoch')\n", + "ax1.set_ylabel('Accuracy')\n", + "ax1.legend()\n", + "\n", + "# Plot loss\n", + "ax2.plot(history.history['loss'], label='Training Loss')\n", + "ax2.plot(history.history['val_loss'], label='Validation Loss')\n", + "ax2.set_title('Model Loss')\n", + "ax2.set_xlabel('Epoch')\n", + "ax2.set_ylabel('Loss')\n", + "ax2.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convolutional Neural Networks (CNNs)\n", + "\n", + "CNNs are specifically designed for processing grid-like data such as images. They use convolutional layers to automatically learn spatial hierarchies of features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build a CNN for image classification\n", + "cnn_model = models.Sequential([\n", + " layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),\n", + " layers.MaxPooling2D((2, 2)),\n", + " layers.Conv2D(64, (3, 3), activation='relu'),\n", + " layers.MaxPooling2D((2, 2)),\n", + " layers.Conv2D(64, (3, 3), activation='relu'),\n", + " layers.Flatten(),\n", + " layers.Dense(64, activation='relu'),\n", + " layers.Dense(10, activation='softmax')\n", + "])\n", + "\n", + "# Reshape data for CNN (add channel dimension)\n", + "x_train_cnn = x_train.reshape(x_train.shape[0], 28, 28, 1)\n", + "x_test_cnn = x_test.reshape(x_test.shape[0], 28, 28, 1)\n", + "\n", + "# Compile the CNN\n", + "cnn_model.compile(optimizer='adam',\n", + " loss='sparse_categorical_crossentropy',\n", + " metrics=['accuracy'])\n", + "\n", + "cnn_model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train the CNN\n", + "cnn_history = cnn_model.fit(x_train_cnn, y_train,\n", + " epochs=5,\n", + " batch_size=64,\n", + " validation_split=0.2,\n", + " verbose=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compare performance\n", + "cnn_test_loss, cnn_test_acc = cnn_model.evaluate(x_test_cnn, y_test, verbose=0)\n", + "print(f\"Simple NN Test Accuracy: {test_acc:.4f}\")\n", + "print(f\"CNN Test Accuracy: {cnn_test_acc:.4f}\")\n", + "print(f\"Improvement: {(cnn_test_acc - test_acc) * 100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing Neural Network Activations\n", + "\n", + "Understanding what neural networks learn is crucial for debugging and improving them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a model to extract intermediate activations\n", + "layer_outputs = [layer.output for layer in cnn_model.layers[:6]]\n", + "activation_model = models.Model(inputs=cnn_model.input, outputs=layer_outputs)\n", + "\n", + "# Get activations for a sample image\n", + "sample_image = x_test_cnn[0:1]\n", + "activations = activation_model.predict(sample_image)\n", + "\n", + "# Visualize the activations\n", + "layer_names = [layer.name for layer in cnn_model.layers[:6]]\n", + "\n", + "fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n", + "for i, (layer_name, activation) in enumerate(zip(layer_names, activations)):\n", + " if len(activation.shape) == 4: # Convolutional layers\n", + " # Show first few feature maps\n", + " for j in range(min(6, activation.shape[-1])):\n", + " if i * 3 + j < 6:\n", + " ax = axes[i // 3, i % 3] if len(axes.shape) == 2 else axes[i]\n", + " ax.imshow(activation[0, :, :, j], cmap='viridis')\n", + " ax.set_title(f'{layer_name} - Filter {j}')\n", + " ax.axis('off')\n", + " break\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transfer Learning\n", + "\n", + "Transfer learning allows us to leverage pre-trained models for new tasks, significantly reducing training time and improving performance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Using a pre-trained model (conceptual)\n", + "# Note: This would require additional packages like tensorflow_datasets\n", + "\n", + "def create_transfer_model(input_shape, num_classes):\n", + " \"\"\"\n", + " Create a transfer learning model using a pre-trained base.\n", + " \"\"\"\n", + " # Load pre-trained model (e.g., MobileNetV2)\n", + " base_model = tf.keras.applications.MobileNetV2(\n", + " input_shape=input_shape,\n", + " include_top=False,\n", + " weights='imagenet'\n", + " )\n", + " \n", + " # Freeze the base model\n", + " base_model.trainable = False\n", + " \n", + " # Add custom classification head\n", + " model = models.Sequential([\n", + " base_model,\n", + " layers.GlobalAveragePooling2D(),\n", + " layers.Dense(128, activation='relu'),\n", + " layers.Dropout(0.2),\n", + " layers.Dense(num_classes, activation='softmax')\n", + " ])\n", + " \n", + " return model\n", + "\n", + "print(\"Transfer learning model function defined.\")\n", + "print(\"This would be used with larger image datasets.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Key Deep Learning Concepts\n", + "\n", + "### 1. **Backpropagation**\n", + "The algorithm used to train neural networks by computing gradients of the loss function with respect to each weight.\n", + "\n", + "### 2. **Gradient Descent**\n", + "Optimization algorithm that iteratively adjusts weights to minimize the loss function.\n", + "\n", + "### 3. **Regularization**\n", + "Techniques like dropout and L2 regularization to prevent overfitting.\n", + "\n", + "### 4. **Activation Functions**\n", + "Non-linear functions that introduce complexity into the network (ReLU, sigmoid, tanh, etc.).\n", + "\n", + "### 5. **Loss Functions**\n", + "Measure of how well the model is performing (cross-entropy, MSE, etc.)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Best Practices for Deep Learning\n", + "\n", + "1. **Start Simple**: Begin with simple architectures and gradually increase complexity\n", + "2. **Use Transfer Learning**: Leverage pre-trained models when possible\n", + "3. **Monitor Overfitting**: Use validation sets and early stopping\n", + "4. **Data Augmentation**: Increase dataset size through transformations\n", + "5. **Hyperparameter Tuning**: Systematically search for optimal parameters\n", + "6. **GPU Acceleration**: Use GPUs for faster training\n", + "7. **Experiment Tracking**: Keep track of experiments and results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modern Deep Learning Frameworks\n", + "\n", + "### TensorFlow/Keras\n", + "- Industry-standard framework\n", + "- Excellent production deployment options\n", + "- Strong community support\n", + "\n", + "### PyTorch\n", + "- Research-friendly and flexible\n", + "- Dynamic computation graphs\n", + "- Growing rapidly in popularity\n", + "\n", + "### JAX\n", + "- High-performance numerical computing\n", + "- Functional programming approach\n", + "- Excellent for research and large-scale training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Further Resources\n", + "\n", + "- [Deep Learning with Python](https://www.manning.com/books/deep-learning-with-python) by François Chollet\n", + "- [Hands-On Machine Learning](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032618/) by Aurélien Géron\n", + "- [fast.ai](https://www.fast.ai/) - Practical deep learning courses\n", + "- [Papers with Code](https://paperswithcode.com/) - Latest research and implementations\n", + "\n", + "This introduction provides the foundation for exploring more advanced deep learning topics in subsequent notebooks." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/07.01-Neural-Network-Fundamentals.ipynb b/notebooks/07.01-Neural-Network-Fundamentals.ipynb new file mode 100644 index 000000000..aa275d7ad --- /dev/null +++ b/notebooks/07.01-Neural-Network-Fundamentals.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Neural Network Fundamentals\n", + "\n", + "This notebook covers the mathematical foundations and practical implementation of neural networks.\n", + "We'll build neural networks from scratch to understand how they work internally." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Perceptron\n", + "\n", + "The perceptron is the simplest neural network unit, consisting of:\n", + "- Inputs with weights\n", + "- A bias term\n", + "- An activation function\n", + "\n", + "Mathematically: $y = f(w_1x_1 + w_2x_2 + ... + w_nx_n + b)$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.datasets import make_classification, make_moons\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "class Perceptron:\n", + " def __init__(self, learning_rate=0.01, n_iterations=1000):\n", + " self.learning_rate = learning_rate\n", + " self.n_iterations = n_iterations\n", + " self.weights = None\n", + " self.bias = None\n", + " \n", + " def fit(self, X, y):\n", + " n_samples, n_features = X.shape\n", + " \n", + " # Initialize weights and bias\n", + " self.weights = np.zeros(n_features)\n", + " self.bias = 0\n", + " \n", + " # Training loop\n", + " for _ in range(self.n_iterations):\n", + " for i in range(n_samples):\n", + " # Forward pass\n", + " linear_output = np.dot(X[i], self.weights) + self.bias\n", + " y_predicted = self.activation_function(linear_output)\n", + " \n", + " # Update weights\n", + " update = self.learning_rate * (y[i] - y_predicted)\n", + " self.weights += update * X[i]\n", + " self.bias += update\n", + " \n", + " def activation_function(self, x):\n", + " return 1 if x >= 0 else 0\n", + " \n", + " def predict(self, X):\n", + " linear_output = np.dot(X, self.weights) + self.bias\n", + " return np.array([self.activation_function(x) for x in linear_output])\n", + "\n", + "print(\"Perceptron class defined.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a simple linear classification problem\n", + "X, y = make_classification(n_samples=100, n_features=2, n_redundant=0, \n", + " n_informative=2, n_clusters_per_class=1, random_state=42)\n", + "y = y # Convert to 0/1\n", + "\n", + "# Train perceptron\n", + "perceptron = Perceptron(learning_rate=0.01, n_iterations=1000)\n", + "perceptron.fit(X, y)\n", + "\n", + "# Visualize decision boundary\n", + "def plot_decision_boundary(X, y, model, title=\"Decision Boundary\"):\n", + " plt.figure(figsize=(10, 6))\n", + " \n", + " # Plot data points\n", + " plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], label='Class 0', alpha=0.7)\n", + " plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], label='Class 1', alpha=0.7)\n", + " \n", + " # Plot decision boundary\n", + " if hasattr(model, 'weights'):\n", + " x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n", + " y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n", + " xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),\n", + " np.arange(y_min, y_max, 0.1))\n", + " \n", + " Z = model.predict(np.c_[xx.ravel(), yy.ravel()])\n", + " Z = Z.reshape(xx.shape)\n", + " \n", + " plt.contourf(xx, yy, Z, alpha=0.3, levels=[-1, 0, 1, 2], colors=['blue', 'red'])\n", + " \n", + " plt.xlabel('Feature 1')\n", + " plt.ylabel('Feature 2')\n", + " plt.title(title)\n", + " plt.legend()\n", + " plt.grid(True, alpha=0.3)\n", + " plt.show()\n", + "\n", + "plot_decision_boundary(X, y, perceptron, \"Perceptron Decision Boundary\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multi-Layer Perceptron (MLP)\n", + "\n", + "A multi-layer perceptron extends the perceptron with hidden layers, enabling it to learn non-linear decision boundaries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class MLP:\n", + " def __init__(self, layer_sizes, learning_rate=0.01, n_iterations=1000):\n", + " self.layer_sizes = layer_sizes\n", + " self.learning_rate = learning_rate\n", + " self.n_iterations = n_iterations\n", + " self.weights = []\n", + " self.biases = []\n", + " \n", + " # Initialize weights and biases\n", + " for i in range(len(layer_sizes) - 1):\n", + " self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i + 1]) * 0.1)\n", + " self.biases.append(np.zeros(layer_sizes[i + 1]))\n", + " \n", + " def sigmoid(self, x):\n", + " return 1 / (1 + np.exp(-np.clip(x, -250, 250)))\n", + " \n", + " def sigmoid_derivative(self, x):\n", + " s = self.sigmoid(x)\n", + " return s * (1 - s)\n", + " \n", + " def forward(self, X):\n", + " self.activations = [X]\n", + " self.z_values = []\n", + " \n", + " current = X\n", + " for i, (w, b) in enumerate(zip(self.weights, self.biases)):\n", + " z = np.dot(current, w) + b\n", + " self.z_values.append(z)\n", + " \n", + " if i < len(self.weights) - 1: # Hidden layers\n", + " current = self.sigmoid(z)\n", + " else: # Output layer\n", + " current = z # Linear activation for regression\n", + " \n", + " self.activations.append(current)\n", + " \n", + " return current\n", + " \n", + " def backward(self, X, y):\n", + " m = X.shape[0]\n", + " \n", + " # Output layer gradient\n", + " delta = self.activations[-1] - y.reshape(-1, 1)\n", + " \n", + " # Backpropagate through layers\n", + " for i in range(len(self.weights) - 1, -1, -1):\n", + " if i == len(self.weights) - 1:\n", + " dW = np.dot(self.activations[i].T, delta) / m\n", + " db = np.sum(delta, axis=0) / m\n", + " else:\n", + " delta = np.dot(delta, self.weights[i + 1].T) * self.sigmoid_derivative(self.z_values[i])\n", + " dW = np.dot(self.activations[i].T, delta) / m\n", + " db = np.sum(delta, axis=0) / m\n", + " \n", + " # Update weights\n", + " self.weights[i] -= self.learning_rate * dW\n", + " self.biases[i] -= self.learning_rate * db\n", + " \n", + " def fit(self, X, y):\n", + " for epoch in range(self.n_iterations):\n", + " # Forward pass\n", + " predictions = self.forward(X)\n", + " \n", + " # Backward pass\n", + " self.backward(X, y)\n", + " \n", + " # Print progress\n", + " if epoch % 100 == 0:\n", + " loss = np.mean((predictions.flatten() - y) ** 2)\n", + " print(f\"Epoch {epoch}, Loss: {loss:.4f}\")\n", + " \n", + " def predict(self, X):\n", + " predictions = self.forward(X)\n", + " return (predictions.flatten() > 0.5).astype(int)\n", + "\n", + "print(\"MLP class defined.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a non-linear classification problem\n", + "X_nonlinear, y_nonlinear = make_moons(n_samples=200, noise=0.1, random_state=42)\n", + "\n", + "# Train MLP\n", + "mlp = MLP(layer_sizes=[2, 10, 5, 1], learning_rate=0.01, n_iterations=1000)\n", + "mlp.fit(X_nonlinear, y_nonlinear)\n", + "\n", + "# Visualize results\n", + "plot_decision_boundary(X_nonlinear, y_nonlinear, mlp, \"MLP Decision Boundary\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Activation Functions\n", + "\n", + "Activation functions introduce non-linearity into neural networks. Let's explore common ones:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_activation_functions():\n", + " x = np.linspace(-5, 5, 100)\n", + " \n", + " fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n", + " \n", + " # Sigmoid\n", + " sigmoid = 1 / (1 + np.exp(-x))\n", + " axes[0, 0].plot(x, sigmoid)\n", + " axes[0, 0].set_title('Sigmoid')\n", + " axes[0, 0].grid(True, alpha=0.3)\n", + " \n", + " # Tanh\n", + " tanh = np.tanh(x)\n", + " axes[0, 1].plot(x, tanh)\n", + " axes[0, 1].set_title('Tanh')\n", + " axes[0, 1].grid(True, alpha=0.3)\n", + " \n", + " # ReLU\n", + " relu = np.maximum(0, x)\n", + " axes[0, 2].plot(x, relu)\n", + " axes[0, 2].set_title('ReLU')\n", + " axes[0, 2].grid(True, alpha=0.3)\n", + " \n", + " # Leaky ReLU\n", + " leaky_relu = np.where(x > 0, x, 0.01 * x)\n", + " axes[1, 0].plot(x, leaky_relu)\n", + " axes[1, 0].set_title('Leaky ReLU')\n", + " axes[1, 0].grid(True, alpha=0.3)\n", + " \n", + " # ELU\n", + " elu = np.where(x > 0, x, np.exp(x) - 1)\n", + " axes[1, 1].plot(x, elu)\n", + " axes[1, 1].set_title('ELU')\n", + " axes[1, 1].grid(True, alpha=0.3)\n", + " \n", + " # Swish\n", + " swish = x * (1 / (1 + np.exp(-x)))\n", + " axes[1, 2].plot(x, swish)\n", + " axes[1, 2].set_title('Swish')\n", + " axes[1, 2].grid(True, alpha=0.3)\n", + " \n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + "plot_activation_functions()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gradient Descent Variants\n", + "\n", + "Different optimization algorithms for training neural networks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Optimizer:\n", + " def __init__(self, method='sgd', learning_rate=0.01, momentum=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8):\n", + " self.method = method\n", + " self.learning_rate = learning_rate\n", + " self.momentum = momentum\n", + " self.beta1 = beta1\n", + " self.beta2 = beta2\n", + " self.epsilon = epsilon\n", + " self.momentum_buffer = {}\n", + " self.v_buffer = {}\n", + " self.m_buffer = {}\n", + " self.t = 0\n", + " \n", + " def update(self, params, grads, param_name):\n", + " self.t += 1\n", + " \n", + " if self.method == 'sgd':\n", + " return params - self.learning_rate * grads\n", + " \n", + " elif self.method == 'momentum':\n", + " if param_name not in self.momentum_buffer:\n", + " self.momentum_buffer[param_name] = np.zeros_like(params)\n", + " \n", + " self.momentum_buffer[param_name] = self.momentum * self.momentum_buffer[param_name] + self.learning_rate * grads\n", + " return params - self.momentum_buffer[param_name]\n", + " \n", + " elif self.method == 'adam':\n", + " if param_name not in self.v_buffer:\n", + " self.v_buffer[param_name] = np.zeros_like(params)\n", + " self.m_buffer[param_name] = np.zeros_like(params)\n", + " \n", + " self.v_buffer[param_name] = self.beta1 * self.v_buffer[param_name] + (1 - self.beta1) * grads\n", + " self.m_buffer[param_name] = self.beta2 * self.m_buffer[param_name] + (1 - self.beta2) * (grads ** 2)\n", + " \n", + " v_corrected = self.v_buffer[param_name] / (1 - self.beta1 ** self.t)\n", + " m_corrected = self.m_buffer[param_name] / (1 - self.beta2 ** self.t)\n", + " \n", + " return params - self.learning_rate * v_corrected / (np.sqrt(m_corrected) + self.epsilon)\n", + "\n", + "# Test optimizers on a simple quadratic function\n", + "def quadratic_function(x, y):\n", + " return x**2 + y**2\n", + "\n", + "def quadratic_gradient(x, y):\n", + " return 2*x, 2*y\n", + "\n", + "def optimize_function(optimizer_name, n_steps=50):\n", + " optimizer = Optimizer(method=optimizer_name, learning_rate=0.1)\n", + " \n", + " # Starting point\n", + " x, y = 3.0, 2.0\n", + " trajectory = [(x, y)]\n", + " \n", + " for i in range(n_steps):\n", + " grad_x, grad_y = quadratic_gradient(x, y)\n", + " \n", + " x = optimizer.update(x, grad_x, 'x')\n", + " y = optimizer.update(y, grad_y, 'y')\n", + " \n", + " trajectory.append((x, y))\n", + " \n", + " return np.array(trajectory)\n", + "\n", + "# Compare optimizers\n", + "optimizers = ['sgd', 'momentum', 'adam']\n", + "colors = ['red', 'blue', 'green']\n", + "\n", + "plt.figure(figsize=(10, 8))\n", + "X, Y = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))\n", + "Z = quadratic_function(X, Y)\n", + "\n", + "plt.contour(X, Y, Z, levels=20, alpha=0.3)\n", + "\n", + "for opt_name, color in zip(optimizers, colors):\n", + " trajectory = optimize_function(opt_name)\n", + " plt.plot(trajectory[:, 0], trajectory[:, 1], 'o-', color=color, label=opt_name, markersize=4)\n", + "\n", + "plt.xlabel('x')\n", + "plt.ylabel('y')\n", + "plt.title('Optimization Algorithms Comparison')\n", + "plt.legend()\n", + "plt.grid(True, alpha=0.3)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Key Takeaways\n", + "\n", + "1. **Perceptrons** can only learn linear decision boundaries\n", + "2. **Multi-layer networks** can learn complex non-linear patterns\n", + "3. **Activation functions** introduce non-linearity and are crucial for deep networks\n", + "4. **Backpropagation** efficiently computes gradients for training\n", + "5. **Optimization algorithms** significantly affect training speed and convergence\n", + "\n", + "These fundamentals form the foundation for modern deep learning architectures." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/07.02-Modern-Data-Science-Tools.ipynb b/notebooks/07.02-Modern-Data-Science-Tools.ipynb new file mode 100644 index 000000000..5c7d9a8be --- /dev/null +++ b/notebooks/07.02-Modern-Data-Science-Tools.ipynb @@ -0,0 +1,605 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Modern Data Science Tools\n", + "\n", + "This notebook introduces cutting-edge tools and libraries that are shaping modern data science workflows.\n", + "We'll explore tools for data manipulation, visualization, machine learning, and deployment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Advanced Data Manipulation with Polars\n", + "\n", + "Polars is a fast DataFrame library implemented in Rust with a Python interface, offering significant performance improvements over pandas." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install and import polars (if not already installed)\n", + "try:\n", + " import polars as pl\n", + " print(f\"Polars version: {pl.__version__}\")\n", + "except ImportError:\n", + " print(\"Installing Polars...\")\n", + " import subprocess\n", + " import sys\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"polars\"])\n", + " import polars as pl\n", + "\n", + "import numpy as np\n", + "import time\n", + "\n", + "# Create a large dataset for comparison\n", + "n_rows = 1_000_000\n", + "data = {\n", + " 'id': range(n_rows),\n", + " 'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),\n", + " 'value1': np.random.randn(n_rows),\n", + " 'value2': np.random.randn(n_rows),\n", + " 'timestamp': pl.date_range(start=pl.datetime(2023, 1, 1), end=pl.datetime(2023, 12, 31), n=n_rows)\n", + "}\n", + "\n", + "# Create Polars DataFrame\n", + "df_pl = pl.DataFrame(data)\n", + "print(f\"Polars DataFrame shape: {df_pl.shape}\")\n", + "print(\"\\nFirst few rows:\")\n", + "print(df_pl.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Demonstrate Polars' lazy evaluation and performance\n", + "print(\"=== Polars Lazy API ===\")\n", + "\n", + "# Lazy operations - no computation yet\n", + "lazy_df = (\n", + " df_pl.lazy()\n", + " .filter(pl.col('category') == 'A')\n", + " .group_by('category')\n", + " .agg([\n", + " pl.col('value1').mean().alias('mean_value1'),\n", + " pl.col('value2').std().alias('std_value2'),\n", + " pl.count().alias('count')\n", + " ])\n", + " .sort('mean_value1', descending=True)\n", + ")\n", + "\n", + "# Execute the lazy query\n", + "start_time = time.time()\n", + "result = lazy_df.collect()\n", + "polars_time = time.time() - start_time\n", + "\n", + "print(\"Result:\")\n", + "print(result)\n", + "print(f\"\\nPolars execution time: {polars_time:.4f} seconds\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Interactive Visualization with Plotly\n", + "\n", + "Plotly creates interactive, publication-quality visualizations that can be embedded in web applications." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import plotly.express as px\n", + " import plotly.graph_objects as go\n", + " from plotly.subplots import make_subplots\n", + " print(f\"Plotly version: {px.__version__}\")\n", + "except ImportError:\n", + " print(\"Installing Plotly...\")\n", + " import subprocess\n", + " import sys\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"plotly\"])\n", + " import plotly.express as px\n", + " import plotly.graph_objects as go\n", + " from plotly.subplots import make_subplots\n", + "\n", + "import pandas as pd\n", + "\n", + "# Create sample data for visualization\n", + "np.random.seed(42)\n", + "dates = pd.date_range('2023-01-01', periods=365, freq='D')\n", + "sales_data = pd.DataFrame({\n", + " 'date': dates,\n", + " 'sales': np.cumsum(np.random.randn(365) * 100 + 500),\n", + " 'category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], 365),\n", + " 'region': np.random.choice(['North', 'South', 'East', 'West'], 365)\n", + "})\n", + "\n", + "# Interactive line plot\n", + "fig = px.line(sales_data, x='date', y='sales', \n", + " title='Daily Sales Trend (Interactive)',\n", + " labels={'sales': 'Sales ($)', 'date': 'Date'})\n", + "fig.update_layout(hovermode='x unified')\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Interactive scatter plot with faceting\n", + "fig = px.scatter(sales_data, x='date', y='sales', \n", + " color='category', facet_col='region',\n", + " title='Sales by Category and Region',\n", + " labels={'sales': 'Sales ($)', 'date': 'Date'})\n", + "fig.update_layout(height=400)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Machine Learning with XGBoost and LightGBM\n", + "\n", + "Gradient boosting libraries that often outperform traditional algorithms in terms of accuracy and speed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import xgboost as xgb\n", + " import lightgbm as lgb\n", + " from sklearn.ensemble import RandomForestRegressor\n", + " from sklearn.model_selection import train_test_split, cross_val_score\n", + " from sklearn.metrics import mean_squared_error, r2_score\n", + " print(f\"XGBoost version: {xgb.__version__}\")\n", + " print(f\"LightGBM version: {lgb.__version__}\")\n", + "except ImportError:\n", + " print(\"Installing gradient boosting libraries...\")\n", + " import subprocess\n", + " import sys\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"xgboost\", \"lightgbm\"])\n", + " import xgboost as xgb\n", + " import lightgbm as lgb\n", + " from sklearn.ensemble import RandomForestRegressor\n", + " from sklearn.model_selection import train_test_split, cross_val_score\n", + " from sklearn.metrics import mean_squared_error, r2_score\n", + "\n", + "# Create a regression dataset\n", + "from sklearn.datasets import make_regression\n", + "X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, \n", + " noise=0.1, random_state=42)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "print(f\"Training set shape: {X_train.shape}\")\n", + "print(f\"Test set shape: {X_test.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compare different models\n", + "models = {\n", + " 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),\n", + " 'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, eval_metric='rmse'),\n", + " 'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)\n", + "}\n", + "\n", + "results = {}\n", + "\n", + "for name, model in models.items():\n", + " print(f\"\\nTraining {name}...\")\n", + " \n", + " # Train model\n", + " start_time = time.time()\n", + " model.fit(X_train, y_train)\n", + " training_time = time.time() - start_time\n", + " \n", + " # Make predictions\n", + " y_pred = model.predict(X_test)\n", + " \n", + " # Calculate metrics\n", + " mse = mean_squared_error(y_test, y_pred)\n", + " r2 = r2_score(y_test, y_pred)\n", + " \n", + " # Cross-validation\n", + " cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')\n", + " \n", + " results[name] = {\n", + " 'MSE': mse,\n", + " 'R²': r2,\n", + " 'CV R²': cv_scores.mean(),\n", + " 'Training Time': training_time\n", + " }\n", + " \n", + " print(f\" MSE: {mse:.4f}\")\n", + " print(f\" R²: {r2:.4f}\")\n", + " print(f\" CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})\")\n", + " print(f\" Training Time: {training_time:.4f}s\")\n", + "\n", + "# Display comparison\n", + "import pandas as pd\n", + "results_df = pd.DataFrame(results).T\n", + "print(\"\\n=== Model Comparison ===\")\n", + "print(results_df.round(4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Feature Engineering with Featuretools\n", + "\n", + "Automated feature engineering library that creates features from temporal and relational datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import featuretools as ft\n", + " print(f\"Featuretools version: {ft.__version__}\")\n", + "except ImportError:\n", + " print(\"Installing Featuretools...\")\n", + " import subprocess\n", + " import sys\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"featuretools\"])\n", + " import featuretools as ft\n", + "\n", + "# Create sample entity set\n", + "es = ft.EntitySet(id=\"customer_data\")\n", + "\n", + "# Create customer data\n", + "customers_df = pd.DataFrame({\n", + " 'customer_id': range(100),\n", + " 'age': np.random.randint(18, 80, 100),\n", + " 'gender': np.random.choice(['M', 'F'], 100),\n", + " 'signup_date': pd.date_range('2020-01-01', periods=100, freq='D')\n", + "})\n", + "\n", + "# Create transaction data\n", + "transactions_df = pd.DataFrame({\n", + " 'transaction_id': range(500),\n", + " 'customer_id': np.random.randint(0, 100, 500),\n", + " 'amount': np.random.uniform(10, 1000, 500),\n", + " 'transaction_date': pd.date_range('2020-01-01', periods=500, freq='6H')\n", + "})\n", + "\n", + "# Add entities to entity set\n", + "es = es.add_dataframe(\n", + " dataframe_name='customers',\n", + " dataframe=customers_df,\n", + " index='customer_id',\n", + " time_index='signup_date'\n", + ")\n", + "\n", + "es = es.add_dataframe(\n", + " dataframe_name='transactions',\n", + " dataframe=transactions_df,\n", + " index='transaction_id',\n", + " time_index='transaction_date'\n", + ")\n", + "\n", + "# Add relationship\n", + "es = es.add_relationship('customers', 'customer_id', 'transactions', 'customer_id')\n", + "\n", + "print(\"Entity Set created:\")\n", + "print(es)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate automated features\n", + "feature_matrix, feature_defs = ft.dfs(\n", + " entityset=es,\n", + " target_dataframe_name='customers',\n", + " max_depth=2,\n", + " verbose=True\n", + ")\n", + "\n", + "print(f\"\\nGenerated {len(feature_defs)} features\")\n", + "print(\"\\nFeature matrix shape:\", feature_matrix.shape)\n", + "print(\"\\nSample features:\")\n", + "print(feature_matrix.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Model Interpretation with SHAP\n", + "\n", + "SHAP (SHapley Additive exPlanations) explains machine learning model predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import shap\n", + " print(f\"SHAP version: {shap.__version__}\")\n", + "except ImportError:\n", + " print(\"Installing SHAP...\")\n", + " import subprocess\n", + " import sys\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"shap\"])\n", + " import shap\n", + "\n", + "# Use the best performing model from earlier\n", + "best_model = models['XGBoost']\n", + "best_model.fit(X_train, y_train)\n", + "\n", + "# Create SHAP explainer\n", + "explainer = shap.Explainer(best_model)\n", + "shap_values = explainer(X_test)\n", + "\n", + "# Summary plot\n", + "print(\"SHAP Summary Plot:\")\n", + "shap.summary_plot(shap_values, X_test, plot_type=\"bar\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Detailed explanation for a single prediction\n", + "print(\"SHAP Waterfall Plot for Single Prediction:\")\n", + "shap.waterfall_plot(shap_values[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. MLflow for Experiment Tracking\n", + "\n", + "MLflow tracks experiments, reproduces runs, and deploys models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import mlflow\n", + " import mlflow.sklearn\n", + " print(f\"MLflow version: {mlflow.__version__}\")\n", + "except ImportError:\n", + " print(\"Installing MLflow...\")\n", + " import subprocess\n", + " import sys\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"mlflow\"])\n", + " import mlflow\n", + " import mlflow.sklearn\n", + "\n", + "# Set up MLflow experiment\n", + "mlflow.set_experiment(\"Data Science Tools Comparison\")\n", + "\n", + "# Log an experiment\n", + "with mlflow.start_run(run_name=\"Random_Forest_Experiment\") as run:\n", + " # Log parameters\n", + " mlflow.log_param(\"model_type\", \"RandomForest\")\n", + " mlflow.log_param(\"n_estimators\", 100)\n", + " mlflow.log_param(\"max_depth\", 10)\n", + " \n", + " # Train and evaluate\n", + " rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)\n", + " rf.fit(X_train, y_train)\n", + " y_pred = rf.predict(X_test)\n", + " \n", + " # Log metrics\n", + " mse = mean_squared_error(y_test, y_pred)\n", + " r2 = r2_score(y_test, y_pred)\n", + " \n", + " mlflow.log_metric(\"mse\", mse)\n", + " mlflow.log_metric(\"r2_score\", r2)\n", + " \n", + " # Log the model\n", + " mlflow.sklearn.log_model(rf, \"random_forest_model\")\n", + " \n", + " print(f\"Experiment logged with run ID: {run.info.run_id}\")\n", + " print(f\"MSE: {mse:.4f}, R²: {r2:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Streamlit for Web Applications\n", + "\n", + "Streamlit turns data scripts into shareable web apps in minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example Streamlit app code (save as app.py to run)\n", + "streamlit_code = '''\n", + "import streamlit as st\n", + "import pandas as pd\n", + "import numpy as np\n", + "import plotly.express as px\n", + "\n", + "st.title(\"Data Science Tools Dashboard\")\n", + "st.write(\"Interactive dashboard built with Streamlit\")\n", + "\n", + "# Sidebar controls\n", + "st.sidebar.header(\"Controls\")\n", + "data_size = st.sidebar.slider(\"Data Size\", 100, 1000, 500)\n", + "noise_level = st.sidebar.slider(\"Noise Level\", 0.0, 1.0, 0.1)\n", + "\n", + "# Generate data\n", + "np.random.seed(42)\n", + "x = np.linspace(0, 10, data_size)\n", + "y = np.sin(x) + noise_level * np.random.randn(data_size)\n", + "\n", + "# Create DataFrame\n", + "df = pd.DataFrame({\"x\": x, \"y\": y})\n", + "\n", + "# Display data\n", + "st.subheader(\"Generated Data\")\n", + "st.dataframe(df.head())\n", + "\n", + "# Interactive plot\n", + "st.subheader(\"Interactive Plot\")\n", + "fig = px.line(df, x=\"x\", y=\"y\", title=\"Sine Wave with Noise\")\n", + "st.plotly_chart(fig, use_container_width=True)\n", + "\n", + "# Statistics\n", + "st.subheader(\"Statistics\")\n", + "col1, col2 = st.columns(2)\n", + "with col1:\n", + " st.metric(\"Mean Y\", f\"{np.mean(y):.3f}\")\n", + " st.metric(\"Std Y\", f\"{np.std(y):.3f}\")\n", + "with col2:\n", + " st.metric(\"Min Y\", f\"{np.min(y):.3f}\")\n", + " st.metric(\"Max Y\", f\"{np.max(y):.3f}\")\n", + "'''\n", + "\n", + "print(\"Streamlit app code generated. Save this as 'app.py' and run with: streamlit run app.py\")\n", + "print(\"\\nSample app code:\")\n", + "print(streamlit_code[:500] + \"...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Modern Python Data Science Stack\n", + "\n", + "### Essential Libraries for 2024+:\n", + "\n", + "**Data Manipulation:**\n", + "- `polars` - Fast DataFrames with lazy evaluation\n", + "- `pandas` - Still the standard for many workflows\n", + "- `dask` - Parallel computing with pandas-like API\n", + "\n", + "**Machine Learning:**\n", + "- `scikit-learn` - Traditional ML algorithms\n", + "- `xgboost` - Gradient boosting\n", + "- `lightgbm` - Fast gradient boosting\n", + "- `catboost` - Gradient boosting with categorical support\n", + "\n", + "**Deep Learning:**\n", + "- `tensorflow`/`keras` - Production-ready deep learning\n", + "- `pytorch` - Research-friendly deep learning\n", + "- `jax` - High-performance numerical computing\n", + "\n", + "**Visualization:**\n", + "- `plotly` - Interactive visualizations\n", + "- `altair` - Declarative statistical visualization\n", + "- `seaborn` - Statistical plots\n", + "- `matplotlib` - Foundation plotting library\n", + "\n", + "**MLOps:**\n", + "- `mlflow` - Experiment tracking\n", + "- `dvc` - Data version control\n", + "- `bentoml` - Model serving\n", + "- `streamlit` - Rapid app development\n", + "\n", + "**Feature Engineering:**\n", + "- `featuretools` - Automated feature engineering\n", + "- `tsfresh` - Time series feature extraction\n", + "- `category_encoders` - Advanced categorical encoding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Best Practices for Modern Data Science\n", + "\n", + "1. **Use lazy evaluation** when possible (Polars, Dask)\n", + "2. **Leverage GPU acceleration** for deep learning and large datasets\n", + "3. **Track experiments** systematically with MLflow or similar tools\n", + "4. **Automate feature engineering** to reduce manual effort\n", + "5. **Interpret models** using SHAP or LIME for transparency\n", + "6. **Build interactive dashboards** for stakeholder communication\n", + "7. **Use version control** for both code and data\n", + "8. **Containerize environments** with Docker for reproducibility\n", + "9. **Monitor model performance** in production\n", + "10. **Stay updated** with rapidly evolving tools and techniques" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resources for Learning Modern Tools\n", + "\n", + "- [Polars Documentation](https://pola.rs/docs/)\n", + "- [Plotly Documentation](https://plotly.com/python/)\n", + "- [XGBoost Guide](https://xgboost.readthedocs.io/)\n", + "- [MLflow Tracking](https://mlflow.org/docs/latest/tracking.html)\n", + "- [Streamlit Documentation](https://docs.streamlit.io/)\n", + "- [Featuretools Guide](https://featuretools.alteryx.com/)\n", + "- [SHAP Documentation](https://shap.readthedocs.io/)\n", + "\n", + "These tools represent the cutting edge of data science and will help you build more efficient, scalable, and impactful solutions." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/07.03-Production-ML-Systems.ipynb b/notebooks/07.03-Production-ML-Systems.ipynb new file mode 100644 index 000000000..2903d03cd --- /dev/null +++ b/notebooks/07.03-Production-ML-Systems.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Production Machine Learning Systems\n", + "\n", + "This notebook covers the complete lifecycle of deploying machine learning models to production, including monitoring, scaling, and maintenance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Deployment Strategies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "import joblib\n", + "import json\n", + "from datetime import datetime\n", + "\n", + "# Create a sample model for deployment\n", + "from sklearn.datasets import make_classification\n", + "X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Train model\n", + "model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Save model\n", + "joblib.dump(model, 'production_model.pkl')\n", + "print(\"Model saved successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create model metadata\n", + "model_metadata = {\n", + " \"model_name\": \"production_classifier\",\n", + " \"version\": \"1.0.0\",\n", + " \"created_date\": datetime.now().isoformat(),\n", + " \"features\": [f\"feature_{i}\" for i in range(10)],\n", + " \"target\": \"binary_classification\",\n", + " \"performance\": {\n", + " \"accuracy\": model.score(X_test, y_test),\n", + " \"n_features\": 10,\n", + " \"n_classes\": 2\n", + " },\n", + " \"requirements\": [\n", + " \"scikit-learn>=1.0.0\",\n", + " \"numpy>=1.20.0\",\n", + " \"pandas>=1.3.0\"\n", + " ]\n", + "}\n", + "\n", + "with open('model_metadata.json', 'w') as f:\n", + " json.dump(model_metadata, f, indent=2)\n", + "\n", + "print(\"Model metadata saved!\")\n", + "print(json.dumps(model_metadata, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Monitoring and Drift Detection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ModelMonitor:\n", + " def __init__(self, model_path, metadata_path):\n", + " self.model = joblib.load(model_path)\n", + " with open(metadata_path, 'r') as f:\n", + " self.metadata = json.load(f)\n", + " self.predictions_log = []\n", + " self.performance_log = []\n", + " \n", + " def predict(self, X):\n", + " prediction = self.model.predict(X)\n", + " probability = self.model.predict_proba(X)\n", + " \n", + " # Log prediction\n", + " self.predictions_log.append({\n", + " 'timestamp': datetime.now().isoformat(),\n", + " 'prediction': int(prediction[0]),\n", + " 'confidence': float(np.max(probability)),\n", + " 'features': X.tolist()\n", + " })\n", + " \n", + " return prediction, probability\n", + " \n", + " def check_drift(self, new_data, threshold=0.1):\n", + " \"\"\"Simple drift detection using statistical comparison\"\"\"\n", + " if len(self.predictions_log) < 100:\n", + " return False, \"Insufficient data for drift detection\"\n", + " \n", + " # Compare feature distributions\n", + " recent_features = [pred['features'] for pred in self.predictions_log[-100:]]\n", + " recent_mean = np.mean(recent_features, axis=0)\n", + " new_mean = np.mean(new_data, axis=0)\n", + " \n", + " drift_score = np.mean(np.abs(recent_mean - new_mean))\n", + " \n", + " if drift_score > threshold:\n", + " return True, f\"Drift detected with score: {drift_score:.3f}\"\n", + " \n", + " return False, f\"No drift detected. Score: {drift_score:.3f}\"\n", + " \n", + " def get_performance_report(self):\n", + " if not self.predictions_log:\n", + " return \"No predictions logged yet\"\n", + " \n", + " confidences = [pred['confidence'] for pred in self.predictions_log]\n", + " predictions = [pred['prediction'] for pred in self.predictions_log]\n", + " \n", + " return {\n", + " 'total_predictions': len(predictions),\n", + " 'avg_confidence': np.mean(confidences),\n", + " 'min_confidence': np.min(confidences),\n", + " 'prediction_distribution': {\n", + " 'class_0': predictions.count(0),\n", + " 'class_1': predictions.count(1)\n", + " }\n", + " }\n", + "\n", + "# Initialize monitor\n", + "monitor = ModelMonitor('production_model.pkl', 'model_metadata.json')\n", + "print(\"Model monitor initialized!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Simulate production predictions\n", + "print(\"Simulating production predictions...\")\n", + "\n", + "for i in range(50):\n", + " # Simulate new data\n", + " new_sample = np.random.randn(1, 10)\n", + " pred, prob = monitor.predict(new_sample)\n", + " \n", + " if i % 10 == 0:\n", + " print(f\"Prediction {i+1}: {pred[0]} (confidence: {np.max(prob):.3f})\")\n", + "\n", + "# Get performance report\n", + "report = monitor.get_performance_report()\n", + "print(\"\\nPerformance Report:\")\n", + "print(json.dumps(report, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A/B Testing for Models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ABTestManager:\n", + " def __init__(self):\n", + " self.models = {}\n", + " self.traffic_split = {}\n", + " self.results = {}\n", + " \n", + " def add_model(self, name, model_path, traffic_percentage):\n", + " self.models[name] = joblib.load(model_path)\n", + " self.traffic_split[name] = traffic_percentage / 100.0\n", + " self.results[name] = []\n", + " \n", + " def get_model_for_request(self):\n", + " rand = np.random.random()\n", + " cumulative = 0\n", + " \n", + " for model_name, percentage in self.traffic_split.items():\n", + " cumulative += percentage\n", + " if rand <= cumulative:\n", + " return model_name\n", + " \n", + " return list(self.models.keys())[-1] # fallback\n", + " \n", + " def predict(self, X):\n", + " model_name = self.get_model_for_request()\n", + " model = self.models[model_name]\n", + " prediction = model.predict(X)[0]\n", + " \n", + " # Log result\n", + " self.results[model_name].append({\n", + " 'timestamp': datetime.now().isoformat(),\n", + " 'prediction': int(prediction),\n", + " 'features': X.tolist()\n", + " })\n", + " \n", + " return prediction, model_name\n", + " \n", + " def get_results(self):\n", + " results_summary = {}\n", + " for model_name, results in self.results.items():\n", + " predictions = [r['prediction'] for r in results]\n", + " results_summary[model_name] = {\n", + " 'total_requests': len(predictions),\n", + " 'class_distribution': {\n", + " '0': predictions.count(0),\n", + " '1': predictions.count(1)\n", + " },\n", + " 'traffic_percentage': self.traffic_split[model_name] * 100\n", + " }\n", + " \n", + " return results_summary\n", + "\n", + "# Create a second model for comparison\n", + "model2 = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=123)\n", + "model2.fit(X_train, y_train)\n", + "joblib.dump(model2, 'model_v2.pkl')\n", + "\n", + "# Set up A/B test\n", + "ab_test = ABTestManager()\n", + "ab_test.add_model('model_v1', 'production_model.pkl', 70) # 70% traffic\n", + "ab_test.add_model('model_v2', 'model_v2.pkl', 30) # 30% traffic\n", + "\n", + "print(\"A/B test setup complete!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run A/B test simulation\n", + "print(\"Running A/B test simulation...\")\n", + "\n", + "for i in range(100):\n", + " sample = np.random.randn(1, 10)\n", + " prediction, model_used = ab_test.predict(sample)\n", + " \n", + " if i % 20 == 0:\n", + " print(f\"Request {i+1}: {prediction} (model: {model_used})\")\n", + "\n", + "# Get A/B test results\n", + "ab_results = ab_test.get_results()\n", + "print(\"\\nA/B Test Results:\")\n", + "print(json.dumps(ab_results, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Key Production Considerations\n", + "\n", + "### 1. **Model Versioning**\n", + "- Track model versions with semantic versioning\n", + "- Store model artifacts and metadata\n", + "- Maintain backward compatibility\n", + "\n", + "### 2. **Monitoring**\n", + "- Track prediction confidence scores\n", + "- Monitor data drift\n", + "- Log performance metrics\n", + "- Set up alerting for anomalies\n", + "\n", + "### 3. **A/B Testing**\n", + "- Gradual rollout of new models\n", + "- Statistical significance testing\n", + "- Traffic splitting strategies\n", + "- Rollback capabilities\n", + "\n", + "### 4. **Scalability**\n", + "- Horizontal scaling with load balancers\n", + "- Container orchestration (Kubernetes)\n", + "- Serverless deployment options\n", + "- Caching strategies\n", + "\n", + "### 5. **Security**\n", + "- API authentication and authorization\n", + "- Data encryption\n", + "- Input validation\n", + "- Rate limiting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Production Deployment Checklist\n", + "\n", + "### Pre-Deployment:\n", + "- [ ] Model performance meets requirements\n", + "- [ ] Comprehensive testing completed\n", + "- [ ] Documentation is complete\n", + "- [ ] Monitoring is configured\n", + "- [ ] Rollback plan is ready\n", + "\n", + "### Deployment:\n", + "- [ ] Model version is tagged\n", + "- [ ] Configuration is validated\n", + "- [ ] Health checks are passing\n", + "- [ ] Traffic is routed correctly\n", + "- [ ] Monitoring is active\n", + "\n", + "### Post-Deployment:\n", + "- [ ] Performance is monitored\n", + "- [ ] Logs are collected\n", + "- [ ] Alerts are configured\n", + "- [ ] User feedback is gathered\n", + "- [ ] Model is updated based on feedback" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/helpers_05_08.py b/notebooks/helpers_05_08.py index 09d900083..bcd260388 100644 --- a/notebooks/helpers_05_08.py +++ b/notebooks/helpers_05_08.py @@ -1,12 +1,47 @@ +""" +Helper functions for Decision Trees and Random Forests notebook. + +This module provides visualization utilities for decision tree classifiers +used in the Python Data Science Handbook. +""" + import numpy as np -import matplotlib.pyplot as plt; plt.rcParams['figure.dpi'] = 600 +import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier from ipywidgets import interact +# Set high DPI for better quality plots +plt.rcParams['figure.dpi'] = 600 + def visualize_tree(estimator, X, y, boundaries=True, xlim=None, ylim=None, ax=None): + """ + Visualize a decision tree classifier with decision boundaries. + + Parameters + ---------- + estimator : sklearn.tree.DecisionTreeClassifier + The decision tree classifier to visualize + X : array-like, shape (n_samples, 2) + Training data features (must be 2D for visualization) + y : array-like, shape (n_samples,) + Training data labels + boundaries : bool, default=True + Whether to plot decision boundaries + xlim : tuple, optional + x-axis limits for the plot + ylim : tuple, optional + y-axis limits for the plot + ax : matplotlib.axes.Axes, optional + Axes object to plot on + + Returns + ------- + None + Creates a visualization plot + """ ax = ax or plt.gca() # Plot the training points @@ -58,6 +93,21 @@ def plot_boundaries(i, xlim, ylim): def plot_tree_interactive(X, y): + """ + Create an interactive decision tree visualization. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Training data features + y : array-like, shape (n_samples,) + Training data labels + + Returns + ------- + ipywidgets.interactive + Interactive widget for controlling tree depth + """ def interactive_tree(depth=5): clf = DecisionTreeClassifier(max_depth=depth, random_state=0) visualize_tree(clf, X, y) @@ -66,6 +116,21 @@ def interactive_tree(depth=5): def randomized_tree_interactive(X, y): + """ + Create an interactive visualization of randomized decision trees. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Training data features + y : array-like, shape (n_samples,) + Training data labels + + Returns + ------- + None + Creates an interactive visualization + """ N = int(0.75 * X.shape[0]) xlim = (X[:, 0].min(), X[:, 0].max()) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..39c09d056 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "python-data-science-handbook" +description = "Jupyter notebooks for the Python Data Science Handbook" +readme = "README.md" +license = {text = "MIT"} +authors = [ + {name = "Jake VanderPlas", email = "jakevdp@uw.edu"} +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering", + "Topic :: Education", +] +requires-python = ">=3.8" +dependencies = [ + "numpy>=1.11.1", + "pandas>=0.18.1", + "scipy>=0.17.1", + "scikit-learn>=0.17.1", + "scikit-image>=0.12.3", + "pillow>=3.4.2", + "matplotlib>=1.5.1", + "seaborn>=0.7.0", + "jupyter", + "notebook", + "line_profiler", + "memory_profiler", + "numexpr", + "pandas-datareader", + "netcdf4", +] + +[project.urls] +Homepage = "https://github.com/jakevdp/PythonDataScienceHandbook" +Repository = "https://github.com/jakevdp/PythonDataScienceHandbook" +Documentation = "https://jakevdp.github.io/PythonDataScienceHandbook/" + +[tool.setuptools] +packages = [] + +[tool.setuptools_scm] +write_to = "tools/_version.py" diff --git a/requirements.txt b/requirements.txt index fe9cdd918..70374aa02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ -numpy==1.11.1 -pandas==0.18.1 -scipy==0.17.1 -scikit-learn==0.17.1 -scikit-image==0.12.3 -pillow==3.4.2 -matplotlib==1.5.1 -seaborn==0.7.0 +numpy>=1.11.1 +pandas>=0.18.1 +scipy>=0.17.1 +scikit-learn>=0.17.1 +scikit-image>=0.12.3 +pillow>=3.4.2 +matplotlib>=1.5.1 +seaborn>=0.7.0 jupyter notebook line_profiler @@ -13,3 +13,13 @@ memory_profiler numexpr pandas-datareader netcdf4 +# +tensorflow>=2.8.0 +xgboost>=1.6.0 +lightgbm>=3.3.0 +plotly>=5.0.0 +polars>=0.15.0 +shap>=0.41.0 +mlflow>=1.25.0 +featuretools>=1.0.0 +joblib>=1.1.0 diff --git a/tools/add_navigation.py b/tools/add_navigation.py index 0fec360d0..2d53fd087 100644 --- a/tools/add_navigation.py +++ b/tools/add_navigation.py @@ -1,3 +1,10 @@ +""" +Add navigation bars to all notebooks. + +This script adds previous/next navigation links and Colab badges to all notebooks +in the project to improve user experience when browsing through the content. +""" + import os import itertools @@ -9,7 +16,20 @@ def prev_this_next(it): - a, b, c = itertools.tee(it,3) + """ + Generate previous, current, and next items from an iterator. + + Parameters + ---------- + it : iterator + Iterator to generate triples from + + Returns + ------- + zip object + Zipped tuples of (previous, current, next) items + """ + a, b, c = itertools.tee(it, 3) next(c) return zip(itertools.chain([None], a), b, itertools.chain(c, [None])) @@ -26,6 +46,14 @@ def prev_this_next(it): def iter_navbars(): + """ + Generate navigation bars for all notebooks. + + Yields + ------ + tuple + (notebook_path, navigation_bar) for each notebook + """ for prev_nb, nb, next_nb in prev_this_next(iter_notebooks()): navbar = NAV_COMMENT if prev_nb: @@ -42,22 +70,26 @@ def iter_navbars(): def write_navbars(): + """Write navigation bars to the beginning and end of all notebooks.""" for nb_name, navbar in iter_navbars(): nb = nbformat.read(nb_name, as_version=4) nb_file = os.path.basename(nb_name) is_comment = lambda cell: cell.source.startswith(NAV_COMMENT) - if is_comment(nb.cells[1]): - print("- amending navbar for {0}".format(nb_file)) + # Add navbar at the beginning (after potential book info) + if len(nb.cells) > 1 and is_comment(nb.cells[1]): + print(f"- amending navbar for {nb_file}") nb.cells[1].source = navbar else: - print("- inserting navbar for {0}".format(nb_file)) + print(f"- inserting navbar for {nb_file}") nb.cells.insert(1, new_markdown_cell(source=navbar)) - if is_comment(nb.cells[-1]): + # Add navbar at the end + if nb.cells and is_comment(nb.cells[-1]): nb.cells[-1].source = navbar else: nb.cells.append(new_markdown_cell(source=navbar)) + nbformat.write(nb, nb_name) diff --git a/tools/fix_kernelspec.py b/tools/fix_kernelspec.py index c98d5e927..6280cc046 100644 --- a/tools/fix_kernelspec.py +++ b/tools/fix_kernelspec.py @@ -1,15 +1,28 @@ -import os +""" +Fix kernelspec display names for all notebooks. + +This script updates the kernelspec display name to 'Python 3' for all notebooks +in the project to ensure consistent kernel display across different environments. +""" +import os import nbformat from generate_contents import iter_notebooks, NOTEBOOK_DIR + def fix_kernelspec(): + """Update kernelspec display name to 'Python 3' for all notebooks.""" for nb_name in iter_notebooks(): nb_file = os.path.join(NOTEBOOK_DIR, nb_name) nb = nbformat.read(nb_file, as_version=4) - print("- Updating kernelspec for {0}".format(nb_name)) + print(f"- Updating kernelspec for {nb_name}") + + # Ensure kernelspec metadata exists + if 'kernelspec' not in nb['metadata']: + nb['metadata']['kernelspec'] = {} + nb['metadata']['kernelspec']['display_name'] = 'Python 3' nbformat.write(nb, nb_file) diff --git a/tools/validate_notebooks.py b/tools/validate_notebooks.py new file mode 100644 index 000000000..634e16e6b --- /dev/null +++ b/tools/validate_notebooks.py @@ -0,0 +1,97 @@ +""" +Validate all Jupyter notebooks in the project. + +This script checks all notebooks for common issues like missing metadata, +execution count consistency, and cell output validation. +""" + +import os +import sys +import nbformat +from nbformat.v4.nbbase import new_markdown_cell + +from generate_contents import iter_notebooks, NOTEBOOK_DIR + + +def validate_notebook(nb_path): + """ + Validate a single notebook for common issues. + + Parameters + ---------- + nb_path : str + Path to the notebook file + + Returns + ------- + list + List of validation warnings/errors + """ + warnings = [] + + try: + with open(nb_path, 'r', encoding='utf-8') as f: + nb = nbformat.read(f, as_version=4) + except Exception as e: + return [f"Failed to read notebook: {e}"] + + # Check notebook metadata + if 'kernelspec' not in nb.metadata: + warnings.append("Missing kernelspec metadata") + elif 'display_name' not in nb.metadata.kernelspec: + warnings.append("Missing kernelspec display_name") + + # Check for empty cells + for i, cell in enumerate(nb.cells): + if cell.cell_type == 'code' and not cell.source.strip(): + warnings.append(f"Empty code cell at position {i}") + elif cell.cell_type == 'markdown' and not cell.source.strip(): + warnings.append(f"Empty markdown cell at position {i}") + + # Check for book info comment + if nb.cells and not nb.cells[0].source.startswith(""): + warnings.append("Missing book information comment") + + # Check for navigation + has_nav = any("" in cell.source for cell in nb.cells) + if not has_nav: + warnings.append("Missing navigation bar") + + return warnings + + +def validate_all_notebooks(): + """Validate all notebooks and print results.""" + print("Validating all notebooks...") + print("=" * 50) + + all_warnings = [] + + for nb_name in iter_notebooks(): + nb_path = os.path.join(NOTEBOOK_DIR, nb_name) + warnings = validate_notebook(nb_path) + + if warnings: + print(f"\n{nb_name}:") + for warning in warnings: + print(f" - {warning}") + all_warnings.extend([(nb_name, warning) for warning in warnings]) + else: + print(f"✓ {nb_name} - No issues found") + + print("\n" + "=" * 50) + print(f"Validation complete. Found {len(all_warnings)} issues total.") + + if all_warnings: + print("\nSummary of issues:") + for nb_name, warning in all_warnings: + print(f" {nb_name}: {warning}") + + return 1 + else: + print("All notebooks passed validation!") + return 0 + + +if __name__ == '__main__': + sys.exit(validate_all_notebooks()) From 67e5c9914db48727c8b745a65fb74028c9a0d157 Mon Sep 17 00:00:00 2001 From: hamzaMissewi Date: Mon, 12 Jan 2026 05:16:35 +0100 Subject: [PATCH 2/3] chore: Update .gitignore to ignore all ignore files - Remove CONTRIBUTING.md from gitignore (should be tracked) - Add generic pattern to ignore all *ignore files - Clean up gitignore patterns for better maintenance --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1eaba128a..af8c0bb8e 100644 --- a/.gitignore +++ b/.gitignore @@ -113,4 +113,5 @@ __pypackages__/ dmypy.json .mypy_cache/ -CONTRIBUTING.md \ No newline at end of file + +**/*ignore* \ No newline at end of file From 95451364163c0f6ab37e1cd0535627fbd9add5f4 Mon Sep 17 00:00:00 2001 From: hamzaMissewi Date: Mon, 12 Jan 2026 16:02:28 +0100 Subject: [PATCH 3/3] add index jupyter --- notebooks/Index.ipynb | 120 +++++++++++++++++++++++++++++++++++++++ notebooks/Untitled.ipynb | 6 -- 2 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 notebooks/Index.ipynb delete mode 100644 notebooks/Untitled.ipynb diff --git a/notebooks/Index.ipynb b/notebooks/Index.ipynb new file mode 100644 index 000000000..0cbd11809 --- /dev/null +++ b/notebooks/Index.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Python Data Science Handbook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Jake VanderPlas*\n", + "\n", + "![Book Cover](figures/PDSH-cover.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is the Jupyter notebook version of the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*\n", + "The text is released under the [CC-BY-NC-ND license](https://creativecommons.org/licenses/by-nc-nd/3.0/us/legalcode), and code is released under the [MIT license](https://opensource.org/licenses/MIT). If you find this content useful, please consider supporting the work by [buying the book](http://shop.oreilly.com/product/0636920034919.do)!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "\n", + "### [Preface](00.00-Preface.ipynb)\n", + "\n", + "### [1. IPython: Beyond Normal Python](01.00-IPython-Beyond-Normal-Python.ipynb)\n", + "- [Help and Documentation in IPython](01.01-Help-And-Documentation.ipynb)\n", + "- [Keyboard Shortcuts in the IPython Shell](01.02-Shell-Keyboard-Shortcuts.ipynb)\n", + "- [IPython Magic Commands](01.03-Magic-Commands.ipynb)\n", + "- [Input and Output History](01.04-Input-Output-History.ipynb)\n", + "- [IPython and Shell Commands](01.05-IPython-And-Shell-Commands.ipynb)\n", + "- [Errors and Debugging](01.06-Errors-and-Debugging.ipynb)\n", + "- [Profiling and Timing Code](01.07-Timing-and-Profiling.ipynb)\n", + "- [More IPython Resources](01.08-More-IPython-Resources.ipynb)\n", + "\n", + "### [2. Introduction to NumPy](02.00-Introduction-to-NumPy.ipynb)\n", + "- [Understanding Data Types in Python](02.01-Understanding-Data-Types.ipynb)\n", + "- [The Basics of NumPy Arrays](02.02-The-Basics-Of-NumPy-Arrays.ipynb)\n", + "- [Computation on NumPy Arrays: Universal Functions](02.03-Computation-on-arrays-ufuncs.ipynb)\n", + "- [Aggregations: Min, Max, and Everything In Between](02.04-Computation-on-arrays-aggregates.ipynb)\n", + "- [Computation on Arrays: Broadcasting](02.05-Computation-on-arrays-broadcasting.ipynb)\n", + "- [Comparisons, Masks, and Boolean Logic](02.06-Boolean-Arrays-and-Masks.ipynb)\n", + "- [Fancy Indexing](02.07-Fancy-Indexing.ipynb)\n", + "- [Sorting Arrays](02.08-Sorting.ipynb)\n", + "- [Structured Data: NumPy's Structured Arrays](02.09-Structured-Data-NumPy.ipynb)\n", + "\n", + "### [3. Data Manipulation with Pandas](03.00-Introduction-to-Pandas.ipynb)\n", + "- [Introducing Pandas Objects](03.01-Introducing-Pandas-Objects.ipynb)\n", + "- [Data Indexing and Selection](03.02-Data-Indexing-and-Selection.ipynb)\n", + "- [Operating on Data in Pandas](03.03-Operations-in-Pandas.ipynb)\n", + "- [Handling Missing Data](03.04-Missing-Values.ipynb)\n", + "- [Hierarchical Indexing](03.05-Hierarchical-Indexing.ipynb)\n", + "- [Combining Datasets: Concat and Append](03.06-Concat-And-Append.ipynb)\n", + "- [Combining Datasets: Merge and Join](03.07-Merge-and-Join.ipynb)\n", + "- [Aggregation and Grouping](03.08-Aggregation-and-Grouping.ipynb)\n", + "- [Pivot Tables](03.09-Pivot-Tables.ipynb)\n", + "- [Vectorized String Operations](03.10-Working-With-Strings.ipynb)\n", + "- [Working with Time Series](03.11-Working-with-Time-Series.ipynb)\n", + "- [High-Performance Pandas: eval() and query()](03.12-Performance-Eval-and-Query.ipynb)\n", + "- [Further Resources](03.13-Further-Resources.ipynb)\n", + "\n", + "### [4. Visualization with Matplotlib](04.00-Introduction-To-Matplotlib.ipynb)\n", + "- [Simple Line Plots](04.01-Simple-Line-Plots.ipynb)\n", + "- [Simple Scatter Plots](04.02-Simple-Scatter-Plots.ipynb)\n", + "- [Visualizing Errors](04.03-Errorbars.ipynb)\n", + "- [Density and Contour Plots](04.04-Density-and-Contour-Plots.ipynb)\n", + "- [Histograms, Binnings, and Density](04.05-Histograms-and-Binnings.ipynb)\n", + "- [Customizing Plot Legends](04.06-Customizing-Legends.ipynb)\n", + "- [Customizing Colorbars](04.07-Customizing-Colorbars.ipynb)\n", + "- [Multiple Subplots](04.08-Multiple-Subplots.ipynb)\n", + "- [Text and Annotation](04.09-Text-and-Annotation.ipynb)\n", + "- [Customizing Ticks](04.10-Customizing-Ticks.ipynb)\n", + "- [Customizing Matplotlib: Configurations and Stylesheets](04.11-Settings-and-Stylesheets.ipynb)\n", + "- [Three-Dimensional Plotting in Matplotlib](04.12-Three-Dimensional-Plotting.ipynb)\n", + "- [Visualization with Seaborn](04.14-Visualization-With-Seaborn.ipynb)\n", + "- [Further Resources](04.15-Further-Resources.ipynb)\n", + "\n", + "### [5. Machine Learning](05.00-Machine-Learning.ipynb)\n", + "- [What Is Machine Learning?](05.01-What-Is-Machine-Learning.ipynb)\n", + "- [Introducing Scikit-Learn](05.02-Introducing-Scikit-Learn.ipynb)\n", + "- [Hyperparameters and Model Validation](05.03-Hyperparameters-and-Model-Validation.ipynb)\n", + "- [Feature Engineering](05.04-Feature-Engineering.ipynb)\n", + "- [In Depth: Naive Bayes Classification](05.05-Naive-Bayes.ipynb)\n", + "- [In Depth: Linear Regression](05.06-Linear-Regression.ipynb)\n", + "- [In-Depth: Support Vector Machines](05.07-Support-Vector-Machines.ipynb)\n", + "- [In-Depth: Decision Trees and Random Forests](05.08-Random-Forests.ipynb)\n", + "- [In Depth: Principal Component Analysis](05.09-Principal-Component-Analysis.ipynb)\n", + "- [In-Depth: Manifold Learning](05.10-Manifold-Learning.ipynb)\n", + "- [In Depth: k-Means Clustering](05.11-K-Means.ipynb)\n", + "- [In Depth: Gaussian Mixture Models](05.12-Gaussian-Mixtures.ipynb)\n", + "- [In-Depth: Kernel Density Estimation](05.13-Kernel-Density-Estimation.ipynb)\n", + "- [Application: A Face Detection Pipeline](05.14-Image-Features.ipynb)\n", + "- [Further Machine Learning Resources](05.15-Learning-More.ipynb)\n", + "\n", + "### [6. Introduction to Deep Learning](07.00-Introduction-to-Deep-Learning.ipynb)\n", + "- [Neural Network Fundamentals](07.01-Neural-Network-Fundamentals.ipynb)\n", + "- [Modern Data Science Tools](07.02-Modern-Data-Science-Tools.ipynb)\n", + "- [Production ML Systems](07.03-Production-ML-Systems.ipynb)\n", + "\n", + "### [Appendix: Figure Code](06.00-Figure-Code.ipynb)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb deleted file mode 100644 index 363fcab7e..000000000 --- a/notebooks/Untitled.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 5 -}