|
| 1 | +\documentclass{beamer} |
| 2 | +\usetheme{Madrid} |
| 3 | + |
| 4 | +\usepackage{amsmath,amssymb,amsfonts,bm,mathtools} |
| 5 | +\usepackage{physics} |
| 6 | +\usepackage{bbm} |
| 7 | +\usepackage{mathrsfs} |
| 8 | + |
| 9 | +\title{The Mathematics of Convolutional Neural Networks} |
| 10 | +\subtitle{Operator Theory, Symmetry, PDE Limits, and Expressivity} |
| 11 | +\author{Morten Hjorth-Jensen} |
| 12 | +\date{Spring 2026} |
| 13 | + |
| 14 | +\begin{document} |
| 15 | + |
| 16 | +%========================================================= |
| 17 | +\begin{frame} |
| 18 | +\titlepage |
| 19 | +\end{frame} |
| 20 | + |
| 21 | +%========================================================= |
| 22 | +\section*{Lecture 1: Convolution as an Operator} |
| 23 | + |
| 24 | +%--------------------------------------------------------- |
| 25 | +\begin{frame}{Linear Shift-Invariant Operators} |
| 26 | +Let $T_a f(x)=f(x-a)$ on $\ell^2(\mathbb{Z}^d)$. |
| 27 | + |
| 28 | +\textbf{Definition:} |
| 29 | +$L$ is shift-invariant if |
| 30 | +\[ |
| 31 | +L T_a = T_a L. |
| 32 | +\] |
| 33 | + |
| 34 | +\textbf{Theorem (Structure theorem):} |
| 35 | +Every bounded linear shift-invariant operator |
| 36 | +\[ |
| 37 | +L:\ell^2(\mathbb{Z}^d)\to\ell^2(\mathbb{Z}^d) |
| 38 | +\] |
| 39 | +is convolution with some $k\in\ell^1$. |
| 40 | + |
| 41 | +\[ |
| 42 | +(Lf)(x)=\sum_y k(x-y)f(y). |
| 43 | +\] |
| 44 | +\end{frame} |
| 45 | + |
| 46 | +%--------------------------------------------------------- |
| 47 | +\begin{frame}{Proof Sketch} |
| 48 | +Let $\delta_0$ be Kronecker delta. |
| 49 | + |
| 50 | +Define |
| 51 | +\[ |
| 52 | +k(x)=L\delta_0(x). |
| 53 | +\] |
| 54 | + |
| 55 | +Using shift invariance: |
| 56 | +\[ |
| 57 | +L\delta_y = T_y L\delta_0 = k(\cdot - y). |
| 58 | +\] |
| 59 | + |
| 60 | +For general $f$, |
| 61 | +\[ |
| 62 | +Lf=\sum_y f(y)L\delta_y |
| 63 | +=\sum_y f(y)k(\cdot - y). |
| 64 | +\] |
| 65 | + |
| 66 | +Hence $Lf=f*k$. |
| 67 | +\end{frame} |
| 68 | + |
| 69 | +%--------------------------------------------------------- |
| 70 | +\begin{frame}{Fourier Diagonalization} |
| 71 | +Fourier transform: |
| 72 | +\[ |
| 73 | +\hat f(\omega)=\sum_x f(x)e^{-i\omega x}. |
| 74 | +\] |
| 75 | + |
| 76 | +Then |
| 77 | +\[ |
| 78 | +\widehat{f*k}=\hat k \hat f. |
| 79 | +\] |
| 80 | + |
| 81 | +Thus convolution operators are diagonal in Fourier basis. |
| 82 | + |
| 83 | +Eigenfunctions: |
| 84 | +\[ |
| 85 | +e^{i\omega x} |
| 86 | +\] |
| 87 | +with eigenvalues $\hat k(\omega)$. |
| 88 | +\end{frame} |
| 89 | + |
| 90 | +%========================================================= |
| 91 | +\section*{Lecture 2: Variational and Functional View} |
| 92 | + |
| 93 | +%--------------------------------------------------------- |
| 94 | +\begin{frame}{CNN Layers as Nonlinear Operators} |
| 95 | +A CNN layer: |
| 96 | +\[ |
| 97 | +\Phi(f)=\sigma(K*f). |
| 98 | +\] |
| 99 | + |
| 100 | +Interpret as operator on Banach space |
| 101 | +\[ |
| 102 | +\Phi: L^p(\Omega)\to L^p(\Omega). |
| 103 | +\] |
| 104 | +\end{frame} |
| 105 | + |
| 106 | +%--------------------------------------------------------- |
| 107 | +\begin{frame}{Variational Interpretation} |
| 108 | +Consider functional |
| 109 | +\[ |
| 110 | +\mathcal{J}(f)=\int_\Omega |f(x)|^2 dx. |
| 111 | +\] |
| 112 | + |
| 113 | +Convolution acts as quadratic form: |
| 114 | +\[ |
| 115 | +\langle f, K*f\rangle |
| 116 | += |
| 117 | +\int \hat k(\omega)|\hat f(\omega)|^2 d\omega. |
| 118 | +\] |
| 119 | + |
| 120 | +Thus kernels define spectral penalties. |
| 121 | +\end{frame} |
| 122 | + |
| 123 | +%--------------------------------------------------------- |
| 124 | +\begin{frame}{Expressivity Theorem} |
| 125 | +\textbf{Theorem:} |
| 126 | +Deep convolutional networks with ReLU activation approximate any continuous translation-equivariant map |
| 127 | +\[ |
| 128 | +F:C(\Omega)\to C(\Omega). |
| 129 | +\] |
| 130 | + |
| 131 | +Idea: |
| 132 | +\begin{itemize} |
| 133 | + \item Finite receptive fields |
| 134 | + \item Increasing depth expands support |
| 135 | +\end{itemize} |
| 136 | +\end{frame} |
| 137 | + |
| 138 | +%========================================================= |
| 139 | +\section*{Lecture 3: Continuous Limit and Neural PDEs} |
| 140 | + |
| 141 | +%--------------------------------------------------------- |
| 142 | +\begin{frame}{Continuous Convolution} |
| 143 | +In continuum: |
| 144 | +\[ |
| 145 | +(K*f)(x)=\int K(x-y)f(y)dy. |
| 146 | +\] |
| 147 | + |
| 148 | +For small kernels: |
| 149 | +\[ |
| 150 | +K(x)=\delta(x)+\epsilon \kappa(x). |
| 151 | +\] |
| 152 | + |
| 153 | +Then |
| 154 | +\[ |
| 155 | +K*f=f+\epsilon \kappa*f. |
| 156 | +\] |
| 157 | +\end{frame} |
| 158 | + |
| 159 | +%--------------------------------------------------------- |
| 160 | +\begin{frame}{Diffusion Limit} |
| 161 | +If $\kappa$ symmetric and localized: |
| 162 | + |
| 163 | +\[ |
| 164 | +\kappa*f \approx c \Delta f. |
| 165 | +\] |
| 166 | + |
| 167 | +Thus one layer: |
| 168 | +\[ |
| 169 | +f_{l+1}=f_l + \epsilon c \Delta f_l. |
| 170 | +\] |
| 171 | + |
| 172 | +In deep limit: |
| 173 | +\[ |
| 174 | +\partial_t f = c \Delta f. |
| 175 | +\] |
| 176 | + |
| 177 | +CNN approximates diffusion PDE. |
| 178 | +\end{frame} |
| 179 | + |
| 180 | +%--------------------------------------------------------- |
| 181 | +\begin{frame}{Neural ODE Limit} |
| 182 | +Let depth $\to\infty$, step $\to0$: |
| 183 | + |
| 184 | +\[ |
| 185 | +f_{l+1}-f_l = h \Phi(f_l) |
| 186 | +\] |
| 187 | + |
| 188 | +Continuous limit: |
| 189 | +\[ |
| 190 | +\partial_t f = \Phi(f). |
| 191 | +\] |
| 192 | + |
| 193 | +Thus CNN approximates nonlinear PDE. |
| 194 | +\end{frame} |
| 195 | + |
| 196 | +%========================================================= |
| 197 | +\section*{Lecture 4: Renormalization Group View} |
| 198 | + |
| 199 | +%--------------------------------------------------------- |
| 200 | +\begin{frame}{Hierarchical Representation} |
| 201 | +CNN layers increase receptive field. |
| 202 | + |
| 203 | +Analogy: |
| 204 | +\[ |
| 205 | +\text{RG step: integrate short-scale modes} |
| 206 | +\] |
| 207 | + |
| 208 | +Pooling: |
| 209 | +\[ |
| 210 | +f(x) \mapsto f(2x) |
| 211 | +\] |
| 212 | + |
| 213 | +acts as coarse-graining. |
| 214 | +\end{frame} |
| 215 | + |
| 216 | +%--------------------------------------------------------- |
| 217 | +\begin{frame}{Scale Decomposition} |
| 218 | +Fourier perspective: |
| 219 | + |
| 220 | +Early layers: high frequency filtering. |
| 221 | + |
| 222 | +Deeper layers: low frequency structure. |
| 223 | + |
| 224 | +This mirrors Wilsonian RG: |
| 225 | +\[ |
| 226 | +\Lambda \to \Lambda/b. |
| 227 | +\] |
| 228 | +\end{frame} |
| 229 | + |
| 230 | +%========================================================= |
| 231 | +\section*{Lecture 5: Group Equivariance} |
| 232 | + |
| 233 | +%--------------------------------------------------------- |
| 234 | +\begin{frame}{General Group Convolution} |
| 235 | +For group $G$: |
| 236 | + |
| 237 | +\[ |
| 238 | +(f * k)(g)=\int_G f(h)k(h^{-1}g)dh. |
| 239 | +\] |
| 240 | + |
| 241 | +Equivariant if |
| 242 | +\[ |
| 243 | +\Phi(L_g f)=L_g\Phi(f). |
| 244 | +\] |
| 245 | +\end{frame} |
| 246 | + |
| 247 | +%--------------------------------------------------------- |
| 248 | +\begin{frame}{Representation Theory} |
| 249 | +Decompose |
| 250 | +\[ |
| 251 | +L^2(G)=\bigoplus_{\pi} V_\pi. |
| 252 | +\] |
| 253 | + |
| 254 | +Equivariant operators block-diagonal in irreducible representations. |
| 255 | + |
| 256 | +Explains: |
| 257 | +\begin{itemize} |
| 258 | + \item SO(3) CNNs |
| 259 | + \item Gauge equivariant nets |
| 260 | +\end{itemize} |
| 261 | +\end{frame} |
| 262 | + |
| 263 | +%========================================================= |
| 264 | +\section*{Lecture 6: Fourier Neural Operators} |
| 265 | + |
| 266 | +%--------------------------------------------------------- |
| 267 | +\begin{frame}{FNO Definition} |
| 268 | +FNO layer: |
| 269 | +\[ |
| 270 | +f_{l+1}(x)=\sigma\left(\mathcal{F}^{-1}\left(R(\omega)\hat f(\omega)\right)\right). |
| 271 | +\] |
| 272 | + |
| 273 | +Truncated Fourier modes. |
| 274 | + |
| 275 | +Global operator learning. |
| 276 | +\end{frame} |
| 277 | + |
| 278 | +%--------------------------------------------------------- |
| 279 | +\begin{frame}{Comparison to CNN} |
| 280 | +CNN: |
| 281 | +\[ |
| 282 | +\text{local kernel} |
| 283 | +\] |
| 284 | + |
| 285 | +FNO: |
| 286 | +\[ |
| 287 | +\text{global spectral multiplier} |
| 288 | +\] |
| 289 | + |
| 290 | +CNN approximates local PDEs. |
| 291 | + |
| 292 | +FNO approximates integral operators. |
| 293 | +\end{frame} |
| 294 | + |
| 295 | +%========================================================= |
| 296 | +\section*{Lecture 7: Optimization and Stability} |
| 297 | + |
| 298 | +%--------------------------------------------------------- |
| 299 | +\begin{frame}{Gradient of Convolution} |
| 300 | +\[ |
| 301 | +\frac{\partial \mathcal{L}}{\partial K} |
| 302 | += |
| 303 | +f * \delta. |
| 304 | +\] |
| 305 | + |
| 306 | +Adjoint: |
| 307 | +\[ |
| 308 | +\tilde K(x)=K(-x). |
| 309 | +\] |
| 310 | +\end{frame} |
| 311 | + |
| 312 | +%--------------------------------------------------------- |
| 313 | +\begin{frame}{Stability} |
| 314 | +If |
| 315 | +\[ |
| 316 | +\|\hat k\|_\infty < 1 |
| 317 | +\] |
| 318 | + |
| 319 | +then convolution is contraction. |
| 320 | + |
| 321 | +Deep nets stable under spectral normalization. |
| 322 | +\end{frame} |
| 323 | + |
| 324 | +%========================================================= |
| 325 | +\section*{Exercises} |
| 326 | + |
| 327 | +%--------------------------------------------------------- |
| 328 | +\begin{frame}{Exercise 1} |
| 329 | +Prove that every bounded translation-invariant operator on $\ell^2$ is convolution. |
| 330 | +\end{frame} |
| 331 | + |
| 332 | +%--------------------------------------------------------- |
| 333 | +\begin{frame}{Exercise 2} |
| 334 | +Show diffusion limit of symmetric kernel explicitly via Taylor expansion. |
| 335 | +\end{frame} |
| 336 | + |
| 337 | +%--------------------------------------------------------- |
| 338 | +\begin{frame}{Exercise 3} |
| 339 | +Derive continuous depth limit leading to reaction-diffusion PDE. |
| 340 | +\end{frame} |
| 341 | + |
| 342 | +%--------------------------------------------------------- |
| 343 | +\begin{frame}{Exercise 4} |
| 344 | +Show that group convolution preserves equivariance. |
| 345 | +\end{frame} |
| 346 | + |
| 347 | +%--------------------------------------------------------- |
| 348 | +\begin{frame}{Summary} |
| 349 | +CNNs are: |
| 350 | +\begin{itemize} |
| 351 | + \item Convolution operators |
| 352 | + \item Spectral filters |
| 353 | + \item Nonlinear PDE discretizations |
| 354 | + \item Hierarchical renormalization flows |
| 355 | + \item Symmetry-preserving operator learners |
| 356 | +\end{itemize} |
| 357 | +\end{frame} |
| 358 | + |
| 359 | +\end{document} |
0 commit comments