diff --git a/src/TorchVision/models/DenseNet.cs b/src/TorchVision/models/DenseNet.cs
new file mode 100644
index 000000000..a636b62cd
--- /dev/null
+++ b/src/TorchVision/models/DenseNet.cs
@@ -0,0 +1,367 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+    public static partial class torchvision
+    {
+        public static partial class models
+        {
+            /// <summary>
+            /// DenseNet-121 model from "Densely Connected Convolutional Networks".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="growth_rate">How many filters to add each layer.</param>
+            /// <param name="bn_size">Multiplicative factor for number of bottleneck layers (i.e. bn_size * k features in the bottleneck layer).</param>
+            /// <param name="drop_rate">Dropout rate after each dense layer.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.densenet121(pretrained=True)
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            ///
+            /// In order for the weights to be loaded, the number of classes has to be the same as
+            /// in the pre-trained model, which is 1000.
+            ///
+            /// It is also possible to skip loading the last linear layer and use it for transfer-learning
+            /// with a different number of output classes. To do so, pass skipfc=true.
+            ///
+            /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+            /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+            /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+            /// </remarks>
+            public static Modules.DenseNet densenet121(
+                int num_classes = 1000,
+                int growth_rate = 32,
+                int bn_size = 4,
+                float drop_rate = 0,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 24, 16 }, 64, bn_size, drop_rate,
+                    num_classes, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// DenseNet-161 model from "Densely Connected Convolutional Networks".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="growth_rate">How many filters to add each layer.</param>
+            /// <param name="bn_size">Multiplicative factor for number of bottleneck layers.</param>
+            /// <param name="drop_rate">Dropout rate after each dense layer.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.DenseNet densenet161(
+                int num_classes = 1000,
+                int growth_rate = 48,
+                int bn_size = 4,
+                float drop_rate = 0,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 36, 24 }, 96, bn_size, drop_rate,
+                    num_classes, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// DenseNet-169 model from "Densely Connected Convolutional Networks".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="growth_rate">How many filters to add each layer.</param>
+            /// <param name="bn_size">Multiplicative factor for number of bottleneck layers.</param>
+            /// <param name="drop_rate">Dropout rate after each dense layer.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.DenseNet densenet169(
+                int num_classes = 1000,
+                int growth_rate = 32,
+                int bn_size = 4,
+                float drop_rate = 0,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 32, 32 }, 64, bn_size, drop_rate,
+                    num_classes, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// DenseNet-201 model from "Densely Connected Convolutional Networks".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="growth_rate">How many filters to add each layer.</param>
+            /// <param name="bn_size">Multiplicative factor for number of bottleneck layers.</param>
+            /// <param name="drop_rate">Dropout rate after each dense layer.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.DenseNet densenet201(
+                int num_classes = 1000,
+                int growth_rate = 32,
+                int bn_size = 4,
+                float drop_rate = 0,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 48, 32 }, 64, bn_size, drop_rate,
+                    num_classes, weights_file, skipfc, device);
+            }
+        }
+    }
+
+    namespace Modules
+    {
+        // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
+        // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+        public class DenseNet : Module<Tensor, Tensor>
+        {
+            /// <summary>
+            /// A single dense layer (BN-ReLU-Conv1x1-BN-ReLU-Conv3x3) as described in the paper.
+            /// </summary>
+            private class DenseLayer : Module<Tensor, Tensor>
+            {
+                private readonly Module<Tensor, Tensor> norm1;
+                private readonly Module<Tensor, Tensor> relu1;
+                private readonly Module<Tensor, Tensor> conv1;
+                private readonly Module<Tensor, Tensor> norm2;
+                private readonly Module<Tensor, Tensor> relu2;
+                private readonly Module<Tensor, Tensor> conv2;
+                private readonly float drop_rate;
+
+                public DenseLayer(string name, int num_input_features, int growth_rate, int bn_size, float drop_rate)
+                    : base(name)
+                {
+                    norm1 = BatchNorm2d(num_input_features);
+                    relu1 = ReLU(inplace: true);
+                    conv1 = Conv2d(num_input_features, bn_size * growth_rate, kernel_size: 1, stride: 1, bias: false);
+                    norm2 = BatchNorm2d(bn_size * growth_rate);
+                    relu2 = ReLU(inplace: true);
+                    conv2 = Conv2d(bn_size * growth_rate, growth_rate, kernel_size: 3, stride: 1, padding: 1, bias: false);
+                    this.drop_rate = drop_rate;
+                    RegisterComponents();
+                }
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        norm1.Dispose(); relu1.Dispose(); conv1.Dispose();
+                        norm2.Dispose(); relu2.Dispose(); conv2.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public override Tensor forward(Tensor input)
+                {
+                    var bottleneck_output = conv1.call(relu1.call(norm1.call(input)));
+                    var new_features = conv2.call(relu2.call(norm2.call(bottleneck_output)));
+                    if (drop_rate > 0 && training)
+                        new_features = nn.functional.dropout(new_features, drop_rate, training);
+                    return new_features;
+                }
+            }
+
+            /// <summary>
+            /// A dense block consisting of multiple dense layers with progressive feature concatenation.
+            /// </summary>
+            private class DenseBlock : Module<Tensor, Tensor>
+            {
+                private readonly Module<Tensor, Tensor>[] denselayers;
+
+                public DenseBlock(string name, int num_layers, int num_input_features, int bn_size, int growth_rate, float drop_rate)
+                    : base(name)
+                {
+                    denselayers = new Module<Tensor, Tensor>[num_layers];
+                    for (int i = 0; i < num_layers; i++) {
+                        var layer = new DenseLayer($"denselayer{i + 1}",
+                            num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate);
+                        denselayers[i] = layer;
+                        // Use register_module to ensure correct named hierarchy for state_dict compatibility
+                        register_module($"denselayer{i + 1}", layer);
+                    }
+                }
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        foreach (var layer in denselayers)
+                            layer.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public override Tensor forward(Tensor init_features)
+                {
+                    var features = new List<Tensor> { init_features };
+                    foreach (var layer in denselayers) {
+                        var concat_features = torch.cat(features.ToArray(), 1);
+                        var new_features = layer.call(concat_features);
+                        features.Add(new_features);
+                    }
+                    return torch.cat(features.ToArray(), 1);
+                }
+            }
+
+            /// <summary>
+            /// A transition layer (BN-ReLU-Conv1x1-AvgPool) that reduces feature map size.
+            /// </summary>
+            private class Transition : Module<Tensor, Tensor>
+            {
+                private readonly Module<Tensor, Tensor> norm;
+                private readonly Module<Tensor, Tensor> relu;
+                private readonly Module<Tensor, Tensor> conv;
+                private readonly Module<Tensor, Tensor> pool;
+
+                public Transition(string name, int num_input_features, int num_output_features) : base(name)
+                {
+                    norm = BatchNorm2d(num_input_features);
+                    relu = ReLU(inplace: true);
+                    conv = Conv2d(num_input_features, num_output_features, kernel_size: 1, stride: 1, bias: false);
+                    pool = AvgPool2d(kernel_size: 2, stride: 2);
+                    RegisterComponents();
+                }
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        norm.Dispose(); relu.Dispose(); conv.Dispose(); pool.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public override Tensor forward(Tensor x)
+                {
+                    return pool.call(conv.call(relu.call(norm.call(x))));
+                }
+            }
+
+            private readonly Module<Tensor, Tensor> features;
+            private readonly Module<Tensor, Tensor> classifier;
+
+            protected override void Dispose(bool disposing)
+            {
+                if (disposing) {
+                    features.Dispose();
+                    classifier.Dispose();
+                }
+                base.Dispose(disposing);
+            }
+
+            /// <summary>
+            /// DenseNet model class.
+            /// </summary>
+            /// <param name="growth_rate">How many filters to add each layer.</param>
+            /// <param name="block_config">Number of layers in each dense block.</param>
+            /// <param name="num_init_features">Number of filters in the first convolution layer.</param>
+            /// <param name="bn_size">Multiplicative factor for number of bottleneck layers.</param>
+            /// <param name="drop_rate">Dropout rate after each dense layer.</param>
+            /// <param name="num_classes">Number of output classes.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public DenseNet(
+                int growth_rate = 32,
+                int[]? block_config = null,
+                int num_init_features = 64,
+                int bn_size = 4,
+                float drop_rate = 0,
+                int num_classes = 1000,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null) : base(nameof(DenseNet))
+            {
+                if (block_config == null)
+                    block_config = new int[] { 6, 12, 24, 16 };
+
+                // Build the features Sequential with named children
+                var f = Sequential();
+                f.append("conv0", Conv2d(3, num_init_features, kernel_size: 7, stride: 2, padding: 3, bias: false));
+                f.append("norm0", BatchNorm2d(num_init_features));
+                f.append("relu0", ReLU(inplace: true));
+                f.append("pool0", MaxPool2d(kernel_size: 3, stride: 2, padding: 1));
+
+                int num_features = num_init_features;
+                for (int i = 0; i < block_config.Length; i++) {
+                    var block = new DenseBlock("DenseBlock",
+                        block_config[i], num_features, bn_size, growth_rate, drop_rate);
+                    f.append($"denseblock{i + 1}", block);
+                    num_features = num_features + block_config[i] * growth_rate;
+                    if (i != block_config.Length - 1) {
+                        var trans = new Transition("Transition",
+                            num_features, num_features / 2);
+                        f.append($"transition{i + 1}", trans);
+                        num_features = num_features / 2;
+                    }
+                }
+
+                f.append("norm5", BatchNorm2d(num_features));
+                features = f;
+
+                classifier = Linear(num_features, num_classes);
+
+                RegisterComponents();
+
+                // Weight initialization
+                if (string.IsNullOrEmpty(weights_file)) {
+                    foreach (var (_, m) in named_modules()) {
+                        if (m is Modules.Conv2d conv) {
+                            nn.init.kaiming_normal_(conv.weight);
+                        } else if (m is Modules.BatchNorm2d bn) {
+                            nn.init.constant_(bn.weight, 1);
+                            nn.init.constant_(bn.bias, 0);
+                        } else if (m is Modules.Linear linear) {
+                            nn.init.constant_(linear.bias, 0);
+                        }
+                    }
+                } else {
+                    this.load(weights_file!, skip: skipfc ? new[] { "classifier.weight", "classifier.bias" } : null);
+                }
+
+                if (device != null && device.type != DeviceType.CPU)
+                    this.to(device);
+            }
+
+            public override Tensor forward(Tensor x)
+            {
+                using (var _ = NewDisposeScope()) {
+                    x = features.call(x);
+                    x = nn.functional.relu(x);
+                    x = nn.functional.adaptive_avg_pool2d(x, new long[] { 1, 1 });
+                    x = torch.flatten(x, 1);
+                    return classifier.call(x).MoveToOuterDisposeScope();
+                }
+            }
+        }
+    }
+}
diff --git a/src/TorchVision/models/EfficientNet.cs b/src/TorchVision/models/EfficientNet.cs
new file mode 100644
index 000000000..ded461949
--- /dev/null
+++ b/src/TorchVision/models/EfficientNet.cs
@@ -0,0 +1,819 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torchvision.models._utils;
+using static TorchSharp.torchvision.ops;
+using TorchSharp.Modules;
+
+#nullable enable
+namespace TorchSharp
+{
+    namespace Modules
+    {
+        public class EfficientNet : nn.Module<Tensor, Tensor>
+        {
+            internal enum BlockType { MBConv, FusedMBConv }
+
+            /// <summary>
+            /// Stores information listed at Tables 1 and 4 of the EfficientNet papers.
+            /// </summary>
+            internal class _MBConvConfig
+            {
+                public double expand_ratio;
+                public long kernel;
+                public long stride;
+                public long input_channels;
+                public long out_channels;
+                public long num_layers;
+                public BlockType block_type;
+
+                public _MBConvConfig(
+                    double expand_ratio, long kernel, long stride,
+                    long input_channels, long out_channels, long num_layers,
+                    BlockType block_type)
+                {
+                    this.expand_ratio = expand_ratio;
+                    this.kernel = kernel;
+                    this.stride = stride;
+                    this.input_channels = input_channels;
+                    this.out_channels = out_channels;
+                    this.num_layers = num_layers;
+                    this.block_type = block_type;
+                }
+
+                public static long adjust_channels(long channels, double width_mult, long? min_value = null)
+                {
+                    return _make_divisible(channels * width_mult, 8, min_value);
+                }
+
+                public _MBConvConfig ShallowCopy()
+                {
+                    return (_MBConvConfig)this.MemberwiseClone();
+                }
+            }
+
+            /// <summary>
+            /// Config for MBConv blocks (EfficientNet B0-B7).
+            /// Applies width and depth multipliers for compound scaling.
+            /// </summary>
+            internal class MBConvConfig : _MBConvConfig
+            {
+                public MBConvConfig(
+                    double expand_ratio, long kernel, long stride,
+                    long input_channels, long out_channels, long num_layers,
+                    double width_mult = 1.0, double depth_mult = 1.0)
+                    : base(expand_ratio, kernel, stride,
+                          adjust_channels(input_channels, width_mult),
+                          adjust_channels(out_channels, width_mult),
+                          adjust_depth(num_layers, depth_mult),
+                          BlockType.MBConv)
+                {
+                }
+
+                public static long adjust_depth(long num_layers, double depth_mult)
+                {
+                    return (long)Math.Ceiling(num_layers * depth_mult);
+                }
+            }
+
+            /// <summary>
+            /// Config for FusedMBConv blocks (EfficientNet V2).
+            /// </summary>
+            internal class FusedMBConvConfig : _MBConvConfig
+            {
+                public FusedMBConvConfig(
+                    double expand_ratio, long kernel, long stride,
+                    long input_channels, long out_channels, long num_layers)
+                    : base(expand_ratio, kernel, stride,
+                          input_channels, out_channels, num_layers,
+                          BlockType.FusedMBConv)
+                {
+                }
+            }
+
+            /// <summary>
+            /// MBConv block: Mobile Inverted Bottleneck Conv with Squeeze-and-Excitation.
+            /// </summary>
+            private class MBConv : nn.Module<Tensor, Tensor>
+            {
+                private readonly nn.Module<Tensor, Tensor> block;
+                private readonly torchvision.StochasticDepth stochastic_depth;
+                private readonly bool use_res_connect;
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        block.Dispose();
+                        stochastic_depth.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public MBConv(
+                    string name,
+                    _MBConvConfig cnf,
+                    double stochastic_depth_prob,
+                    Func<long, nn.Module<Tensor, Tensor>> norm_layer) : base(name)
+                {
+                    if (!(1 <= cnf.stride && cnf.stride <= 2))
+                        throw new ArgumentException("illegal stride value");
+
+                    use_res_connect = cnf.stride == 1 && cnf.input_channels == cnf.out_channels;
+
+                    var layers = new List<nn.Module<Tensor, Tensor>>();
+                    Func<bool, nn.Module<Tensor, Tensor>> activation_layer = (inplace) => nn.SiLU(inplace);
+
+                    // expand
+                    var expanded_channels = _MBConvConfig.adjust_channels(cnf.input_channels, cnf.expand_ratio);
+                    if (expanded_channels != cnf.input_channels) {
+                        layers.Add(Conv2dNormActivation(
+                            cnf.input_channels, expanded_channels,
+                            kernel_size: 1,
+                            norm_layer: norm_layer,
+                            activation_layer: activation_layer));
+                    }
+
+                    // depthwise
+                    layers.Add(Conv2dNormActivation(
+                        expanded_channels, expanded_channels,
+                        kernel_size: cnf.kernel,
+                        stride: cnf.stride,
+                        groups: expanded_channels,
+                        norm_layer: norm_layer,
+                        activation_layer: activation_layer));
+
+                    // squeeze and excitation
+                    var squeeze_channels = Math.Max(1, cnf.input_channels / 4);
+                    layers.Add(
+                        torchvision.ops.SqueezeExcitation(
+                            expanded_channels,
+                            squeeze_channels,
+                            activation: () => nn.SiLU(inplace: true)));
+
+                    // project
+                    layers.Add(Conv2dNormActivation(
+                        expanded_channels, cnf.out_channels,
+                        kernel_size: 1,
+                        norm_layer: norm_layer,
+                        activation_layer: null));
+
+                    block = nn.Sequential(layers);
+                    stochastic_depth = torchvision.ops.StochasticDepth(stochastic_depth_prob, torchvision.StochasticDepth.Mode.Row);
+
+                    RegisterComponents();
+                }
+
+                public override Tensor forward(Tensor input)
+                {
+                    var result = block.call(input);
+                    if (use_res_connect) {
+                        result = stochastic_depth.call(result);
+                        result += input;
+                    }
+                    return result;
+                }
+            }
+
+            /// <summary>
+            /// FusedMBConv block: Fused Mobile Inverted Bottleneck Conv (no depthwise or SE).
+            /// </summary>
+            private class FusedMBConv : nn.Module<Tensor, Tensor>
+            {
+                private readonly nn.Module<Tensor, Tensor> block;
+                private readonly torchvision.StochasticDepth stochastic_depth;
+                private readonly bool use_res_connect;
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        block.Dispose();
+                        stochastic_depth.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public FusedMBConv(
+                    string name,
+                    _MBConvConfig cnf,
+                    double stochastic_depth_prob,
+                    Func<long, nn.Module<Tensor, Tensor>> norm_layer) : base(name)
+                {
+                    if (!(1 <= cnf.stride && cnf.stride <= 2))
+                        throw new ArgumentException("illegal stride value");
+
+                    use_res_connect = cnf.stride == 1 && cnf.input_channels == cnf.out_channels;
+
+                    var layers = new List<nn.Module<Tensor, Tensor>>();
+                    Func<bool, nn.Module<Tensor, Tensor>> activation_layer = (inplace) => nn.SiLU(inplace);
+
+                    var expanded_channels = _MBConvConfig.adjust_channels(cnf.input_channels, cnf.expand_ratio);
+                    if (expanded_channels != cnf.input_channels) {
+                        // fused expand
+                        layers.Add(Conv2dNormActivation(
+                            cnf.input_channels, expanded_channels,
+                            kernel_size: cnf.kernel,
+                            stride: cnf.stride,
+                            norm_layer: norm_layer,
+                            activation_layer: activation_layer));
+
+                        // project
+                        layers.Add(Conv2dNormActivation(
+                            expanded_channels, cnf.out_channels,
+                            kernel_size: 1,
+                            norm_layer: norm_layer,
+                            activation_layer: null));
+                    } else {
+                        layers.Add(Conv2dNormActivation(
+                            cnf.input_channels, cnf.out_channels,
+                            kernel_size: cnf.kernel,
+                            stride: cnf.stride,
+                            norm_layer: norm_layer,
+                            activation_layer: activation_layer));
+                    }
+
+                    block = nn.Sequential(layers);
+                    stochastic_depth = torchvision.ops.StochasticDepth(stochastic_depth_prob, torchvision.StochasticDepth.Mode.Row);
+
+                    RegisterComponents();
+                }
+
+                public override Tensor forward(Tensor input)
+                {
+                    var result = block.call(input);
+                    if (use_res_connect) {
+                        result = stochastic_depth.call(result);
+                        result += input;
+                    }
+                    return result;
+                }
+            }
+
+            private readonly nn.Module<Tensor, Tensor> features;
+            private readonly nn.Module<Tensor, Tensor> avgpool;
+            private readonly nn.Module<Tensor, Tensor> classifier;
+
+            protected override void Dispose(bool disposing)
+            {
+                if (disposing) {
+                    features.Dispose();
+                    avgpool.Dispose();
+                    classifier.Dispose();
+                }
+                base.Dispose(disposing);
+            }
+
+            /// <summary>
+            /// EfficientNet V1 and V2 main class
+            /// </summary>
+            /// <param name="name"></param>
+            /// <param name="inverted_residual_setting">Network structure</param>
+            /// <param name="dropout">The dropout probability</param>
+            /// <param name="stochastic_depth_prob">The stochastic depth probability</param>
+            /// <param name="num_classes">Number of classes</param>
+            /// <param name="norm_layer">Module specifying the normalization layer to use</param>
+            /// <param name="last_channel">The number of channels on the penultimate layer</param>
+            internal EfficientNet(
+                string name,
+                _MBConvConfig[] inverted_residual_setting,
+                double dropout,
+                double stochastic_depth_prob = 0.2,
+                long num_classes = 1000,
+                Func<long, nn.Module<Tensor, Tensor>>? norm_layer = null,
+                long? last_channel = null) : base(name)
+            {
+                if (inverted_residual_setting == null || inverted_residual_setting.Length == 0)
+                    throw new ArgumentException("The inverted_residual_setting should not be empty");
+
+                if (norm_layer == null)
+                    norm_layer = (features) => nn.BatchNorm2d(features);
+
+                var layers = new List<nn.Module<Tensor, Tensor>>();
+
+                // building first layer
+                var firstconv_output_channels = inverted_residual_setting[0].input_channels;
+                layers.Add(Conv2dNormActivation(
+                    3, firstconv_output_channels,
+                    kernel_size: 3, stride: 2,
+                    norm_layer: norm_layer,
+                    activation_layer: (inplace) => nn.SiLU(inplace)));
+
+                // building inverted residual blocks
+                long total_stage_blocks = 0;
+                foreach (var cnf in inverted_residual_setting)
+                    total_stage_blocks += cnf.num_layers;
+
+                long stage_block_id = 0;
+                foreach (var cnf in inverted_residual_setting) {
+                    var stage = new List<nn.Module<Tensor, Tensor>>();
+                    for (int i = 0; i < cnf.num_layers; i++) {
+                        var block_cnf = cnf.ShallowCopy();
+
+                        // overwrite info if not the first conv in the stage
+                        if (stage.Count > 0) {
+                            block_cnf.input_channels = block_cnf.out_channels;
+                            block_cnf.stride = 1;
+                        }
+
+                        // adjust stochastic depth probability based on the depth of the stage block
+                        var sd_prob = stochastic_depth_prob * (double)stage_block_id / total_stage_blocks;
+
+                        if (block_cnf.block_type == BlockType.FusedMBConv) {
+                            stage.Add(new FusedMBConv("FusedMBConv", block_cnf, sd_prob, norm_layer));
+                        } else {
+                            stage.Add(new MBConv("MBConv", block_cnf, sd_prob, norm_layer));
+                        }
+                        stage_block_id++;
+                    }
+                    layers.Add(nn.Sequential(stage));
+                }
+
+                // building last several layers
+                var lastconv_input_channels = inverted_residual_setting[inverted_residual_setting.Length - 1].out_channels;
+                var lastconv_output_channels = last_channel.HasValue ? last_channel.Value : 4 * lastconv_input_channels;
+                layers.Add(Conv2dNormActivation(
+                    lastconv_input_channels, lastconv_output_channels,
+                    kernel_size: 1,
+                    norm_layer: norm_layer,
+                    activation_layer: (inplace) => nn.SiLU(inplace)));
+
+                features = nn.Sequential(layers);
+                avgpool = nn.AdaptiveAvgPool2d(1);
+                classifier = nn.Sequential(
+                    nn.Dropout(p: dropout, inplace: true),
+                    nn.Linear(lastconv_output_channels, num_classes));
+
+                RegisterComponents();
+
+                foreach (var (_, m) in this.named_modules()) {
+                    if (m is Modules.Conv2d) {
+                        var conv = (Modules.Conv2d)m;
+                        nn.init.kaiming_normal_(conv.weight, mode: nn.init.FanInOut.FanOut);
+                        if (conv.bias is not null) {
+                            nn.init.zeros_(conv.bias);
+                        }
+                    } else if (m is Modules.BatchNorm2d) {
+                        var norm = (Modules.BatchNorm2d)m;
+                        nn.init.ones_(norm.weight);
+                        nn.init.zeros_(norm.bias);
+                    } else if (m is Modules.GroupNorm) {
+                        var norm = (Modules.GroupNorm)m;
+                        nn.init.ones_(norm.weight);
+                        nn.init.zeros_(norm.bias);
+                    } else if (m is Modules.Linear) {
+                        var linear = (Modules.Linear)m;
+                        var init_range = 1.0 / Math.Sqrt(linear.weight.shape[0]);
+                        nn.init.uniform_(linear.weight, -init_range, init_range);
+                        nn.init.zeros_(linear.bias);
+                    }
+                }
+            }
+
+            public override Tensor forward(Tensor x)
+            {
+                using (var _ = NewDisposeScope()) {
+                    x = features.call(x);
+                    x = avgpool.call(x);
+                    x = torch.flatten(x, 1);
+                    x = classifier.call(x);
+                    return x.MoveToOuterDisposeScope();
+                }
+            }
+        }
+    }
+
+    public static partial class torchvision
+    {
+        public static partial class models
+        {
+            private static (EfficientNet._MBConvConfig[], long?) _efficientnet_conf(string arch, double width_mult = 1.0, double depth_mult = 1.0)
+            {
+                EfficientNet._MBConvConfig[] inverted_residual_setting;
+                long? last_channel;
+
+                if (arch.StartsWith("efficientnet_b")) {
+                    EfficientNet._MBConvConfig bneck_conf(
+                        double expand_ratio, long kernel, long stride,
+                        long input_channels, long out_channels, long num_layers) =>
+                        new EfficientNet.MBConvConfig(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, width_mult, depth_mult);
+
+                    inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+                        bneck_conf(1, 3, 1, 32, 16, 1),
+                        bneck_conf(6, 3, 2, 16, 24, 2),
+                        bneck_conf(6, 5, 2, 24, 40, 2),
+                        bneck_conf(6, 3, 2, 40, 80, 3),
+                        bneck_conf(6, 5, 1, 80, 112, 3),
+                        bneck_conf(6, 5, 2, 112, 192, 4),
+                        bneck_conf(6, 3, 1, 192, 320, 1),
+                    };
+                    last_channel = null;
+                } else if (arch.StartsWith("efficientnet_v2_s")) {
+                    inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+                        new EfficientNet.FusedMBConvConfig(1, 3, 1, 24, 24, 2),
+                        new EfficientNet.FusedMBConvConfig(4, 3, 2, 24, 48, 4),
+                        new EfficientNet.FusedMBConvConfig(4, 3, 2, 48, 64, 4),
+                        new EfficientNet.MBConvConfig(4, 3, 2, 64, 128, 6),
+                        new EfficientNet.MBConvConfig(6, 3, 1, 128, 160, 9),
+                        new EfficientNet.MBConvConfig(6, 3, 2, 160, 256, 15),
+                    };
+                    last_channel = 1280;
+                } else if (arch.StartsWith("efficientnet_v2_m")) {
+                    inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+                        new EfficientNet.FusedMBConvConfig(1, 3, 1, 24, 24, 3),
+                        new EfficientNet.FusedMBConvConfig(4, 3, 2, 24, 48, 5),
+                        new EfficientNet.FusedMBConvConfig(4, 3, 2, 48, 80, 5),
+                        new EfficientNet.MBConvConfig(4, 3, 2, 80, 160, 7),
+                        new EfficientNet.MBConvConfig(6, 3, 1, 160, 176, 14),
+                        new EfficientNet.MBConvConfig(6, 3, 2, 176, 304, 18),
+                        new EfficientNet.MBConvConfig(6, 3, 1, 304, 512, 5),
+                    };
+                    last_channel = 1280;
+                } else if (arch.StartsWith("efficientnet_v2_l")) {
+                    inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+                        new EfficientNet.FusedMBConvConfig(1, 3, 1, 32, 32, 4),
+                        new EfficientNet.FusedMBConvConfig(4, 3, 2, 32, 64, 7),
+                        new EfficientNet.FusedMBConvConfig(4, 3, 2, 64, 96, 7),
+                        new EfficientNet.MBConvConfig(4, 3, 2, 96, 192, 10),
+                        new EfficientNet.MBConvConfig(6, 3, 1, 192, 224, 19),
+                        new EfficientNet.MBConvConfig(6, 3, 2, 224, 384, 25),
+                        new EfficientNet.MBConvConfig(6, 3, 1, 384, 640, 7),
+                    };
+                    last_channel = 1280;
+                } else {
+                    throw new ArgumentException($"Unsupported model type {arch}");
+                }
+
+                return (inverted_residual_setting, last_channel);
+            }
+
+            private static Modules.EfficientNet _efficientnet(
+                EfficientNet._MBConvConfig[] inverted_residual_setting,
+                double dropout,
+                long? last_channel,
+                long num_classes = 1000,
+                Func<long, nn.Module<Tensor, Tensor>>? norm_layer = null,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                var model = new EfficientNet("EfficientNet", inverted_residual_setting, dropout, num_classes: num_classes, norm_layer: norm_layer, last_channel: last_channel);
+
+                if (!string.IsNullOrEmpty(weights_file)) {
+                    model.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null);
+                }
+
+                if (device != null && device.type != DeviceType.CPU)
+                    model.to(device);
+
+                return model;
+            }
+
+            /// <summary>
+            /// EfficientNet B0 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b0(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            ///
+            /// In order for the weights to be loaded, the number of classes has to be the same as
+            /// in the pre-trained model, which is 1000.
+            ///
+            /// It is also possible to skip loading the last linear layer and use it for transfer-learning
+            /// with a different number of output classes. To do so, pass skipfc=true.
+            ///
+            /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+            /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+            /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b0(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b0", width_mult: 1.0, depth_mult: 1.0);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// EfficientNet B1 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b1(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b1(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b1", width_mult: 1.0, depth_mult: 1.1);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// EfficientNet B2 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b2(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b2(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b2", width_mult: 1.1, depth_mult: 1.2);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// EfficientNet B3 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b3(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b3(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b3", width_mult: 1.2, depth_mult: 1.4);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// EfficientNet B4 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b4(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b4(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b4", width_mult: 1.4, depth_mult: 1.8);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// EfficientNet B5 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b5(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b5(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b5", width_mult: 1.6, depth_mult: 2.2);
+                Func<long, nn.Module<Tensor, Tensor>> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// EfficientNet B6 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b6(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b6(int num_classes = 1000, float dropout = 0.5f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b6", width_mult: 1.8, depth_mult: 2.6);
+                Func<long, nn.Module<Tensor, Tensor>> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// EfficientNet B7 model architecture from the
+            /// <a href="https://arxiv.org/abs/1905.11946">EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks</a> paper.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_b7(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_b7(int num_classes = 1000, float dropout = 0.5f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b7", width_mult: 2.0, depth_mult: 3.1);
+                Func<long, nn.Module<Tensor, Tensor>> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// Constructs an EfficientNetV2-S architecture from
+            /// <a href="https://arxiv.org/abs/2104.00298">EfficientNetV2: Smaller Models and Faster Training</a>.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_v2_s(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_v2_s(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_s");
+                Func<long, nn.Module<Tensor, Tensor>> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// Constructs an EfficientNetV2-M architecture from
+            /// <a href="https://arxiv.org/abs/2104.00298">EfficientNetV2: Smaller Models and Faster Training</a>.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_v2_m(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_v2_m(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_m");
+                Func<long, nn.Module<Tensor, Tensor>> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+
+            /// <summary>
+            /// Constructs an EfficientNetV2-L architecture from
+            /// <a href="https://arxiv.org/abs/2104.00298">EfficientNetV2: Smaller Models and Faster Training</a>.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.efficientnet_v2_l(weights='DEFAULT')
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            /// </remarks>
+            public static Modules.EfficientNet efficientnet_v2_l(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_l");
+                Func<long, nn.Module<Tensor, Tensor>> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001);
+                return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+            }
+        }
+    }
+}
diff --git a/src/TorchVision/models/MNASNet.cs b/src/TorchVision/models/MNASNet.cs
new file mode 100644
index 000000000..7210f3268
--- /dev/null
+++ b/src/TorchVision/models/MNASNet.cs
@@ -0,0 +1,299 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+    public static partial class torchvision
+    {
+        public static partial class models
+        {
+            /// <summary>
+            /// MNASNet with depth multiplier of 0.5 from
+            /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.mnasnet0_5(pretrained=True)
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            ///
+            /// In order for the weights to be loaded, the number of classes has to be the same as
+            /// in the pre-trained model, which is 1000.
+            ///
+            /// It is also possible to skip loading the last linear layer and use it for transfer-learning
+            /// with a different number of output classes. To do so, pass skipfc=true.
+            ///
+            /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+            /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+            /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+            /// </remarks>
+            public static Modules.MNASNet mnasnet0_5(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                return new Modules.MNASNet(0.5, num_classes, dropout, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// MNASNet with depth multiplier of 0.75 from
+            /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.MNASNet mnasnet0_75(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                return new Modules.MNASNet(0.75, num_classes, dropout, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// MNASNet with depth multiplier of 1.0 from
+            /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.MNASNet mnasnet1_0(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                return new Modules.MNASNet(1.0, num_classes, dropout, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// MNASNet with depth multiplier of 1.3 from
+            /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.MNASNet mnasnet1_3(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+            {
+                return new Modules.MNASNet(1.3, num_classes, dropout, weights_file, skipfc, device);
+            }
+        }
+    }
+
+    namespace Modules
+    {
+        // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py
+        // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+        /// <summary>
+        /// MNASNet, as described in https://arxiv.org/abs/1807.11626.
+        /// This implements the B1 variant of the model.
+        /// </summary>
+        public class MNASNet : Module<Tensor, Tensor>
+        {
+            // Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is 1.0 - tensorflow.
+            private const double _BN_MOMENTUM = 1.0 - 0.9997;
+
+            private class _InvertedResidual : Module<Tensor, Tensor>
+            {
+                private readonly bool apply_residual;
+                private readonly Module<Tensor, Tensor> layers;
+
+                public _InvertedResidual(string name, long in_ch, long out_ch, long kernel_size, long stride, long expansion_factor, double bn_momentum)
+                    : base(name)
+                {
+                    if (stride != 1 && stride != 2)
+                        throw new ArgumentOutOfRangeException($"stride should be 1 or 2 instead of {stride}");
+                    if (kernel_size != 3 && kernel_size != 5)
+                        throw new ArgumentOutOfRangeException($"kernel_size should be 3 or 5 instead of {kernel_size}");
+
+                    var mid_ch = in_ch * expansion_factor;
+                    apply_residual = in_ch == out_ch && stride == 1;
+                    layers = Sequential(
+                        // Pointwise
+                        Conv2d(in_ch, mid_ch, 1, bias: false),
+                        BatchNorm2d(mid_ch, momentum: bn_momentum),
+                        ReLU(inplace: true),
+                        // Depthwise
+                        Conv2d(mid_ch, mid_ch, kernel_size, padding: kernel_size / 2, stride: stride, groups: mid_ch, bias: false),
+                        BatchNorm2d(mid_ch, momentum: bn_momentum),
+                        ReLU(inplace: true),
+                        // Linear pointwise. Note that there's no activation.
+                        Conv2d(mid_ch, out_ch, 1, bias: false),
+                        BatchNorm2d(out_ch, momentum: bn_momentum)
+                    );
+                    RegisterComponents();
+                }
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        layers.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public override Tensor forward(Tensor input)
+                {
+                    if (apply_residual) {
+                        return layers.call(input) + input;
+                    } else {
+                        return layers.call(input);
+                    }
+                }
+            }
+
+            /// <summary>
+            /// Creates a stack of inverted residuals.
+            /// </summary>
+            private static Module<Tensor, Tensor> _stack(long in_ch, long out_ch, long kernel_size, long stride, long exp_factor, int repeats, double bn_momentum)
+            {
+                if (repeats < 1)
+                    throw new ArgumentOutOfRangeException($"repeats should be >= 1, instead got {repeats}");
+
+                var modules = new List<Module<Tensor, Tensor>>();
+                // First one has no skip, because feature map size changes.
+                modules.Add(new _InvertedResidual("_InvertedResidual", in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum));
+                for (int i = 1; i < repeats; i++) {
+                    modules.Add(new _InvertedResidual("_InvertedResidual", out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum));
+                }
+                return Sequential(modules);
+            }
+
+            /// <summary>
+            /// Asymmetric rounding to make val divisible by divisor.
+            /// With default bias, will round up, unless the number is no more than 10% greater
+            /// than the smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88.
+            /// </summary>
+            private static int _round_to_multiple_of(double val, int divisor, double round_up_bias = 0.9)
+            {
+                if (round_up_bias <= 0.0 || round_up_bias >= 1.0)
+                    throw new ArgumentOutOfRangeException($"round_up_bias should be greater than 0.0 and smaller than 1.0 instead of {round_up_bias}");
+                var new_val = Math.Max(divisor, (int)(val + divisor / 2) / divisor * divisor);
+                return new_val >= round_up_bias * val ? new_val : new_val + divisor;
+            }
+
+            /// <summary>
+            /// Scales tensor depths as in reference MobileNet code, prefers rounding up rather than down.
+            /// </summary>
+            private static int[] _get_depths(double alpha)
+            {
+                var depths = new int[] { 32, 16, 24, 40, 80, 96, 192, 320 };
+                var result = new int[depths.Length];
+                for (int i = 0; i < depths.Length; i++) {
+                    result[i] = _round_to_multiple_of(depths[i] * alpha, 8);
+                }
+                return result;
+            }
+
+            private readonly Module<Tensor, Tensor> layers;
+            private readonly Module<Tensor, Tensor> classifier;
+
+            protected override void Dispose(bool disposing)
+            {
+                if (disposing) {
+                    layers.Dispose();
+                    classifier.Dispose();
+                }
+                base.Dispose(disposing);
+            }
+
+            public MNASNet(double alpha, int num_classes = 1000, float dropout = 0.2f,
+                string? weights_file = null, bool skipfc = true, Device? device = null)
+                : base(nameof(MNASNet))
+            {
+                if (alpha <= 0.0)
+                    throw new ArgumentOutOfRangeException($"alpha should be greater than 0.0 instead of {alpha}");
+
+                var depths = _get_depths(alpha);
+                var layerList = new List<Module<Tensor, Tensor>> {
+                    // First layer: regular conv.
+                    Conv2d(3, depths[0], 3, padding: 1, stride: 2, bias: false),
+                    BatchNorm2d(depths[0], momentum: _BN_MOMENTUM),
+                    ReLU(inplace: true),
+                    // Depthwise separable, no skip.
+                    Conv2d(depths[0], depths[0], 3, padding: 1, stride: 1, groups: depths[0], bias: false),
+                    BatchNorm2d(depths[0], momentum: _BN_MOMENTUM),
+                    ReLU(inplace: true),
+                    Conv2d(depths[0], depths[1], 1, padding: 0L, stride: 1, bias: false),
+                    BatchNorm2d(depths[1], momentum: _BN_MOMENTUM),
+                    // MNASNet blocks: stacks of inverted residuals.
+                    _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+                    _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM),
+                    _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM),
+                    _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM),
+                    _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM),
+                    _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM),
+                    // Final mapping to classifier input.
+                    Conv2d(depths[7], 1280, 1, padding: 0L, stride: 1, bias: false),
+                    BatchNorm2d(1280, momentum: _BN_MOMENTUM),
+                    ReLU(inplace: true),
+                };
+                layers = Sequential(layerList);
+                classifier = Sequential(
+                    Dropout(p: dropout, inplace: true),
+                    Linear(1280, num_classes)
+                );
+
+                RegisterComponents();
+
+                // Weight initialization
+                foreach (var (_, m) in named_modules()) {
+                    if (m is Modules.Conv2d conv) {
+                        init.kaiming_normal_(conv.weight, mode: init.FanInOut.FanOut);
+                        if (conv.bias is not null)
+                            init.zeros_(conv.bias);
+                    } else if (m is Modules.BatchNorm2d norm) {
+                        init.ones_(norm.weight);
+                        init.zeros_(norm.bias);
+                    } else if (m is Modules.Linear linear) {
+                        init.kaiming_uniform_(linear.weight, mode: init.FanInOut.FanOut, nonlinearity: init.NonlinearityType.Sigmoid);
+                        init.zeros_(linear.bias);
+                    }
+                }
+
+                if (!string.IsNullOrEmpty(weights_file)) {
+                    this.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null);
+                }
+
+                if (device != null && device.type != DeviceType.CPU)
+                    this.to(device);
+            }
+
+            public override Tensor forward(Tensor x)
+            {
+                using (var _ = NewDisposeScope()) {
+                    x = layers.call(x);
+                    // Equivalent to global avgpool and removing H and W dimensions.
+                    x = x.mean(new long[] { 2, 3 });
+                    return classifier.call(x).MoveToOuterDisposeScope();
+                }
+            }
+        }
+    }
+}
diff --git a/src/TorchVision/models/ShuffleNetV2.cs b/src/TorchVision/models/ShuffleNetV2.cs
new file mode 100644
index 000000000..3c7b5348e
--- /dev/null
+++ b/src/TorchVision/models/ShuffleNetV2.cs
@@ -0,0 +1,316 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+    public static partial class torchvision
+    {
+        public static partial class models
+        {
+            /// <summary>
+            /// ShuffleNet V2 with 0.5x output channels, as described in
+            /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.shufflenet_v2_x0_5(pretrained=True)
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            ///
+            /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+            /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+            /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+            /// </remarks>
+            public static Modules.ShuffleNetV2 shufflenet_v2_x0_5(
+                int num_classes = 1000,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.ShuffleNetV2(
+                    new int[] { 4, 8, 4 },
+                    new int[] { 24, 48, 96, 192, 1024 },
+                    num_classes, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// ShuffleNet V2 with 1.0x output channels, as described in
+            /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.ShuffleNetV2 shufflenet_v2_x1_0(
+                int num_classes = 1000,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.ShuffleNetV2(
+                    new int[] { 4, 8, 4 },
+                    new int[] { 24, 116, 232, 464, 1024 },
+                    num_classes, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// ShuffleNet V2 with 1.5x output channels, as described in
+            /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.ShuffleNetV2 shufflenet_v2_x1_5(
+                int num_classes = 1000,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.ShuffleNetV2(
+                    new int[] { 4, 8, 4 },
+                    new int[] { 24, 176, 352, 704, 1024 },
+                    num_classes, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// ShuffleNet V2 with 2.0x output channels, as described in
+            /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public static Modules.ShuffleNetV2 shufflenet_v2_x2_0(
+                int num_classes = 1000,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.ShuffleNetV2(
+                    new int[] { 4, 8, 4 },
+                    new int[] { 24, 244, 488, 976, 2048 },
+                    num_classes, weights_file, skipfc, device);
+            }
+        }
+    }
+
+    namespace Modules
+    {
+        // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py
+        // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+        public class ShuffleNetV2 : Module<Tensor, Tensor>
+        {
+            private static Tensor channel_shuffle(Tensor x, int groups)
+            {
+                var batchsize = x.shape[0];
+                var num_channels = x.shape[1];
+                var height = x.shape[2];
+                var width = x.shape[3];
+                var channels_per_group = num_channels / groups;
+
+                x = x.view(batchsize, groups, channels_per_group, height, width);
+                x = x.transpose(1, 2).contiguous();
+                x = x.view(batchsize, num_channels, height, width);
+                return x;
+            }
+
+            private static Module<Tensor, Tensor> depthwise_conv(
+                long i, long o, long kernel_size, long stride = 1, long padding = 0, bool bias = false)
+            {
+                return Conv2d(i, o, kernel_size: kernel_size, stride: stride, padding: padding, bias: bias, groups: i);
+            }
+
+            private class InvertedResidual : Module<Tensor, Tensor>
+            {
+                private readonly Module<Tensor, Tensor> branch1;
+                private readonly Module<Tensor, Tensor> branch2;
+                private readonly int _stride;
+
+                public InvertedResidual(string name, long inp, long oup, int stride) : base(name)
+                {
+                    if (stride < 1 || stride > 3)
+                        throw new ArgumentException("illegal stride value", nameof(stride));
+
+                    _stride = stride;
+                    var branch_features = oup / 2;
+
+                    if (stride > 1) {
+                        branch1 = Sequential(
+                            depthwise_conv(inp, inp, kernel_size: 3, stride: stride, padding: 1),
+                            BatchNorm2d(inp),
+                            Conv2d(inp, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+                            BatchNorm2d(branch_features),
+                            ReLU(inplace: true)
+                        );
+                    } else {
+                        branch1 = Sequential();
+                    }
+
+                    branch2 = Sequential(
+                        Conv2d(stride > 1 ? inp : branch_features, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+                        BatchNorm2d(branch_features),
+                        ReLU(inplace: true),
+                        depthwise_conv(branch_features, branch_features, kernel_size: 3, stride: stride, padding: 1),
+                        BatchNorm2d(branch_features),
+                        Conv2d(branch_features, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+                        BatchNorm2d(branch_features),
+                        ReLU(inplace: true)
+                    );
+
+                    RegisterComponents();
+                }
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        branch1.Dispose();
+                        branch2.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public override Tensor forward(Tensor x)
+                {
+                    Tensor @out;
+                    if (_stride == 1) {
+                        var chunks = x.chunk(2, dim: 1);
+                        @out = torch.cat(new[] { chunks[0], branch2.call(chunks[1]) }, 1);
+                    } else {
+                        @out = torch.cat(new[] { branch1.call(x), branch2.call(x) }, 1);
+                    }
+                    @out = channel_shuffle(@out, 2);
+                    return @out;
+                }
+            }
+
+            private readonly Module<Tensor, Tensor> conv1;
+            private readonly Module<Tensor, Tensor> maxpool;
+            private readonly Module<Tensor, Tensor> stage2;
+            private readonly Module<Tensor, Tensor> stage3;
+            private readonly Module<Tensor, Tensor> stage4;
+            private readonly Module<Tensor, Tensor> conv5;
+            private readonly Module<Tensor, Tensor> fc;
+
+            protected override void Dispose(bool disposing)
+            {
+                if (disposing) {
+                    conv1.Dispose(); maxpool.Dispose();
+                    stage2.Dispose(); stage3.Dispose(); stage4.Dispose();
+                    conv5.Dispose(); fc.Dispose();
+                }
+                base.Dispose(disposing);
+            }
+
+            private static Module<Tensor, Tensor> MakeStage(long input_channels, long output_channels, int repeats)
+            {
+                var modules = new List<Module<Tensor, Tensor>>();
+                modules.Add(new InvertedResidual("InvertedResidual", input_channels, output_channels, 2));
+                for (int i = 0; i < repeats - 1; i++) {
+                    modules.Add(new InvertedResidual("InvertedResidual", output_channels, output_channels, 1));
+                }
+                return Sequential(modules.ToArray());
+            }
+
+            /// <summary>
+            /// ShuffleNet V2 main class.
+            /// </summary>
+            /// <param name="stages_repeats">Number of repeated blocks in each stage.</param>
+            /// <param name="stages_out_channels">Output channels for each stage.</param>
+            /// <param name="num_classes">Number of output classes.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last linear layer will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            public ShuffleNetV2(
+                int[] stages_repeats,
+                int[] stages_out_channels,
+                int num_classes = 1000,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null) : base(nameof(ShuffleNetV2))
+            {
+                if (stages_repeats.Length != 3)
+                    throw new ArgumentException("expected stages_repeats to have 3 elements");
+                if (stages_out_channels.Length != 5)
+                    throw new ArgumentException("expected stages_out_channels to have 5 elements");
+
+                long input_channels = 3;
+                long output_channels = stages_out_channels[0];
+
+                conv1 = Sequential(
+                    Conv2d(input_channels, output_channels, kernel_size: 3, stride: 2, padding: 1, bias: false),
+                    BatchNorm2d(output_channels),
+                    ReLU(inplace: true)
+                );
+                input_channels = output_channels;
+
+                maxpool = MaxPool2d(kernel_size: 3, stride: 2, padding: 1);
+
+                stage2 = MakeStage(input_channels, stages_out_channels[1], stages_repeats[0]);
+                stage3 = MakeStage(stages_out_channels[1], stages_out_channels[2], stages_repeats[1]);
+                stage4 = MakeStage(stages_out_channels[2], stages_out_channels[3], stages_repeats[2]);
+
+                output_channels = stages_out_channels[4];
+                conv5 = Sequential(
+                    Conv2d(stages_out_channels[3], output_channels, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+                    BatchNorm2d(output_channels),
+                    ReLU(inplace: true)
+                );
+
+                fc = Linear(output_channels, num_classes);
+
+                RegisterComponents();
+
+                if (!string.IsNullOrEmpty(weights_file)) {
+                    this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias" } : null);
+                }
+
+                if (device != null && device.type != DeviceType.CPU)
+                    this.to(device);
+            }
+
+            public override Tensor forward(Tensor x)
+            {
+                using (var _ = NewDisposeScope()) {
+                    x = conv1.call(x);
+                    x = maxpool.call(x);
+                    x = stage2.call(x);
+                    x = stage3.call(x);
+                    x = stage4.call(x);
+                    x = conv5.call(x);
+                    x = x.mean(new long[] { 2, 3 }); // global pool
+                    x = fc.call(x);
+                    return x.MoveToOuterDisposeScope();
+                }
+            }
+        }
+    }
+}
diff --git a/src/TorchVision/models/SqueezeNet.cs b/src/TorchVision/models/SqueezeNet.cs
new file mode 100644
index 000000000..34df94020
--- /dev/null
+++ b/src/TorchVision/models/SqueezeNet.cs
@@ -0,0 +1,257 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+    public static partial class torchvision
+    {
+        public static partial class models
+        {
+            /// <summary>
+            /// SqueezeNet 1.0 model from
+            /// "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and less than 0.5MB model size".
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last convolutional layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.squeezenet1_0(pretrained=True)
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            ///
+            /// In order for the weights to be loaded, the number of classes has to be the same as
+            /// in the pre-trained model, which is 1000.
+            ///
+            /// It is also possible to skip loading the last classifier layer and use it for transfer-learning
+            /// with a different number of output classes. To do so, pass skipfc=true.
+            ///
+            /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+            /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+            /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+            /// </remarks>
+            public static Modules.SqueezeNet squeezenet1_0(
+                int num_classes = 1000,
+                float dropout = 0.5f,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.SqueezeNet("1_0", num_classes, dropout, weights_file, skipfc, device);
+            }
+
+            /// <summary>
+            /// SqueezeNet 1.1 model from the official SqueezeNet repo.
+            /// SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0.
+            /// </summary>
+            /// <param name="num_classes">The number of output classes.</param>
+            /// <param name="dropout">The dropout ratio.</param>
+            /// <param name="weights_file">The location of a file containing pre-trained weights for the model.</param>
+            /// <param name="skipfc">If true, the last convolutional layer of the classifier will not be loaded from the weights file.</param>
+            /// <param name="device">The device to locate the model on.</param>
+            /// <remarks>
+            /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+            /// using the exportsd.py script, then loading into the .NET instance:
+            ///
+            /// from torchvision import models
+            /// import exportsd
+            ///
+            /// model = models.squeezenet1_1(pretrained=True)
+            /// f = open("model_weights.dat", "wb")
+            /// exportsd.save_state_dict(model.state_dict(), f)
+            /// f.close()
+            ///
+            /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+            ///
+            /// In order for the weights to be loaded, the number of classes has to be the same as
+            /// in the pre-trained model, which is 1000.
+            ///
+            /// It is also possible to skip loading the last classifier layer and use it for transfer-learning
+            /// with a different number of output classes. To do so, pass skipfc=true.
+            ///
+            /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+            /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+            /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+            /// </remarks>
+            public static Modules.SqueezeNet squeezenet1_1(
+                int num_classes = 1000,
+                float dropout = 0.5f,
+                string? weights_file = null,
+                bool skipfc = true,
+                Device? device = null)
+            {
+                return new Modules.SqueezeNet("1_1", num_classes, dropout, weights_file, skipfc, device);
+            }
+        }
+    }
+
+    namespace Modules
+    {
+        // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py
+        // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+        public class SqueezeNet : Module<Tensor, Tensor>
+        {
+            private class Fire : Module<Tensor, Tensor>
+            {
+                private readonly Module<Tensor, Tensor> squeeze;
+                private readonly Module<Tensor, Tensor> squeeze_activation;
+                private readonly Module<Tensor, Tensor> expand1x1;
+                private readonly Module<Tensor, Tensor> expand1x1_activation;
+                private readonly Module<Tensor, Tensor> expand3x3;
+                private readonly Module<Tensor, Tensor> expand3x3_activation;
+
+                public Fire(string name, int inplanes, int squeeze_planes, int expand1x1_planes, int expand3x3_planes)
+                    : base(name)
+                {
+                    squeeze = Conv2d(inplanes, squeeze_planes, kernel_size: 1);
+                    squeeze_activation = ReLU(inplace: true);
+                    expand1x1 = Conv2d(squeeze_planes, expand1x1_planes, kernel_size: 1);
+                    expand1x1_activation = ReLU(inplace: true);
+                    expand3x3 = Conv2d(squeeze_planes, expand3x3_planes, kernel_size: 3, padding: 1);
+                    expand3x3_activation = ReLU(inplace: true);
+                    RegisterComponents();
+                }
+
+                protected override void Dispose(bool disposing)
+                {
+                    if (disposing) {
+                        squeeze.Dispose();
+                        squeeze_activation.Dispose();
+                        expand1x1.Dispose();
+                        expand1x1_activation.Dispose();
+                        expand3x3.Dispose();
+                        expand3x3_activation.Dispose();
+                    }
+                    base.Dispose(disposing);
+                }
+
+                public override Tensor forward(Tensor x)
+                {
+                    x = squeeze_activation.call(squeeze.call(x));
+                    return torch.cat(new[] {
+                        expand1x1_activation.call(expand1x1.call(x)),
+                        expand3x3_activation.call(expand3x3.call(x))
+                    }, 1);
+                }
+            }
+
+            private readonly Module<Tensor, Tensor> features;
+            private readonly Module<Tensor, Tensor> classifier;
+
+            protected override void Dispose(bool disposing)
+            {
+                if (disposing) {
+                    features.Dispose();
+                    classifier.Dispose();
+                }
+                base.Dispose(disposing);
+            }
+
+            public SqueezeNet(string version, int num_classes = 1000, float dropout = 0.5f,
+                string? weights_file = null, bool skipfc = true, Device? device = null)
+                : base(nameof(SqueezeNet))
+            {
+                Module<Tensor, Tensor> final_conv;
+
+                if (version == "1_0") {
+                    features = Sequential(
+                        Conv2d(3, 96, kernel_size: 7, stride: 2),
+                        ReLU(inplace: true),
+                        MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+                        new Fire("Fire", 96, 16, 64, 64),
+                        new Fire("Fire", 128, 16, 64, 64),
+                        new Fire("Fire", 128, 32, 128, 128),
+                        MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+                        new Fire("Fire", 256, 32, 128, 128),
+                        new Fire("Fire", 256, 48, 192, 192),
+                        new Fire("Fire", 384, 48, 192, 192),
+                        new Fire("Fire", 384, 64, 256, 256),
+                        MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+                        new Fire("Fire", 512, 64, 256, 256)
+                    );
+                } else if (version == "1_1") {
+                    features = Sequential(
+                        Conv2d(3, 64, kernel_size: 3, stride: 2),
+                        ReLU(inplace: true),
+                        MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+                        new Fire("Fire", 64, 16, 64, 64),
+                        new Fire("Fire", 128, 16, 64, 64),
+                        MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+                        new Fire("Fire", 128, 32, 128, 128),
+                        new Fire("Fire", 256, 32, 128, 128),
+                        MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+                        new Fire("Fire", 256, 48, 192, 192),
+                        new Fire("Fire", 384, 48, 192, 192),
+                        new Fire("Fire", 384, 64, 256, 256),
+                        new Fire("Fire", 512, 64, 256, 256)
+                    );
+                } else {
+                    throw new ArgumentException($"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected");
+                }
+
+                final_conv = Conv2d(512, num_classes, kernel_size: 1);
+                classifier = Sequential(
+                    Dropout(p: dropout),
+                    final_conv,
+                    ReLU(inplace: true),
+                    AdaptiveAvgPool2d(new long[] { 1, 1 })
+                );
+
+                RegisterComponents();
+
+                if (string.IsNullOrEmpty(weights_file)) {
+                    foreach (var (_, m) in named_modules()) {
+                        if (m is Modules.Conv2d conv) {
+                            if (object.ReferenceEquals(m, final_conv)) {
+                                nn.init.normal_(conv.weight, mean: 0.0, std: 0.01);
+                            } else {
+                                nn.init.kaiming_uniform_(conv.weight);
+                            }
+                            if (conv.bias is not null)
+                                nn.init.constant_(conv.bias, 0);
+                        }
+                    }
+                } else {
+                    this.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null);
+                }
+
+                if (device != null && device.type != DeviceType.CPU)
+                    this.to(device);
+            }
+
+            public override Tensor forward(Tensor x)
+            {
+                using (var _ = NewDisposeScope()) {
+                    x = features.call(x);
+                    x = classifier.call(x);
+                    return torch.flatten(x, 1).MoveToOuterDisposeScope();
+                }
+            }
+        }
+    }
+}
diff --git a/test/TorchSharpTest/TestTorchVision.cs b/test/TorchSharpTest/TestTorchVision.cs
index c8f1bc341..e534ef36a 100644
--- a/test/TorchSharpTest/TestTorchVision.cs
+++ b/test/TorchSharpTest/TestTorchVision.cs
@@ -799,6 +799,203 @@ public void TestMobileNetV3()
             }
         }
 
+        [Fact]
+        public void TestSqueezeNet()
+        {
+            {
+                using var model = squeezenet1_0();
+                var sd = model.state_dict();
+                Assert.Equal(52, sd.Count);
+                var names = model.named_children().Select(nm => nm.name).ToArray();
+                Assert.Multiple(
+                    () => Assert.Equal("features", names[0]),
+                    () => Assert.Equal("classifier", names[1])
+                );
+
+                using var input = torch.randn(2, 3, 224, 224);
+                using var output = model.call(input);
+
+                Assert.Equal(new long[] { 2, 1000 }, output.shape);
+            }
+            {
+                using var model = squeezenet1_1();
+                var sd = model.state_dict();
+                Assert.Equal(52, sd.Count);
+                var names = model.named_children().Select(nm => nm.name).ToArray();
+                Assert.Multiple(
+                    () => Assert.Equal("features", names[0]),
+                    () => Assert.Equal("classifier", names[1])
+                );
+
+                using var input = torch.randn(2, 3, 224, 224);
+                using var output = model.call(input);
+
+                Assert.Equal(new long[] { 2, 1000 }, output.shape);
+            }
+        }
+
+        [Fact]
+        public void TestDenseNet121()
+        {
+            using var model = densenet121();
+            var sd = model.state_dict();
+            Assert.Equal(727, sd.Count);
+            var names = model.named_children().Select(nm => nm.name).ToArray();
+            Assert.Multiple(
+                () => Assert.Equal("features", names[0]),
+                () => Assert.Equal("classifier", names[1])
+            );
+
+            using var input = torch.randn(2, 3, 224, 224);
+            using var output = model.call(input);
+
+            Assert.Equal(new long[] { 2, 1000 }, output.shape);
+        }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestDenseNet161()
+        {
+            using var model = densenet161();
+            var names = model.named_children().Select(nm => nm.name).ToArray();
+            Assert.Multiple(
+                () => Assert.Equal("features", names[0]),
+                () => Assert.Equal("classifier", names[1])
+            );
+
+            using var input = torch.randn(2, 3, 224, 224);
+            using var output = model.call(input);
+
+            Assert.Equal(new long[] { 2, 1000 }, output.shape);
+        }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestDenseNet169()
+        {
+            using var model = densenet169();
+            using var input = torch.randn(2, 3, 224, 224);
+            using var output = model.call(input);
+            Assert.Equal(new long[] { 2, 1000 }, output.shape);
+        }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestDenseNet201()
+        {
+            using var model = densenet201();
+            using var input = torch.randn(2, 3, 224, 224);
+            using var output = model.call(input);
+            Assert.Equal(new long[] { 2, 1000 }, output.shape);
+        }
+
+        [Fact]
+        public void TestShuffleNetV2()
+        {
+            using (var model = shufflenet_v2_x1_0()) {
+                var names = model.named_children().Select(nm => nm.name).ToArray();
+                Assert.Multiple(
+                    () => Assert.Equal("conv1", names[0]),
+                    () => Assert.Equal("maxpool", names[1]),
+                    () => Assert.Equal("stage2", names[2]),
+                    () => Assert.Equal("stage3", names[3]),
+                    () => Assert.Equal("stage4", names[4]),
+                    () => Assert.Equal("conv5", names[5]),
+                    () => Assert.Equal("fc", names[6])
+                );
+
+                using var input = torch.randn(2, 3, 224, 224);
+                using var output = model.call(input);
+
+                Assert.Equal(new long[] { 2, 1000 }, output.shape);
+            }
+
+            using (var model = shufflenet_v2_x0_5()) {
+                using var input = torch.randn(2, 3, 224, 224);
+                using var output = model.call(input);
+                Assert.Equal(new long[] { 2, 1000 }, output.shape);
+            }
+        }
+
+        [Fact]
+        public void TestEfficientNetB0()
+        {
+            using var model = efficientnet_b0();
+            var sd = model.state_dict();
+            Assert.Equal(360, sd.Count);
+            var names = model.named_children().Select(nm => nm.name).ToArray();
+            Assert.Multiple(
+                () => Assert.Equal("features", names[0]),
+                () => Assert.Equal("avgpool", names[1]),
+                () => Assert.Equal("classifier", names[2])
+            );
+
+            using var input = torch.randn(2, 3, 224, 224);
+            using var output = model.call(input);
+
+            Assert.Equal(new long[] { 2, 1000 }, output.shape);
+        }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetV2S()
+        {
+            using var model = efficientnet_v2_s();
+            var sd = model.state_dict();
+            Assert.Equal(782, sd.Count);
+            var names = model.named_children().Select(nm => nm.name).ToArray();
+            Assert.Multiple(
+                () => Assert.Equal("features", names[0]),
+                () => Assert.Equal("avgpool", names[1]),
+                () => Assert.Equal("classifier", names[2])
+            );
+
+            using var input = torch.randn(2, 3, 224, 224);
+            using var output = model.call(input);
+
+            Assert.Equal(new long[] { 2, 1000 }, output.shape);
+        }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetB1() { using var model = efficientnet_b1(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetB2() { using var model = efficientnet_b2(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetB3() { using var model = efficientnet_b3(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetB4() { using var model = efficientnet_b4(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetB5() { using var model = efficientnet_b5(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetB6() { using var model = efficientnet_b6(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetB7() { using var model = efficientnet_b7(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetV2M() { using var model = efficientnet_v2_m(); }
+
+        [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+        public void TestEfficientNetV2L() { using var model = efficientnet_v2_l(); }
+
+        [Fact]
+        public void TestMNASNet()
+        {
+            using var model = mnasnet1_0();
+            var sd = model.state_dict();
+            var names = model.named_children().Select(nm => nm.name).ToArray();
+            Assert.Multiple(
+                () => Assert.Equal("layers", names[0]),
+                () => Assert.Equal("classifier", names[1])
+            );
+
+            using var input = torch.randn(2, 3, 224, 224);
+            using var output = model.call(input);
+
+            Assert.Equal(new long[] { 2, 1000 }, output.shape);
+        }
+
         [Fact]
         public void TestReadingAndWritingImages()
         {