diff --git a/src/TorchVision/models/DenseNet.cs b/src/TorchVision/models/DenseNet.cs
new file mode 100644
index 000000000..a636b62cd
--- /dev/null
+++ b/src/TorchVision/models/DenseNet.cs
@@ -0,0 +1,367 @@
+// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+ public static partial class torchvision
+ {
+ public static partial class models
+ {
+ ///
+ /// DenseNet-121 model from "Densely Connected Convolutional Networks".
+ ///
+ /// The number of output classes.
+ /// How many filters to add each layer.
+ /// Multiplicative factor for number of bottleneck layers (i.e. bn_size * k features in the bottleneck layer).
+ /// Dropout rate after each dense layer.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.densenet121(pretrained=True)
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ /// In order for the weights to be loaded, the number of classes has to be the same as
+ /// in the pre-trained model, which is 1000.
+ ///
+ /// It is also possible to skip loading the last linear layer and use it for transfer-learning
+ /// with a different number of output classes. To do so, pass skipfc=true.
+ ///
+ /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+ /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+ /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+ ///
+ public static Modules.DenseNet densenet121(
+ int num_classes = 1000,
+ int growth_rate = 32,
+ int bn_size = 4,
+ float drop_rate = 0,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 24, 16 }, 64, bn_size, drop_rate,
+ num_classes, weights_file, skipfc, device);
+ }
+
+ ///
+ /// DenseNet-161 model from "Densely Connected Convolutional Networks".
+ ///
+ /// The number of output classes.
+ /// How many filters to add each layer.
+ /// Multiplicative factor for number of bottleneck layers.
+ /// Dropout rate after each dense layer.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.DenseNet densenet161(
+ int num_classes = 1000,
+ int growth_rate = 48,
+ int bn_size = 4,
+ float drop_rate = 0,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 36, 24 }, 96, bn_size, drop_rate,
+ num_classes, weights_file, skipfc, device);
+ }
+
+ ///
+ /// DenseNet-169 model from "Densely Connected Convolutional Networks".
+ ///
+ /// The number of output classes.
+ /// How many filters to add each layer.
+ /// Multiplicative factor for number of bottleneck layers.
+ /// Dropout rate after each dense layer.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.DenseNet densenet169(
+ int num_classes = 1000,
+ int growth_rate = 32,
+ int bn_size = 4,
+ float drop_rate = 0,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 32, 32 }, 64, bn_size, drop_rate,
+ num_classes, weights_file, skipfc, device);
+ }
+
+ ///
+ /// DenseNet-201 model from "Densely Connected Convolutional Networks".
+ ///
+ /// The number of output classes.
+ /// How many filters to add each layer.
+ /// Multiplicative factor for number of bottleneck layers.
+ /// Dropout rate after each dense layer.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.DenseNet densenet201(
+ int num_classes = 1000,
+ int growth_rate = 32,
+ int bn_size = 4,
+ float drop_rate = 0,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 48, 32 }, 64, bn_size, drop_rate,
+ num_classes, weights_file, skipfc, device);
+ }
+ }
+ }
+
+ namespace Modules
+ {
+ // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py
+ // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+ public class DenseNet : Module
+ {
+ ///
+ /// A single dense layer (BN-ReLU-Conv1x1-BN-ReLU-Conv3x3) as described in the paper.
+ ///
+ private class DenseLayer : Module
+ {
+ private readonly Module norm1;
+ private readonly Module relu1;
+ private readonly Module conv1;
+ private readonly Module norm2;
+ private readonly Module relu2;
+ private readonly Module conv2;
+ private readonly float drop_rate;
+
+ public DenseLayer(string name, int num_input_features, int growth_rate, int bn_size, float drop_rate)
+ : base(name)
+ {
+ norm1 = BatchNorm2d(num_input_features);
+ relu1 = ReLU(inplace: true);
+ conv1 = Conv2d(num_input_features, bn_size * growth_rate, kernel_size: 1, stride: 1, bias: false);
+ norm2 = BatchNorm2d(bn_size * growth_rate);
+ relu2 = ReLU(inplace: true);
+ conv2 = Conv2d(bn_size * growth_rate, growth_rate, kernel_size: 3, stride: 1, padding: 1, bias: false);
+ this.drop_rate = drop_rate;
+ RegisterComponents();
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ norm1.Dispose(); relu1.Dispose(); conv1.Dispose();
+ norm2.Dispose(); relu2.Dispose(); conv2.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public override Tensor forward(Tensor input)
+ {
+ var bottleneck_output = conv1.call(relu1.call(norm1.call(input)));
+ var new_features = conv2.call(relu2.call(norm2.call(bottleneck_output)));
+ if (drop_rate > 0 && training)
+ new_features = nn.functional.dropout(new_features, drop_rate, training);
+ return new_features;
+ }
+ }
+
+ ///
+ /// A dense block consisting of multiple dense layers with progressive feature concatenation.
+ ///
+ private class DenseBlock : Module
+ {
+ private readonly Module[] denselayers;
+
+ public DenseBlock(string name, int num_layers, int num_input_features, int bn_size, int growth_rate, float drop_rate)
+ : base(name)
+ {
+ denselayers = new Module[num_layers];
+ for (int i = 0; i < num_layers; i++) {
+ var layer = new DenseLayer($"denselayer{i + 1}",
+ num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate);
+ denselayers[i] = layer;
+ // Use register_module to ensure correct named hierarchy for state_dict compatibility
+ register_module($"denselayer{i + 1}", layer);
+ }
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ foreach (var layer in denselayers)
+ layer.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public override Tensor forward(Tensor init_features)
+ {
+ var features = new List { init_features };
+ foreach (var layer in denselayers) {
+ var concat_features = torch.cat(features.ToArray(), 1);
+ var new_features = layer.call(concat_features);
+ features.Add(new_features);
+ }
+ return torch.cat(features.ToArray(), 1);
+ }
+ }
+
+ ///
+ /// A transition layer (BN-ReLU-Conv1x1-AvgPool) that reduces feature map size.
+ ///
+ private class Transition : Module
+ {
+ private readonly Module norm;
+ private readonly Module relu;
+ private readonly Module conv;
+ private readonly Module pool;
+
+ public Transition(string name, int num_input_features, int num_output_features) : base(name)
+ {
+ norm = BatchNorm2d(num_input_features);
+ relu = ReLU(inplace: true);
+ conv = Conv2d(num_input_features, num_output_features, kernel_size: 1, stride: 1, bias: false);
+ pool = AvgPool2d(kernel_size: 2, stride: 2);
+ RegisterComponents();
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ norm.Dispose(); relu.Dispose(); conv.Dispose(); pool.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ return pool.call(conv.call(relu.call(norm.call(x))));
+ }
+ }
+
+ private readonly Module features;
+ private readonly Module classifier;
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ features.Dispose();
+ classifier.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ ///
+ /// DenseNet model class.
+ ///
+ /// How many filters to add each layer.
+ /// Number of layers in each dense block.
+ /// Number of filters in the first convolution layer.
+ /// Multiplicative factor for number of bottleneck layers.
+ /// Dropout rate after each dense layer.
+ /// Number of output classes.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public DenseNet(
+ int growth_rate = 32,
+ int[]? block_config = null,
+ int num_init_features = 64,
+ int bn_size = 4,
+ float drop_rate = 0,
+ int num_classes = 1000,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null) : base(nameof(DenseNet))
+ {
+ if (block_config == null)
+ block_config = new int[] { 6, 12, 24, 16 };
+
+ // Build the features Sequential with named children
+ var f = Sequential();
+ f.append("conv0", Conv2d(3, num_init_features, kernel_size: 7, stride: 2, padding: 3, bias: false));
+ f.append("norm0", BatchNorm2d(num_init_features));
+ f.append("relu0", ReLU(inplace: true));
+ f.append("pool0", MaxPool2d(kernel_size: 3, stride: 2, padding: 1));
+
+ int num_features = num_init_features;
+ for (int i = 0; i < block_config.Length; i++) {
+ var block = new DenseBlock("DenseBlock",
+ block_config[i], num_features, bn_size, growth_rate, drop_rate);
+ f.append($"denseblock{i + 1}", block);
+ num_features = num_features + block_config[i] * growth_rate;
+ if (i != block_config.Length - 1) {
+ var trans = new Transition("Transition",
+ num_features, num_features / 2);
+ f.append($"transition{i + 1}", trans);
+ num_features = num_features / 2;
+ }
+ }
+
+ f.append("norm5", BatchNorm2d(num_features));
+ features = f;
+
+ classifier = Linear(num_features, num_classes);
+
+ RegisterComponents();
+
+ // Weight initialization
+ if (string.IsNullOrEmpty(weights_file)) {
+ foreach (var (_, m) in named_modules()) {
+ if (m is Modules.Conv2d conv) {
+ nn.init.kaiming_normal_(conv.weight);
+ } else if (m is Modules.BatchNorm2d bn) {
+ nn.init.constant_(bn.weight, 1);
+ nn.init.constant_(bn.bias, 0);
+ } else if (m is Modules.Linear linear) {
+ nn.init.constant_(linear.bias, 0);
+ }
+ }
+ } else {
+ this.load(weights_file!, skip: skipfc ? new[] { "classifier.weight", "classifier.bias" } : null);
+ }
+
+ if (device != null && device.type != DeviceType.CPU)
+ this.to(device);
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ using (var _ = NewDisposeScope()) {
+ x = features.call(x);
+ x = nn.functional.relu(x);
+ x = nn.functional.adaptive_avg_pool2d(x, new long[] { 1, 1 });
+ x = torch.flatten(x, 1);
+ return classifier.call(x).MoveToOuterDisposeScope();
+ }
+ }
+ }
+ }
+}
diff --git a/src/TorchVision/models/EfficientNet.cs b/src/TorchVision/models/EfficientNet.cs
new file mode 100644
index 000000000..ded461949
--- /dev/null
+++ b/src/TorchVision/models/EfficientNet.cs
@@ -0,0 +1,819 @@
+// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torchvision.models._utils;
+using static TorchSharp.torchvision.ops;
+using TorchSharp.Modules;
+
+#nullable enable
+namespace TorchSharp
+{
+ namespace Modules
+ {
+ public class EfficientNet : nn.Module
+ {
+ internal enum BlockType { MBConv, FusedMBConv }
+
+ ///
+ /// Stores information listed at Tables 1 and 4 of the EfficientNet papers.
+ ///
+ internal class _MBConvConfig
+ {
+ public double expand_ratio;
+ public long kernel;
+ public long stride;
+ public long input_channels;
+ public long out_channels;
+ public long num_layers;
+ public BlockType block_type;
+
+ public _MBConvConfig(
+ double expand_ratio, long kernel, long stride,
+ long input_channels, long out_channels, long num_layers,
+ BlockType block_type)
+ {
+ this.expand_ratio = expand_ratio;
+ this.kernel = kernel;
+ this.stride = stride;
+ this.input_channels = input_channels;
+ this.out_channels = out_channels;
+ this.num_layers = num_layers;
+ this.block_type = block_type;
+ }
+
+ public static long adjust_channels(long channels, double width_mult, long? min_value = null)
+ {
+ return _make_divisible(channels * width_mult, 8, min_value);
+ }
+
+ public _MBConvConfig ShallowCopy()
+ {
+ return (_MBConvConfig)this.MemberwiseClone();
+ }
+ }
+
+ ///
+ /// Config for MBConv blocks (EfficientNet B0-B7).
+ /// Applies width and depth multipliers for compound scaling.
+ ///
+ internal class MBConvConfig : _MBConvConfig
+ {
+ public MBConvConfig(
+ double expand_ratio, long kernel, long stride,
+ long input_channels, long out_channels, long num_layers,
+ double width_mult = 1.0, double depth_mult = 1.0)
+ : base(expand_ratio, kernel, stride,
+ adjust_channels(input_channels, width_mult),
+ adjust_channels(out_channels, width_mult),
+ adjust_depth(num_layers, depth_mult),
+ BlockType.MBConv)
+ {
+ }
+
+ public static long adjust_depth(long num_layers, double depth_mult)
+ {
+ return (long)Math.Ceiling(num_layers * depth_mult);
+ }
+ }
+
+ ///
+ /// Config for FusedMBConv blocks (EfficientNet V2).
+ ///
+ internal class FusedMBConvConfig : _MBConvConfig
+ {
+ public FusedMBConvConfig(
+ double expand_ratio, long kernel, long stride,
+ long input_channels, long out_channels, long num_layers)
+ : base(expand_ratio, kernel, stride,
+ input_channels, out_channels, num_layers,
+ BlockType.FusedMBConv)
+ {
+ }
+ }
+
+ ///
+ /// MBConv block: Mobile Inverted Bottleneck Conv with Squeeze-and-Excitation.
+ ///
+ private class MBConv : nn.Module
+ {
+ private readonly nn.Module block;
+ private readonly torchvision.StochasticDepth stochastic_depth;
+ private readonly bool use_res_connect;
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ block.Dispose();
+ stochastic_depth.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public MBConv(
+ string name,
+ _MBConvConfig cnf,
+ double stochastic_depth_prob,
+ Func> norm_layer) : base(name)
+ {
+ if (!(1 <= cnf.stride && cnf.stride <= 2))
+ throw new ArgumentException("illegal stride value");
+
+ use_res_connect = cnf.stride == 1 && cnf.input_channels == cnf.out_channels;
+
+ var layers = new List>();
+ Func> activation_layer = (inplace) => nn.SiLU(inplace);
+
+ // expand
+ var expanded_channels = _MBConvConfig.adjust_channels(cnf.input_channels, cnf.expand_ratio);
+ if (expanded_channels != cnf.input_channels) {
+ layers.Add(Conv2dNormActivation(
+ cnf.input_channels, expanded_channels,
+ kernel_size: 1,
+ norm_layer: norm_layer,
+ activation_layer: activation_layer));
+ }
+
+ // depthwise
+ layers.Add(Conv2dNormActivation(
+ expanded_channels, expanded_channels,
+ kernel_size: cnf.kernel,
+ stride: cnf.stride,
+ groups: expanded_channels,
+ norm_layer: norm_layer,
+ activation_layer: activation_layer));
+
+ // squeeze and excitation
+ var squeeze_channels = Math.Max(1, cnf.input_channels / 4);
+ layers.Add(
+ torchvision.ops.SqueezeExcitation(
+ expanded_channels,
+ squeeze_channels,
+ activation: () => nn.SiLU(inplace: true)));
+
+ // project
+ layers.Add(Conv2dNormActivation(
+ expanded_channels, cnf.out_channels,
+ kernel_size: 1,
+ norm_layer: norm_layer,
+ activation_layer: null));
+
+ block = nn.Sequential(layers);
+ stochastic_depth = torchvision.ops.StochasticDepth(stochastic_depth_prob, torchvision.StochasticDepth.Mode.Row);
+
+ RegisterComponents();
+ }
+
+ public override Tensor forward(Tensor input)
+ {
+ var result = block.call(input);
+ if (use_res_connect) {
+ result = stochastic_depth.call(result);
+ result += input;
+ }
+ return result;
+ }
+ }
+
+ ///
+ /// FusedMBConv block: Fused Mobile Inverted Bottleneck Conv (no depthwise or SE).
+ ///
+ private class FusedMBConv : nn.Module
+ {
+ private readonly nn.Module block;
+ private readonly torchvision.StochasticDepth stochastic_depth;
+ private readonly bool use_res_connect;
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ block.Dispose();
+ stochastic_depth.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public FusedMBConv(
+ string name,
+ _MBConvConfig cnf,
+ double stochastic_depth_prob,
+ Func> norm_layer) : base(name)
+ {
+ if (!(1 <= cnf.stride && cnf.stride <= 2))
+ throw new ArgumentException("illegal stride value");
+
+ use_res_connect = cnf.stride == 1 && cnf.input_channels == cnf.out_channels;
+
+ var layers = new List>();
+ Func> activation_layer = (inplace) => nn.SiLU(inplace);
+
+ var expanded_channels = _MBConvConfig.adjust_channels(cnf.input_channels, cnf.expand_ratio);
+ if (expanded_channels != cnf.input_channels) {
+ // fused expand
+ layers.Add(Conv2dNormActivation(
+ cnf.input_channels, expanded_channels,
+ kernel_size: cnf.kernel,
+ stride: cnf.stride,
+ norm_layer: norm_layer,
+ activation_layer: activation_layer));
+
+ // project
+ layers.Add(Conv2dNormActivation(
+ expanded_channels, cnf.out_channels,
+ kernel_size: 1,
+ norm_layer: norm_layer,
+ activation_layer: null));
+ } else {
+ layers.Add(Conv2dNormActivation(
+ cnf.input_channels, cnf.out_channels,
+ kernel_size: cnf.kernel,
+ stride: cnf.stride,
+ norm_layer: norm_layer,
+ activation_layer: activation_layer));
+ }
+
+ block = nn.Sequential(layers);
+ stochastic_depth = torchvision.ops.StochasticDepth(stochastic_depth_prob, torchvision.StochasticDepth.Mode.Row);
+
+ RegisterComponents();
+ }
+
+ public override Tensor forward(Tensor input)
+ {
+ var result = block.call(input);
+ if (use_res_connect) {
+ result = stochastic_depth.call(result);
+ result += input;
+ }
+ return result;
+ }
+ }
+
+ private readonly nn.Module features;
+ private readonly nn.Module avgpool;
+ private readonly nn.Module classifier;
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ features.Dispose();
+ avgpool.Dispose();
+ classifier.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ ///
+ /// EfficientNet V1 and V2 main class
+ ///
+ ///
+ /// Network structure
+ /// The dropout probability
+ /// The stochastic depth probability
+ /// Number of classes
+ /// Module specifying the normalization layer to use
+ /// The number of channels on the penultimate layer
+ internal EfficientNet(
+ string name,
+ _MBConvConfig[] inverted_residual_setting,
+ double dropout,
+ double stochastic_depth_prob = 0.2,
+ long num_classes = 1000,
+ Func>? norm_layer = null,
+ long? last_channel = null) : base(name)
+ {
+ if (inverted_residual_setting == null || inverted_residual_setting.Length == 0)
+ throw new ArgumentException("The inverted_residual_setting should not be empty");
+
+ if (norm_layer == null)
+ norm_layer = (features) => nn.BatchNorm2d(features);
+
+ var layers = new List>();
+
+ // building first layer
+ var firstconv_output_channels = inverted_residual_setting[0].input_channels;
+ layers.Add(Conv2dNormActivation(
+ 3, firstconv_output_channels,
+ kernel_size: 3, stride: 2,
+ norm_layer: norm_layer,
+ activation_layer: (inplace) => nn.SiLU(inplace)));
+
+ // building inverted residual blocks
+ long total_stage_blocks = 0;
+ foreach (var cnf in inverted_residual_setting)
+ total_stage_blocks += cnf.num_layers;
+
+ long stage_block_id = 0;
+ foreach (var cnf in inverted_residual_setting) {
+ var stage = new List>();
+ for (int i = 0; i < cnf.num_layers; i++) {
+ var block_cnf = cnf.ShallowCopy();
+
+ // overwrite info if not the first conv in the stage
+ if (stage.Count > 0) {
+ block_cnf.input_channels = block_cnf.out_channels;
+ block_cnf.stride = 1;
+ }
+
+ // adjust stochastic depth probability based on the depth of the stage block
+ var sd_prob = stochastic_depth_prob * (double)stage_block_id / total_stage_blocks;
+
+ if (block_cnf.block_type == BlockType.FusedMBConv) {
+ stage.Add(new FusedMBConv("FusedMBConv", block_cnf, sd_prob, norm_layer));
+ } else {
+ stage.Add(new MBConv("MBConv", block_cnf, sd_prob, norm_layer));
+ }
+ stage_block_id++;
+ }
+ layers.Add(nn.Sequential(stage));
+ }
+
+ // building last several layers
+ var lastconv_input_channels = inverted_residual_setting[inverted_residual_setting.Length - 1].out_channels;
+ var lastconv_output_channels = last_channel.HasValue ? last_channel.Value : 4 * lastconv_input_channels;
+ layers.Add(Conv2dNormActivation(
+ lastconv_input_channels, lastconv_output_channels,
+ kernel_size: 1,
+ norm_layer: norm_layer,
+ activation_layer: (inplace) => nn.SiLU(inplace)));
+
+ features = nn.Sequential(layers);
+ avgpool = nn.AdaptiveAvgPool2d(1);
+ classifier = nn.Sequential(
+ nn.Dropout(p: dropout, inplace: true),
+ nn.Linear(lastconv_output_channels, num_classes));
+
+ RegisterComponents();
+
+ foreach (var (_, m) in this.named_modules()) {
+ if (m is Modules.Conv2d) {
+ var conv = (Modules.Conv2d)m;
+ nn.init.kaiming_normal_(conv.weight, mode: nn.init.FanInOut.FanOut);
+ if (conv.bias is not null) {
+ nn.init.zeros_(conv.bias);
+ }
+ } else if (m is Modules.BatchNorm2d) {
+ var norm = (Modules.BatchNorm2d)m;
+ nn.init.ones_(norm.weight);
+ nn.init.zeros_(norm.bias);
+ } else if (m is Modules.GroupNorm) {
+ var norm = (Modules.GroupNorm)m;
+ nn.init.ones_(norm.weight);
+ nn.init.zeros_(norm.bias);
+ } else if (m is Modules.Linear) {
+ var linear = (Modules.Linear)m;
+ var init_range = 1.0 / Math.Sqrt(linear.weight.shape[0]);
+ nn.init.uniform_(linear.weight, -init_range, init_range);
+ nn.init.zeros_(linear.bias);
+ }
+ }
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ using (var _ = NewDisposeScope()) {
+ x = features.call(x);
+ x = avgpool.call(x);
+ x = torch.flatten(x, 1);
+ x = classifier.call(x);
+ return x.MoveToOuterDisposeScope();
+ }
+ }
+ }
+ }
+
+ public static partial class torchvision
+ {
+ public static partial class models
+ {
+ private static (EfficientNet._MBConvConfig[], long?) _efficientnet_conf(string arch, double width_mult = 1.0, double depth_mult = 1.0)
+ {
+ EfficientNet._MBConvConfig[] inverted_residual_setting;
+ long? last_channel;
+
+ if (arch.StartsWith("efficientnet_b")) {
+ EfficientNet._MBConvConfig bneck_conf(
+ double expand_ratio, long kernel, long stride,
+ long input_channels, long out_channels, long num_layers) =>
+ new EfficientNet.MBConvConfig(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, width_mult, depth_mult);
+
+ inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+ bneck_conf(1, 3, 1, 32, 16, 1),
+ bneck_conf(6, 3, 2, 16, 24, 2),
+ bneck_conf(6, 5, 2, 24, 40, 2),
+ bneck_conf(6, 3, 2, 40, 80, 3),
+ bneck_conf(6, 5, 1, 80, 112, 3),
+ bneck_conf(6, 5, 2, 112, 192, 4),
+ bneck_conf(6, 3, 1, 192, 320, 1),
+ };
+ last_channel = null;
+ } else if (arch.StartsWith("efficientnet_v2_s")) {
+ inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+ new EfficientNet.FusedMBConvConfig(1, 3, 1, 24, 24, 2),
+ new EfficientNet.FusedMBConvConfig(4, 3, 2, 24, 48, 4),
+ new EfficientNet.FusedMBConvConfig(4, 3, 2, 48, 64, 4),
+ new EfficientNet.MBConvConfig(4, 3, 2, 64, 128, 6),
+ new EfficientNet.MBConvConfig(6, 3, 1, 128, 160, 9),
+ new EfficientNet.MBConvConfig(6, 3, 2, 160, 256, 15),
+ };
+ last_channel = 1280;
+ } else if (arch.StartsWith("efficientnet_v2_m")) {
+ inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+ new EfficientNet.FusedMBConvConfig(1, 3, 1, 24, 24, 3),
+ new EfficientNet.FusedMBConvConfig(4, 3, 2, 24, 48, 5),
+ new EfficientNet.FusedMBConvConfig(4, 3, 2, 48, 80, 5),
+ new EfficientNet.MBConvConfig(4, 3, 2, 80, 160, 7),
+ new EfficientNet.MBConvConfig(6, 3, 1, 160, 176, 14),
+ new EfficientNet.MBConvConfig(6, 3, 2, 176, 304, 18),
+ new EfficientNet.MBConvConfig(6, 3, 1, 304, 512, 5),
+ };
+ last_channel = 1280;
+ } else if (arch.StartsWith("efficientnet_v2_l")) {
+ inverted_residual_setting = new EfficientNet._MBConvConfig[] {
+ new EfficientNet.FusedMBConvConfig(1, 3, 1, 32, 32, 4),
+ new EfficientNet.FusedMBConvConfig(4, 3, 2, 32, 64, 7),
+ new EfficientNet.FusedMBConvConfig(4, 3, 2, 64, 96, 7),
+ new EfficientNet.MBConvConfig(4, 3, 2, 96, 192, 10),
+ new EfficientNet.MBConvConfig(6, 3, 1, 192, 224, 19),
+ new EfficientNet.MBConvConfig(6, 3, 2, 224, 384, 25),
+ new EfficientNet.MBConvConfig(6, 3, 1, 384, 640, 7),
+ };
+ last_channel = 1280;
+ } else {
+ throw new ArgumentException($"Unsupported model type {arch}");
+ }
+
+ return (inverted_residual_setting, last_channel);
+ }
+
+ private static Modules.EfficientNet _efficientnet(
+ EfficientNet._MBConvConfig[] inverted_residual_setting,
+ double dropout,
+ long? last_channel,
+ long num_classes = 1000,
+ Func>? norm_layer = null,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ var model = new EfficientNet("EfficientNet", inverted_residual_setting, dropout, num_classes: num_classes, norm_layer: norm_layer, last_channel: last_channel);
+
+ if (!string.IsNullOrEmpty(weights_file)) {
+ model.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null);
+ }
+
+ if (device != null && device.type != DeviceType.CPU)
+ model.to(device);
+
+ return model;
+ }
+
+ ///
+ /// EfficientNet B0 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b0(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ /// In order for the weights to be loaded, the number of classes has to be the same as
+ /// in the pre-trained model, which is 1000.
+ ///
+ /// It is also possible to skip loading the last linear layer and use it for transfer-learning
+ /// with a different number of output classes. To do so, pass skipfc=true.
+ ///
+ /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+ /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+ /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+ ///
+ public static Modules.EfficientNet efficientnet_b0(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b0", width_mult: 1.0, depth_mult: 1.0);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// EfficientNet B1 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b1(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_b1(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b1", width_mult: 1.0, depth_mult: 1.1);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// EfficientNet B2 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b2(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_b2(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b2", width_mult: 1.1, depth_mult: 1.2);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// EfficientNet B3 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b3(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_b3(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b3", width_mult: 1.2, depth_mult: 1.4);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// EfficientNet B4 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b4(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_b4(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b4", width_mult: 1.4, depth_mult: 1.8);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// EfficientNet B5 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b5(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_b5(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b5", width_mult: 1.6, depth_mult: 2.2);
+ Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// EfficientNet B6 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b6(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_b6(int num_classes = 1000, float dropout = 0.5f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b6", width_mult: 1.8, depth_mult: 2.6);
+ Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// EfficientNet B7 model architecture from the
+ /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_b7(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_b7(int num_classes = 1000, float dropout = 0.5f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b7", width_mult: 2.0, depth_mult: 3.1);
+ Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// Constructs an EfficientNetV2-S architecture from
+ /// EfficientNetV2: Smaller Models and Faster Training.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_v2_s(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_v2_s(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_s");
+ Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// Constructs an EfficientNetV2-M architecture from
+ /// EfficientNetV2: Smaller Models and Faster Training.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_v2_m(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_v2_m(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_m");
+ Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+
+ ///
+ /// Constructs an EfficientNetV2-L architecture from
+ /// EfficientNetV2: Smaller Models and Faster Training.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.efficientnet_v2_l(weights='DEFAULT')
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ public static Modules.EfficientNet efficientnet_v2_l(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_l");
+ Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001);
+ return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device);
+ }
+ }
+ }
+}
diff --git a/src/TorchVision/models/MNASNet.cs b/src/TorchVision/models/MNASNet.cs
new file mode 100644
index 000000000..7210f3268
--- /dev/null
+++ b/src/TorchVision/models/MNASNet.cs
@@ -0,0 +1,299 @@
+// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+ public static partial class torchvision
+ {
+ public static partial class models
+ {
+ ///
+ /// MNASNet with depth multiplier of 0.5 from
+ /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.mnasnet0_5(pretrained=True)
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ /// In order for the weights to be loaded, the number of classes has to be the same as
+ /// in the pre-trained model, which is 1000.
+ ///
+ /// It is also possible to skip loading the last linear layer and use it for transfer-learning
+ /// with a different number of output classes. To do so, pass skipfc=true.
+ ///
+ /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+ /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+ /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+ ///
+ public static Modules.MNASNet mnasnet0_5(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ return new Modules.MNASNet(0.5, num_classes, dropout, weights_file, skipfc, device);
+ }
+
+ ///
+ /// MNASNet with depth multiplier of 0.75 from
+ /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.MNASNet mnasnet0_75(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ return new Modules.MNASNet(0.75, num_classes, dropout, weights_file, skipfc, device);
+ }
+
+ ///
+ /// MNASNet with depth multiplier of 1.0 from
+ /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.MNASNet mnasnet1_0(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ return new Modules.MNASNet(1.0, num_classes, dropout, weights_file, skipfc, device);
+ }
+
+ ///
+ /// MNASNet with depth multiplier of 1.3 from
+ /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile".
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.MNASNet mnasnet1_3(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null)
+ {
+ return new Modules.MNASNet(1.3, num_classes, dropout, weights_file, skipfc, device);
+ }
+ }
+ }
+
+ namespace Modules
+ {
+ // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py
+ // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+ ///
+ /// MNASNet, as described in https://arxiv.org/abs/1807.11626.
+ /// This implements the B1 variant of the model.
+ ///
+ public class MNASNet : Module
+ {
+ // Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is 1.0 - tensorflow.
+ private const double _BN_MOMENTUM = 1.0 - 0.9997;
+
+ private class _InvertedResidual : Module
+ {
+ private readonly bool apply_residual;
+ private readonly Module layers;
+
+ public _InvertedResidual(string name, long in_ch, long out_ch, long kernel_size, long stride, long expansion_factor, double bn_momentum)
+ : base(name)
+ {
+ if (stride != 1 && stride != 2)
+ throw new ArgumentOutOfRangeException($"stride should be 1 or 2 instead of {stride}");
+ if (kernel_size != 3 && kernel_size != 5)
+ throw new ArgumentOutOfRangeException($"kernel_size should be 3 or 5 instead of {kernel_size}");
+
+ var mid_ch = in_ch * expansion_factor;
+ apply_residual = in_ch == out_ch && stride == 1;
+ layers = Sequential(
+ // Pointwise
+ Conv2d(in_ch, mid_ch, 1, bias: false),
+ BatchNorm2d(mid_ch, momentum: bn_momentum),
+ ReLU(inplace: true),
+ // Depthwise
+ Conv2d(mid_ch, mid_ch, kernel_size, padding: kernel_size / 2, stride: stride, groups: mid_ch, bias: false),
+ BatchNorm2d(mid_ch, momentum: bn_momentum),
+ ReLU(inplace: true),
+ // Linear pointwise. Note that there's no activation.
+ Conv2d(mid_ch, out_ch, 1, bias: false),
+ BatchNorm2d(out_ch, momentum: bn_momentum)
+ );
+ RegisterComponents();
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ layers.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public override Tensor forward(Tensor input)
+ {
+ if (apply_residual) {
+ return layers.call(input) + input;
+ } else {
+ return layers.call(input);
+ }
+ }
+ }
+
+ ///
+ /// Creates a stack of inverted residuals.
+ ///
+ private static Module _stack(long in_ch, long out_ch, long kernel_size, long stride, long exp_factor, int repeats, double bn_momentum)
+ {
+ if (repeats < 1)
+ throw new ArgumentOutOfRangeException($"repeats should be >= 1, instead got {repeats}");
+
+ var modules = new List>();
+ // First one has no skip, because feature map size changes.
+ modules.Add(new _InvertedResidual("_InvertedResidual", in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum));
+ for (int i = 1; i < repeats; i++) {
+ modules.Add(new _InvertedResidual("_InvertedResidual", out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum));
+ }
+ return Sequential(modules);
+ }
+
+ ///
+ /// Asymmetric rounding to make val divisible by divisor.
+ /// With default bias, will round up, unless the number is no more than 10% greater
+ /// than the smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88.
+ ///
+ private static int _round_to_multiple_of(double val, int divisor, double round_up_bias = 0.9)
+ {
+ if (round_up_bias <= 0.0 || round_up_bias >= 1.0)
+ throw new ArgumentOutOfRangeException($"round_up_bias should be greater than 0.0 and smaller than 1.0 instead of {round_up_bias}");
+ var new_val = Math.Max(divisor, (int)(val + divisor / 2) / divisor * divisor);
+ return new_val >= round_up_bias * val ? new_val : new_val + divisor;
+ }
+
+ ///
+ /// Scales tensor depths as in reference MobileNet code, prefers rounding up rather than down.
+ ///
+ private static int[] _get_depths(double alpha)
+ {
+ var depths = new int[] { 32, 16, 24, 40, 80, 96, 192, 320 };
+ var result = new int[depths.Length];
+ for (int i = 0; i < depths.Length; i++) {
+ result[i] = _round_to_multiple_of(depths[i] * alpha, 8);
+ }
+ return result;
+ }
+
+ private readonly Module layers;
+ private readonly Module classifier;
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ layers.Dispose();
+ classifier.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public MNASNet(double alpha, int num_classes = 1000, float dropout = 0.2f,
+ string? weights_file = null, bool skipfc = true, Device? device = null)
+ : base(nameof(MNASNet))
+ {
+ if (alpha <= 0.0)
+ throw new ArgumentOutOfRangeException($"alpha should be greater than 0.0 instead of {alpha}");
+
+ var depths = _get_depths(alpha);
+ var layerList = new List> {
+ // First layer: regular conv.
+ Conv2d(3, depths[0], 3, padding: 1, stride: 2, bias: false),
+ BatchNorm2d(depths[0], momentum: _BN_MOMENTUM),
+ ReLU(inplace: true),
+ // Depthwise separable, no skip.
+ Conv2d(depths[0], depths[0], 3, padding: 1, stride: 1, groups: depths[0], bias: false),
+ BatchNorm2d(depths[0], momentum: _BN_MOMENTUM),
+ ReLU(inplace: true),
+ Conv2d(depths[0], depths[1], 1, padding: 0L, stride: 1, bias: false),
+ BatchNorm2d(depths[1], momentum: _BN_MOMENTUM),
+ // MNASNet blocks: stacks of inverted residuals.
+ _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+ _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM),
+ _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM),
+ _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM),
+ _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM),
+ _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM),
+ // Final mapping to classifier input.
+ Conv2d(depths[7], 1280, 1, padding: 0L, stride: 1, bias: false),
+ BatchNorm2d(1280, momentum: _BN_MOMENTUM),
+ ReLU(inplace: true),
+ };
+ layers = Sequential(layerList);
+ classifier = Sequential(
+ Dropout(p: dropout, inplace: true),
+ Linear(1280, num_classes)
+ );
+
+ RegisterComponents();
+
+ // Weight initialization
+ foreach (var (_, m) in named_modules()) {
+ if (m is Modules.Conv2d conv) {
+ init.kaiming_normal_(conv.weight, mode: init.FanInOut.FanOut);
+ if (conv.bias is not null)
+ init.zeros_(conv.bias);
+ } else if (m is Modules.BatchNorm2d norm) {
+ init.ones_(norm.weight);
+ init.zeros_(norm.bias);
+ } else if (m is Modules.Linear linear) {
+ init.kaiming_uniform_(linear.weight, mode: init.FanInOut.FanOut, nonlinearity: init.NonlinearityType.Sigmoid);
+ init.zeros_(linear.bias);
+ }
+ }
+
+ if (!string.IsNullOrEmpty(weights_file)) {
+ this.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null);
+ }
+
+ if (device != null && device.type != DeviceType.CPU)
+ this.to(device);
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ using (var _ = NewDisposeScope()) {
+ x = layers.call(x);
+ // Equivalent to global avgpool and removing H and W dimensions.
+ x = x.mean(new long[] { 2, 3 });
+ return classifier.call(x).MoveToOuterDisposeScope();
+ }
+ }
+ }
+ }
+}
diff --git a/src/TorchVision/models/ShuffleNetV2.cs b/src/TorchVision/models/ShuffleNetV2.cs
new file mode 100644
index 000000000..3c7b5348e
--- /dev/null
+++ b/src/TorchVision/models/ShuffleNetV2.cs
@@ -0,0 +1,316 @@
+// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using System.Collections.Generic;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+ public static partial class torchvision
+ {
+ public static partial class models
+ {
+ ///
+ /// ShuffleNet V2 with 0.5x output channels, as described in
+ /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+ ///
+ /// The number of output classes.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.shufflenet_v2_x0_5(pretrained=True)
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+ /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+ /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+ ///
+ public static Modules.ShuffleNetV2 shufflenet_v2_x0_5(
+ int num_classes = 1000,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.ShuffleNetV2(
+ new int[] { 4, 8, 4 },
+ new int[] { 24, 48, 96, 192, 1024 },
+ num_classes, weights_file, skipfc, device);
+ }
+
+ ///
+ /// ShuffleNet V2 with 1.0x output channels, as described in
+ /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+ ///
+ /// The number of output classes.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.ShuffleNetV2 shufflenet_v2_x1_0(
+ int num_classes = 1000,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.ShuffleNetV2(
+ new int[] { 4, 8, 4 },
+ new int[] { 24, 116, 232, 464, 1024 },
+ num_classes, weights_file, skipfc, device);
+ }
+
+ ///
+ /// ShuffleNet V2 with 1.5x output channels, as described in
+ /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+ ///
+ /// The number of output classes.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.ShuffleNetV2 shufflenet_v2_x1_5(
+ int num_classes = 1000,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.ShuffleNetV2(
+ new int[] { 4, 8, 4 },
+ new int[] { 24, 176, 352, 704, 1024 },
+ num_classes, weights_file, skipfc, device);
+ }
+
+ ///
+ /// ShuffleNet V2 with 2.0x output channels, as described in
+ /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design".
+ ///
+ /// The number of output classes.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public static Modules.ShuffleNetV2 shufflenet_v2_x2_0(
+ int num_classes = 1000,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.ShuffleNetV2(
+ new int[] { 4, 8, 4 },
+ new int[] { 24, 244, 488, 976, 2048 },
+ num_classes, weights_file, skipfc, device);
+ }
+ }
+ }
+
+ namespace Modules
+ {
+ // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py
+ // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+ public class ShuffleNetV2 : Module
+ {
+ private static Tensor channel_shuffle(Tensor x, int groups)
+ {
+ var batchsize = x.shape[0];
+ var num_channels = x.shape[1];
+ var height = x.shape[2];
+ var width = x.shape[3];
+ var channels_per_group = num_channels / groups;
+
+ x = x.view(batchsize, groups, channels_per_group, height, width);
+ x = x.transpose(1, 2).contiguous();
+ x = x.view(batchsize, num_channels, height, width);
+ return x;
+ }
+
+ private static Module depthwise_conv(
+ long i, long o, long kernel_size, long stride = 1, long padding = 0, bool bias = false)
+ {
+ return Conv2d(i, o, kernel_size: kernel_size, stride: stride, padding: padding, bias: bias, groups: i);
+ }
+
+ private class InvertedResidual : Module
+ {
+ private readonly Module branch1;
+ private readonly Module branch2;
+ private readonly int _stride;
+
+ public InvertedResidual(string name, long inp, long oup, int stride) : base(name)
+ {
+ if (stride < 1 || stride > 3)
+ throw new ArgumentException("illegal stride value", nameof(stride));
+
+ _stride = stride;
+ var branch_features = oup / 2;
+
+ if (stride > 1) {
+ branch1 = Sequential(
+ depthwise_conv(inp, inp, kernel_size: 3, stride: stride, padding: 1),
+ BatchNorm2d(inp),
+ Conv2d(inp, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+ BatchNorm2d(branch_features),
+ ReLU(inplace: true)
+ );
+ } else {
+ branch1 = Sequential();
+ }
+
+ branch2 = Sequential(
+ Conv2d(stride > 1 ? inp : branch_features, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+ BatchNorm2d(branch_features),
+ ReLU(inplace: true),
+ depthwise_conv(branch_features, branch_features, kernel_size: 3, stride: stride, padding: 1),
+ BatchNorm2d(branch_features),
+ Conv2d(branch_features, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+ BatchNorm2d(branch_features),
+ ReLU(inplace: true)
+ );
+
+ RegisterComponents();
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ branch1.Dispose();
+ branch2.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ Tensor @out;
+ if (_stride == 1) {
+ var chunks = x.chunk(2, dim: 1);
+ @out = torch.cat(new[] { chunks[0], branch2.call(chunks[1]) }, 1);
+ } else {
+ @out = torch.cat(new[] { branch1.call(x), branch2.call(x) }, 1);
+ }
+ @out = channel_shuffle(@out, 2);
+ return @out;
+ }
+ }
+
+ private readonly Module conv1;
+ private readonly Module maxpool;
+ private readonly Module stage2;
+ private readonly Module stage3;
+ private readonly Module stage4;
+ private readonly Module conv5;
+ private readonly Module fc;
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ conv1.Dispose(); maxpool.Dispose();
+ stage2.Dispose(); stage3.Dispose(); stage4.Dispose();
+ conv5.Dispose(); fc.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ private static Module MakeStage(long input_channels, long output_channels, int repeats)
+ {
+ var modules = new List>();
+ modules.Add(new InvertedResidual("InvertedResidual", input_channels, output_channels, 2));
+ for (int i = 0; i < repeats - 1; i++) {
+ modules.Add(new InvertedResidual("InvertedResidual", output_channels, output_channels, 1));
+ }
+ return Sequential(modules.ToArray());
+ }
+
+ ///
+ /// ShuffleNet V2 main class.
+ ///
+ /// Number of repeated blocks in each stage.
+ /// Output channels for each stage.
+ /// Number of output classes.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last linear layer will not be loaded from the weights file.
+ /// The device to locate the model on.
+ public ShuffleNetV2(
+ int[] stages_repeats,
+ int[] stages_out_channels,
+ int num_classes = 1000,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null) : base(nameof(ShuffleNetV2))
+ {
+ if (stages_repeats.Length != 3)
+ throw new ArgumentException("expected stages_repeats to have 3 elements");
+ if (stages_out_channels.Length != 5)
+ throw new ArgumentException("expected stages_out_channels to have 5 elements");
+
+ long input_channels = 3;
+ long output_channels = stages_out_channels[0];
+
+ conv1 = Sequential(
+ Conv2d(input_channels, output_channels, kernel_size: 3, stride: 2, padding: 1, bias: false),
+ BatchNorm2d(output_channels),
+ ReLU(inplace: true)
+ );
+ input_channels = output_channels;
+
+ maxpool = MaxPool2d(kernel_size: 3, stride: 2, padding: 1);
+
+ stage2 = MakeStage(input_channels, stages_out_channels[1], stages_repeats[0]);
+ stage3 = MakeStage(stages_out_channels[1], stages_out_channels[2], stages_repeats[1]);
+ stage4 = MakeStage(stages_out_channels[2], stages_out_channels[3], stages_repeats[2]);
+
+ output_channels = stages_out_channels[4];
+ conv5 = Sequential(
+ Conv2d(stages_out_channels[3], output_channels, kernel_size: 1, stride: 1, padding: 0L, bias: false),
+ BatchNorm2d(output_channels),
+ ReLU(inplace: true)
+ );
+
+ fc = Linear(output_channels, num_classes);
+
+ RegisterComponents();
+
+ if (!string.IsNullOrEmpty(weights_file)) {
+ this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias" } : null);
+ }
+
+ if (device != null && device.type != DeviceType.CPU)
+ this.to(device);
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ using (var _ = NewDisposeScope()) {
+ x = conv1.call(x);
+ x = maxpool.call(x);
+ x = stage2.call(x);
+ x = stage3.call(x);
+ x = stage4.call(x);
+ x = conv5.call(x);
+ x = x.mean(new long[] { 2, 3 }); // global pool
+ x = fc.call(x);
+ return x.MoveToOuterDisposeScope();
+ }
+ }
+ }
+ }
+}
diff --git a/src/TorchVision/models/SqueezeNet.cs b/src/TorchVision/models/SqueezeNet.cs
new file mode 100644
index 000000000..34df94020
--- /dev/null
+++ b/src/TorchVision/models/SqueezeNet.cs
@@ -0,0 +1,257 @@
+// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information.
+
+// A number of implementation details in this file have been translated from the Python version of torchvision,
+// largely located in the files found in this folder:
+//
+// https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py
+//
+// The origin has the following copyright notice and license:
+//
+// https://github.com/pytorch/vision/blob/main/LICENSE
+//
+
+using System;
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+#nullable enable
+namespace TorchSharp
+{
+ public static partial class torchvision
+ {
+ public static partial class models
+ {
+ ///
+ /// SqueezeNet 1.0 model from
+ /// "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and less than 0.5MB model size".
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last convolutional layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.squeezenet1_0(pretrained=True)
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ /// In order for the weights to be loaded, the number of classes has to be the same as
+ /// in the pre-trained model, which is 1000.
+ ///
+ /// It is also possible to skip loading the last classifier layer and use it for transfer-learning
+ /// with a different number of output classes. To do so, pass skipfc=true.
+ ///
+ /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+ /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+ /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+ ///
+ public static Modules.SqueezeNet squeezenet1_0(
+ int num_classes = 1000,
+ float dropout = 0.5f,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.SqueezeNet("1_0", num_classes, dropout, weights_file, skipfc, device);
+ }
+
+ ///
+ /// SqueezeNet 1.1 model from the official SqueezeNet repo.
+ /// SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0.
+ ///
+ /// The number of output classes.
+ /// The dropout ratio.
+ /// The location of a file containing pre-trained weights for the model.
+ /// If true, the last convolutional layer of the classifier will not be loaded from the weights file.
+ /// The device to locate the model on.
+ ///
+ /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict
+ /// using the exportsd.py script, then loading into the .NET instance:
+ ///
+ /// from torchvision import models
+ /// import exportsd
+ ///
+ /// model = models.squeezenet1_1(pretrained=True)
+ /// f = open("model_weights.dat", "wb")
+ /// exportsd.save_state_dict(model.state_dict(), f)
+ /// f.close()
+ ///
+ /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md
+ ///
+ /// In order for the weights to be loaded, the number of classes has to be the same as
+ /// in the pre-trained model, which is 1000.
+ ///
+ /// It is also possible to skip loading the last classifier layer and use it for transfer-learning
+ /// with a different number of output classes. To do so, pass skipfc=true.
+ ///
+ /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB
+ /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded
+ /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].
+ ///
+ public static Modules.SqueezeNet squeezenet1_1(
+ int num_classes = 1000,
+ float dropout = 0.5f,
+ string? weights_file = null,
+ bool skipfc = true,
+ Device? device = null)
+ {
+ return new Modules.SqueezeNet("1_1", num_classes, dropout, weights_file, skipfc, device);
+ }
+ }
+ }
+
+ namespace Modules
+ {
+ // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py
+ // License: https://github.com/pytorch/vision/blob/main/LICENSE
+
+ public class SqueezeNet : Module
+ {
+ private class Fire : Module
+ {
+ private readonly Module squeeze;
+ private readonly Module squeeze_activation;
+ private readonly Module expand1x1;
+ private readonly Module expand1x1_activation;
+ private readonly Module expand3x3;
+ private readonly Module expand3x3_activation;
+
+ public Fire(string name, int inplanes, int squeeze_planes, int expand1x1_planes, int expand3x3_planes)
+ : base(name)
+ {
+ squeeze = Conv2d(inplanes, squeeze_planes, kernel_size: 1);
+ squeeze_activation = ReLU(inplace: true);
+ expand1x1 = Conv2d(squeeze_planes, expand1x1_planes, kernel_size: 1);
+ expand1x1_activation = ReLU(inplace: true);
+ expand3x3 = Conv2d(squeeze_planes, expand3x3_planes, kernel_size: 3, padding: 1);
+ expand3x3_activation = ReLU(inplace: true);
+ RegisterComponents();
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ squeeze.Dispose();
+ squeeze_activation.Dispose();
+ expand1x1.Dispose();
+ expand1x1_activation.Dispose();
+ expand3x3.Dispose();
+ expand3x3_activation.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ x = squeeze_activation.call(squeeze.call(x));
+ return torch.cat(new[] {
+ expand1x1_activation.call(expand1x1.call(x)),
+ expand3x3_activation.call(expand3x3.call(x))
+ }, 1);
+ }
+ }
+
+ private readonly Module features;
+ private readonly Module classifier;
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) {
+ features.Dispose();
+ classifier.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ public SqueezeNet(string version, int num_classes = 1000, float dropout = 0.5f,
+ string? weights_file = null, bool skipfc = true, Device? device = null)
+ : base(nameof(SqueezeNet))
+ {
+ Module final_conv;
+
+ if (version == "1_0") {
+ features = Sequential(
+ Conv2d(3, 96, kernel_size: 7, stride: 2),
+ ReLU(inplace: true),
+ MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+ new Fire("Fire", 96, 16, 64, 64),
+ new Fire("Fire", 128, 16, 64, 64),
+ new Fire("Fire", 128, 32, 128, 128),
+ MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+ new Fire("Fire", 256, 32, 128, 128),
+ new Fire("Fire", 256, 48, 192, 192),
+ new Fire("Fire", 384, 48, 192, 192),
+ new Fire("Fire", 384, 64, 256, 256),
+ MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+ new Fire("Fire", 512, 64, 256, 256)
+ );
+ } else if (version == "1_1") {
+ features = Sequential(
+ Conv2d(3, 64, kernel_size: 3, stride: 2),
+ ReLU(inplace: true),
+ MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+ new Fire("Fire", 64, 16, 64, 64),
+ new Fire("Fire", 128, 16, 64, 64),
+ MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+ new Fire("Fire", 128, 32, 128, 128),
+ new Fire("Fire", 256, 32, 128, 128),
+ MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true),
+ new Fire("Fire", 256, 48, 192, 192),
+ new Fire("Fire", 384, 48, 192, 192),
+ new Fire("Fire", 384, 64, 256, 256),
+ new Fire("Fire", 512, 64, 256, 256)
+ );
+ } else {
+ throw new ArgumentException($"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected");
+ }
+
+ final_conv = Conv2d(512, num_classes, kernel_size: 1);
+ classifier = Sequential(
+ Dropout(p: dropout),
+ final_conv,
+ ReLU(inplace: true),
+ AdaptiveAvgPool2d(new long[] { 1, 1 })
+ );
+
+ RegisterComponents();
+
+ if (string.IsNullOrEmpty(weights_file)) {
+ foreach (var (_, m) in named_modules()) {
+ if (m is Modules.Conv2d conv) {
+ if (object.ReferenceEquals(m, final_conv)) {
+ nn.init.normal_(conv.weight, mean: 0.0, std: 0.01);
+ } else {
+ nn.init.kaiming_uniform_(conv.weight);
+ }
+ if (conv.bias is not null)
+ nn.init.constant_(conv.bias, 0);
+ }
+ }
+ } else {
+ this.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null);
+ }
+
+ if (device != null && device.type != DeviceType.CPU)
+ this.to(device);
+ }
+
+ public override Tensor forward(Tensor x)
+ {
+ using (var _ = NewDisposeScope()) {
+ x = features.call(x);
+ x = classifier.call(x);
+ return torch.flatten(x, 1).MoveToOuterDisposeScope();
+ }
+ }
+ }
+ }
+}
diff --git a/test/TorchSharpTest/TestTorchVision.cs b/test/TorchSharpTest/TestTorchVision.cs
index c8f1bc341..e534ef36a 100644
--- a/test/TorchSharpTest/TestTorchVision.cs
+++ b/test/TorchSharpTest/TestTorchVision.cs
@@ -799,6 +799,203 @@ public void TestMobileNetV3()
}
}
+ [Fact]
+ public void TestSqueezeNet()
+ {
+ {
+ using var model = squeezenet1_0();
+ var sd = model.state_dict();
+ Assert.Equal(52, sd.Count);
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("features", names[0]),
+ () => Assert.Equal("classifier", names[1])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+ {
+ using var model = squeezenet1_1();
+ var sd = model.state_dict();
+ Assert.Equal(52, sd.Count);
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("features", names[0]),
+ () => Assert.Equal("classifier", names[1])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+ }
+
+ [Fact]
+ public void TestDenseNet121()
+ {
+ using var model = densenet121();
+ var sd = model.state_dict();
+ Assert.Equal(727, sd.Count);
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("features", names[0]),
+ () => Assert.Equal("classifier", names[1])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestDenseNet161()
+ {
+ using var model = densenet161();
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("features", names[0]),
+ () => Assert.Equal("classifier", names[1])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestDenseNet169()
+ {
+ using var model = densenet169();
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestDenseNet201()
+ {
+ using var model = densenet201();
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
+ [Fact]
+ public void TestShuffleNetV2()
+ {
+ using (var model = shufflenet_v2_x1_0()) {
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("conv1", names[0]),
+ () => Assert.Equal("maxpool", names[1]),
+ () => Assert.Equal("stage2", names[2]),
+ () => Assert.Equal("stage3", names[3]),
+ () => Assert.Equal("stage4", names[4]),
+ () => Assert.Equal("conv5", names[5]),
+ () => Assert.Equal("fc", names[6])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
+ using (var model = shufflenet_v2_x0_5()) {
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+ }
+
+ [Fact]
+ public void TestEfficientNetB0()
+ {
+ using var model = efficientnet_b0();
+ var sd = model.state_dict();
+ Assert.Equal(360, sd.Count);
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("features", names[0]),
+ () => Assert.Equal("avgpool", names[1]),
+ () => Assert.Equal("classifier", names[2])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetV2S()
+ {
+ using var model = efficientnet_v2_s();
+ var sd = model.state_dict();
+ Assert.Equal(782, sd.Count);
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("features", names[0]),
+ () => Assert.Equal("avgpool", names[1]),
+ () => Assert.Equal("classifier", names[2])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetB1() { using var model = efficientnet_b1(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetB2() { using var model = efficientnet_b2(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetB3() { using var model = efficientnet_b3(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetB4() { using var model = efficientnet_b4(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetB5() { using var model = efficientnet_b5(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetB6() { using var model = efficientnet_b6(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetB7() { using var model = efficientnet_b7(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetV2M() { using var model = efficientnet_v2_m(); }
+
+ [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")]
+ public void TestEfficientNetV2L() { using var model = efficientnet_v2_l(); }
+
+ [Fact]
+ public void TestMNASNet()
+ {
+ using var model = mnasnet1_0();
+ var sd = model.state_dict();
+ var names = model.named_children().Select(nm => nm.name).ToArray();
+ Assert.Multiple(
+ () => Assert.Equal("layers", names[0]),
+ () => Assert.Equal("classifier", names[1])
+ );
+
+ using var input = torch.randn(2, 3, 224, 224);
+ using var output = model.call(input);
+
+ Assert.Equal(new long[] { 2, 1000 }, output.shape);
+ }
+
[Fact]
public void TestReadingAndWritingImages()
{