diff --git a/src/TorchVision/models/DenseNet.cs b/src/TorchVision/models/DenseNet.cs new file mode 100644 index 000000000..a636b62cd --- /dev/null +++ b/src/TorchVision/models/DenseNet.cs @@ -0,0 +1,367 @@ +// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python version of torchvision, +// largely located in the files found in this folder: +// +// https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/pytorch/vision/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; +using static TorchSharp.torch; +using static TorchSharp.torch.nn; + +#nullable enable +namespace TorchSharp +{ + public static partial class torchvision + { + public static partial class models + { + /// + /// DenseNet-121 model from "Densely Connected Convolutional Networks". + /// + /// The number of output classes. + /// How many filters to add each layer. + /// Multiplicative factor for number of bottleneck layers (i.e. bn_size * k features in the bottleneck layer). + /// Dropout rate after each dense layer. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.densenet121(pretrained=True) + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + /// In order for the weights to be loaded, the number of classes has to be the same as + /// in the pre-trained model, which is 1000. + /// + /// It is also possible to skip loading the last linear layer and use it for transfer-learning + /// with a different number of output classes. To do so, pass skipfc=true. + /// + /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB + /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded + /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]. + /// + public static Modules.DenseNet densenet121( + int num_classes = 1000, + int growth_rate = 32, + int bn_size = 4, + float drop_rate = 0, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 24, 16 }, 64, bn_size, drop_rate, + num_classes, weights_file, skipfc, device); + } + + /// + /// DenseNet-161 model from "Densely Connected Convolutional Networks". + /// + /// The number of output classes. + /// How many filters to add each layer. + /// Multiplicative factor for number of bottleneck layers. + /// Dropout rate after each dense layer. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.DenseNet densenet161( + int num_classes = 1000, + int growth_rate = 48, + int bn_size = 4, + float drop_rate = 0, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 36, 24 }, 96, bn_size, drop_rate, + num_classes, weights_file, skipfc, device); + } + + /// + /// DenseNet-169 model from "Densely Connected Convolutional Networks". + /// + /// The number of output classes. + /// How many filters to add each layer. + /// Multiplicative factor for number of bottleneck layers. + /// Dropout rate after each dense layer. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.DenseNet densenet169( + int num_classes = 1000, + int growth_rate = 32, + int bn_size = 4, + float drop_rate = 0, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 32, 32 }, 64, bn_size, drop_rate, + num_classes, weights_file, skipfc, device); + } + + /// + /// DenseNet-201 model from "Densely Connected Convolutional Networks". + /// + /// The number of output classes. + /// How many filters to add each layer. + /// Multiplicative factor for number of bottleneck layers. + /// Dropout rate after each dense layer. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.DenseNet densenet201( + int num_classes = 1000, + int growth_rate = 32, + int bn_size = 4, + float drop_rate = 0, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.DenseNet(growth_rate, new int[] { 6, 12, 48, 32 }, 64, bn_size, drop_rate, + num_classes, weights_file, skipfc, device); + } + } + } + + namespace Modules + { + // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py + // License: https://github.com/pytorch/vision/blob/main/LICENSE + + public class DenseNet : Module + { + /// + /// A single dense layer (BN-ReLU-Conv1x1-BN-ReLU-Conv3x3) as described in the paper. + /// + private class DenseLayer : Module + { + private readonly Module norm1; + private readonly Module relu1; + private readonly Module conv1; + private readonly Module norm2; + private readonly Module relu2; + private readonly Module conv2; + private readonly float drop_rate; + + public DenseLayer(string name, int num_input_features, int growth_rate, int bn_size, float drop_rate) + : base(name) + { + norm1 = BatchNorm2d(num_input_features); + relu1 = ReLU(inplace: true); + conv1 = Conv2d(num_input_features, bn_size * growth_rate, kernel_size: 1, stride: 1, bias: false); + norm2 = BatchNorm2d(bn_size * growth_rate); + relu2 = ReLU(inplace: true); + conv2 = Conv2d(bn_size * growth_rate, growth_rate, kernel_size: 3, stride: 1, padding: 1, bias: false); + this.drop_rate = drop_rate; + RegisterComponents(); + } + + protected override void Dispose(bool disposing) + { + if (disposing) { + norm1.Dispose(); relu1.Dispose(); conv1.Dispose(); + norm2.Dispose(); relu2.Dispose(); conv2.Dispose(); + } + base.Dispose(disposing); + } + + public override Tensor forward(Tensor input) + { + var bottleneck_output = conv1.call(relu1.call(norm1.call(input))); + var new_features = conv2.call(relu2.call(norm2.call(bottleneck_output))); + if (drop_rate > 0 && training) + new_features = nn.functional.dropout(new_features, drop_rate, training); + return new_features; + } + } + + /// + /// A dense block consisting of multiple dense layers with progressive feature concatenation. + /// + private class DenseBlock : Module + { + private readonly Module[] denselayers; + + public DenseBlock(string name, int num_layers, int num_input_features, int bn_size, int growth_rate, float drop_rate) + : base(name) + { + denselayers = new Module[num_layers]; + for (int i = 0; i < num_layers; i++) { + var layer = new DenseLayer($"denselayer{i + 1}", + num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate); + denselayers[i] = layer; + // Use register_module to ensure correct named hierarchy for state_dict compatibility + register_module($"denselayer{i + 1}", layer); + } + } + + protected override void Dispose(bool disposing) + { + if (disposing) { + foreach (var layer in denselayers) + layer.Dispose(); + } + base.Dispose(disposing); + } + + public override Tensor forward(Tensor init_features) + { + var features = new List { init_features }; + foreach (var layer in denselayers) { + var concat_features = torch.cat(features.ToArray(), 1); + var new_features = layer.call(concat_features); + features.Add(new_features); + } + return torch.cat(features.ToArray(), 1); + } + } + + /// + /// A transition layer (BN-ReLU-Conv1x1-AvgPool) that reduces feature map size. + /// + private class Transition : Module + { + private readonly Module norm; + private readonly Module relu; + private readonly Module conv; + private readonly Module pool; + + public Transition(string name, int num_input_features, int num_output_features) : base(name) + { + norm = BatchNorm2d(num_input_features); + relu = ReLU(inplace: true); + conv = Conv2d(num_input_features, num_output_features, kernel_size: 1, stride: 1, bias: false); + pool = AvgPool2d(kernel_size: 2, stride: 2); + RegisterComponents(); + } + + protected override void Dispose(bool disposing) + { + if (disposing) { + norm.Dispose(); relu.Dispose(); conv.Dispose(); pool.Dispose(); + } + base.Dispose(disposing); + } + + public override Tensor forward(Tensor x) + { + return pool.call(conv.call(relu.call(norm.call(x)))); + } + } + + private readonly Module features; + private readonly Module classifier; + + protected override void Dispose(bool disposing) + { + if (disposing) { + features.Dispose(); + classifier.Dispose(); + } + base.Dispose(disposing); + } + + /// + /// DenseNet model class. + /// + /// How many filters to add each layer. + /// Number of layers in each dense block. + /// Number of filters in the first convolution layer. + /// Multiplicative factor for number of bottleneck layers. + /// Dropout rate after each dense layer. + /// Number of output classes. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer will not be loaded from the weights file. + /// The device to locate the model on. + public DenseNet( + int growth_rate = 32, + int[]? block_config = null, + int num_init_features = 64, + int bn_size = 4, + float drop_rate = 0, + int num_classes = 1000, + string? weights_file = null, + bool skipfc = true, + Device? device = null) : base(nameof(DenseNet)) + { + if (block_config == null) + block_config = new int[] { 6, 12, 24, 16 }; + + // Build the features Sequential with named children + var f = Sequential(); + f.append("conv0", Conv2d(3, num_init_features, kernel_size: 7, stride: 2, padding: 3, bias: false)); + f.append("norm0", BatchNorm2d(num_init_features)); + f.append("relu0", ReLU(inplace: true)); + f.append("pool0", MaxPool2d(kernel_size: 3, stride: 2, padding: 1)); + + int num_features = num_init_features; + for (int i = 0; i < block_config.Length; i++) { + var block = new DenseBlock("DenseBlock", + block_config[i], num_features, bn_size, growth_rate, drop_rate); + f.append($"denseblock{i + 1}", block); + num_features = num_features + block_config[i] * growth_rate; + if (i != block_config.Length - 1) { + var trans = new Transition("Transition", + num_features, num_features / 2); + f.append($"transition{i + 1}", trans); + num_features = num_features / 2; + } + } + + f.append("norm5", BatchNorm2d(num_features)); + features = f; + + classifier = Linear(num_features, num_classes); + + RegisterComponents(); + + // Weight initialization + if (string.IsNullOrEmpty(weights_file)) { + foreach (var (_, m) in named_modules()) { + if (m is Modules.Conv2d conv) { + nn.init.kaiming_normal_(conv.weight); + } else if (m is Modules.BatchNorm2d bn) { + nn.init.constant_(bn.weight, 1); + nn.init.constant_(bn.bias, 0); + } else if (m is Modules.Linear linear) { + nn.init.constant_(linear.bias, 0); + } + } + } else { + this.load(weights_file!, skip: skipfc ? new[] { "classifier.weight", "classifier.bias" } : null); + } + + if (device != null && device.type != DeviceType.CPU) + this.to(device); + } + + public override Tensor forward(Tensor x) + { + using (var _ = NewDisposeScope()) { + x = features.call(x); + x = nn.functional.relu(x); + x = nn.functional.adaptive_avg_pool2d(x, new long[] { 1, 1 }); + x = torch.flatten(x, 1); + return classifier.call(x).MoveToOuterDisposeScope(); + } + } + } + } +} diff --git a/src/TorchVision/models/EfficientNet.cs b/src/TorchVision/models/EfficientNet.cs new file mode 100644 index 000000000..ded461949 --- /dev/null +++ b/src/TorchVision/models/EfficientNet.cs @@ -0,0 +1,819 @@ +// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python version of torchvision, +// largely located in the files found in this folder: +// +// https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/pytorch/vision/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; +using static TorchSharp.torch; +using static TorchSharp.torch.nn; +using static TorchSharp.torchvision.models._utils; +using static TorchSharp.torchvision.ops; +using TorchSharp.Modules; + +#nullable enable +namespace TorchSharp +{ + namespace Modules + { + public class EfficientNet : nn.Module + { + internal enum BlockType { MBConv, FusedMBConv } + + /// + /// Stores information listed at Tables 1 and 4 of the EfficientNet papers. + /// + internal class _MBConvConfig + { + public double expand_ratio; + public long kernel; + public long stride; + public long input_channels; + public long out_channels; + public long num_layers; + public BlockType block_type; + + public _MBConvConfig( + double expand_ratio, long kernel, long stride, + long input_channels, long out_channels, long num_layers, + BlockType block_type) + { + this.expand_ratio = expand_ratio; + this.kernel = kernel; + this.stride = stride; + this.input_channels = input_channels; + this.out_channels = out_channels; + this.num_layers = num_layers; + this.block_type = block_type; + } + + public static long adjust_channels(long channels, double width_mult, long? min_value = null) + { + return _make_divisible(channels * width_mult, 8, min_value); + } + + public _MBConvConfig ShallowCopy() + { + return (_MBConvConfig)this.MemberwiseClone(); + } + } + + /// + /// Config for MBConv blocks (EfficientNet B0-B7). + /// Applies width and depth multipliers for compound scaling. + /// + internal class MBConvConfig : _MBConvConfig + { + public MBConvConfig( + double expand_ratio, long kernel, long stride, + long input_channels, long out_channels, long num_layers, + double width_mult = 1.0, double depth_mult = 1.0) + : base(expand_ratio, kernel, stride, + adjust_channels(input_channels, width_mult), + adjust_channels(out_channels, width_mult), + adjust_depth(num_layers, depth_mult), + BlockType.MBConv) + { + } + + public static long adjust_depth(long num_layers, double depth_mult) + { + return (long)Math.Ceiling(num_layers * depth_mult); + } + } + + /// + /// Config for FusedMBConv blocks (EfficientNet V2). + /// + internal class FusedMBConvConfig : _MBConvConfig + { + public FusedMBConvConfig( + double expand_ratio, long kernel, long stride, + long input_channels, long out_channels, long num_layers) + : base(expand_ratio, kernel, stride, + input_channels, out_channels, num_layers, + BlockType.FusedMBConv) + { + } + } + + /// + /// MBConv block: Mobile Inverted Bottleneck Conv with Squeeze-and-Excitation. + /// + private class MBConv : nn.Module + { + private readonly nn.Module block; + private readonly torchvision.StochasticDepth stochastic_depth; + private readonly bool use_res_connect; + + protected override void Dispose(bool disposing) + { + if (disposing) { + block.Dispose(); + stochastic_depth.Dispose(); + } + base.Dispose(disposing); + } + + public MBConv( + string name, + _MBConvConfig cnf, + double stochastic_depth_prob, + Func> norm_layer) : base(name) + { + if (!(1 <= cnf.stride && cnf.stride <= 2)) + throw new ArgumentException("illegal stride value"); + + use_res_connect = cnf.stride == 1 && cnf.input_channels == cnf.out_channels; + + var layers = new List>(); + Func> activation_layer = (inplace) => nn.SiLU(inplace); + + // expand + var expanded_channels = _MBConvConfig.adjust_channels(cnf.input_channels, cnf.expand_ratio); + if (expanded_channels != cnf.input_channels) { + layers.Add(Conv2dNormActivation( + cnf.input_channels, expanded_channels, + kernel_size: 1, + norm_layer: norm_layer, + activation_layer: activation_layer)); + } + + // depthwise + layers.Add(Conv2dNormActivation( + expanded_channels, expanded_channels, + kernel_size: cnf.kernel, + stride: cnf.stride, + groups: expanded_channels, + norm_layer: norm_layer, + activation_layer: activation_layer)); + + // squeeze and excitation + var squeeze_channels = Math.Max(1, cnf.input_channels / 4); + layers.Add( + torchvision.ops.SqueezeExcitation( + expanded_channels, + squeeze_channels, + activation: () => nn.SiLU(inplace: true))); + + // project + layers.Add(Conv2dNormActivation( + expanded_channels, cnf.out_channels, + kernel_size: 1, + norm_layer: norm_layer, + activation_layer: null)); + + block = nn.Sequential(layers); + stochastic_depth = torchvision.ops.StochasticDepth(stochastic_depth_prob, torchvision.StochasticDepth.Mode.Row); + + RegisterComponents(); + } + + public override Tensor forward(Tensor input) + { + var result = block.call(input); + if (use_res_connect) { + result = stochastic_depth.call(result); + result += input; + } + return result; + } + } + + /// + /// FusedMBConv block: Fused Mobile Inverted Bottleneck Conv (no depthwise or SE). + /// + private class FusedMBConv : nn.Module + { + private readonly nn.Module block; + private readonly torchvision.StochasticDepth stochastic_depth; + private readonly bool use_res_connect; + + protected override void Dispose(bool disposing) + { + if (disposing) { + block.Dispose(); + stochastic_depth.Dispose(); + } + base.Dispose(disposing); + } + + public FusedMBConv( + string name, + _MBConvConfig cnf, + double stochastic_depth_prob, + Func> norm_layer) : base(name) + { + if (!(1 <= cnf.stride && cnf.stride <= 2)) + throw new ArgumentException("illegal stride value"); + + use_res_connect = cnf.stride == 1 && cnf.input_channels == cnf.out_channels; + + var layers = new List>(); + Func> activation_layer = (inplace) => nn.SiLU(inplace); + + var expanded_channels = _MBConvConfig.adjust_channels(cnf.input_channels, cnf.expand_ratio); + if (expanded_channels != cnf.input_channels) { + // fused expand + layers.Add(Conv2dNormActivation( + cnf.input_channels, expanded_channels, + kernel_size: cnf.kernel, + stride: cnf.stride, + norm_layer: norm_layer, + activation_layer: activation_layer)); + + // project + layers.Add(Conv2dNormActivation( + expanded_channels, cnf.out_channels, + kernel_size: 1, + norm_layer: norm_layer, + activation_layer: null)); + } else { + layers.Add(Conv2dNormActivation( + cnf.input_channels, cnf.out_channels, + kernel_size: cnf.kernel, + stride: cnf.stride, + norm_layer: norm_layer, + activation_layer: activation_layer)); + } + + block = nn.Sequential(layers); + stochastic_depth = torchvision.ops.StochasticDepth(stochastic_depth_prob, torchvision.StochasticDepth.Mode.Row); + + RegisterComponents(); + } + + public override Tensor forward(Tensor input) + { + var result = block.call(input); + if (use_res_connect) { + result = stochastic_depth.call(result); + result += input; + } + return result; + } + } + + private readonly nn.Module features; + private readonly nn.Module avgpool; + private readonly nn.Module classifier; + + protected override void Dispose(bool disposing) + { + if (disposing) { + features.Dispose(); + avgpool.Dispose(); + classifier.Dispose(); + } + base.Dispose(disposing); + } + + /// + /// EfficientNet V1 and V2 main class + /// + /// + /// Network structure + /// The dropout probability + /// The stochastic depth probability + /// Number of classes + /// Module specifying the normalization layer to use + /// The number of channels on the penultimate layer + internal EfficientNet( + string name, + _MBConvConfig[] inverted_residual_setting, + double dropout, + double stochastic_depth_prob = 0.2, + long num_classes = 1000, + Func>? norm_layer = null, + long? last_channel = null) : base(name) + { + if (inverted_residual_setting == null || inverted_residual_setting.Length == 0) + throw new ArgumentException("The inverted_residual_setting should not be empty"); + + if (norm_layer == null) + norm_layer = (features) => nn.BatchNorm2d(features); + + var layers = new List>(); + + // building first layer + var firstconv_output_channels = inverted_residual_setting[0].input_channels; + layers.Add(Conv2dNormActivation( + 3, firstconv_output_channels, + kernel_size: 3, stride: 2, + norm_layer: norm_layer, + activation_layer: (inplace) => nn.SiLU(inplace))); + + // building inverted residual blocks + long total_stage_blocks = 0; + foreach (var cnf in inverted_residual_setting) + total_stage_blocks += cnf.num_layers; + + long stage_block_id = 0; + foreach (var cnf in inverted_residual_setting) { + var stage = new List>(); + for (int i = 0; i < cnf.num_layers; i++) { + var block_cnf = cnf.ShallowCopy(); + + // overwrite info if not the first conv in the stage + if (stage.Count > 0) { + block_cnf.input_channels = block_cnf.out_channels; + block_cnf.stride = 1; + } + + // adjust stochastic depth probability based on the depth of the stage block + var sd_prob = stochastic_depth_prob * (double)stage_block_id / total_stage_blocks; + + if (block_cnf.block_type == BlockType.FusedMBConv) { + stage.Add(new FusedMBConv("FusedMBConv", block_cnf, sd_prob, norm_layer)); + } else { + stage.Add(new MBConv("MBConv", block_cnf, sd_prob, norm_layer)); + } + stage_block_id++; + } + layers.Add(nn.Sequential(stage)); + } + + // building last several layers + var lastconv_input_channels = inverted_residual_setting[inverted_residual_setting.Length - 1].out_channels; + var lastconv_output_channels = last_channel.HasValue ? last_channel.Value : 4 * lastconv_input_channels; + layers.Add(Conv2dNormActivation( + lastconv_input_channels, lastconv_output_channels, + kernel_size: 1, + norm_layer: norm_layer, + activation_layer: (inplace) => nn.SiLU(inplace))); + + features = nn.Sequential(layers); + avgpool = nn.AdaptiveAvgPool2d(1); + classifier = nn.Sequential( + nn.Dropout(p: dropout, inplace: true), + nn.Linear(lastconv_output_channels, num_classes)); + + RegisterComponents(); + + foreach (var (_, m) in this.named_modules()) { + if (m is Modules.Conv2d) { + var conv = (Modules.Conv2d)m; + nn.init.kaiming_normal_(conv.weight, mode: nn.init.FanInOut.FanOut); + if (conv.bias is not null) { + nn.init.zeros_(conv.bias); + } + } else if (m is Modules.BatchNorm2d) { + var norm = (Modules.BatchNorm2d)m; + nn.init.ones_(norm.weight); + nn.init.zeros_(norm.bias); + } else if (m is Modules.GroupNorm) { + var norm = (Modules.GroupNorm)m; + nn.init.ones_(norm.weight); + nn.init.zeros_(norm.bias); + } else if (m is Modules.Linear) { + var linear = (Modules.Linear)m; + var init_range = 1.0 / Math.Sqrt(linear.weight.shape[0]); + nn.init.uniform_(linear.weight, -init_range, init_range); + nn.init.zeros_(linear.bias); + } + } + } + + public override Tensor forward(Tensor x) + { + using (var _ = NewDisposeScope()) { + x = features.call(x); + x = avgpool.call(x); + x = torch.flatten(x, 1); + x = classifier.call(x); + return x.MoveToOuterDisposeScope(); + } + } + } + } + + public static partial class torchvision + { + public static partial class models + { + private static (EfficientNet._MBConvConfig[], long?) _efficientnet_conf(string arch, double width_mult = 1.0, double depth_mult = 1.0) + { + EfficientNet._MBConvConfig[] inverted_residual_setting; + long? last_channel; + + if (arch.StartsWith("efficientnet_b")) { + EfficientNet._MBConvConfig bneck_conf( + double expand_ratio, long kernel, long stride, + long input_channels, long out_channels, long num_layers) => + new EfficientNet.MBConvConfig(expand_ratio, kernel, stride, input_channels, out_channels, num_layers, width_mult, depth_mult); + + inverted_residual_setting = new EfficientNet._MBConvConfig[] { + bneck_conf(1, 3, 1, 32, 16, 1), + bneck_conf(6, 3, 2, 16, 24, 2), + bneck_conf(6, 5, 2, 24, 40, 2), + bneck_conf(6, 3, 2, 40, 80, 3), + bneck_conf(6, 5, 1, 80, 112, 3), + bneck_conf(6, 5, 2, 112, 192, 4), + bneck_conf(6, 3, 1, 192, 320, 1), + }; + last_channel = null; + } else if (arch.StartsWith("efficientnet_v2_s")) { + inverted_residual_setting = new EfficientNet._MBConvConfig[] { + new EfficientNet.FusedMBConvConfig(1, 3, 1, 24, 24, 2), + new EfficientNet.FusedMBConvConfig(4, 3, 2, 24, 48, 4), + new EfficientNet.FusedMBConvConfig(4, 3, 2, 48, 64, 4), + new EfficientNet.MBConvConfig(4, 3, 2, 64, 128, 6), + new EfficientNet.MBConvConfig(6, 3, 1, 128, 160, 9), + new EfficientNet.MBConvConfig(6, 3, 2, 160, 256, 15), + }; + last_channel = 1280; + } else if (arch.StartsWith("efficientnet_v2_m")) { + inverted_residual_setting = new EfficientNet._MBConvConfig[] { + new EfficientNet.FusedMBConvConfig(1, 3, 1, 24, 24, 3), + new EfficientNet.FusedMBConvConfig(4, 3, 2, 24, 48, 5), + new EfficientNet.FusedMBConvConfig(4, 3, 2, 48, 80, 5), + new EfficientNet.MBConvConfig(4, 3, 2, 80, 160, 7), + new EfficientNet.MBConvConfig(6, 3, 1, 160, 176, 14), + new EfficientNet.MBConvConfig(6, 3, 2, 176, 304, 18), + new EfficientNet.MBConvConfig(6, 3, 1, 304, 512, 5), + }; + last_channel = 1280; + } else if (arch.StartsWith("efficientnet_v2_l")) { + inverted_residual_setting = new EfficientNet._MBConvConfig[] { + new EfficientNet.FusedMBConvConfig(1, 3, 1, 32, 32, 4), + new EfficientNet.FusedMBConvConfig(4, 3, 2, 32, 64, 7), + new EfficientNet.FusedMBConvConfig(4, 3, 2, 64, 96, 7), + new EfficientNet.MBConvConfig(4, 3, 2, 96, 192, 10), + new EfficientNet.MBConvConfig(6, 3, 1, 192, 224, 19), + new EfficientNet.MBConvConfig(6, 3, 2, 224, 384, 25), + new EfficientNet.MBConvConfig(6, 3, 1, 384, 640, 7), + }; + last_channel = 1280; + } else { + throw new ArgumentException($"Unsupported model type {arch}"); + } + + return (inverted_residual_setting, last_channel); + } + + private static Modules.EfficientNet _efficientnet( + EfficientNet._MBConvConfig[] inverted_residual_setting, + double dropout, + long? last_channel, + long num_classes = 1000, + Func>? norm_layer = null, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + var model = new EfficientNet("EfficientNet", inverted_residual_setting, dropout, num_classes: num_classes, norm_layer: norm_layer, last_channel: last_channel); + + if (!string.IsNullOrEmpty(weights_file)) { + model.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null); + } + + if (device != null && device.type != DeviceType.CPU) + model.to(device); + + return model; + } + + /// + /// EfficientNet B0 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b0(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + /// In order for the weights to be loaded, the number of classes has to be the same as + /// in the pre-trained model, which is 1000. + /// + /// It is also possible to skip loading the last linear layer and use it for transfer-learning + /// with a different number of output classes. To do so, pass skipfc=true. + /// + /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB + /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded + /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]. + /// + public static Modules.EfficientNet efficientnet_b0(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b0", width_mult: 1.0, depth_mult: 1.0); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// EfficientNet B1 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b1(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_b1(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b1", width_mult: 1.0, depth_mult: 1.1); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// EfficientNet B2 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b2(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_b2(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b2", width_mult: 1.1, depth_mult: 1.2); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// EfficientNet B3 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b3(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_b3(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b3", width_mult: 1.2, depth_mult: 1.4); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// EfficientNet B4 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b4(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_b4(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b4", width_mult: 1.4, depth_mult: 1.8); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// EfficientNet B5 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b5(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_b5(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b5", width_mult: 1.6, depth_mult: 2.2); + Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// EfficientNet B6 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b6(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_b6(int num_classes = 1000, float dropout = 0.5f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b6", width_mult: 1.8, depth_mult: 2.6); + Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// EfficientNet B7 model architecture from the + /// EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks paper. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_b7(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_b7(int num_classes = 1000, float dropout = 0.5f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_b7", width_mult: 2.0, depth_mult: 3.1); + Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001, momentum: 0.01); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// Constructs an EfficientNetV2-S architecture from + /// EfficientNetV2: Smaller Models and Faster Training. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_v2_s(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_v2_s(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_s"); + Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// Constructs an EfficientNetV2-M architecture from + /// EfficientNetV2: Smaller Models and Faster Training. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_v2_m(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_v2_m(int num_classes = 1000, float dropout = 0.3f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_m"); + Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device); + } + + /// + /// Constructs an EfficientNetV2-L architecture from + /// EfficientNetV2: Smaller Models and Faster Training. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.efficientnet_v2_l(weights='DEFAULT') + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + public static Modules.EfficientNet efficientnet_v2_l(int num_classes = 1000, float dropout = 0.4f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + var (inverted_residual_setting, last_channel) = _efficientnet_conf("efficientnet_v2_l"); + Func> norm_layer = (features) => nn.BatchNorm2d(features, eps: 0.001); + return _efficientnet(inverted_residual_setting, dropout, last_channel, num_classes, norm_layer: norm_layer, weights_file: weights_file, skipfc: skipfc, device: device); + } + } + } +} diff --git a/src/TorchVision/models/MNASNet.cs b/src/TorchVision/models/MNASNet.cs new file mode 100644 index 000000000..7210f3268 --- /dev/null +++ b/src/TorchVision/models/MNASNet.cs @@ -0,0 +1,299 @@ +// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python version of torchvision, +// largely located in the files found in this folder: +// +// https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/pytorch/vision/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; + +using static TorchSharp.torch; +using static TorchSharp.torch.nn; + +#nullable enable +namespace TorchSharp +{ + public static partial class torchvision + { + public static partial class models + { + /// + /// MNASNet with depth multiplier of 0.5 from + /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile". + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.mnasnet0_5(pretrained=True) + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + /// In order for the weights to be loaded, the number of classes has to be the same as + /// in the pre-trained model, which is 1000. + /// + /// It is also possible to skip loading the last linear layer and use it for transfer-learning + /// with a different number of output classes. To do so, pass skipfc=true. + /// + /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB + /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded + /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]. + /// + public static Modules.MNASNet mnasnet0_5(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + return new Modules.MNASNet(0.5, num_classes, dropout, weights_file, skipfc, device); + } + + /// + /// MNASNet with depth multiplier of 0.75 from + /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile". + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.MNASNet mnasnet0_75(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + return new Modules.MNASNet(0.75, num_classes, dropout, weights_file, skipfc, device); + } + + /// + /// MNASNet with depth multiplier of 1.0 from + /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile". + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.MNASNet mnasnet1_0(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + return new Modules.MNASNet(1.0, num_classes, dropout, weights_file, skipfc, device); + } + + /// + /// MNASNet with depth multiplier of 1.3 from + /// "MnasNet: Platform-Aware Neural Architecture Search for Mobile". + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.MNASNet mnasnet1_3(int num_classes = 1000, float dropout = 0.2f, string? weights_file = null, bool skipfc = true, Device? device = null) + { + return new Modules.MNASNet(1.3, num_classes, dropout, weights_file, skipfc, device); + } + } + } + + namespace Modules + { + // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py + // License: https://github.com/pytorch/vision/blob/main/LICENSE + + /// + /// MNASNet, as described in https://arxiv.org/abs/1807.11626. + /// This implements the B1 variant of the model. + /// + public class MNASNet : Module + { + // Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is 1.0 - tensorflow. + private const double _BN_MOMENTUM = 1.0 - 0.9997; + + private class _InvertedResidual : Module + { + private readonly bool apply_residual; + private readonly Module layers; + + public _InvertedResidual(string name, long in_ch, long out_ch, long kernel_size, long stride, long expansion_factor, double bn_momentum) + : base(name) + { + if (stride != 1 && stride != 2) + throw new ArgumentOutOfRangeException($"stride should be 1 or 2 instead of {stride}"); + if (kernel_size != 3 && kernel_size != 5) + throw new ArgumentOutOfRangeException($"kernel_size should be 3 or 5 instead of {kernel_size}"); + + var mid_ch = in_ch * expansion_factor; + apply_residual = in_ch == out_ch && stride == 1; + layers = Sequential( + // Pointwise + Conv2d(in_ch, mid_ch, 1, bias: false), + BatchNorm2d(mid_ch, momentum: bn_momentum), + ReLU(inplace: true), + // Depthwise + Conv2d(mid_ch, mid_ch, kernel_size, padding: kernel_size / 2, stride: stride, groups: mid_ch, bias: false), + BatchNorm2d(mid_ch, momentum: bn_momentum), + ReLU(inplace: true), + // Linear pointwise. Note that there's no activation. + Conv2d(mid_ch, out_ch, 1, bias: false), + BatchNorm2d(out_ch, momentum: bn_momentum) + ); + RegisterComponents(); + } + + protected override void Dispose(bool disposing) + { + if (disposing) { + layers.Dispose(); + } + base.Dispose(disposing); + } + + public override Tensor forward(Tensor input) + { + if (apply_residual) { + return layers.call(input) + input; + } else { + return layers.call(input); + } + } + } + + /// + /// Creates a stack of inverted residuals. + /// + private static Module _stack(long in_ch, long out_ch, long kernel_size, long stride, long exp_factor, int repeats, double bn_momentum) + { + if (repeats < 1) + throw new ArgumentOutOfRangeException($"repeats should be >= 1, instead got {repeats}"); + + var modules = new List>(); + // First one has no skip, because feature map size changes. + modules.Add(new _InvertedResidual("_InvertedResidual", in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum)); + for (int i = 1; i < repeats; i++) { + modules.Add(new _InvertedResidual("_InvertedResidual", out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum)); + } + return Sequential(modules); + } + + /// + /// Asymmetric rounding to make val divisible by divisor. + /// With default bias, will round up, unless the number is no more than 10% greater + /// than the smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88. + /// + private static int _round_to_multiple_of(double val, int divisor, double round_up_bias = 0.9) + { + if (round_up_bias <= 0.0 || round_up_bias >= 1.0) + throw new ArgumentOutOfRangeException($"round_up_bias should be greater than 0.0 and smaller than 1.0 instead of {round_up_bias}"); + var new_val = Math.Max(divisor, (int)(val + divisor / 2) / divisor * divisor); + return new_val >= round_up_bias * val ? new_val : new_val + divisor; + } + + /// + /// Scales tensor depths as in reference MobileNet code, prefers rounding up rather than down. + /// + private static int[] _get_depths(double alpha) + { + var depths = new int[] { 32, 16, 24, 40, 80, 96, 192, 320 }; + var result = new int[depths.Length]; + for (int i = 0; i < depths.Length; i++) { + result[i] = _round_to_multiple_of(depths[i] * alpha, 8); + } + return result; + } + + private readonly Module layers; + private readonly Module classifier; + + protected override void Dispose(bool disposing) + { + if (disposing) { + layers.Dispose(); + classifier.Dispose(); + } + base.Dispose(disposing); + } + + public MNASNet(double alpha, int num_classes = 1000, float dropout = 0.2f, + string? weights_file = null, bool skipfc = true, Device? device = null) + : base(nameof(MNASNet)) + { + if (alpha <= 0.0) + throw new ArgumentOutOfRangeException($"alpha should be greater than 0.0 instead of {alpha}"); + + var depths = _get_depths(alpha); + var layerList = new List> { + // First layer: regular conv. + Conv2d(3, depths[0], 3, padding: 1, stride: 2, bias: false), + BatchNorm2d(depths[0], momentum: _BN_MOMENTUM), + ReLU(inplace: true), + // Depthwise separable, no skip. + Conv2d(depths[0], depths[0], 3, padding: 1, stride: 1, groups: depths[0], bias: false), + BatchNorm2d(depths[0], momentum: _BN_MOMENTUM), + ReLU(inplace: true), + Conv2d(depths[0], depths[1], 1, padding: 0L, stride: 1, bias: false), + BatchNorm2d(depths[1], momentum: _BN_MOMENTUM), + // MNASNet blocks: stacks of inverted residuals. + _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM), + _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM), + _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM), + _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM), + _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM), + _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM), + // Final mapping to classifier input. + Conv2d(depths[7], 1280, 1, padding: 0L, stride: 1, bias: false), + BatchNorm2d(1280, momentum: _BN_MOMENTUM), + ReLU(inplace: true), + }; + layers = Sequential(layerList); + classifier = Sequential( + Dropout(p: dropout, inplace: true), + Linear(1280, num_classes) + ); + + RegisterComponents(); + + // Weight initialization + foreach (var (_, m) in named_modules()) { + if (m is Modules.Conv2d conv) { + init.kaiming_normal_(conv.weight, mode: init.FanInOut.FanOut); + if (conv.bias is not null) + init.zeros_(conv.bias); + } else if (m is Modules.BatchNorm2d norm) { + init.ones_(norm.weight); + init.zeros_(norm.bias); + } else if (m is Modules.Linear linear) { + init.kaiming_uniform_(linear.weight, mode: init.FanInOut.FanOut, nonlinearity: init.NonlinearityType.Sigmoid); + init.zeros_(linear.bias); + } + } + + if (!string.IsNullOrEmpty(weights_file)) { + this.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null); + } + + if (device != null && device.type != DeviceType.CPU) + this.to(device); + } + + public override Tensor forward(Tensor x) + { + using (var _ = NewDisposeScope()) { + x = layers.call(x); + // Equivalent to global avgpool and removing H and W dimensions. + x = x.mean(new long[] { 2, 3 }); + return classifier.call(x).MoveToOuterDisposeScope(); + } + } + } + } +} diff --git a/src/TorchVision/models/ShuffleNetV2.cs b/src/TorchVision/models/ShuffleNetV2.cs new file mode 100644 index 000000000..3c7b5348e --- /dev/null +++ b/src/TorchVision/models/ShuffleNetV2.cs @@ -0,0 +1,316 @@ +// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python version of torchvision, +// largely located in the files found in this folder: +// +// https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/pytorch/vision/blob/main/LICENSE +// + +using System; +using System.Collections.Generic; +using static TorchSharp.torch; +using static TorchSharp.torch.nn; + +#nullable enable +namespace TorchSharp +{ + public static partial class torchvision + { + public static partial class models + { + /// + /// ShuffleNet V2 with 0.5x output channels, as described in + /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design". + /// + /// The number of output classes. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.shufflenet_v2_x0_5(pretrained=True) + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB + /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded + /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]. + /// + public static Modules.ShuffleNetV2 shufflenet_v2_x0_5( + int num_classes = 1000, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.ShuffleNetV2( + new int[] { 4, 8, 4 }, + new int[] { 24, 48, 96, 192, 1024 }, + num_classes, weights_file, skipfc, device); + } + + /// + /// ShuffleNet V2 with 1.0x output channels, as described in + /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design". + /// + /// The number of output classes. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.ShuffleNetV2 shufflenet_v2_x1_0( + int num_classes = 1000, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.ShuffleNetV2( + new int[] { 4, 8, 4 }, + new int[] { 24, 116, 232, 464, 1024 }, + num_classes, weights_file, skipfc, device); + } + + /// + /// ShuffleNet V2 with 1.5x output channels, as described in + /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design". + /// + /// The number of output classes. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.ShuffleNetV2 shufflenet_v2_x1_5( + int num_classes = 1000, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.ShuffleNetV2( + new int[] { 4, 8, 4 }, + new int[] { 24, 176, 352, 704, 1024 }, + num_classes, weights_file, skipfc, device); + } + + /// + /// ShuffleNet V2 with 2.0x output channels, as described in + /// "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design". + /// + /// The number of output classes. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + public static Modules.ShuffleNetV2 shufflenet_v2_x2_0( + int num_classes = 1000, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.ShuffleNetV2( + new int[] { 4, 8, 4 }, + new int[] { 24, 244, 488, 976, 2048 }, + num_classes, weights_file, skipfc, device); + } + } + } + + namespace Modules + { + // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py + // License: https://github.com/pytorch/vision/blob/main/LICENSE + + public class ShuffleNetV2 : Module + { + private static Tensor channel_shuffle(Tensor x, int groups) + { + var batchsize = x.shape[0]; + var num_channels = x.shape[1]; + var height = x.shape[2]; + var width = x.shape[3]; + var channels_per_group = num_channels / groups; + + x = x.view(batchsize, groups, channels_per_group, height, width); + x = x.transpose(1, 2).contiguous(); + x = x.view(batchsize, num_channels, height, width); + return x; + } + + private static Module depthwise_conv( + long i, long o, long kernel_size, long stride = 1, long padding = 0, bool bias = false) + { + return Conv2d(i, o, kernel_size: kernel_size, stride: stride, padding: padding, bias: bias, groups: i); + } + + private class InvertedResidual : Module + { + private readonly Module branch1; + private readonly Module branch2; + private readonly int _stride; + + public InvertedResidual(string name, long inp, long oup, int stride) : base(name) + { + if (stride < 1 || stride > 3) + throw new ArgumentException("illegal stride value", nameof(stride)); + + _stride = stride; + var branch_features = oup / 2; + + if (stride > 1) { + branch1 = Sequential( + depthwise_conv(inp, inp, kernel_size: 3, stride: stride, padding: 1), + BatchNorm2d(inp), + Conv2d(inp, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false), + BatchNorm2d(branch_features), + ReLU(inplace: true) + ); + } else { + branch1 = Sequential(); + } + + branch2 = Sequential( + Conv2d(stride > 1 ? inp : branch_features, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false), + BatchNorm2d(branch_features), + ReLU(inplace: true), + depthwise_conv(branch_features, branch_features, kernel_size: 3, stride: stride, padding: 1), + BatchNorm2d(branch_features), + Conv2d(branch_features, branch_features, kernel_size: 1, stride: 1, padding: 0L, bias: false), + BatchNorm2d(branch_features), + ReLU(inplace: true) + ); + + RegisterComponents(); + } + + protected override void Dispose(bool disposing) + { + if (disposing) { + branch1.Dispose(); + branch2.Dispose(); + } + base.Dispose(disposing); + } + + public override Tensor forward(Tensor x) + { + Tensor @out; + if (_stride == 1) { + var chunks = x.chunk(2, dim: 1); + @out = torch.cat(new[] { chunks[0], branch2.call(chunks[1]) }, 1); + } else { + @out = torch.cat(new[] { branch1.call(x), branch2.call(x) }, 1); + } + @out = channel_shuffle(@out, 2); + return @out; + } + } + + private readonly Module conv1; + private readonly Module maxpool; + private readonly Module stage2; + private readonly Module stage3; + private readonly Module stage4; + private readonly Module conv5; + private readonly Module fc; + + protected override void Dispose(bool disposing) + { + if (disposing) { + conv1.Dispose(); maxpool.Dispose(); + stage2.Dispose(); stage3.Dispose(); stage4.Dispose(); + conv5.Dispose(); fc.Dispose(); + } + base.Dispose(disposing); + } + + private static Module MakeStage(long input_channels, long output_channels, int repeats) + { + var modules = new List>(); + modules.Add(new InvertedResidual("InvertedResidual", input_channels, output_channels, 2)); + for (int i = 0; i < repeats - 1; i++) { + modules.Add(new InvertedResidual("InvertedResidual", output_channels, output_channels, 1)); + } + return Sequential(modules.ToArray()); + } + + /// + /// ShuffleNet V2 main class. + /// + /// Number of repeated blocks in each stage. + /// Output channels for each stage. + /// Number of output classes. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last linear layer will not be loaded from the weights file. + /// The device to locate the model on. + public ShuffleNetV2( + int[] stages_repeats, + int[] stages_out_channels, + int num_classes = 1000, + string? weights_file = null, + bool skipfc = true, + Device? device = null) : base(nameof(ShuffleNetV2)) + { + if (stages_repeats.Length != 3) + throw new ArgumentException("expected stages_repeats to have 3 elements"); + if (stages_out_channels.Length != 5) + throw new ArgumentException("expected stages_out_channels to have 5 elements"); + + long input_channels = 3; + long output_channels = stages_out_channels[0]; + + conv1 = Sequential( + Conv2d(input_channels, output_channels, kernel_size: 3, stride: 2, padding: 1, bias: false), + BatchNorm2d(output_channels), + ReLU(inplace: true) + ); + input_channels = output_channels; + + maxpool = MaxPool2d(kernel_size: 3, stride: 2, padding: 1); + + stage2 = MakeStage(input_channels, stages_out_channels[1], stages_repeats[0]); + stage3 = MakeStage(stages_out_channels[1], stages_out_channels[2], stages_repeats[1]); + stage4 = MakeStage(stages_out_channels[2], stages_out_channels[3], stages_repeats[2]); + + output_channels = stages_out_channels[4]; + conv5 = Sequential( + Conv2d(stages_out_channels[3], output_channels, kernel_size: 1, stride: 1, padding: 0L, bias: false), + BatchNorm2d(output_channels), + ReLU(inplace: true) + ); + + fc = Linear(output_channels, num_classes); + + RegisterComponents(); + + if (!string.IsNullOrEmpty(weights_file)) { + this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias" } : null); + } + + if (device != null && device.type != DeviceType.CPU) + this.to(device); + } + + public override Tensor forward(Tensor x) + { + using (var _ = NewDisposeScope()) { + x = conv1.call(x); + x = maxpool.call(x); + x = stage2.call(x); + x = stage3.call(x); + x = stage4.call(x); + x = conv5.call(x); + x = x.mean(new long[] { 2, 3 }); // global pool + x = fc.call(x); + return x.MoveToOuterDisposeScope(); + } + } + } + } +} diff --git a/src/TorchVision/models/SqueezeNet.cs b/src/TorchVision/models/SqueezeNet.cs new file mode 100644 index 000000000..34df94020 --- /dev/null +++ b/src/TorchVision/models/SqueezeNet.cs @@ -0,0 +1,257 @@ +// Copyright (c) .NET Foundation and Contributors. All Rights Reserved. See LICENSE in the project root for license information. + +// A number of implementation details in this file have been translated from the Python version of torchvision, +// largely located in the files found in this folder: +// +// https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py +// +// The origin has the following copyright notice and license: +// +// https://github.com/pytorch/vision/blob/main/LICENSE +// + +using System; +using static TorchSharp.torch; +using static TorchSharp.torch.nn; + +#nullable enable +namespace TorchSharp +{ + public static partial class torchvision + { + public static partial class models + { + /// + /// SqueezeNet 1.0 model from + /// "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and less than 0.5MB model size". + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last convolutional layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.squeezenet1_0(pretrained=True) + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + /// In order for the weights to be loaded, the number of classes has to be the same as + /// in the pre-trained model, which is 1000. + /// + /// It is also possible to skip loading the last classifier layer and use it for transfer-learning + /// with a different number of output classes. To do so, pass skipfc=true. + /// + /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB + /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded + /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]. + /// + public static Modules.SqueezeNet squeezenet1_0( + int num_classes = 1000, + float dropout = 0.5f, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.SqueezeNet("1_0", num_classes, dropout, weights_file, skipfc, device); + } + + /// + /// SqueezeNet 1.1 model from the official SqueezeNet repo. + /// SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters than SqueezeNet 1.0. + /// + /// The number of output classes. + /// The dropout ratio. + /// The location of a file containing pre-trained weights for the model. + /// If true, the last convolutional layer of the classifier will not be loaded from the weights file. + /// The device to locate the model on. + /// + /// Pre-trained weights may be retrieved by using Pytorch and saving the model state-dict + /// using the exportsd.py script, then loading into the .NET instance: + /// + /// from torchvision import models + /// import exportsd + /// + /// model = models.squeezenet1_1(pretrained=True) + /// f = open("model_weights.dat", "wb") + /// exportsd.save_state_dict(model.state_dict(), f) + /// f.close() + /// + /// See also: https://github.com/dotnet/TorchSharp/blob/main/docfx/articles/saveload.md + /// + /// In order for the weights to be loaded, the number of classes has to be the same as + /// in the pre-trained model, which is 1000. + /// + /// It is also possible to skip loading the last classifier layer and use it for transfer-learning + /// with a different number of output classes. To do so, pass skipfc=true. + /// + /// All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB + /// images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded + /// in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]. + /// + public static Modules.SqueezeNet squeezenet1_1( + int num_classes = 1000, + float dropout = 0.5f, + string? weights_file = null, + bool skipfc = true, + Device? device = null) + { + return new Modules.SqueezeNet("1_1", num_classes, dropout, weights_file, skipfc, device); + } + } + } + + namespace Modules + { + // Based on https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py + // License: https://github.com/pytorch/vision/blob/main/LICENSE + + public class SqueezeNet : Module + { + private class Fire : Module + { + private readonly Module squeeze; + private readonly Module squeeze_activation; + private readonly Module expand1x1; + private readonly Module expand1x1_activation; + private readonly Module expand3x3; + private readonly Module expand3x3_activation; + + public Fire(string name, int inplanes, int squeeze_planes, int expand1x1_planes, int expand3x3_planes) + : base(name) + { + squeeze = Conv2d(inplanes, squeeze_planes, kernel_size: 1); + squeeze_activation = ReLU(inplace: true); + expand1x1 = Conv2d(squeeze_planes, expand1x1_planes, kernel_size: 1); + expand1x1_activation = ReLU(inplace: true); + expand3x3 = Conv2d(squeeze_planes, expand3x3_planes, kernel_size: 3, padding: 1); + expand3x3_activation = ReLU(inplace: true); + RegisterComponents(); + } + + protected override void Dispose(bool disposing) + { + if (disposing) { + squeeze.Dispose(); + squeeze_activation.Dispose(); + expand1x1.Dispose(); + expand1x1_activation.Dispose(); + expand3x3.Dispose(); + expand3x3_activation.Dispose(); + } + base.Dispose(disposing); + } + + public override Tensor forward(Tensor x) + { + x = squeeze_activation.call(squeeze.call(x)); + return torch.cat(new[] { + expand1x1_activation.call(expand1x1.call(x)), + expand3x3_activation.call(expand3x3.call(x)) + }, 1); + } + } + + private readonly Module features; + private readonly Module classifier; + + protected override void Dispose(bool disposing) + { + if (disposing) { + features.Dispose(); + classifier.Dispose(); + } + base.Dispose(disposing); + } + + public SqueezeNet(string version, int num_classes = 1000, float dropout = 0.5f, + string? weights_file = null, bool skipfc = true, Device? device = null) + : base(nameof(SqueezeNet)) + { + Module final_conv; + + if (version == "1_0") { + features = Sequential( + Conv2d(3, 96, kernel_size: 7, stride: 2), + ReLU(inplace: true), + MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true), + new Fire("Fire", 96, 16, 64, 64), + new Fire("Fire", 128, 16, 64, 64), + new Fire("Fire", 128, 32, 128, 128), + MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true), + new Fire("Fire", 256, 32, 128, 128), + new Fire("Fire", 256, 48, 192, 192), + new Fire("Fire", 384, 48, 192, 192), + new Fire("Fire", 384, 64, 256, 256), + MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true), + new Fire("Fire", 512, 64, 256, 256) + ); + } else if (version == "1_1") { + features = Sequential( + Conv2d(3, 64, kernel_size: 3, stride: 2), + ReLU(inplace: true), + MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true), + new Fire("Fire", 64, 16, 64, 64), + new Fire("Fire", 128, 16, 64, 64), + MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true), + new Fire("Fire", 128, 32, 128, 128), + new Fire("Fire", 256, 32, 128, 128), + MaxPool2d(kernel_size: 3, stride: 2, ceil_mode: true), + new Fire("Fire", 256, 48, 192, 192), + new Fire("Fire", 384, 48, 192, 192), + new Fire("Fire", 384, 64, 256, 256), + new Fire("Fire", 512, 64, 256, 256) + ); + } else { + throw new ArgumentException($"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected"); + } + + final_conv = Conv2d(512, num_classes, kernel_size: 1); + classifier = Sequential( + Dropout(p: dropout), + final_conv, + ReLU(inplace: true), + AdaptiveAvgPool2d(new long[] { 1, 1 }) + ); + + RegisterComponents(); + + if (string.IsNullOrEmpty(weights_file)) { + foreach (var (_, m) in named_modules()) { + if (m is Modules.Conv2d conv) { + if (object.ReferenceEquals(m, final_conv)) { + nn.init.normal_(conv.weight, mean: 0.0, std: 0.01); + } else { + nn.init.kaiming_uniform_(conv.weight); + } + if (conv.bias is not null) + nn.init.constant_(conv.bias, 0); + } + } + } else { + this.load(weights_file!, skip: skipfc ? new[] { "classifier.1.weight", "classifier.1.bias" } : null); + } + + if (device != null && device.type != DeviceType.CPU) + this.to(device); + } + + public override Tensor forward(Tensor x) + { + using (var _ = NewDisposeScope()) { + x = features.call(x); + x = classifier.call(x); + return torch.flatten(x, 1).MoveToOuterDisposeScope(); + } + } + } + } +} diff --git a/test/TorchSharpTest/TestTorchVision.cs b/test/TorchSharpTest/TestTorchVision.cs index c8f1bc341..e534ef36a 100644 --- a/test/TorchSharpTest/TestTorchVision.cs +++ b/test/TorchSharpTest/TestTorchVision.cs @@ -799,6 +799,203 @@ public void TestMobileNetV3() } } + [Fact] + public void TestSqueezeNet() + { + { + using var model = squeezenet1_0(); + var sd = model.state_dict(); + Assert.Equal(52, sd.Count); + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("features", names[0]), + () => Assert.Equal("classifier", names[1]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + { + using var model = squeezenet1_1(); + var sd = model.state_dict(); + Assert.Equal(52, sd.Count); + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("features", names[0]), + () => Assert.Equal("classifier", names[1]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + } + + [Fact] + public void TestDenseNet121() + { + using var model = densenet121(); + var sd = model.state_dict(); + Assert.Equal(727, sd.Count); + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("features", names[0]), + () => Assert.Equal("classifier", names[1]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestDenseNet161() + { + using var model = densenet161(); + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("features", names[0]), + () => Assert.Equal("classifier", names[1]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestDenseNet169() + { + using var model = densenet169(); + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestDenseNet201() + { + using var model = densenet201(); + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + + [Fact] + public void TestShuffleNetV2() + { + using (var model = shufflenet_v2_x1_0()) { + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("conv1", names[0]), + () => Assert.Equal("maxpool", names[1]), + () => Assert.Equal("stage2", names[2]), + () => Assert.Equal("stage3", names[3]), + () => Assert.Equal("stage4", names[4]), + () => Assert.Equal("conv5", names[5]), + () => Assert.Equal("fc", names[6]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + + using (var model = shufflenet_v2_x0_5()) { + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + } + + [Fact] + public void TestEfficientNetB0() + { + using var model = efficientnet_b0(); + var sd = model.state_dict(); + Assert.Equal(360, sd.Count); + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("features", names[0]), + () => Assert.Equal("avgpool", names[1]), + () => Assert.Equal("classifier", names[2]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetV2S() + { + using var model = efficientnet_v2_s(); + var sd = model.state_dict(); + Assert.Equal(782, sd.Count); + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("features", names[0]), + () => Assert.Equal("avgpool", names[1]), + () => Assert.Equal("classifier", names[2]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetB1() { using var model = efficientnet_b1(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetB2() { using var model = efficientnet_b2(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetB3() { using var model = efficientnet_b3(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetB4() { using var model = efficientnet_b4(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetB5() { using var model = efficientnet_b5(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetB6() { using var model = efficientnet_b6(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetB7() { using var model = efficientnet_b7(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetV2M() { using var model = efficientnet_v2_m(); } + + [Fact(Skip = "The test takes too long to run and causes trouble in CI/CD, since it uses a lot of memory.")] + public void TestEfficientNetV2L() { using var model = efficientnet_v2_l(); } + + [Fact] + public void TestMNASNet() + { + using var model = mnasnet1_0(); + var sd = model.state_dict(); + var names = model.named_children().Select(nm => nm.name).ToArray(); + Assert.Multiple( + () => Assert.Equal("layers", names[0]), + () => Assert.Equal("classifier", names[1]) + ); + + using var input = torch.randn(2, 3, 224, 224); + using var output = model.call(input); + + Assert.Equal(new long[] { 2, 1000 }, output.shape); + } + [Fact] public void TestReadingAndWritingImages() {