dotnet · stephentoub · Mar 11, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/NuGet.config b/NuGet.config
@@ -5,7 +5,6 @@
   </solution>
   <packageSources>
     <clear />
-    <add key="darc-pub-dotnet-maintenance-packages-ab95a1f1" value="https://dnceng.pkgs.visualstudio.com/public/_packaging/darc-pub-dotnet-maintenance-packages-ab95a1f1/nuget/v3/index.json" />
     <add key="dotnet-public" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-public/nuget/v3/index.json" />
     <add key="dotnet-tools" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" />
     <add key="dotnet-libraries" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-libraries/nuget/v3/index.json" />

diff --git a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
@@ -6,6 +6,7 @@
     <IsPackable>true</IsPackable>
     <PackageDescription>Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms.</PackageDescription>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <NoWarn>$(NoWarn);MSML_NoInstanceInitializers</NoWarn>
   </PropertyGroup>
 
   <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
@@ -16,10 +17,6 @@
     <Compile Remove="Utils/Helpers.netstandard.cs" />
   </ItemGroup>
 
-  <ItemGroup>
-    <PackageReference Include="Google.Protobuf" />
-  </ItemGroup>
-
   <ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
     <PackageReference Include="Microsoft.Bcl.HashCode" />
     <PackageReference Include="Microsoft.Bcl.Memory" />

diff --git a/src/Microsoft.ML.Tokenizers/Model/BertOptions.cs b/src/Microsoft.ML.Tokenizers/Model/BertOptions.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
@@ -9,7 +9,6 @@ namespace Microsoft.ML.Tokenizers
     /// </summary>
     public sealed class BertOptions : WordPieceOptions
     {
-#pragma warning disable MSML_NoInstanceInitializers
         /// <summary>
         /// Gets or sets a value indicating whether to lower case the input before tokenization.
         /// </summary>
@@ -66,7 +65,5 @@ public sealed class BertOptions : WordPieceOptions
         /// Gets or sets a value indicating whether to remove non-spacing marks.
         /// </summary>
         public bool RemoveNonSpacingMarks { get; set; }
-
-#pragma warning restore MSML_NoInstanceInitializers
     }
-}
+}
diff --git a/src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs b/src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs
@@ -1,4 +1,4 @@
-// Licensed to the .NET Foundation under one or more agreements.
+// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
@@ -11,7 +11,6 @@ namespace Microsoft.ML.Tokenizers
     /// </summary>
     public class WordPieceOptions
     {
-#pragma warning disable MSML_NoInstanceInitializers
         internal const int DefaultMaxInputCharsPerWord = 100;
         internal const string DefaultContinuingSubwordPrefix = "##";
 
@@ -44,6 +43,5 @@ public class WordPieceOptions
         /// Gets or set the maximum number of characters to consider for a single word.
         /// </summary>
         public int MaxInputCharsPerWord { get; set; } = DefaultMaxInputCharsPerWord;
-#pragma warning restore MSML_NoInstanceInitializers
     }
-}
+}
diff --git a/src/Microsoft.ML.Tokenizers/SentencepieceModel.cs b/src/Microsoft.ML.Tokenizers/SentencepieceModel.cs
diff --git a/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs b/test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs
@@ -913,5 +913,11 @@ public void TestPhi3TokenizerIdEncoding(string text, string decodedWithNoSpecial
             Assert.Equal(textWithSpecialTokens.Length, charactersWritten);
             Assert.Equal(textWithSpecialTokens, destinationBuffer.AsSpan(0, charactersWritten).ToString());
         }
+
+        [Fact]
+        public void CreateWithNullStreamThrows()
+        {
+            Assert.ThrowsAny<ArgumentException>(() => LlamaTokenizer.Create(null!));
+        }
     }
 }
diff --git a/test/Microsoft.ML.Tokenizers.Tests/SentencePieceTests.cs b/test/Microsoft.ML.Tokenizers.Tests/SentencePieceTests.cs
@@ -0,0 +1,51 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Tokenizers;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Xunit;
+
+namespace Microsoft.ML.Tokenizers.Tests
+{
+    public class SentencePieceTests
+    {
+        [Fact]
+        public void CreateWithNullStreamThrows()
+        {
+            Assert.ThrowsAny<ArgumentException>(() => SentencePieceTokenizer.Create(null!));
+        }
+
+        [Fact]
+        public void CreateWithEmptyStreamThrows()
+        {
+            using MemoryStream empty = new MemoryStream(Array.Empty<byte>());
+            Assert.ThrowsAny<ArgumentException>(() => SentencePieceTokenizer.Create(empty));
+        }
+
+        [Fact]
+        public void CreateWithTruncatedStreamThrows()
+        {
+            // A protobuf tag claiming a length-delimited field longer than remaining bytes.
+            byte[] truncated = new byte[] { 0x0A, 0xFF, 0x01 }; // field 1, length 255 – but only 0 data bytes follow
+            using MemoryStream ms = new MemoryStream(truncated);
+            Assert.ThrowsAny<Exception>(() => SentencePieceTokenizer.Create(ms));
+        }
+
+        [Fact]
+        public void CreateBpeViaSentencePieceTokenizer()
+        {
+            // Verify that the generic SentencePieceTokenizer.Create() factory method
+            // works for BPE models (not just LlamaTokenizer.Create()).
+            using Stream stream = File.OpenRead(Path.Combine(@"Llama", "tokenizer.model"));
+            SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(stream);
+
+            IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens("Hello", out _);
+            Assert.True(tokens.Count > 0);
+            Assert.Equal("Hello", tokenizer.Decode(tokens.Select(t => t.Id)));
+        }
+    }
+}