Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion NuGet.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
</solution>
<packageSources>
<clear />
<add key="darc-pub-dotnet-maintenance-packages-ab95a1f1" value="https://dnceng.pkgs.visualstudio.com/public/_packaging/darc-pub-dotnet-maintenance-packages-ab95a1f1/nuget/v3/index.json" />
<add key="dotnet-public" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-public/nuget/v3/index.json" />
<add key="dotnet-tools" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" />
<add key="dotnet-libraries" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-libraries/nuget/v3/index.json" />
Expand Down
5 changes: 1 addition & 4 deletions src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
<IsPackable>true</IsPackable>
<PackageDescription>Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms.</PackageDescription>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<NoWarn>$(NoWarn);MSML_NoInstanceInitializers</NoWarn>
</PropertyGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
Expand All @@ -16,10 +17,6 @@
<Compile Remove="Utils/Helpers.netstandard.cs" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Google.Protobuf" />
</ItemGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
<PackageReference Include="Microsoft.Bcl.HashCode" />
<PackageReference Include="Microsoft.Bcl.Memory" />
Expand Down
7 changes: 2 additions & 5 deletions src/Microsoft.ML.Tokenizers/Model/BertOptions.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

Expand All @@ -9,7 +9,6 @@ namespace Microsoft.ML.Tokenizers
/// </summary>
public sealed class BertOptions : WordPieceOptions
{
#pragma warning disable MSML_NoInstanceInitializers
/// <summary>
/// Gets or sets a value indicating whether to lower case the input before tokenization.
/// </summary>
Expand Down Expand Up @@ -66,7 +65,5 @@ public sealed class BertOptions : WordPieceOptions
/// Gets or sets a value indicating whether to remove non-spacing marks.
/// </summary>
public bool RemoveNonSpacingMarks { get; set; }

#pragma warning restore MSML_NoInstanceInitializers
}
}
}
6 changes: 2 additions & 4 deletions src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

Expand All @@ -11,7 +11,6 @@ namespace Microsoft.ML.Tokenizers
/// </summary>
public class WordPieceOptions
{
#pragma warning disable MSML_NoInstanceInitializers
internal const int DefaultMaxInputCharsPerWord = 100;
internal const string DefaultContinuingSubwordPrefix = "##";

Expand Down Expand Up @@ -44,6 +43,5 @@ public class WordPieceOptions
/// Gets or set the maximum number of characters to consider for a single word.
/// </summary>
public int MaxInputCharsPerWord { get; set; } = DefaultMaxInputCharsPerWord;
#pragma warning restore MSML_NoInstanceInitializers
}
}
}
5,029 changes: 374 additions & 4,655 deletions src/Microsoft.ML.Tokenizers/SentencepieceModel.cs

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -913,5 +913,11 @@ public void TestPhi3TokenizerIdEncoding(string text, string decodedWithNoSpecial
Assert.Equal(textWithSpecialTokens.Length, charactersWritten);
Assert.Equal(textWithSpecialTokens, destinationBuffer.AsSpan(0, charactersWritten).ToString());
}

[Fact]
public void CreateWithNullStreamThrows()
{
Assert.ThrowsAny<ArgumentException>(() => LlamaTokenizer.Create(null!));
}
}
}
51 changes: 51 additions & 0 deletions test/Microsoft.ML.Tokenizers.Tests/SentencePieceTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Tokenizers;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Xunit;

namespace Microsoft.ML.Tokenizers.Tests
{
public class SentencePieceTests
{
[Fact]
public void CreateWithNullStreamThrows()
{
Assert.ThrowsAny<ArgumentException>(() => SentencePieceTokenizer.Create(null!));
}

[Fact]
public void CreateWithEmptyStreamThrows()
{
using MemoryStream empty = new MemoryStream(Array.Empty<byte>());
Assert.ThrowsAny<ArgumentException>(() => SentencePieceTokenizer.Create(empty));
}

[Fact]
public void CreateWithTruncatedStreamThrows()
{
// A protobuf tag claiming a length-delimited field longer than remaining bytes.
byte[] truncated = new byte[] { 0x0A, 0xFF, 0x01 }; // field 1, length 255 – but only 0 data bytes follow
using MemoryStream ms = new MemoryStream(truncated);
Assert.ThrowsAny<Exception>(() => SentencePieceTokenizer.Create(ms));
}

[Fact]
public void CreateBpeViaSentencePieceTokenizer()
{
// Verify that the generic SentencePieceTokenizer.Create() factory method
// works for BPE models (not just LlamaTokenizer.Create()).
using Stream stream = File.OpenRead(Path.Combine(@"Llama", "tokenizer.model"));
SentencePieceTokenizer tokenizer = SentencePieceTokenizer.Create(stream);

IReadOnlyList<EncodedToken> tokens = tokenizer.EncodeToTokens("Hello", out _);
Assert.True(tokens.Count > 0);
Assert.Equal("Hello", tokenizer.Decode(tokens.Select(t => t.Id)));
}
}
}
Loading