Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions LLama.Unittest/NativeAbiTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Runtime.InteropServices;
using LLama.Native;
using System.Net.Mime;
Comment thread
martindevans marked this conversation as resolved.
using System.Runtime.InteropServices;

namespace LLama.Unittest
{
Expand Down Expand Up @@ -42,8 +43,10 @@ public void ContextParamsSizeMatchesNative()
(sizeof(uint), 4), // n_batch
(sizeof(uint), 4), // n_ubatch
(sizeof(uint), 4), // n_seq_max
(sizeof(uint), 4), // n_rs_seq
(sizeof(int), 4), // n_threads
(sizeof(int), 4), // n_threads_batch
(sizeof(LLamaContextType), 4), // ctx_type
(sizeof(int), 4), // rope_scaling_type
(sizeof(int), 4), // pooling_type
(sizeof(int), 4), // attention_type
Expand Down Expand Up @@ -80,9 +83,14 @@ public void ContextParamsSizeMatchesNative()
public void ModelParamsBoolBlockMatchesNative()
{
var pointerSize = IntPtr.Size;
var kvOffset = Marshal.OffsetOf<LLamaModelParams>("kv_overrides").ToInt32();

// Get the field immediately before the first boolean field
var kvOffset = Marshal.OffsetOf<LLamaModelParams>(nameof(LLamaModelParams.kv_overrides)).ToInt32();

// Get the first boolean field
var vocabOffset = Marshal.OffsetOf<LLamaModelParams>("_vocab_only").ToInt32();

// Check first boolean field is one ptr-size after the other
Assert.Equal(kvOffset + pointerSize, vocabOffset);
Assert.Equal(vocabOffset + 1, Marshal.OffsetOf<LLamaModelParams>("_use_mmap").ToInt32());
Assert.Equal(vocabOffset + 2, Marshal.OffsetOf<LLamaModelParams>("_use_direct_io").ToInt32());
Expand Down
5 changes: 5 additions & 0 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ public class ModelOptions
/// <inheritdoc />
public uint? ContextSize { get; set; }

LLamaContextType IContextParams.ContextType => LLamaContextType.Default;

Comment thread
martindevans marked this conversation as resolved.
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

Expand All @@ -35,6 +37,9 @@ public class ModelOptions
/// <inheritdoc />
public uint SeqMax { get; set; }

/// <inheritdoc />
public uint RecurrentRollbackSnapshots { get; set; } = 0;

/// <inheritdoc />
public bool Embeddings { get; set; }

Expand Down
10 changes: 10 additions & 0 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ public interface IContextParams
/// </summary>
uint? ContextSize { get; }

/// <summary>
/// The type of context
/// </summary>
LLamaContextType ContextType { get; }

Comment thread
martindevans marked this conversation as resolved.
/// <summary>
/// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch)
/// </summary>
Expand All @@ -28,6 +33,11 @@ public interface IContextParams
/// </summary>
uint SeqMax { get; }

/// <summary>
/// The number of recurrent-state snapshots per seq for rollback
/// </summary>
uint RecurrentRollbackSnapshots { get; }

/// <summary>
/// If true, extract embeddings (together with logits).
/// </summary>
Expand Down
6 changes: 6 additions & 0 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ public record ModelParams
/// <inheritdoc />
public uint? ContextSize { get; set; }

/// <inheritdoc />
public LLamaContextType ContextType { get; set; } = LLamaContextType.Default;

/// <inheritdoc />
public int MainGpu { get; set; } = 0;

Expand All @@ -31,6 +34,9 @@ public record ModelParams
/// <inheritdoc />
public uint SeqMax { get; set; } = 1;

/// <inheritdoc />
public uint RecurrentRollbackSnapshots { get; set; } = 0;

/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;

Expand Down
2 changes: 2 additions & 0 deletions LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result = LLamaContextParams.Default();

result.n_ctx = @params.ContextSize ?? 0;
result.context_type = @params.ContextType;
result.n_batch = @params.BatchSize;
result.n_ubatch = @params.UBatchSize;
result.n_seq_max = @params.SeqMax;
result.n_rs_seq = @params.RecurrentRollbackSnapshots;
result.embeddings = @params.Embeddings;
result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
</ItemGroup>

<PropertyGroup>
<BinaryReleaseId>3f7c29d318e317b6</BinaryReleaseId>
<BinaryReleaseId>c0c7e147e7efa6c58587</BinaryReleaseId>
</PropertyGroup>

<PropertyGroup>
Expand Down
10 changes: 10 additions & 0 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ public struct LLamaContextParams
/// </summary>
public uint n_seq_max;

/// <summary>
/// number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
/// </summary>
public uint n_rs_seq;

/// <summary>
/// number of threads to use for generation
/// </summary>
Expand All @@ -50,6 +55,11 @@ public struct LLamaContextParams
/// </summary>
public int n_threads_batch;

/// <summary>
/// Set the type of context (e.g. MTP)
/// </summary>
public LLamaContextType context_type;

/// <summary>
/// RoPE scaling type, from `enum llama_rope_scaling_type`
/// </summary>
Expand Down
18 changes: 18 additions & 0 deletions LLama/Native/LLamaContextType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
namespace LLama.Native;

/// <summary>
///
Comment thread
martindevans marked this conversation as resolved.
/// </summary>
/// <remarks>llama_context_type</remarks>
public enum LLamaContextType
{
/// <summary>
/// Default context type
/// </summary>
Default = 0,

/// <summary>
/// Multi token prediction context
/// </summary>
Mtp = 1,
}
114 changes: 57 additions & 57 deletions LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,35 +71,35 @@ public bool vocab_only
}
private sbyte _vocab_only;

/// <summary>
/// use mmap if possible
/// </summary>
public bool use_mmap
{
readonly get => Convert.ToBoolean(_use_mmap);
set => _use_mmap = Convert.ToSByte(value);
}
private sbyte _use_mmap;

/// <summary>
/// use direct io, takes precedence over use_mmap when supported
/// </summary>
public bool use_direct_io
{
readonly get => Convert.ToBoolean(_use_direct_io);
set => _use_direct_io = Convert.ToSByte(value);
}
private sbyte _use_direct_io;

/// <summary>
/// force system to keep model in RAM
/// </summary>
public bool use_mlock
{
readonly get => Convert.ToBoolean(_use_mlock);
set => _use_mlock = Convert.ToSByte(value);
}
private sbyte _use_mlock;
/// <summary>
/// use mmap if possible
/// </summary>
public bool use_mmap
{
readonly get => Convert.ToBoolean(_use_mmap);
set => _use_mmap = Convert.ToSByte(value);
}
private sbyte _use_mmap;
/// <summary>
/// use direct io, takes precedence over use_mmap when supported
/// </summary>
public bool use_direct_io
{
readonly get => Convert.ToBoolean(_use_direct_io);
set => _use_direct_io = Convert.ToSByte(value);
}
private sbyte _use_direct_io;
/// <summary>
/// force system to keep model in RAM
/// </summary>
public bool use_mlock
{
readonly get => Convert.ToBoolean(_use_mlock);
set => _use_mlock = Convert.ToSByte(value);
}
private sbyte _use_mlock;

/// <summary>
/// validate model tensor data
Expand All @@ -112,34 +112,34 @@ public bool check_tensors
private sbyte _check_tensors;

/// <summary>
/// use extra buffer types (used for weight repacking)
/// </summary>
public bool use_extra_bufts
{
readonly get => Convert.ToBoolean(_use_extra_bufts);
set => _use_extra_bufts = Convert.ToSByte(value);
}
private sbyte _use_extra_bufts;

/// <summary>
/// bypass host buffer allowing extra buffers to be used
/// </summary>
public bool no_host
{
readonly get => Convert.ToBoolean(_no_host);
set => _no_host = Convert.ToSByte(value);
}
private sbyte _no_host;

/// <summary>
/// only load metadata and simulate memory allocations
/// </summary>
public bool no_alloc
{
readonly get => Convert.ToBoolean(_no_alloc);
set => _no_alloc = Convert.ToSByte(value);
}
private sbyte _no_alloc;
/// use extra buffer types (used for weight repacking)
/// </summary>
public bool use_extra_bufts
{
readonly get => Convert.ToBoolean(_use_extra_bufts);
set => _use_extra_bufts = Convert.ToSByte(value);
}
private sbyte _use_extra_bufts;
/// <summary>
/// bypass host buffer allowing extra buffers to be used
/// </summary>
public bool no_host
{
readonly get => Convert.ToBoolean(_no_host);
set => _no_host = Convert.ToSByte(value);
}
private sbyte _no_host;
/// <summary>
/// only load metadata and simulate memory allocations
/// </summary>
public bool no_alloc
{
readonly get => Convert.ToBoolean(_no_alloc);
set => _no_alloc = Convert.ToSByte(value);
}
private sbyte _no_alloc;
/// <summary>
/// Create a LLamaModelParams with default values
/// </summary>
Expand Down
19 changes: 0 additions & 19 deletions LLama/Native/LLamaParamsFitStatus.cs

This file was deleted.

22 changes: 20 additions & 2 deletions LLama/Native/NativeApi.Mtmd.cs
Original file line number Diff line number Diff line change
Expand Up @@ -204,16 +204,20 @@ internal struct mtmd_decoder_pos

[FieldOffset(8)]
uint y;

[FieldOffset(12)]
uint z;
};

/// <summary>
/// get position for decoder attention, to be used by M-RoPE models
/// </summary>
/// <param name="image_tokens"></param>
/// <param name="pos_0">pos_0 is the absolute position of the first token</param>
/// <param name="i">i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1</param>
/// <returns>return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)</returns>
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, nuint i);
internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, LLamaPos pos_0, nuint i);

// tokenize ----------------------------------------------------------

Expand Down Expand Up @@ -312,7 +316,11 @@ internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHan
[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
// out_pos must have length == mtmd_helper_get_n_tokens(image)
internal static extern void mtmd_helper_image_get_decoder_pos(IntPtr /* mtmd_image_tokens* */ image, IntPtr /* mtmd_decoder_pos* */ out_pos);
internal static extern void mtmd_helper_image_get_decoder_pos(
IntPtr /* mtmd_image_tokens* */ image,
LLamaPos pos_0,
IntPtr /* mtmd_decoder_pos* */ out_pos
);

[DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)]
internal static extern int mtmd_helper_eval_chunks(
Expand Down Expand Up @@ -346,4 +354,14 @@ internal static extern int mtmd_helper_decode_image_chunk(
int seq_id,
int n_batch,
ref int new_n_past);

/*
* // EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
// This is only intended to be used by llama-server, breaking changes is expected
struct mtmd_caps {
bool inp_vision;
bool inp_audio;
};
MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
*/
}
28 changes: 0 additions & 28 deletions LLama/Native/NativeApi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -464,34 +464,6 @@ public static string llama_split_prefix(string splitPath, int splitNo, int split
[DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern IntPtr ggml_backend_buft_name(IntPtr buft);

Comment thread
martindevans marked this conversation as resolved.
/// <summary>
/// Fits mparams and cparams to free device memory (assumes system memory is unlimited)
/// - returns true if the parameters could be successfully modified to fit device memory
/// - this function is NOT thread safe because it modifies the global llama logger state
/// - only parameters that have the same value as in llama_default_model_params are modified
/// with the exception of the context size which is modified if and only if equal to 0
/// </summary>
/// <param name="path"></param>
/// <param name="mparams"></param>
/// <param name="cparams"></param>
/// <param name="tensor_split">Writable buffer for tensor split, needs at least llama_max_devices elements</param>
/// <param name="tensor_buft_overrides">Writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements</param>
/// <param name="margins">Margins of memory to leave per device in bytes</param>
/// <param name="n_ctx_min">Minimum context size to set when trying to reduce memory use</param>
/// <param name="log_level">Minimum log level to print during fitting, lower levels go to debug log</param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe LLamaParamsFitStatus llama_params_fit(
string path,
ref LLamaModelParams mparams,
ref LLamaContextParams cparams,
float* tensor_split,
LLamaModelTensorBufferOverride* tensor_buft_overrides,
nint* margins,
uint n_ctx_min,
int /* GGML_LOG_LEVEL */ log_level
);

[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern long llama_time_us();

Expand Down
Loading
Loading