SciSharp · martindevans · May 23, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/LLama.Unittest/NativeAbiTests.cs b/LLama.Unittest/NativeAbiTests.cs
@@ -1,5 +1,6 @@
-using System.Runtime.InteropServices;
 using LLama.Native;
+using System.Net.Mime;
+using System.Runtime.InteropServices;
 
 namespace LLama.Unittest
 {
@@ -42,8 +43,10 @@ public void ContextParamsSizeMatchesNative()
                 (sizeof(uint), 4), // n_batch
                 (sizeof(uint), 4), // n_ubatch
                 (sizeof(uint), 4), // n_seq_max
+                (sizeof(uint), 4), // n_rs_seq
                 (sizeof(int), 4),  // n_threads
                 (sizeof(int), 4),  // n_threads_batch
+                (sizeof(LLamaContextType), 4), // ctx_type
                 (sizeof(int), 4),  // rope_scaling_type
                 (sizeof(int), 4),  // pooling_type
                 (sizeof(int), 4),  // attention_type
@@ -80,9 +83,14 @@ public void ContextParamsSizeMatchesNative()
         public void ModelParamsBoolBlockMatchesNative()
         {
             var pointerSize = IntPtr.Size;
-            var kvOffset = Marshal.OffsetOf<LLamaModelParams>("kv_overrides").ToInt32();
+
+            // Get the field immediately before the first boolean field
+            var kvOffset = Marshal.OffsetOf<LLamaModelParams>(nameof(LLamaModelParams.kv_overrides)).ToInt32();
+
+            // Get the first boolean field
             var vocabOffset = Marshal.OffsetOf<LLamaModelParams>("_vocab_only").ToInt32();
 
+            // Check first boolean field is one ptr-size after the other
             Assert.Equal(kvOffset + pointerSize, vocabOffset);
             Assert.Equal(vocabOffset + 1, Marshal.OffsetOf<LLamaModelParams>("_use_mmap").ToInt32());
             Assert.Equal(vocabOffset + 2, Marshal.OffsetOf<LLamaModelParams>("_use_direct_io").ToInt32());

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -20,6 +20,8 @@ public class ModelOptions
         /// <inheritdoc />
         public uint? ContextSize { get; set; }
 
+        LLamaContextType IContextParams.ContextType => LLamaContextType.Default;
+
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
@@ -35,6 +37,9 @@ public class ModelOptions
         /// <inheritdoc />
         public uint SeqMax { get; set; }
 
+        /// <inheritdoc />
+        public uint RecurrentRollbackSnapshots { get; set; } = 0;
+
         /// <inheritdoc />
         public bool Embeddings { get; set; }
 

diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -13,6 +13,11 @@ public interface IContextParams
     /// </summary>
     uint? ContextSize { get; }
 
+    /// <summary>
+    /// The type of context
+    /// </summary>
+    LLamaContextType ContextType { get; }
+
     /// <summary>
     /// maximum batch size that can be submitted at once (must be >=32 to use BLAS) (n_batch)
     /// </summary>
@@ -28,6 +33,11 @@ public interface IContextParams
     /// </summary>
     uint SeqMax { get; }
 
+    /// <summary>
+    /// The number of recurrent-state snapshots per seq for rollback
+    /// </summary>
+    uint RecurrentRollbackSnapshots { get; }
+
     /// <summary>
     /// If true, extract embeddings (together with logits).
     /// </summary>

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -16,6 +16,9 @@ public record ModelParams
         /// <inheritdoc />
         public uint? ContextSize { get; set; }
 
+        /// <inheritdoc />
+        public LLamaContextType ContextType { get; set; } = LLamaContextType.Default;
+
         /// <inheritdoc />
         public int MainGpu { get; set; } = 0;
 
@@ -31,6 +34,9 @@ public record ModelParams
         /// <inheritdoc />
         public uint SeqMax { get; set; } = 1;
 
+        /// <inheritdoc />
+        public uint RecurrentRollbackSnapshots { get; set; } = 0;
+
         /// <inheritdoc />
         public bool UseMemorymap { get; set; } = true;
 

diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -23,9 +23,11 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result = LLamaContextParams.Default();
 
             result.n_ctx = @params.ContextSize ?? 0;
+            result.context_type = @params.ContextType;
             result.n_batch = @params.BatchSize;
             result.n_ubatch = @params.UBatchSize;
             result.n_seq_max = @params.SeqMax;
+            result.n_rs_seq = @params.RecurrentRollbackSnapshots;
             result.embeddings = @params.Embeddings;
             result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
             result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;

diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -59,7 +59,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>3f7c29d318e317b6</BinaryReleaseId>
+    <BinaryReleaseId>c0c7e147e7efa6c58587</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>

diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
@@ -40,6 +40,11 @@ public struct LLamaContextParams
         /// </summary>
         public uint n_seq_max;
 
+        /// <summary>
+        /// number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
+        /// </summary>
+        public uint n_rs_seq;
+
         /// <summary>
         /// number of threads to use for generation
         /// </summary>
@@ -50,6 +55,11 @@ public struct LLamaContextParams
         /// </summary>
         public int n_threads_batch;
 
+        /// <summary>
+        /// Set the type of context (e.g. MTP)
+        /// </summary>
+        public LLamaContextType context_type;
+
         /// <summary>
         /// RoPE scaling type, from `enum llama_rope_scaling_type` 
         /// </summary>

diff --git a/LLama/Native/LLamaContextType.cs b/LLama/Native/LLamaContextType.cs
@@ -0,0 +1,18 @@
+namespace LLama.Native;
+
+/// <summary>
+/// 
+/// </summary>
+/// <remarks>llama_context_type</remarks>
+public enum LLamaContextType
+{
+    /// <summary>
+    /// Default context type
+    /// </summary>
+    Default = 0,
+
+    /// <summary>
+    /// Multi token prediction context
+    /// </summary>
+    Mtp = 1,
+}
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -71,35 +71,35 @@ public bool vocab_only
         }
         private sbyte _vocab_only;
 
-        /// <summary>
-        /// use mmap if possible
-        /// </summary>
-        public bool use_mmap
-        {
-            readonly get => Convert.ToBoolean(_use_mmap);
-            set => _use_mmap = Convert.ToSByte(value);
-        }
-        private sbyte _use_mmap;
-
-        /// <summary>
-        /// use direct io, takes precedence over use_mmap when supported
-        /// </summary>
-        public bool use_direct_io
-        {
-            readonly get => Convert.ToBoolean(_use_direct_io);
-            set => _use_direct_io = Convert.ToSByte(value);
-        }
-        private sbyte _use_direct_io;
-
-        /// <summary>
-        /// force system to keep model in RAM
-        /// </summary>
-        public bool use_mlock
-        {
-            readonly get => Convert.ToBoolean(_use_mlock);
-            set => _use_mlock = Convert.ToSByte(value);
-        }
-        private sbyte _use_mlock;
+        /// <summary>
+        /// use mmap if possible
+        /// </summary>
+        public bool use_mmap
+        {
+            readonly get => Convert.ToBoolean(_use_mmap);
+            set => _use_mmap = Convert.ToSByte(value);
+        }
+        private sbyte _use_mmap;
+
+        /// <summary>
+        /// use direct io, takes precedence over use_mmap when supported
+        /// </summary>
+        public bool use_direct_io
+        {
+            readonly get => Convert.ToBoolean(_use_direct_io);
+            set => _use_direct_io = Convert.ToSByte(value);
+        }
+        private sbyte _use_direct_io;
+
+        /// <summary>
+        /// force system to keep model in RAM
+        /// </summary>
+        public bool use_mlock
+        {
+            readonly get => Convert.ToBoolean(_use_mlock);
+            set => _use_mlock = Convert.ToSByte(value);
+        }
+        private sbyte _use_mlock;
 
         /// <summary>
         /// validate model tensor data
@@ -112,34 +112,34 @@ public bool check_tensors
         private sbyte _check_tensors;
 
         /// <summary>
-        /// use extra buffer types (used for weight repacking) 
-        /// </summary>
-        public bool use_extra_bufts
-        {
-            readonly get => Convert.ToBoolean(_use_extra_bufts);
-            set => _use_extra_bufts = Convert.ToSByte(value);
-        }
-        private sbyte _use_extra_bufts;
-
-        /// <summary>
-        /// bypass host buffer allowing extra buffers to be used
-        /// </summary>
-        public bool no_host
-        {
-            readonly get => Convert.ToBoolean(_no_host);
-            set => _no_host = Convert.ToSByte(value);
-        }
-        private sbyte _no_host;
-
-        /// <summary>
-        /// only load metadata and simulate memory allocations
-        /// </summary>
-        public bool no_alloc
-        {
-            readonly get => Convert.ToBoolean(_no_alloc);
-            set => _no_alloc = Convert.ToSByte(value);
-        }
-        private sbyte _no_alloc;
+        /// use extra buffer types (used for weight repacking) 
+        /// </summary>
+        public bool use_extra_bufts
+        {
+            readonly get => Convert.ToBoolean(_use_extra_bufts);
+            set => _use_extra_bufts = Convert.ToSByte(value);
+        }
+        private sbyte _use_extra_bufts;
+
+        /// <summary>
+        /// bypass host buffer allowing extra buffers to be used
+        /// </summary>
+        public bool no_host
+        {
+            readonly get => Convert.ToBoolean(_no_host);
+            set => _no_host = Convert.ToSByte(value);
+        }
+        private sbyte _no_host;
+
+        /// <summary>
+        /// only load metadata and simulate memory allocations
+        /// </summary>
+        public bool no_alloc
+        {
+            readonly get => Convert.ToBoolean(_no_alloc);
+            set => _no_alloc = Convert.ToSByte(value);
+        }
+        private sbyte _no_alloc;
         /// <summary>
         /// Create a LLamaModelParams with default values
         /// </summary>

diff --git a/LLama/Native/LLamaParamsFitStatus.cs b/LLama/Native/LLamaParamsFitStatus.cs
diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs
@@ -204,16 +204,20 @@ internal struct mtmd_decoder_pos
 
         [FieldOffset(8)]
         uint y;
+
+        [FieldOffset(12)]
+        uint z;
     };
 
     /// <summary>
     /// get position for decoder attention, to be used by M-RoPE models
     /// </summary>
     /// <param name="image_tokens"></param>
+    /// <param name="pos_0">pos_0 is the absolute position of the first token</param>
     /// <param name="i">i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1</param>
     /// <returns>return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)</returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
-    internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, nuint i);
+    internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, LLamaPos pos_0, nuint i);
 
     // tokenize ----------------------------------------------------------
 
@@ -312,7 +316,11 @@ internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHan
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
     // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
     // out_pos must have length == mtmd_helper_get_n_tokens(image)
-    internal static extern void mtmd_helper_image_get_decoder_pos(IntPtr /* mtmd_image_tokens* */ image, IntPtr /* mtmd_decoder_pos* */ out_pos);
+    internal static extern void mtmd_helper_image_get_decoder_pos(
+        IntPtr /* mtmd_image_tokens* */ image,
+        LLamaPos pos_0,
+        IntPtr /* mtmd_decoder_pos* */ out_pos
+    );
 
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_helper_eval_chunks(
@@ -346,4 +354,14 @@ internal static extern int mtmd_helper_decode_image_chunk(
         int seq_id,
         int n_batch,
         ref int new_n_past);
+
+    /*
+     * // EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
+       // This is only intended to be used by llama-server, breaking changes is expected
+       struct mtmd_caps {
+           bool inp_vision;
+           bool inp_audio;
+       };
+       MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+     */
 }
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -464,34 +464,6 @@ public static string llama_split_prefix(string splitPath, int splitNo, int split
         [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern IntPtr ggml_backend_buft_name(IntPtr buft);
 
-        /// <summary>
-        /// Fits mparams and cparams to free device memory (assumes system memory is unlimited)
-        ///   - returns true if the parameters could be successfully modified to fit device memory
-        ///   - this function is NOT thread safe because it modifies the global llama logger state
-        ///   - only parameters that have the same value as in llama_default_model_params are modified
-        ///     with the exception of the context size which is modified if and only if equal to 0
-        /// </summary>
-        /// <param name="path"></param>
-        /// <param name="mparams"></param>
-        /// <param name="cparams"></param>
-        /// <param name="tensor_split">Writable buffer for tensor split, needs at least llama_max_devices elements</param>
-        /// <param name="tensor_buft_overrides">Writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements</param>
-        /// <param name="margins">Margins of memory to leave per device in bytes</param>
-        /// <param name="n_ctx_min">Minimum context size to set when trying to reduce memory use</param>
-        /// <param name="log_level">Minimum log level to print during fitting, lower levels go to debug log</param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe LLamaParamsFitStatus llama_params_fit(
-            string path,
-            ref LLamaModelParams mparams,
-            ref LLamaContextParams cparams,
-            float* tensor_split,
-            LLamaModelTensorBufferOverride* tensor_buft_overrides,
-            nint* margins,
-            uint n_ctx_min,
-            int /* GGML_LOG_LEVEL */ log_level
-        );
-
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern long llama_time_us();