55 < meta name ="color-scheme " content ="light dark "> < link rel ="index " title ="Index " href ="../../genindex.html "> < link rel ="search " title ="Search " href ="../../search.html ">
66
77 <!-- Generated with Sphinx 9.1.0 and Furo 2025.12.19 -->
8- < title > transformer.config - transformer 0.3 .0 documentation</ title >
8+ < title > transformer.config - transformer 0.4 .0 documentation</ title >
99 < link rel ="stylesheet " type ="text/css " href ="../../_static/pygments.css?v=d111a655 " />
1010 < link rel ="stylesheet " type ="text/css " href ="../../_static/styles/furo.css?v=7bdb33bb " />
1111 < link rel ="stylesheet " type ="text/css " href ="../../_static/styles/furo-extensions.css?v=8dab3a3b " />
160160 </ label >
161161 </ div >
162162 < div class ="header-center ">
163- < a href ="../../index.html "> < div class ="brand "> transformer 0.3 .0 documentation</ div > </ a >
163+ < a href ="../../index.html "> < div class ="brand "> transformer 0.4 .0 documentation</ div > </ a >
164164 </ div >
165165 < div class ="header-right ">
166166 < div class ="theme-toggle-container theme-toggle-header ">
181181
182182 < div class ="sidebar-sticky "> < a class ="sidebar-brand " href ="../../index.html ">
183183
184- < span class ="sidebar-brand-text "> transformer 0.3 .0 documentation</ span >
184+ < span class ="sidebar-brand-text "> transformer 0.4 .0 documentation</ span >
185185
186186</ a > < form class ="sidebar-search-container " method ="get " action ="../../search.html " role ="search ">
187187 < input class ="sidebar-search " placeholder ="Search " name ="q " aria-label ="Search ">
188188 < input type ="hidden " name ="check_keywords " value ="yes ">
189189 < input type ="hidden " name ="area " value ="default ">
190190</ form >
191191< div id ="searchbox "> </ div > < div class ="sidebar-scroll "> < div class ="sidebar-tree ">
192- < ul >
192+ < p class ="caption " role ="heading "> < span class ="caption-text "> Getting Started</ span > </ p >
193+ < ul >
193194< li class ="toctree-l1 "> < a class ="reference internal " href ="../../installation.html "> Installation</ a > </ li >
194195< li class ="toctree-l1 "> < a class ="reference internal " href ="../../quickstart.html "> Quick Start</ a > </ li >
195196</ ul >
197+ < p class ="caption " role ="heading "> < span class ="caption-text "> Guide</ span > </ p >
196198< ul >
197199< li class ="toctree-l1 "> < a class ="reference internal " href ="../../guide.html "> Transformer: A PyTorch SOTA Transformer Implementation</ a > </ li >
198200< li class ="toctree-l1 "> < a class ="reference internal " href ="../../guide.html#configuration "> Configuration</ a > </ li >
199201</ ul >
202+ < p class ="caption " role ="heading "> < span class ="caption-text "> API Reference</ span > </ p >
200203< ul >
201204< li class ="toctree-l1 "> < a class ="reference internal " href ="../../api.html "> API Reference</ a > </ li >
202205</ ul >
206+ < p class ="caption " role ="heading "> < span class ="caption-text "> Usage Examples</ span > </ p >
203207< ul >
204208< li class ="toctree-l1 "> < a class ="reference internal " href ="../../examples.html "> Usage Examples</ a > </ li >
205209</ ul >
210+ < p class ="caption " role ="heading "> < span class ="caption-text "> Project Info</ span > </ p >
206211< ul >
207212< li class ="toctree-l1 "> < a class ="reference internal " href ="../../contributing.html "> Contributing</ a > </ li >
208213</ ul >
@@ -278,7 +283,7 @@ <h1>Source code for transformer.config</h1><div class="highlight"><pre>
278283< span class ="sd "> - If ``str``, one of ``rms_norm`` or ``layer_norm``.</ span >
279284< span class ="sd "> - If ``Type[nn.Module]`` then will be instantiated inside the model.</ span >
280285< span class ="sd "> Should have the same API as a torch Normalization Layer.</ span >
281- < span class ="sd "> - If ``List[Union[Type[nn.Module], str]]`` and len(ffn_class ) == n_layers</ span >
286+ < span class ="sd "> - If ``List[Union[Type[nn.Module], str]]`` and len(norm_class ) == n_layers</ span >
282287< span class ="sd "> then will be instantiated inside the model for the corresponding layers.</ span >
283288< span class ="sd "> :type norm_class: Union[List[Union[Type[nn.Module], str]], Type[nn.Module], str]</ span >
284289
@@ -297,9 +302,9 @@ <h1>Source code for transformer.config</h1><div class="highlight"><pre>
297302< span class ="sd "> - If ``Type[nn.Module]`` then will be instantiated inside the model.</ span >
298303< span class ="sd "> Should have the same API as ``transformer.attn.MHA``.</ span >
299304< span class ="sd "> Default ``MHA``</ span >
300- < span class ="sd "> - If ``List[Union[Type[nn.Module], str]]`` and len(ffn_class ) == n_layers</ span >
305+ < span class ="sd "> - If ``List[Union[Type[nn.Module], str]]`` and len(attn_class ) == n_layers</ span >
301306< span class ="sd "> then will be instantiated inside the model for the corresponding layers.</ span >
302- < span class ="sd "> Default ``SwiGLU `` for every layer.</ span >
307+ < span class ="sd "> Default ``MHA `` for every layer.</ span >
303308< span class ="sd "> :type attn_class: Union[List[Union[Type[nn.Module], str]], Type[nn.Module], str]</ span >
304309
305310< span class ="sd "> :param block_class: Transformer Block class for every layer. Default: ``None``</ span >
@@ -329,11 +334,9 @@ <h1>Source code for transformer.config</h1><div class="highlight"><pre>
329334< span class ="sd "> :type seq_len: int</ span >
330335
331336< span class ="sd "> :param pos_encoding: Positional Encoding for attention.</ span >
332- < span class ="sd "> - If ``List[Union[Type[nn.Module], str]]`` and len(ffn_class) == n_layers</ span >
333- < span class ="sd "> then will be instantiated inside the model for the corresponding layers.</ span >
334- < span class ="sd "> Default ``SwiGLU`` for every layer.</ span >
335337< span class ="sd "> - If ``str`` one of ``RoPE``, ``AliBI``, ``PartialRoPE``. Default: ``RoPE``</ span >
336338< span class ="sd "> Note: Is recommended to change the default to ``PartialRoPE`` which is used in SOTA models like Qwen3-Next-80B-A3B</ span >
339+ < span class ="sd "> - If ``List[str]`` and len(pos_encoding) == n_layers, applies different positional encodings per layer.</ span >
337340< span class ="sd "> :type pos_encoding: Union[List[str], str]</ span >
338341
339342< span class ="sd "> :param rope_base: Base for the Exponential Frequency Calculation in RoPE. Default: ``10000.0``</ span >
@@ -342,6 +345,12 @@ <h1>Source code for transformer.config</h1><div class="highlight"><pre>
342345< span class ="sd "> :param max_seq_len: Maximum sequence length for positional embeddings.</ span >
343346< span class ="sd "> :type max_seq_len: int</ span >
344347
348+ < span class ="sd "> :param use_cache: Whether to use KV cache during generation. Default: ``True``</ span >
349+ < span class ="sd "> :type use_cache: bool, optional</ span >
350+
351+ < span class ="sd "> :param is_decoder: Whether this is a decoder model. Default: ``True``</ span >
352+ < span class ="sd "> :type is_decoder: bool, optional</ span >
353+
345354< span class ="sd "> :param kwargs: Additional keyword arguments passed to `PretrainedConfig`</ span >
346355< span class ="sd "> :type kwargs: dict, optional</ span >
347356
@@ -371,17 +380,19 @@ <h1>Source code for transformer.config</h1><div class="highlight"><pre>
371380 < span class ="n "> attn_dropout</ span > < span class ="p "> :</ span > < span class ="n "> Optional</ span > < span class ="p "> [</ span > < span class ="nb "> float</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="mf "> 0.0</ span > < span class ="p "> ,</ span >
372381 < span class ="n "> tied_weights</ span > < span class ="p "> :</ span > < span class ="nb "> bool</ span > < span class ="o "> =</ span > < span class ="kc "> False</ span > < span class ="p "> ,</ span >
373382 < span class ="n "> seq_len</ span > < span class ="p "> :</ span > < span class ="nb "> int</ span > < span class ="o "> =</ span > < span class ="mi "> 1024</ span > < span class ="p "> ,</ span >
374- < span class ="n "> pos_encoding</ span > < span class ="p "> :</ span > < span class ="nb "> str</ span > < span class ="o "> =</ span > < span class ="s2 "> "RoPE"</ span > < span class ="p "> ,</ span >
383+ < span class ="n "> pos_encoding</ span > < span class ="p "> :</ span > < span class ="n " > Union </ span > < span class =" p " > [ </ span > < span class =" n " > List </ span > < span class =" p " > [ </ span > < span class =" nb "> str</ span > < span class =" p " > ], </ span > < span class =" nb " > str </ span > < span class =" p " > ] </ span > < span class ="o "> =</ span > < span class ="s2 "> "RoPE"</ span > < span class ="p "> ,</ span >
375384 < span class ="n "> rope_base</ span > < span class ="p "> :</ span > < span class ="nb "> float</ span > < span class ="o "> =</ span > < span class ="mf "> 10000.0</ span > < span class ="p "> ,</ span >
376385 < span class ="n "> max_seq_len</ span > < span class ="p "> :</ span > < span class ="nb "> int</ span > < span class ="o "> =</ span > < span class ="mi "> 4096</ span > < span class ="p "> ,</ span >
386+ < span class ="n "> use_cache</ span > < span class ="p "> :</ span > < span class ="nb "> bool</ span > < span class ="o "> =</ span > < span class ="kc "> True</ span > < span class ="p "> ,</ span >
387+ < span class ="n "> is_decoder</ span > < span class ="p "> :</ span > < span class ="nb "> bool</ span > < span class ="o "> =</ span > < span class ="kc "> True</ span > < span class ="p "> ,</ span >
377388 < span class ="o "> **</ span > < span class ="n "> kwargs</ span > < span class ="p "> :</ span > < span class ="n "> Dict</ span > < span class ="p "> ,</ span >
378389 < span class ="p "> ):</ span >
379390 < span class ="nb "> super</ span > < span class ="p "> ()</ span > < span class ="o "> .</ span > < span class ="fm "> __init__</ span > < span class ="p "> (</ span > < span class ="o "> **</ span > < span class ="n "> kwargs</ span > < span class ="p "> )</ span >
380391
381392 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> n_layer</ span > < span class ="o "> =</ span > < span class ="n "> n_layers</ span >
382393 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> d_model</ span > < span class ="o "> =</ span > < span class ="n "> d_model</ span >
383394 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> n_heads</ span > < span class ="o "> =</ span > < span class ="n "> n_heads</ span >
384- < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> n_kv_heads</ span > < span class ="o "> =</ span > < span class ="n "> n_kv_heads</ span > < span class ="k "> if</ span > < span class ="n "> attn_class </ span > < span class ="o " > == </ span > < span class ="s2 " > "GQA" </ span > < span class ="k "> else</ span > < span class ="n "> n_heads</ span >
395+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> n_kv_heads</ span > < span class ="o "> =</ span > < span class ="n "> n_kv_heads</ span > < span class ="k "> if</ span > < span class ="n "> n_kv_heads </ span > < span class ="ow " > is </ span > < span class =" ow " > not </ span > < span class ="kc " > None </ span > < span class ="k "> else</ span > < span class ="n "> n_heads</ span >
385396 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> vocab_size</ span > < span class ="o "> =</ span > < span class ="n "> vocab_size</ span >
386397
387398 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> attn_class</ span > < span class ="o "> =</ span > < span class ="n "> attn_class</ span >
@@ -405,7 +416,10 @@ <h1>Source code for transformer.config</h1><div class="highlight"><pre>
405416 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> seq_len</ span > < span class ="o "> =</ span > < span class ="n "> seq_len</ span >
406417 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> pos_encoding</ span > < span class ="o "> =</ span > < span class ="n "> pos_encoding</ span >
407418 < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> rope_base</ span > < span class ="o "> =</ span > < span class ="n "> rope_base</ span >
408- < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> max_seq_len</ span > < span class ="o "> =</ span > < span class ="n "> max_seq_len</ span > </ div >
419+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> max_seq_len</ span > < span class ="o "> =</ span > < span class ="n "> max_seq_len</ span >
420+
421+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> use_cache</ span > < span class ="o "> =</ span > < span class ="n "> use_cache</ span >
422+ < span class ="bp "> self</ span > < span class ="o "> .</ span > < span class ="n "> is_decoder</ span > < span class ="o "> =</ span > < span class ="n "> is_decoder</ span > </ div >
409423</ div >
410424
411425</ pre > </ div >
@@ -440,7 +454,7 @@ <h1>Source code for transformer.config</h1><div class="highlight"><pre>
440454
441455 </ aside >
442456 </ div >
443- </ div > < script src ="../../_static/documentation_options.js?v=e259d695 "> </ script >
457+ </ div > < script src ="../../_static/documentation_options.js?v=6c02275b "> </ script >
444458 < script src ="../../_static/doctools.js?v=fd6eb6e6 "> </ script >
445459 < script src ="../../_static/sphinx_highlight.js?v=6ffebe34 "> </ script >
446460 < script src ="../../_static/scripts/furo.js?v=46bd48cc "> </ script >
0 commit comments