Construct Recurrent-Depth Transformers with OpenMythos for MLA, GQA, Sparse MoE, and Loop-Scaled Reasoning

By admin2010

May 22, 2026

48

def build_model(attn_type: str = "mla", max_loop_iters: int = 8) -> tuple:
   """Construct a small OpenMythos mannequin. Two consideration variants supported.
   MLA  — Multi-Latent Consideration (compressed KV cache, DeepSeek-V2 type)
   GQA  — Grouped-Question Consideration (fewer KV heads than Q heads)
   """
   base = dict(
       vocab_size       = 64,
       dim              = 128,
       n_heads          = 4,
       max_seq_len      = 32,
       max_loop_iters   = max_loop_iters,
       prelude_layers   = 1,
       coda_layers      = 1,
       n_experts        = 4,
       n_shared_experts = 1,
       n_experts_per_tok= 2,
       expert_dim       = 64,
       lora_rank        = 8,
       attn_type        = attn_type,
   )
   if attn_type == "gqa":
       cfg = MythosConfig(**base, n_kv_heads=2)
   else:
       cfg = MythosConfig(
           **base, n_kv_heads=4,
           kv_lora_rank=32, q_lora_rank=32,
           qk_rope_head_dim=16, qk_nope_head_dim=16, v_head_dim=16,
       )
   mannequin = OpenMythos(cfg).to(system)
   return mannequin, cfg
model_mla, cfg_mla = build_model("mla")
model_gqa, cfg_gqa = build_model("gqa")
def n_params(m): return sum(p.numel() for p in m.parameters())
print(f"n[MLA] params: {n_params(model_mla):>10,}")
print(f"[GQA] params: {n_params(model_gqa):>10,}")
def spectral_radius(mannequin):
   A = mannequin.recurrent.injection.get_A().detach().cpu()
   if A.dim() == 1:
       rho = A.abs().max().merchandise()
   else:
       rho = torch.linalg.eigvals(A.float()).abs().max().merchandise()
   return rho
print(f"nρ(A) MLA: {spectral_radius(model_mla):.4f}   (have to be < 1)")
print(f"ρ(A) GQA: {spectral_radius(model_gqa):.4f}   (have to be < 1)")
ids = torch.randint(0, cfg_mla.vocab_size, (2, 16), system=system)
with torch.no_grad():
   logits = model_mla(ids, n_loops=4)
   gen    = model_mla.generate(ids, max_new_tokens=4, n_loops=8)
print(f"nForward logits form:  {tuple(logits.form)}")
print(f"Era form:      {tuple(gen.form)}")

Construct Recurrent-Depth Transformers with OpenMythos for MLA, GQA, Sparse MoE, and Loop-Scaled Reasoning

SQL vs Pandas vs AI Brokers: Which Solves Analytics Issues Finest?

The Obtain: your stake in OpenAI, and the Treasury’s AI warning

Tencent Releases Hy3: An Open 295B Combination-of-Consultants (MoE) Mannequin with 21B Lively Parameters and 256K Context

LEAVE A REPLY Cancel reply

Most Popular

SN64 is accessible for buying and selling!

Securitize (SECZ), BlackRock’s tokenization companion, slides 40% after SPAC debut

How To Commerce ‘Finish Of Day’ Worth Motion Methods at New York Shut » Be taught To Commerce The Market

SQL vs Pandas vs AI Brokers: Which Solves Analytics Issues Finest?

Recent Comments

ABOUT US

POPULAR POSTS

SN64 is accessible for buying and selling!

Securitize (SECZ), BlackRock’s tokenization companion, slides 40% after SPAC debut

How To Commerce ‘Finish Of Day’ Worth Motion Methods at New York Shut » Be taught To Commerce The Market

POPULAR CATEGORY