def build_model(attn_type: str = "mla", max_loop_iters: int = 8) -> tuple:
"""Construct a small OpenMythos mannequin. Two consideration variants supported.
MLA — Multi-Latent Consideration (compressed KV cache, DeepSeek-V2 type)
GQA — Grouped-Question Consideration (fewer KV heads than Q heads)
"""
base = dict(
vocab_size = 64,
dim = 128,
n_heads = 4,
max_seq_len = 32,
max_loop_iters = max_loop_iters,
prelude_layers = 1,
coda_layers = 1,
n_experts = 4,
n_shared_experts = 1,
n_experts_per_tok= 2,
expert_dim = 64,
lora_rank = 8,
attn_type = attn_type,
)
if attn_type == "gqa":
cfg = MythosConfig(**base, n_kv_heads=2)
else:
cfg = MythosConfig(
**base, n_kv_heads=4,
kv_lora_rank=32, q_lora_rank=32,
qk_rope_head_dim=16, qk_nope_head_dim=16, v_head_dim=16,
)
mannequin = OpenMythos(cfg).to(system)
return mannequin, cfg
model_mla, cfg_mla = build_model("mla")
model_gqa, cfg_gqa = build_model("gqa")
def n_params(m): return sum(p.numel() for p in m.parameters())
print(f"n[MLA] params: {n_params(model_mla):>10,}")
print(f"[GQA] params: {n_params(model_gqa):>10,}")
def spectral_radius(mannequin):
A = mannequin.recurrent.injection.get_A().detach().cpu()
if A.dim() == 1:
rho = A.abs().max().merchandise()
else:
rho = torch.linalg.eigvals(A.float()).abs().max().merchandise()
return rho
print(f"nρ(A) MLA: {spectral_radius(model_mla):.4f} (have to be < 1)")
print(f"ρ(A) GQA: {spectral_radius(model_gqa):.4f} (have to be < 1)")
ids = torch.randint(0, cfg_mla.vocab_size, (2, 16), system=system)
with torch.no_grad():
logits = model_mla(ids, n_loops=4)
gen = model_mla.generate(ids, max_new_tokens=4, n_loops=8)
print(f"nForward logits form: {tuple(logits.form)}")
print(f"Era form: {tuple(gen.form)}")
Construct Recurrent-Depth Transformers with OpenMythos for MLA, GQA, Sparse MoE, and Loop-Scaled Reasoning
RELATED ARTICLES
