jaepil commited on
Commit
14dbc69
·
verified ·
1 Parent(s): 071829a

Upload configuration_cognica_poe.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_cognica_poe.py +87 -0
configuration_cognica_poe.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cognica-PoE configuration class (HF transformers PretrainedConfig subclass).
2
+
3
+ Mirrors the `GPTConfig` dataclass inside the nanochat GPT implementation while
4
+ exposing the canonical HF field names so the model loads via
5
+ `AutoModelForCausalLM.from_pretrained(..., trust_remote_code=True)`.
6
+ """
7
+
8
+ from transformers import PretrainedConfig
9
+
10
+
11
+ class CognicaPoEConfig(PretrainedConfig):
12
+ model_type = "cognica_poe"
13
+ keys_to_ignore_at_inference = ["past_key_values"]
14
+
15
+ def __init__(
16
+ self,
17
+ hidden_size: int = 1536,
18
+ intermediate_size: int = 6144,
19
+ num_hidden_layers: int = 24,
20
+ num_attention_heads: int = 12,
21
+ num_key_value_heads: int = 12,
22
+ head_dim: int = 128,
23
+ max_position_embeddings: int = 2048,
24
+ vocab_size: int = 32768,
25
+ padded_vocab_size: int = 32768,
26
+ hidden_act: str = "relu_squared",
27
+ rms_norm_eps: float = 1e-6,
28
+ rope_theta: float = 100000.0,
29
+ tie_word_embeddings: bool = False,
30
+ window_pattern: str = "SSSL",
31
+ use_cache: bool = True,
32
+ poe_mode: str = "flat",
33
+ poe_every: int = 6,
34
+ poe_alpha: float = 0.0,
35
+ poe_head_count: int = 4,
36
+ base_model_name_or_path: str = None,
37
+ new_layers: int = 0,
38
+ frozen_layers: int = 0,
39
+ dual_head: bool = False,
40
+ stage_depth: int = 0,
41
+ stage_training: dict = None,
42
+ **kwargs,
43
+ ):
44
+ self.hidden_size = hidden_size
45
+ self.intermediate_size = intermediate_size
46
+ self.num_hidden_layers = num_hidden_layers
47
+ self.num_attention_heads = num_attention_heads
48
+ self.num_key_value_heads = num_key_value_heads
49
+ self.head_dim = head_dim
50
+ self.max_position_embeddings = max_position_embeddings
51
+ self.vocab_size = vocab_size
52
+ self.padded_vocab_size = padded_vocab_size
53
+ self.hidden_act = hidden_act
54
+ self.rms_norm_eps = rms_norm_eps
55
+ self.rope_theta = rope_theta
56
+ self.window_pattern = window_pattern
57
+ self.use_cache = use_cache
58
+
59
+ # PoE-specific metadata (training-time, no effect at inference)
60
+ self.poe_mode = poe_mode
61
+ self.poe_every = poe_every
62
+ self.poe_alpha = poe_alpha
63
+ self.poe_head_count = poe_head_count
64
+
65
+ # Stage extension metadata (paper Section 8.8 Elastic Depth + 6.5 Dual-Head).
66
+ # base_model_name_or_path: HF repo id of the parent model (another stage, or the base).
67
+ # - None: this IS the base model (leaf of the cascade).
68
+ # - str: this is a stage repo; `from_pretrained` will cascade-load the parent first.
69
+ # new_layers: number of layers this stage adds on top of its parent.
70
+ # - 0 at base leaf.
71
+ # frozen_layers: index boundary at which the parent's layers end.
72
+ # - At a stage with N_parent + N_new layers, frozen_layers = N_parent.
73
+ # dual_head: if True, this stage carries an additive specialist `lm_head_stage`
74
+ # that is summed with the frozen base `lm_head` at the final projection.
75
+ # stage_depth: how deep this stage is in the cascade (0 = base, 1 = first SFT, 2 = stacked, ...).
76
+ # stage_training: optional dict of hyperparameters used to train this stage.
77
+ self.base_model_name_or_path = base_model_name_or_path
78
+ self.new_layers = new_layers
79
+ self.frozen_layers = frozen_layers
80
+ self.dual_head = dual_head
81
+ self.stage_depth = stage_depth
82
+ self.stage_training = stage_training if stage_training is not None else {}
83
+
84
+ super().__init__(
85
+ tie_word_embeddings=tie_word_embeddings,
86
+ **kwargs,
87
+ )