cdpearlman commited on
Commit
f478beb
·
1 Parent(s): 7fa8fb4

New models chosen and rag docs updated

Browse files
components/model_selector.py CHANGED
@@ -10,22 +10,22 @@ from dash import html, dcc
10
 
11
  # Available models organized by family
12
  AVAILABLE_MODELS = [
13
- # LLaMA-like models (Qwen)
14
- {"label": "Qwen2.5-0.5B", "value": "Qwen/Qwen2.5-0.5B"},
15
- # {"label": "Qwen2.5-1.5B", "value": "Qwen/Qwen2.5-1.5B"},
16
-
17
- # GPT-2 family
18
- {"label": "GPT-2 (124M)", "value": "gpt2"}
19
- # {"label": "GPT-2 Medium (355M)", "value": "gpt2-medium"},
20
- # {"label": "GPT-2 Large (774M)", "value": "gpt2-large"},
21
-
22
- # # OPT family
23
- # {"label": "OPT-125M", "value": "facebook/opt-125m"},
24
- # {"label": "OPT-350M", "value": "facebook/opt-350m"},
25
-
26
- # # GPT-NeoX family (Pythia)
27
- # {"label": "Pythia-70M", "value": "EleutherAI/pythia-70m"},
28
- # {"label": "Pythia-160M", "value": "EleutherAI/pythia-160m"},
29
  ]
30
 
31
  def create_model_selector():
 
10
 
11
  # Available models organized by family
12
  AVAILABLE_MODELS = [
13
+ # GPT-2 family (OpenAI) — absolute positional encoding, LayerNorm, GELU
14
+ {"label": "GPT-2 (124M)", "value": "gpt2"},
15
+ {"label": "GPT-2 Medium (355M)", "value": "gpt2-medium"},
16
+
17
+ # GPT-Neo (EleutherAI) — absolute PE, LayerNorm, GELU
18
+ {"label": "GPT-Neo 125M", "value": "EleutherAI/gpt-neo-125M"},
19
+
20
+ # Pythia (EleutherAI) — rotary PE, LayerNorm, GELU, parallel attn+MLP
21
+ {"label": "Pythia-160M", "value": "EleutherAI/pythia-160m"},
22
+ {"label": "Pythia-410M", "value": "EleutherAI/pythia-410m"},
23
+
24
+ # OPT (Meta) — absolute PE, LayerNorm, ReLU activation
25
+ {"label": "OPT-125M", "value": "facebook/opt-125m"},
26
+
27
+ # Qwen2.5 (Alibaba) rotary PE, RMSNorm, SiLU activation
28
+ {"label": "Qwen2.5-0.5B (494M)", "value": "Qwen/Qwen2.5-0.5B"},
29
  ]
30
 
31
  def create_model_selector():
rag_docs/README.md CHANGED
@@ -37,8 +37,10 @@ This folder contains documents used by the AI chatbot for Retrieval-Augmented Ge
37
 
38
  ### Category 3: Model-Specific Documentation
39
  - `gpt2_overview.md` - GPT-2 architecture, why it's a good starter, variants
40
- - `llama_overview.md` - LLaMA/Qwen/Mistral architecture, RoPE, GQA differences
41
- - `opt_overview.md` - OPT architecture, comparison with GPT-2
 
 
42
 
43
  ### Category 4: Guided Experiments (Step-by-Step)
44
  - `experiment_first_analysis.md` - Your first analysis with GPT-2
 
37
 
38
  ### Category 3: Model-Specific Documentation
39
  - `gpt2_overview.md` - GPT-2 architecture, why it's a good starter, variants
40
+ - `gpt_neo_overview.md` - GPT-Neo architecture, local attention, comparison with GPT-2
41
+ - `pythia_overview.md` - Pythia architecture, RoPE, parallel attn+MLP, interpretability focus
42
+ - `opt_overview.md` - OPT architecture, ReLU activation, comparison with GPT-2
43
+ - `qwen_overview.md` - Qwen2.5 (LLaMA-like) architecture, RMSNorm, SiLU, GQA
44
 
45
  ### Category 4: Guided Experiments (Step-by-Step)
46
  - `experiment_first_analysis.md` - Your first analysis with GPT-2
rag_docs/embeddings_cache.json DELETED
The diff for this file is too large to render. See raw diff
 
rag_docs/gpt2_overview.md CHANGED
@@ -39,14 +39,14 @@ When analyzing GPT-2, you'll typically see:
39
 
40
  ## GPT-2 Variants
41
 
42
- The dashboard supports all GPT-2 sizes, though only the small variant is in the default dropdown:
43
 
44
  - **GPT-2 Small** (124M params, 12 layers) -- in dropdown as "GPT-2 (124M)"
45
- - **GPT-2 Medium** (355M params, 24 layers) -- enter `gpt2-medium` in the dropdown
46
  - **GPT-2 Large** (774M params, 36 layers) -- enter `gpt2-large`
47
  - **GPT-2 XL** (1.5B params, 48 layers) -- enter `gpt2-xl`
48
 
49
- Larger variants have more layers and heads but use more memory and are slower.
50
 
51
  ## HuggingFace Model IDs
52
 
 
39
 
40
  ## GPT-2 Variants
41
 
42
+ The dashboard includes GPT-2 Small and Medium in the dropdown. Larger variants can be loaded by typing the model ID:
43
 
44
  - **GPT-2 Small** (124M params, 12 layers) -- in dropdown as "GPT-2 (124M)"
45
+ - **GPT-2 Medium** (355M params, 24 layers) -- in dropdown as "GPT-2 Medium (355M)"
46
  - **GPT-2 Large** (774M params, 36 layers) -- enter `gpt2-large`
47
  - **GPT-2 XL** (1.5B params, 48 layers) -- enter `gpt2-xl`
48
 
49
+ Comparing GPT-2 Small and Medium is a great way to see how attention heads specialize as models scale up within the same architecture.
50
 
51
  ## HuggingFace Model IDs
52
 
rag_docs/gpt_neo_overview.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-Neo Overview
2
+
3
+ ## What Is GPT-Neo?
4
+
5
+ GPT-Neo is a family of open-source language models created by EleutherAI in 2021. It was one of the first serious open-source alternatives to OpenAI's GPT-3, designed as an open replication of the GPT architecture. GPT-Neo is the predecessor to EleutherAI's later Pythia suite.
6
+
7
+ ## Architecture Details
8
+
9
+ | Property | GPT-Neo 125M |
10
+ |----------|-------------|
11
+ | Parameters | 85M (actual) |
12
+ | Layers | 12 |
13
+ | Attention Heads | 12 per layer |
14
+ | Hidden Dimension | 768 |
15
+ | Vocabulary Size | 50,257 |
16
+ | Positional Encoding | Learned absolute |
17
+ | Normalization | LayerNorm |
18
+ | Activation Function | GELU |
19
+
20
+ ## Key Architectural Feature: Local Attention
21
+
22
+ GPT-Neo's most distinctive feature is its **alternating local and global attention** pattern:
23
+ - **Even layers**: Use local attention with a window of 256 tokens (each token only attends to nearby tokens)
24
+ - **Odd layers**: Use standard global attention (each token can attend to all previous tokens)
25
+
26
+ This alternating pattern is a significant architectural difference from GPT-2 (which uses global attention in every layer) and creates interesting attention visualization patterns in the dashboard.
27
+
28
+ ## Why GPT-Neo Is Useful for Learning
29
+
30
+ - **Same size as GPT-2**: At 125M parameters with 12 layers and 12 heads, it's directly comparable to GPT-2
31
+ - **Same positional encoding**: Uses learned absolute positions like GPT-2, so attention pattern differences are due to architecture, not PE
32
+ - **Different attention pattern**: The local attention in even layers creates visually distinct attention maps — great for understanding how attention scope affects behavior
33
+ - **Bridge to Pythia**: GPT-Neo → GPT-NeoX → Pythia is an evolutionary chain. Comparing GPT-Neo (absolute PE) with Pythia (rotary PE) shows the effect of positional encoding
34
+
35
+ ## Comparing GPT-Neo and GPT-2
36
+
37
+ | Feature | GPT-2 (124M) | GPT-Neo 125M |
38
+ |---------|-------------|--------------|
39
+ | Layers × Heads | 12 × 12 | 12 × 12 |
40
+ | Hidden Dim | 768 | 768 |
41
+ | Attention | All global | Alternating local/global |
42
+ | PE Type | Learned absolute | Learned absolute |
43
+ | Training Data | WebText | The Pile |
44
+ | Creator | OpenAI | EleutherAI |
45
+
46
+ ## HuggingFace Model IDs
47
+
48
+ - `EleutherAI/gpt-neo-125M` (in dropdown)
49
+ - `EleutherAI/gpt-neo-1.3B`, `EleutherAI/gpt-neo-2.7B` (larger, enter manually)
rag_docs/llama_overview.md DELETED
@@ -1,46 +0,0 @@
1
- # LLaMA Overview
2
-
3
- ## What Is LLaMA?
4
-
5
- LLaMA (Large Language Model Meta AI) is a family of open-weight language models developed by Meta. First released in 2023, LLaMA models introduced several architectural improvements over GPT-2 and became the foundation for many other models (Mistral, Qwen, etc.). In the dashboard, models labeled "LLaMA-like" share this architecture.
6
-
7
- ## Architectural Differences from GPT-2
8
-
9
- LLaMA models use several key innovations:
10
-
11
- ### RoPE (Rotary Position Embeddings)
12
- Instead of GPT-2's learned absolute position embeddings, LLaMA uses **Rotary Position Embeddings (RoPE)**. RoPE encodes position information by rotating the query and key vectors in attention. This means:
13
- - The model can generalize better to different sequence lengths
14
- - Position information is baked into the attention computation itself
15
- - Attention patterns may look different from GPT-2 because of how positions are encoded
16
-
17
- ### RMSNorm Instead of LayerNorm
18
- LLaMA uses **RMSNorm** (Root Mean Square Normalization) instead of the standard LayerNorm used in GPT-2. RMSNorm is simpler and slightly faster -- it only normalizes the magnitude of the vectors without centering them first.
19
-
20
- ### SiLU Activation
21
- Where GPT-2 uses GELU activation in the MLP, LLaMA uses **SiLU** (Sigmoid Linear Unit, also called "Swish"). This is a smooth activation function that tends to produce slightly different MLP behavior.
22
-
23
- ### Grouped-Query Attention (GQA)
24
- Larger LLaMA variants use **Grouped-Query Attention**, where multiple query heads share the same key and value heads. This reduces memory usage and speeds up inference without significantly hurting quality. This means the number of key/value heads may be smaller than the number of query heads.
25
-
26
- ## Models Using LLaMA Architecture
27
-
28
- The dashboard's "llama_like" family includes:
29
- - **Meta LLaMA**: LLaMA 2 (7B, 13B, 70B), LLaMA 3 (1B, 3B, 8B, 70B)
30
- - **Qwen**: Qwen2, Qwen2.5 (0.5B to 72B) -- available in the dashboard dropdown as "Qwen2.5-0.5B"
31
- - **Mistral**: Mistral-7B, Mixtral-8x7B
32
-
33
- ## What to Expect in the Dashboard
34
-
35
- When using a LLaMA-like model (such as Qwen2.5-0.5B):
36
-
37
- - **More layers and heads**: Even the small Qwen2.5-0.5B has 24 layers and 14 heads, compared to GPT-2's 12 layers and 12 heads
38
- - **Different attention patterns**: RoPE-based attention may show different positional patterns compared to GPT-2
39
- - **Different tokenizer**: LLaMA-family models use a different BPE vocabulary, so the same text may tokenize differently
40
- - **Comparing with GPT-2**: Running the same prompt on both GPT-2 and a LLaMA-like model is a great way to see how architecture affects predictions
41
-
42
- ## HuggingFace Model IDs
43
-
44
- - `Qwen/Qwen2.5-0.5B` (in default dropdown)
45
- - `meta-llama/Llama-3.2-1B`, `meta-llama/Llama-3.1-8B`
46
- - `mistralai/Mistral-7B-v0.3`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_docs/model_selector_guide.md CHANGED
@@ -2,39 +2,43 @@
2
 
3
  ## How to Choose a Model
4
 
5
- The dashboard supports several transformer model families. You can select a model from the dropdown menu in the generator section at the top of the page.
6
 
7
  ### Available Models
8
 
9
- Currently, the dashboard offers:
 
 
 
 
 
 
 
 
10
 
11
- - **GPT-2 (124M)**: OpenAI's GPT-2 small model. 12 layers, 12 attention heads, 768-dimensional embeddings. This is the best model to start with -- it's small, fast, and well-studied.
12
- - **Qwen2.5-0.5B**: A LLaMA-like model from Alibaba's Qwen family. 24 layers, 14 attention heads, 896-dimensional embeddings. Slightly larger and uses different architectural features (RoPE, SiLU activation).
13
 
14
- You can also enter a custom **HuggingFace model ID** in the dropdown (type it in). The dashboard supports GPT-2, LLaMA, OPT, GPT-NeoX, BLOOM, Falcon, and MPT model families.
 
 
 
 
 
15
 
16
  ### What Happens When You Load a Model
17
 
18
  1. The model is downloaded from HuggingFace (this may take a moment the first time)
19
  2. The dashboard **auto-detects** the model's architecture family
20
  3. Internal hooks are automatically configured to capture attention patterns, MLP activations, and other data
21
- 4. The layer and head dropdowns in the sidebar and ablation panel are populated based on the model's structure
22
-
23
- ### Auto-Detection
24
-
25
- The dashboard has a registry that maps model names to their architecture family. When it recognizes a model, it automatically configures:
26
- - Which internal modules to hook for attention capture
27
- - Which normalization parameters to track
28
- - The correct patterns for extracting layer outputs
29
-
30
- If you enter an unknown model, the sidebar's configuration dropdowns may need manual adjustment.
31
 
32
  ### Tips for Choosing
33
 
34
  - **Start with GPT-2**: It's small, fast, and the most widely studied. Most educational resources reference GPT-2.
35
- - **Try Qwen2.5-0.5B for comparison**: It uses a different architecture (LLaMA-style). Comparing results between GPT-2 and Qwen can highlight how architectural differences affect attention patterns.
36
- - **Larger models are slower**: Models with more parameters take longer to load and analyze. Stick to small models for interactive exploration.
37
- - **Memory matters**: Larger models require more RAM. If the dashboard becomes unresponsive, try a smaller model.
 
38
 
39
  ### Generation Settings
40
 
 
2
 
3
  ## How to Choose a Model
4
 
5
+ The dashboard supports seven transformer models from five architecture families. Select a model from the dropdown menu in the generator section at the top of the page.
6
 
7
  ### Available Models
8
 
9
+ | Model | Family | Params | Layers × Heads | Key Feature |
10
+ |-------|--------|--------|----------------|-------------|
11
+ | **GPT-2 (124M)** | GPT-2 | 85M | 12 × 12 | The MI classic — start here |
12
+ | **GPT-2 Medium (355M)** | GPT-2 | 302M | 24 × 16 | Scale comparison within GPT-2 |
13
+ | **GPT-Neo 125M** | GPT-Neo | 85M | 12 × 12 | Local attention in alternating layers |
14
+ | **Pythia-160M** | Pythia | 85M | 12 × 12 | Rotary PE, parallel attn+MLP |
15
+ | **Pythia-410M** | Pythia | 302M | 24 × 16 | Larger Pythia for scale comparison |
16
+ | **OPT-125M** | OPT | 85M | 12 × 12 | ReLU activation (unique contrast) |
17
+ | **Qwen2.5-0.5B (494M)** | Qwen2 | 391M | 24 × 14 | Modern: RMSNorm, SiLU, rotary PE |
18
 
19
+ ### Architecture Comparisons
 
20
 
21
+ These models were chosen to highlight specific architectural differences:
22
+
23
+ - **Positional encoding**: GPT-2, GPT-Neo, and OPT use absolute positions. Pythia and Qwen use rotary (RoPE). Comparing the same prompt across both types shows how PE affects attention patterns.
24
+ - **Activation function**: Most models use GELU, but OPT uses ReLU and Qwen uses SiLU. This affects MLP behavior.
25
+ - **Normalization**: GPT-2, Neo, Pythia, and OPT use LayerNorm. Qwen uses RMSNorm.
26
+ - **Attention scope**: GPT-Neo alternates between local (256-token window) and global attention, unlike all other models.
27
 
28
  ### What Happens When You Load a Model
29
 
30
  1. The model is downloaded from HuggingFace (this may take a moment the first time)
31
  2. The dashboard **auto-detects** the model's architecture family
32
  3. Internal hooks are automatically configured to capture attention patterns, MLP activations, and other data
33
+ 4. The layer and head dropdowns are populated based on the model's structure
 
 
 
 
 
 
 
 
 
34
 
35
  ### Tips for Choosing
36
 
37
  - **Start with GPT-2**: It's small, fast, and the most widely studied. Most educational resources reference GPT-2.
38
+ - **Compare same-size models**: GPT-2, GPT-Neo 125M, Pythia-160M, and OPT-125M all have 12 layers and 12 heads differences in their attention patterns come from architecture, not scale.
39
+ - **Compare scale**: GPT-2 vs GPT-2 Medium (or Pythia-160M vs Pythia-410M) shows how more layers and heads change behavior.
40
+ - **Try Qwen2.5 for a modern perspective**: It uses an entirely different design philosophy from GPT-2.
41
+ - **Memory matters**: All dropdown models are small enough for interactive exploration. Larger models can be entered manually but may be slow.
42
 
43
  ### Generation Settings
44
 
rag_docs/opt_overview.md CHANGED
@@ -18,7 +18,8 @@ OPT's architecture is close to GPT-2 but has some differences:
18
 
19
  ### Key Differences from GPT-2
20
 
21
- - **Learned positional embeddings**: Like GPT-2, OPT uses learned absolute position embeddings (unlike LLaMA's RoPE)
 
22
  - **LayerNorm placement**: OPT uses pre-norm LayerNorm (applied before each sublayer), which is slightly different from GPT-2's original arrangement
23
  - **Larger variants available**: OPT scales up to 175 billion parameters, though only smaller variants are practical for interactive use
24
 
@@ -36,13 +37,9 @@ When using OPT models:
36
  - **OPT-125M is very similar to GPT-2**: Same number of layers (12), heads (12), and hidden dimension (768). You'll see similar attention patterns and predictions.
37
  - **Different module paths**: The dashboard auto-detects OPT's internal structure (e.g., `model.decoder.layers.N.self_attn`), so hooking works automatically.
38
  - **Tokenization**: OPT's tokenizer is very similar to GPT-2's, so the same text usually produces similar (but not identical) token sequences.
39
- - **Good for comparison**: Running the same prompt on GPT-2 and OPT-125M can show how similar architectures with different training data produce different predictions.
40
 
41
  ## HuggingFace Model IDs
42
 
43
- - `facebook/opt-125m`
44
- - `facebook/opt-350m`
45
- - `facebook/opt-1.3b`
46
- - `facebook/opt-2.7b`
47
-
48
- Note: OPT models are not in the default dropdown but can be loaded by typing the model ID directly.
 
18
 
19
  ### Key Differences from GPT-2
20
 
21
+ - **ReLU activation**: OPT uses **ReLU** instead of GPT-2's GELU. This is the only model in the dashboard with ReLU, making it useful for comparing how activation functions affect MLP behavior.
22
+ - **Learned positional embeddings**: Like GPT-2, OPT uses learned absolute position embeddings (unlike Pythia's or Qwen's RoPE)
23
  - **LayerNorm placement**: OPT uses pre-norm LayerNorm (applied before each sublayer), which is slightly different from GPT-2's original arrangement
24
  - **Larger variants available**: OPT scales up to 175 billion parameters, though only smaller variants are practical for interactive use
25
 
 
37
  - **OPT-125M is very similar to GPT-2**: Same number of layers (12), heads (12), and hidden dimension (768). You'll see similar attention patterns and predictions.
38
  - **Different module paths**: The dashboard auto-detects OPT's internal structure (e.g., `model.decoder.layers.N.self_attn`), so hooking works automatically.
39
  - **Tokenization**: OPT's tokenizer is very similar to GPT-2's, so the same text usually produces similar (but not identical) token sequences.
40
+ - **Good for comparison**: Running the same prompt on GPT-2 and OPT-125M can show how similar architectures with different training data and activation functions produce different predictions.
41
 
42
  ## HuggingFace Model IDs
43
 
44
+ - `facebook/opt-125m` (in dropdown)
45
+ - `facebook/opt-350m`, `facebook/opt-1.3b` (larger, enter manually)
 
 
 
 
rag_docs/pythia_overview.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pythia Overview
2
+
3
+ ## What Is Pythia?
4
+
5
+ Pythia is a suite of language models created by EleutherAI specifically to support interpretability research. Released in 2023, the Pythia models were trained on the deduplicated Pile dataset with full public training data and checkpoints, making them uniquely transparent.
6
+
7
+ ## Architecture Details
8
+
9
+ | Property | Pythia-160M | Pythia-410M |
10
+ |----------|-------------|-------------|
11
+ | Parameters | 85M (actual) | 302M (actual) |
12
+ | Layers | 12 | 24 |
13
+ | Attention Heads | 12 per layer | 16 per layer |
14
+ | Hidden Dimension | 768 | 1024 |
15
+ | Vocabulary Size | 50,304 | 50,304 |
16
+ | Positional Encoding | Rotary (RoPE) | Rotary (RoPE) |
17
+ | Normalization | LayerNorm | LayerNorm |
18
+ | Activation Function | GELU | GELU |
19
+ | Parallel Attn+MLP | Yes | Yes |
20
+
21
+ ## Key Architectural Features
22
+
23
+ ### Rotary Position Embeddings (RoPE)
24
+ Unlike GPT-2's learned absolute positions, Pythia uses **Rotary Position Embeddings**. RoPE encodes position by rotating the query and key vectors, which means:
25
+ - Position information is baked into the attention computation
26
+ - The model can potentially generalize to longer sequences
27
+ - Attention patterns may look different from GPT-2
28
+
29
+ ### Parallel Attention + MLP
30
+ Pythia computes attention and MLP in **parallel** within each layer (rather than sequentially). This is a design choice that slightly changes how information flows through the network.
31
+
32
+ ### GPT-NeoX Architecture
33
+ Pythia uses the GPT-NeoX architecture (`GPTNeoXForCausalLM`), which is EleutherAI's evolution of GPT-Neo. The module paths use `gpt_neox.layers.{N}.attention` instead of GPT-2's `transformer.h.{N}.attn`.
34
+
35
+ ## Why Pythia Matters for Interpretability
36
+
37
+ - **Open training data**: The Pile dataset is fully public, so you can trace what the model learned
38
+ - **Training checkpoints**: Hundreds of checkpoints are available, letting researchers study how features develop during training
39
+ - **Designed for research**: EleutherAI specifically built Pythia to be a good subject for mechanistic interpretability
40
+ - **Comparison with GPT-Neo**: Same creator but different architecture (rotary vs absolute PE), enabling clean ablation studies of positional encoding
41
+
42
+ ## Comparing Pythia and GPT-2
43
+
44
+ Running the same prompt on both is highly educational:
45
+ - **Same size, different PE**: Pythia-160M and GPT-2 both have 12 layers and 12 heads, but use different positional encodings
46
+ - **Parallel vs sequential**: Pythia processes attention and MLP in parallel; GPT-2 does them sequentially
47
+ - **Different training data**: GPT-2 was trained on WebText; Pythia on The Pile
48
+
49
+ ## HuggingFace Model IDs
50
+
51
+ - `EleutherAI/pythia-160m` (in dropdown)
52
+ - `EleutherAI/pythia-410m` (in dropdown)
53
+ - `EleutherAI/pythia-1b`, `EleutherAI/pythia-1.4b` (larger, enter manually)
rag_docs/qwen_overview.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen2.5 Overview
2
+
3
+ ## What Is Qwen2.5?
4
+
5
+ Qwen2.5 is a family of language models developed by Alibaba's Qwen team, released in 2024. It uses a modern LLaMA-like architecture with several improvements over older models like GPT-2. The dashboard includes **Qwen2.5-0.5B** as a representative of this modern architecture.
6
+
7
+ ## Architecture Details
8
+
9
+ | Property | Qwen2.5-0.5B |
10
+ |----------|-------------|
11
+ | Parameters | 494M |
12
+ | Layers | 24 |
13
+ | Attention Heads | 14 per layer |
14
+ | Hidden Dimension | 896 |
15
+ | Vocabulary Size | 151,936 |
16
+ | Positional Encoding | Rotary (RoPE) |
17
+ | Normalization | RMSNorm |
18
+ | Activation Function | SiLU |
19
+
20
+ ## Key Architectural Differences from GPT-2
21
+
22
+ ### RoPE (Rotary Position Embeddings)
23
+ Instead of GPT-2's learned absolute position embeddings, Qwen uses **Rotary Position Embeddings (RoPE)**. RoPE encodes position by rotating query and key vectors in attention. This means:
24
+ - Better generalization to different sequence lengths
25
+ - Position information baked into the attention computation
26
+ - Attention patterns may look different from GPT-2
27
+
28
+ ### RMSNorm Instead of LayerNorm
29
+ Qwen uses **RMSNorm** (Root Mean Square Normalization) instead of standard LayerNorm. RMSNorm only normalizes vector magnitude without centering, making it simpler and slightly faster.
30
+
31
+ ### SiLU Activation
32
+ Where GPT-2 uses GELU in the MLP, Qwen uses **SiLU** (Sigmoid Linear Unit, also called "Swish"), a smooth activation function common in modern architectures.
33
+
34
+ ### Grouped-Query Attention (GQA)
35
+ Qwen uses **Grouped-Query Attention**, where multiple query heads share key and value heads. This reduces memory usage while maintaining quality. In the attention visualization, you may notice this affects how attention patterns are structured.
36
+
37
+ ## Why Include Qwen2.5?
38
+
39
+ - **Modern architecture**: Represents the latest generation of LLM design (post-LLaMA innovations)
40
+ - **Contrast with GPT-2**: Every major architectural choice is different — PE type, normalization, activation, attention variant
41
+ - **Good size**: 494M parameters with 24 layers provides enough depth for interesting head specialization while staying fast
42
+ - **Open access**: No gated access or license agreement required (unlike LLaMA or Gemma)
43
+
44
+ ## What to Expect in the Dashboard
45
+
46
+ When using Qwen2.5-0.5B:
47
+ - **More layers**: 24 layers vs GPT-2's 12 means more attention patterns to explore
48
+ - **Different tokenizer**: Qwen has a much larger vocabulary (152K vs 50K), so the same text tokenizes differently
49
+ - **Different attention patterns**: RoPE-based attention creates different positional patterns from GPT-2
50
+ - **Comparing with GPT-2**: Running the same prompt on both is the best way to see how architecture affects predictions and attention
51
+
52
+ ## HuggingFace Model IDs
53
+
54
+ - `Qwen/Qwen2.5-0.5B` (in dropdown)
55
+ - `Qwen/Qwen2.5-1.5B`, `Qwen/Qwen2.5-3B` (larger, enter manually)
rag_docs/recommended_starting_points.md CHANGED
@@ -55,10 +55,14 @@ If you're new to the dashboard, follow this path:
55
  5. **Experiment: Comparing Heads** -- Systematically compare head categories
56
  6. **Experiment: Beam Search** -- Explore alternative generation paths
57
 
58
- ## After the Basics
59
 
60
- Once you've completed the guided experiments:
61
- - **Compare models**: Run the same prompt on GPT-2 and Qwen2.5-0.5B to see architectural differences
 
 
 
 
 
62
  - **Try longer prompts**: See how attention patterns change with more context
63
  - **Combine techniques**: Use attribution to find important tokens, then ablate heads to find the components that process those tokens
64
- - **Explore edge cases**: Try prompts in other languages, code snippets, or mathematical expressions
 
55
  5. **Experiment: Comparing Heads** -- Systematically compare head categories
56
  6. **Experiment: Beam Search** -- Explore alternative generation paths
57
 
58
+ ## After the Basics: Cross-Model Comparisons
59
 
60
+ Once you've completed the guided experiments, try comparing models to see how architecture affects behavior:
61
+
62
+ - **GPT-2 vs GPT-Neo 125M**: Same size and PE type, but GPT-Neo alternates local/global attention — see how attention scope matters
63
+ - **GPT-2 vs Pythia-160M**: Same size but different positional encoding (absolute vs rotary) — see how RoPE changes attention patterns
64
+ - **GPT-2 vs OPT-125M**: Same size but OPT uses ReLU instead of GELU — compare MLP behavior
65
+ - **GPT-2 vs GPT-2 Medium**: Same architecture at different scales — see how head specialization changes with more layers
66
+ - **Pythia-160M vs Qwen2.5-0.5B**: Both use rotary PE but different normalization (LayerNorm vs RMSNorm) and activation (GELU vs SiLU)
67
  - **Try longer prompts**: See how attention patterns change with more context
68
  - **Combine techniques**: Use attribution to find important tokens, then ablate heads to find the components that process those tokens
 
scripts/analyze_heads.py CHANGED
@@ -48,6 +48,7 @@ HF_TO_TL_NAME = {
48
  "openai-community/gpt2-large": "gpt2-large",
49
  "gpt2-xl": "gpt2-xl",
50
  "openai-community/gpt2-xl": "gpt2-xl",
 
51
  "EleutherAI/pythia-70m": "pythia-70m",
52
  "EleutherAI/pythia-160m": "pythia-160m",
53
  "EleutherAI/pythia-410m": "pythia-410m",
@@ -56,6 +57,7 @@ HF_TO_TL_NAME = {
56
  "facebook/opt-125m": "opt-125m",
57
  "facebook/opt-350m": "opt-350m",
58
  "facebook/opt-1.3b": "opt-1.3b",
 
59
  }
60
 
61
  # Default models to analyze
@@ -64,10 +66,11 @@ DEFAULT_MODELS = ["gpt2"]
64
  ALL_PRIORITY_MODELS = [
65
  "gpt2",
66
  "gpt2-medium",
67
- "EleutherAI/pythia-70m",
68
  "EleutherAI/pythia-160m",
69
  "EleutherAI/pythia-410m",
70
  "facebook/opt-125m",
 
71
  ]
72
 
73
  # ============================================================================
 
48
  "openai-community/gpt2-large": "gpt2-large",
49
  "gpt2-xl": "gpt2-xl",
50
  "openai-community/gpt2-xl": "gpt2-xl",
51
+ "EleutherAI/gpt-neo-125M": "gpt-neo-125M",
52
  "EleutherAI/pythia-70m": "pythia-70m",
53
  "EleutherAI/pythia-160m": "pythia-160m",
54
  "EleutherAI/pythia-410m": "pythia-410m",
 
57
  "facebook/opt-125m": "opt-125m",
58
  "facebook/opt-350m": "opt-350m",
59
  "facebook/opt-1.3b": "opt-1.3b",
60
+ "Qwen/Qwen2.5-0.5B": "qwen2.5-0.5b",
61
  }
62
 
63
  # Default models to analyze
 
66
  ALL_PRIORITY_MODELS = [
67
  "gpt2",
68
  "gpt2-medium",
69
+ "EleutherAI/gpt-neo-125M",
70
  "EleutherAI/pythia-160m",
71
  "EleutherAI/pythia-410m",
72
  "facebook/opt-125m",
73
+ "Qwen/Qwen2.5-0.5B",
74
  ]
75
 
76
  # ============================================================================
utils/head_categories.json CHANGED
The diff for this file is too large to render. See raw diff
 
utils/model_config.py CHANGED
@@ -34,6 +34,18 @@ MODEL_FAMILIES: Dict[str, Dict[str, Any]] = {
34
  "norm_type": "layernorm",
35
  },
36
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # OPT
38
  "opt": {
39
  "description": "OPT architecture",
@@ -143,7 +155,12 @@ MODEL_TO_FAMILY: Dict[str, str] = {
143
  "facebook/opt-13b": "opt",
144
  "facebook/opt-30b": "opt",
145
 
146
- # GPT-NeoX models
 
 
 
 
 
147
  "EleutherAI/gpt-neox-20b": "gpt_neox",
148
  "EleutherAI/pythia-70m": "gpt_neox",
149
  "EleutherAI/pythia-160m": "gpt_neox",
 
34
  "norm_type": "layernorm",
35
  },
36
 
37
+ # GPT-Neo (EleutherAI) — similar to GPT-2 but with local attention
38
+ "gpt_neo": {
39
+ "description": "GPT-Neo architecture (EleutherAI)",
40
+ "templates": {
41
+ "attention_pattern": "transformer.h.{N}.attn.attention",
42
+ "mlp_pattern": "transformer.h.{N}.mlp",
43
+ "block_pattern": "transformer.h.{N}",
44
+ },
45
+ "norm_parameter": "transformer.ln_f.weight",
46
+ "norm_type": "layernorm",
47
+ },
48
+
49
  # OPT
50
  "opt": {
51
  "description": "OPT architecture",
 
155
  "facebook/opt-13b": "opt",
156
  "facebook/opt-30b": "opt",
157
 
158
+ # GPT-Neo models (EleutherAI)
159
+ "EleutherAI/gpt-neo-125M": "gpt_neo",
160
+ "EleutherAI/gpt-neo-1.3B": "gpt_neo",
161
+ "EleutherAI/gpt-neo-2.7B": "gpt_neo",
162
+
163
+ # GPT-NeoX / Pythia models (EleutherAI)
164
  "EleutherAI/gpt-neox-20b": "gpt_neox",
165
  "EleutherAI/pythia-70m": "gpt_neox",
166
  "EleutherAI/pythia-160m": "gpt_neox",