"""Configuration: emotions, topics, prompts, and model settings from Anthropic's appendix.""" import os BASE_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.dirname(BASE_DIR) DATA_DIR = os.path.join(BASE_DIR, "data") RESULTS_DIR = os.path.join(BASE_DIR, "results") # --- Parse Anthropic's appendix files --- def _load_lines(filename): path = os.path.join(ROOT_DIR, filename) with open(path, "r", encoding="utf-8") as f: return f.read().strip() # 171 emotions from Anthropic's paper appendix _raw_emotions = _load_lines("anthropic_emotions.txt") EMOTIONS = [e.strip() for e in _raw_emotions.split(",") if e.strip()] # 100 topics from Anthropic's paper appendix _raw_topics = _load_lines("anthropic_topics.txt") TOPICS = [t.strip() for t in _raw_topics.split("\n") if t.strip()] # Story generation prompt (emotion word must NOT appear in stories) STORY_PROMPT = _load_lines("anthropic_prompt.txt") # Neutral dialogue prompt (from anthropic_all.txt lines 253-355) NEUTRAL_PROMPT = """Write {n_stories} different dialogues based on the following topic. Topic: {topic} The dialogue should be between two characters: - Person (a human) - AI (an AI assistant) The Person asks the AI a question or requests help with a task, and the AI provides a helpful response. The first speaker turn should always be from Person. Format the dialogues like so: [optional system instructions] Person: [line] AI: [line] Person: [line] AI: [line] [continue for 2-6 exchanges] [dialogue 2] etc. IMPORTANT: Always put a blank line before each speaker turn. Each turn should start with "Person:" or "AI:" on its own line after a blank line. Generate a diverse mix of dialogue types across the {n_stories} examples: - Some, but not all should include a system prompt at the start. These should come before the first Person turn. No tag like "System:" is needed, just put the instructions at the top. You can use "you" or "The assistant" to refer to the AI in the system prompt. - Some should be about code or programming tasks - Some should be factual questions (science, history, math, geography) - Some should be work-related tasks (writing, analysis, summarization) - Some should be practical how-to questions - Some should be creative but neutral tasks (brainstorming names, generating lists) - If it's natural to do so given the topic, it's ok for the dialogue to be a single back and forth (Person asks a question, AI answers), but at least some should have multiple exchanges. CRITICAL REQUIREMENT: These dialogues must be completely neutral and emotionless. - NO emotional content whatsoever - not explicit, not implied, not subtle - The Person should not express any feelings (no frustration, excitement, gratitude, worry, etc.) - The AI should not express any feelings (no enthusiasm, concern, satisfaction, etc.) - The system prompt, if present, should not mention emotions at all, nor contain any emotionally charged language - Avoid emotionally-charged topics entirely - Use matter-of-fact, neutral language throughout - No pleasantries (avoid "I'd be happy to help", "Great question!", etc.) - Focus purely on information exchange and task completion""" # --- Model configs --- MODELS = { "e4b": { "model_id": "google/gemma-4-E4B-it", "quantization": None, "num_layers": 42, "hidden_dim": 2560, }, "31b": { "model_id": "google/gemma-4-31B-it", "quantization": "4bit", "num_layers": 60, "hidden_dim": 5376, }, } # --- Extraction settings --- START_TOKEN = 50 DENOISING_VARIANCE_THRESHOLD = 0.5 N_STORIES_PER_PROMPT = 12 N_NEUTRAL_PER_TOPIC = 12 def get_extraction_layers(model_key): """Return list of layers to extract from.""" cfg = MODELS[model_key] n = cfg["num_layers"] target = int(n * 2 / 3) # Every 5th layer + the 2/3 depth point layers = list(range(5, n, 5)) if target not in layers: layers.append(target) layers.sort() return layers def get_results_dir(model_key): name = f"gemma4-{model_key}" return os.path.join(RESULTS_DIR, name)