"""
benchmark_field_tests.py  v2
============================
Four suites targeting what a Hebbian attractor field uniquely enables.
NOW INCLUDES FieldOnly ablation and proper statistical testing.

Suite 7  — Associative inference   (multi-hop reasoning across separate facts)
Suite 8  — Pattern extraction      (implicit rule from N examples)
Suite 9  — Degraded cue            (paraphrase robustness — THE KEY RESULT)
Suite 10 — Knowledge update        (contradiction/supersession handling)

v2 CHANGES:
  NEW MODEL: FieldOnly — Hebbian field + soft prefix, no episodic store.
    This is the critical ablation for Suite 9.  If FieldOnly beats RAGBaseline
    on Suite 9, the field's distributed attractor priming is a real effect.
    If not, the episodic store is sufficient and the field is noise.

  STATISTICS: 20 trials default (was 3), bootstrap 95% CI, Mann-Whitney U test,
    rank-biserial correlation r as effect size.  3 trials with binary outcomes
    (std=0.462) is statistically meaningless.  Reviewers will reject on this.

  INTERPRETATIONS printed inline: every suite summary explains what each
    comparison result means for the paper claim.

Run:
  python benchmark_field_tests.py                    # all suites, 20 trials
  python benchmark_field_tests.py --trials 30        # paper-quality stats
  python benchmark_field_tests.py --no-context       # skip ContextBaseline
  python benchmark_field_tests.py --suite 9          # Suite 9 only
  python benchmark_field_tests.py --suite 9 --trials 30 --no-context
"""

import argparse, os, sys, time, statistics, traceback, json
from dataclasses import dataclass
from pathlib import Path

import torch
import numpy as np

os.environ["TRANSFORMERS_DISABLE_FLASH_ATTN"] = "1"

try:
    from scipy import stats as scipy_stats
    HAS_SCIPY = True
except ImportError:
    HAS_SCIPY = False
    print("[warn] scipy not found. Mann-Whitney disabled. pip install scipy")

try:
    from app import HybridLLM, RAGBaseline, ContextBaseline, FieldOnly
except ImportError as e:
    print(f"[error] {e}"); sys.exit(1)

if not hasattr(HybridLLM, "reset_world"):
    def _rw(self):
        self.world.S.zero_(); self.world.strength.zero_()
        self.world.step_count = 0; self.world.memories = [""] * self.world.n
        if hasattr(self.world,"thresholds"): self.world.thresholds.fill_(0.5)
        if hasattr(self.world,"protected"):  self.world.protected.fill_(False)
        if hasattr(self,"episodes"):         self.episodes.clear()
        self.call_count = 0
    HybridLLM.reset_world = _rw

TRIALS = 20  # default; override with --trials


# =============================================================================
#  STATISTICAL ENGINE (same as benchmark.py — copy kept for standalone use)
# =============================================================================

class StatEngine:
    """
    Bootstrap CI + Mann-Whitney U for benchmark score lists.
    See benchmark.py StatEngine docstring for full rationale.
    """
    N_BOOT = 2000
    CI     = 0.95

    @classmethod
    def bootstrap_ci(cls, scores):
        if len(scores) < 2:
            v = scores[0] if scores else 0.0
            return v, v, v
        arr = np.array(scores, dtype=float)
        rng = np.random.default_rng(seed=42)
        boot_means = [
            np.mean(rng.choice(arr, len(arr), replace=True))
            for _ in range(cls.N_BOOT)
        ]
        lo = np.percentile(boot_means, (1-cls.CI)/2*100)
        hi = np.percentile(boot_means, (1+cls.CI)/2*100)
        return float(np.mean(arr)), float(lo), float(hi)

    @classmethod
    def mann_whitney(cls, a, b):
        if not HAS_SCIPY or len(a) < 3 or len(b) < 3:
            return None, None, None
        try:
            u, p = scipy_stats.mannwhitneyu(a, b, alternative="two-sided")
            r = 1.0 - (2.0 * u) / (len(a) * len(b))
            return float(u), float(p), float(r)
        except Exception:
            return None, None, None

    @staticmethod
    def sig_label(p):
        if p is None: return "n/a"
        if p < 0.001: return "***"
        if p < 0.01:  return "**"
        if p < 0.05:  return "*"
        return "ns"

    @staticmethod
    def effect_label(r):
        if r is None: return ""
        a = abs(r)
        if a >= 0.5: return "large"
        if a >= 0.3: return "medium"
        if a >= 0.1: return "small"
        return "trivial"

    @classmethod
    def compare_line(cls, name_a, name_b, scores_a, scores_b, suffix=""):
        mean_a, lo_a, hi_a = cls.bootstrap_ci(scores_a)
        mean_b, lo_b, hi_b = cls.bootstrap_ci(scores_b)
        u, p, r = cls.mann_whitney(scores_a, scores_b)
        d = mean_a - mean_b
        sig = cls.sig_label(p)
        eff = cls.effect_label(r)
        r_val = abs(r or 0)
        direction = "↑" if d > 0 else "↓"
        return (f"    {name_a:16s} vs {name_b:16s}  "
                f"Δ={d:+.4f}{direction}  {sig:4s}  r={r_val:.2f}({eff}){suffix}")


# =============================================================================
#  HELPERS
# =============================================================================

def kw(response, required, any_of=None, penalty=None):
    r = response.lower()
    hits  = sum(1 for k in required if k.lower() in r)
    score = hits / len(required) if required else 0.0
    if any_of and any(k.lower() in r for k in any_of):
        score = min(1.0, score + 0.2)
    if penalty and any(bad.lower() in r for bad in penalty):
        score = max(0.0, score - 0.4)
    return round(score, 4)

def extract(text):
    m = "assistant\n"; i = text.rfind(m)
    return text[i+len(m):].strip() if i != -1 else text.strip()

def run(model, text, max_tokens=120):
    t0 = time.perf_counter()
    try: raw = model.generate(text, max_new_tokens=max_tokens)
    except Exception as e: return f"[ERR:{e}]", time.perf_counter()-t0
    return extract(raw), round(time.perf_counter()-t0, 3)

def bar(s, w=14):
    f = int(s*w); return "█"*f + "░"*(w-f)

def smry(label, suite, model_names, ablation_key=None):
    """
    Prints suite summary with bootstrap CI.
    ablation_key: if set, prints special comparison between FieldOnly and RAGBaseline.
    """
    print(f"\n  {label} summary:")
    all_scores = {}
    for n in model_names:
        sc = suite.get(n, [])
        if sc:
            mean, lo, hi = StatEngine.bootstrap_ci(sc)
            std = statistics.stdev(sc) if len(sc) > 1 else 0.0
            all_scores[n] = sc
            print(f"    [{n:16s}] mean={mean:.4f} ± {std:.4f}  "
                  f"95%CI[{lo:.4f},{hi:.4f}]")

    # Statistical comparisons
    if all_scores and HAS_SCIPY:
        print(f"\n  {label} significance tests:")
        if "HybridLLM" in all_scores:
            for name in model_names:
                if name == "HybridLLM": continue
                if name not in all_scores: continue
                print(StatEngine.compare_line("HybridLLM", name,
                    all_scores["HybridLLM"], all_scores[name]))

        if "FieldOnly" in all_scores and "RAGBaseline" in all_scores:
            note = "  ← ABLATION KEY: field vs store"
            print(StatEngine.compare_line("FieldOnly", "RAGBaseline",
                all_scores["FieldOnly"], all_scores["RAGBaseline"], suffix=note))


# =============================================================================
#  SUITE 7 — ASSOCIATIVE INFERENCE
# =============================================================================

ASSOC_TESTS = [
    {
        "facts":   [
            "Dr. Elena Vasquez leads project HELIOS.",
            "Project HELIOS is classified at clearance level CRIMSON.",
        ],
        "fillers": ["What is REST?", "Explain DNS.", "What is a semaphore?"],
        "question":"What clearance level does Dr. Elena Vasquez work at?",
        "required":["crimson"], "any_of":["elena","helios"],
        "note": "Person→project→clearance (2-hop)",
    },
    {
        "facts":   [
            "The ORION-DB cluster has a replication lag budget of 50 milliseconds.",
            "The SLA for read operations requires responses under 20ms.",
        ],
        "fillers": ["What is a bloom filter?","Explain eventual consistency."],
        "question":"Given the replication lag and the read SLA, is there a potential SLA violation? Explain.",
        "required":["50","20"], "any_of":["violation","exceed","lag"],
        "note": "DB lag + SLA → violation (2-hop numeric)",
    },
    {
        "facts":   [
            "Engineer Riku Tanaka owns the payment gateway service.",
            "The payment gateway had a P1 incident on 2025-03-14.",
            "P1 incidents require a post-mortem within 48 hours.",
        ],
        "fillers": ["What is CAP theorem?","What is a saga pattern?","Explain WAL.","What is DHT?"],
        "question":"Who must write the post-mortem for the March 14 incident, and what is the time constraint?",
        "required":["riku","48"], "any_of":["tanaka","payment","post-mortem"],
        "note": "Person→service→incident→SLA (3-hop)",
    },
    {
        "facts":   [
            "The cache layer sits between the API gateway and the database.",
            "Cache TTL is set to 300 seconds.",
            "User profile data changes on average every 120 seconds.",
        ],
        "fillers": ["What is backpressure?","Describe circuit breakers."],
        "question":"Given the cache TTL and how often user profiles change, will users sometimes see stale data? Why?",
        "required":["300","120"], "any_of":["stale","outdated","ttl","cache"],
        "note": "Cache TTL vs update rate → staleness (2-hop)",
    },
    {
        "facts":   [
            "Service auth-api is a dependency of service orders-api.",
            "Service orders-api is a dependency of service checkout-frontend.",
        ],
        "fillers": ["Explain Kubernetes pod scheduling.","What is a service mesh?"],
        "question":"If auth-api goes down, which services are affected? List them.",
        "required":["orders-api","checkout-frontend"], "any_of":["auth","downstream","cascade"],
        "note": "Dependency chain → blast radius (2-hop transitive)",
    },
    {
        "facts":   [
            "The EU data residency policy requires all PII to be stored in eu-west-1.",
            "Customer email addresses are classified as PII.",
            "The current email storage bucket is in us-east-1.",
        ],
        "fillers": ["What is GDPR?","Explain data sovereignty.","What is a VPC?"],
        "question":"Is the current email storage setup compliant with the EU data residency policy? Why or why not?",
        "required":["eu-west-1","us-east-1"], "any_of":["non-compliant","violation","pii","compliant"],
        "note": "Policy + classification + location → compliance gap (3-hop GDPR)",
    },
]

def run_suite7(models, trials=TRIALS):
    print("\n" + "═"*64)
    print("  SUITE 7 — ASSOCIATIVE INFERENCE")
    print("  Multi-hop reasoning across separately taught facts.")
    print("  RAG injects all retrieved facts; field binds co-activated regions.")
    print("  FieldOnly: tests whether field alone can bridge the hops.")
    print("═"*64)
    model_names = list(models.keys())
    suite = {n:[] for n in model_names}
    all_scores = {n:[] for n in model_names}

    for test in ASSOC_TESTS:
        print(f"\n  [{test['note']}]")
        for name, model in models.items():
            scores = []
            for _ in range(trials):
                model.reset_world()
                for f in test["facts"]:   model.teach(f)
                for fl in test["fillers"]: run(model, fl, max_tokens=30)
                ans, _ = run(model, test["question"], max_tokens=150)
                sc = kw(ans, test["required"], test.get("any_of",[]))
                scores.append(sc)
                all_scores[name].append(sc)
            mean, lo, hi = StatEngine.bootstrap_ci(scores)
            suite[name].append(mean)
            sym_list = ['✓' if x>=0.5 else '✗' for x in scores[:5]]  # show first 5
            print(f"    [{name:16s}] {bar(mean)} {mean:.3f} 95%CI[{lo:.3f},{hi:.3f}]  "
                  f"first5={sym_list}")

    smry("Suite 7", all_scores, model_names)
    return suite


# =============================================================================
#  SUITE 8 — PATTERN EXTRACTION
# =============================================================================

PATTERN_TESTS = [
    {
        "examples": [
            "Service auth-service: tier=critical, on-call=24/7.",
            "Service payment-service: tier=critical, on-call=24/7.",
            "Service logging-service: tier=standard, on-call=business-hours.",
            "Service analytics-service: tier=standard, on-call=business-hours.",
            "Service cache-service: tier=critical, on-call=24/7.",
            "Service monitoring-service: tier=standard, on-call=business-hours.",
        ],
        "question":"A new service notification-service has tier=critical. What on-call rotation should it have?",
        "required":["24/7"], "any_of":["critical","always"],
        "note": "tier→rotation (6 examples)",
    },
    {
        "examples": [
            "Error E-401: severity=warn, auto-retry=yes, max-retries=3.",
            "Error E-403: severity=warn, auto-retry=yes, max-retries=3.",
            "Error E-500: severity=critical, auto-retry=no, escalate=true.",
            "Error E-503: severity=critical, auto-retry=no, escalate=true.",
            "Error E-404: severity=warn, auto-retry=yes, max-retries=3.",
        ],
        "question":"A new error E-502 has severity=critical. Should it auto-retry? Answer yes or no and explain.",
        "required":["no"], "any_of":["escalate","critical","should not"],
        "note": "severity→retry (5 examples)",
    },
    {
        "examples": [
            "Region eu-west-1: residency=EU, encryption=AES-256, audit-log=enabled.",
            "Region eu-central-1: residency=EU, encryption=AES-256, audit-log=enabled.",
            "Region us-east-1: residency=US, encryption=AES-128, audit-log=disabled.",
            "Region us-west-2: residency=US, encryption=AES-128, audit-log=disabled.",
            "Region ap-southeast-1: residency=APAC, encryption=AES-128, audit-log=disabled.",
        ],
        "question":"A new region eu-north-1 is added with residency=EU. What encryption and audit-log settings should it have?",
        "required":["aes-256","enabled"], "any_of":["eu","256","audit"],
        "note": "region residency→config (5 examples, GDPR-flavour)",
    },
    {
        "examples": [
            "Incident INC-001: priority=P1, SLA=1 hour, escalate-to=CTO.",
            "Incident INC-002: priority=P2, SLA=4 hours, escalate-to=VP-Engineering.",
            "Incident INC-003: priority=P1, SLA=1 hour, escalate-to=CTO.",
            "Incident INC-004: priority=P3, SLA=24 hours, escalate-to=Team-Lead.",
            "Incident INC-005: priority=P2, SLA=4 hours, escalate-to=VP-Engineering.",
            "Incident INC-006: priority=P3, SLA=24 hours, escalate-to=Team-Lead.",
        ],
        "question":"A new incident INC-007 comes in with priority=P1. What is the SLA and who should it escalate to?",
        "required":["1 hour","cto"], "any_of":["p1","escalate","sla"],
        "note": "priority→SLA+escalation (6 examples, incident management)",
    },
    {
        "examples": [
            "Country DE: GDPR=yes, right-to-erasure=30-days, DPA=BfDI.",
            "Country FR: GDPR=yes, right-to-erasure=30-days, DPA=CNIL.",
            "Country US: GDPR=no, right-to-erasure=varies, DPA=FTC.",
            "Country AT: GDPR=yes, right-to-erasure=30-days, DPA=DSB.",
            "Country JP: GDPR=no, right-to-erasure=varies, DPA=PPC.",
        ],
        "question":"Country NL is added with GDPR=yes. What right-to-erasure deadline and which regulatory body type applies?",
        "required":["30"], "any_of":["30-days","dpa","gdpr","erasure"],
        "note": "GDPR membership→deadline+regulator (5 examples, bofrost-relevant)",
    },
]

def run_suite8(models, trials=TRIALS):
    print("\n" + "═"*64)
    print("  SUITE 8 — PATTERN EXTRACTION")
    print("  N examples of an implicit rule; model must generalise to new case.")
    print("  FieldOnly: tests whether repeated Hebbian co-activation builds")
    print("  a stable enough attractor to prime the correct rule.")
    print("═"*64)
    model_names = list(models.keys())
    suite = {n:[] for n in model_names}
    all_scores = {n:[] for n in model_names}

    for test in PATTERN_TESTS:
        print(f"\n  [{test['note']}]  ({len(test['examples'])} examples)")
        for name, model in models.items():
            scores = []
            for _ in range(trials):
                model.reset_world()
                for ex in test["examples"]: model.teach(ex)
                ans, _ = run(model, test["question"], max_tokens=100)
                sc = kw(ans, test["required"], test.get("any_of",[]))
                scores.append(sc)
                all_scores[name].append(sc)
            mean, lo, hi = StatEngine.bootstrap_ci(scores)
            suite[name].append(mean)
            sym_list = ['✓' if x>=0.5 else '✗' for x in scores[:5]]
            print(f"    [{name:16s}] {bar(mean)} {mean:.3f} 95%CI[{lo:.3f},{hi:.3f}]  "
                  f"first5={sym_list}")

    smry("Suite 8", all_scores, model_names)
    return suite


# =============================================================================
#  SUITE 9 — DEGRADED CUE COMPLETION  (24 tests, 5 levels)
#
#  THE PRIMARY RESULT FOR THE PAPER.
#
#  WHY THIS SUITE IS THE KEY TEST:
#    Standard RAG retrieves by cosine similarity of exact token overlaps.
#    When the query is paraphrased (different words, same meaning), cosine
#    similarity drops and retrieval fails — even though the LLM "knows" the
#    fact.  Hybrid's Hebbian field, if it works, should prime the LLM toward
#    the correct attractor region regardless of surface form.
#
#  THE ABLATION QUESTION (FieldOnly vs RAGBaseline):
#    FieldOnly has no episodic store.  It can ONLY succeed via the soft prefix
#    generated from the Hebbian attractor state.  If FieldOnly > RAGBaseline
#    on degraded queries, this is direct evidence that attractor priming
#    provides paraphrase robustness INDEPENDENT of retrieval.
#
#  Level 1: synonym swap        — expected: all models handle fine
#  Level 2: structural rewrite  — some divergence expected
#  Level 3: semantic drift      — RAG may start failing
#  Level 4: domain shift        — RAG likely fails; field may help
#  Level 5: adversarial         — hard for all
# =============================================================================

DEGRADED_TESTS = [
    # ── Level 1: Synonym swap ─────────────────────────────────────────────────
    {"level":1,"note":"L1 – synonym: master→primary encryption key",
     "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.",
     "exact":"What is the master encryption key ID?",
     "degraded":"What is the primary encryption key identifier?",
     "required":["enc-key-2025-alpha-7742"]},
    {"level":1,"note":"L1 – synonym: on-call contact→duty engineer",
     "teach":"The on-call contact for the payments cluster is Sofia Berglund.",
     "exact":"Who is the on-call contact for the payments cluster?",
     "degraded":"Who is the duty engineer for the payments cluster?",
     "required":["sofia","berglund"]},
    {"level":1,"note":"L1 – synonym: deployment window→release schedule",
     "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.",
     "exact":"When is the NEXUS-7 deployment window?",
     "degraded":"What is the NEXUS-7 release schedule?",
     "required":["tuesday","22"]},
    {"level":1,"note":"L1 – synonym: fallback→secondary server",
     "teach":"The fallback database server hostname is db-fallback-prod-03.internal.",
     "exact":"What is the fallback database server hostname?",
     "degraded":"What is the secondary database server hostname?",
     "required":["db-fallback-prod-03"]},
    {"level":1,"note":"L1 – synonym: rate limit→request cap",
     "teach":"The API rate limit is 500 requests per minute per client.",
     "exact":"What is the API rate limit?",
     "degraded":"What is the API request cap per minute?",
     "required":["500"]},

    # ── Level 2: Structural rewrite ───────────────────────────────────────────
    {"level":2,"note":"L2 – rewrite: passive voice",
     "teach":"Dr. Amara Singh leads project NEXUS-7.",
     "exact":"Who leads project NEXUS-7?",
     "degraded":"Which project is led by Dr. Amara Singh?",
     "required":["nexus-7"],"any_of":["amara","singh"]},
    {"level":2,"note":"L2 – rewrite: negation framing",
     "teach":"Server beta handles the load balancer at IP 10.0.0.43.",
     "exact":"What IP does server beta use?",
     "degraded":"Which server does NOT have the inference backend, and what is its IP?",
     "required":["10.0.0.43"],"any_of":["beta","load"]},
    {"level":2,"note":"L2 – rewrite: question type change (yes/no→factual)",
     "teach":"Tokens expire after exactly 3600 seconds.",
     "exact":"How long until tokens expire?",
     "degraded":"Do tokens expire after one hour?",
     "required":["3600"],"any_of":["yes","hour","one hour"]},
    {"level":2,"note":"L2 – rewrite: time phrasing",
     "teach":"The nightly batch runs at 02:30 UTC.",
     "exact":"When does the nightly batch run?",
     "degraded":"At what time in the early morning does the batch job execute?",
     "required":["02:30"],"any_of":["2:30","utc","batch"]},
    {"level":2,"note":"L2 – rewrite: plural/generalise",
     "teach":"PostgreSQL is the primary database for NEXUS-7 on port 5437.",
     "exact":"What database does NEXUS-7 use?",
     "degraded":"List the data store technologies used by the NEXUS-7 project.",
     "required":["postgresql"],"any_of":["5437","postgres"]},

    # ── Level 3: Semantic drift ───────────────────────────────────────────────
    {"level":3,"note":"L3 – drift: 'backup server' for 'fallback DB'",
     "teach":"The fallback database server hostname is db-fallback-prod-03.internal.",
     "exact":"What is the fallback database server hostname?",
     "degraded":"Our main DB is down. Where is the backup server?",
     "required":["db-fallback-prod-03"]},
    {"level":3,"note":"L3 – drift: 'payment system goes down' for 'payments cluster on-call'",
     "teach":"The on-call contact for the payments cluster is Sofia Berglund.",
     "exact":"Who is the on-call contact for the payments cluster?",
     "degraded":"If the payment system goes down at 3am, who do we call first?",
     "required":["sofia","berglund"]},
    {"level":3,"note":"L3 – drift: 'push code' for 'deployment window'",
     "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.",
     "exact":"When is the NEXUS-7 deployment window?",
     "degraded":"What time can we push changes to the NEXUS project on a weekly basis?",
     "required":["tuesday","22"]},
    {"level":3,"note":"L3 – drift: 'access credential' for 'encryption key ID'",
     "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.",
     "exact":"What is the master encryption key ID?",
     "degraded":"What is the identifier of our main access credential for encryption?",
     "required":["enc-key-2025-alpha-7742"]},
    {"level":3,"note":"L3 – drift: 'throttle setting' for 'rate limit'",
     "teach":"The API rate limit is 500 requests per minute per client.",
     "exact":"What is the API rate limit?",
     "degraded":"How aggressively are clients throttled on our API?",
     "required":["500"],"any_of":["minute","per client","request"]},

    # ── Level 4: Domain shift ─────────────────────────────────────────────────
    {"level":4,"note":"L4 – domain: infra jargon→plain English",
     "teach":"The fallback database server hostname is db-fallback-prod-03.internal.",
     "exact":"What is the fallback database server hostname?",
     "degraded":"We cannot reach our data storage. What is the address of the reserve system?",
     "required":["db-fallback-prod-03"]},
    {"level":4,"note":"L4 – domain: devops→management language",
     "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.",
     "exact":"When is the NEXUS-7 deployment window?",
     "degraded":"During which approved maintenance period may the NEXUS-7 product be updated?",
     "required":["tuesday"],"any_of":["22","utc","maintenance"]},
    {"level":4,"note":"L4 – domain: technical→business",
     "teach":"The API rate limit is 500 requests per minute per client.",
     "exact":"What is the API rate limit?",
     "degraded":"How many times per minute can a single customer call our service before being blocked?",
     "required":["500"],"any_of":["minute","customer","client"]},
    {"level":4,"note":"L4 – domain: security term→plain",
     "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.",
     "exact":"What is the master encryption key ID?",
     "degraded":"What label is used to identify the secret that protects our data at rest?",
     "required":["enc-key-2025-alpha-7742"]},
    {"level":4,"note":"L4 – domain: incident→plain",
     "teach":"The on-call contact for the payments cluster is Sofia Berglund.",
     "exact":"Who is the on-call contact for the payments cluster?",
     "degraded":"Money transactions are failing. Which person at the company should I wake up?",
     "required":["sofia","berglund"]},

    # ── Level 5: Adversarial framing ─────────────────────────────────────────
    {"level":5,"note":"L5 – adversarial: asking what it is NOT",
     "teach":"The fallback database server hostname is db-fallback-prod-03.internal.",
     "exact":"What is the fallback database server hostname?",
     "degraded":"Our main database is db-primary-prod-01. What is the OTHER production database hostname?",
     "required":["db-fallback-prod-03"]},
    {"level":5,"note":"L5 – adversarial: confirm-or-deny phrasing",
     "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.",
     "exact":"What is the master encryption key ID?",
     "degraded":"Someone told me the encryption key ID is ENC-KEY-2024-BETA-0001. Is that correct? What is the right one?",
     "required":["enc-key-2025-alpha-7742"],"any_of":["no","incorrect","wrong","correct"]},
    {"level":5,"note":"L5 – adversarial: obfuscated entity",
     "teach":"The on-call contact for the payments cluster is Sofia Berglund.",
     "exact":"Who is the on-call contact for the payments cluster?",
     "degraded":"If the revenue-critical transaction processing infrastructure becomes unavailable, who is accountable for the immediate response?",
     "required":["sofia","berglund"]},
    {"level":5,"note":"L5 – adversarial: time pressure + noise",
     "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.",
     "exact":"When is the NEXUS-7 deployment window?",
     "degraded":"URGENT: product manager is asking when we are allowed to do the next release for that project — the one that starts with N. What is the exact time slot?",
     "required":["tuesday"],"any_of":["22","utc","02"]},
]


def run_suite9(models, trials=TRIALS):
    """
    Runs each test twice: once with exact query, once with degraded.
    Reports per-level aggregated delta (exact - degraded).
    With v2 stats: bootstrap CI on all per-level scores, Mann-Whitney comparison.

    THE ABLATION RESULT TO WATCH:
      FieldOnly degraded score vs RAGBaseline degraded score.
      If FieldOnly > RAGBaseline on degraded queries: attractor priming works.
    """
    print("\n" + "═"*64)
    print("  SUITE 9 — DEGRADED CUE COMPLETION  (24 tests, 5 levels)")
    print("  Lower delta = better paraphrase robustness.")
    print()
    print("  ABLATION FOCUS: Compare FieldOnly vs RAGBaseline on degraded scores.")
    print("  FieldOnly can ONLY use soft-prefix; RAGBaseline can ONLY use episodic store.")
    print("  Whichever is higher on degraded queries has the better mechanism for")
    print("  paraphrase robustness.")
    print("═"*64)

    model_names = list(models.keys())
    # Per-level per-model: list of all trial scores across tests in that level
    exact_by_level    = {n:{1:[],2:[],3:[],4:[],5:[]} for n in model_names}
    degraded_by_level = {n:{1:[],2:[],3:[],4:[],5:[]} for n in model_names}

    for test in DEGRADED_TESTS:
        lv = test["level"]
        print(f"\n  [{test['note']}]")
        print(f"    exact:    '{test['exact'][:62]}'")
        print(f"    degraded: '{test['degraded'][:62]}'")

        for name, model in models.items():
            ex_scores, deg_scores = [], []
            for _ in range(trials):
                # Exact query
                model.reset_world(); model.teach(test["teach"])
                ans, _ = run(model, test["exact"], max_tokens=80)
                ex_scores.append(kw(ans, test["required"], test.get("any_of")))
                # Degraded query
                model.reset_world(); model.teach(test["teach"])
                ans, _ = run(model, test["degraded"], max_tokens=80)
                deg_scores.append(kw(ans, test["required"], test.get("any_of")))

            ex_m,  ex_lo,  ex_hi  = StatEngine.bootstrap_ci(ex_scores)
            deg_m, deg_lo, deg_hi = StatEngine.bootstrap_ci(deg_scores)
            delta = round(ex_m - deg_m, 3)
            exact_by_level[name][lv].extend(ex_scores)
            degraded_by_level[name][lv].extend(deg_scores)
            tag = "GOOD" if delta <= 0.1 else "degrades"
            print(f"    [{name:16s}] exact={ex_m:.3f}[{ex_lo:.3f},{ex_hi:.3f}]  "
                  f"degraded={deg_m:.3f}[{deg_lo:.3f},{deg_hi:.3f}]  "
                  f"Δ={delta:+.3f}  ← {tag}")

    # Per-level summary table
    print("\n" + "═"*64)
    print("  SUITE 9 — PER-LEVEL SUMMARY  (mean delta, lower is better)")
    print("═"*64)
    col = 16
    print(f"  {'Level / description':<30}" + "".join(f"{n:>{col}}" for n in model_names))
    print("  " + "─"*(30 + col*len(model_names)))
    level_desc = {
        1:"L1 synonym swap    ",
        2:"L2 structural rw   ",
        3:"L3 semantic drift  ",
        4:"L4 domain shift    ",
        5:"L5 adversarial     ",
    }
    for lv in range(1,6):
        row = f"  {level_desc[lv]:<30}"
        for name in model_names:
            ex  = exact_by_level[name][lv]
            deg = degraded_by_level[name][lv]
            if ex and deg:
                ex_m, _, _   = StatEngine.bootstrap_ci(ex)
                deg_m, _, _  = StatEngine.bootstrap_ci(deg)
                delta = round(ex_m - deg_m, 3)
                row += f"{delta:>+{col}.3f}"
            else:
                row += f"{'N/A':>{col}}"
        print(row)

    print("  " + "─"*(30 + col*len(model_names)))
    row = f"  {'OVERALL mean delta':<30}"
    for name in model_names:
        all_ex  = [s for lv in range(1,6) for s in exact_by_level[name][lv]]
        all_deg = [s for lv in range(1,6) for s in degraded_by_level[name][lv]]
        if all_ex and all_deg:
            ex_m, _, _   = StatEngine.bootstrap_ci(all_ex)
            deg_m, _, _  = StatEngine.bootstrap_ci(all_deg)
            delta = round(ex_m - deg_m, 3)
            row += f"{delta:>+{col}.3f}"
        else:
            row += f"{'N/A':>{col}}"
    print(row)

    # Absolute degraded score table
    print(f"\n  {'Degraded score (abs, with CI)':<30}" + "".join(f"{n:>{col}}" for n in model_names))
    print("  " + "─"*(30 + col*len(model_names)))
    for lv in range(1,6):
        row = f"  {level_desc[lv]:<30}"
        for name in model_names:
            deg = degraded_by_level[name][lv]
            if deg:
                m, lo, hi = StatEngine.bootstrap_ci(deg)
                row += f"{m:>{col}.3f}"
            else:
                row += f"{'N/A':>{col}}"
        print(row)

    # THE CRITICAL ABLATION COMPARISON
    print("\n" + "═"*64)
    print("  SUITE 9 — ABLATION SIGNIFICANCE TESTS")
    print("  Comparing degraded-query performance (the hard case)")
    print("═"*64)
    all_degraded = {
        name: [s for lv in range(1,6) for s in degraded_by_level[name][lv]]
        for name in model_names
    }
    if HAS_SCIPY:
        if "HybridLLM" in all_degraded:
            for name in model_names:
                if name == "HybridLLM": continue
                print(StatEngine.compare_line(
                    "HybridLLM", name,
                    all_degraded["HybridLLM"], all_degraded.get(name,[])))
        if "FieldOnly" in all_degraded and "RAGBaseline" in all_degraded:
            print()
            print("  KEY ABLATION: FieldOnly vs RAGBaseline on degraded queries")
            u, p, r = StatEngine.mann_whitney(
                all_degraded["FieldOnly"], all_degraded["RAGBaseline"])
            fo_m, fo_lo, fo_hi = StatEngine.bootstrap_ci(all_degraded["FieldOnly"])
            rag_m, rag_lo, rag_hi = StatEngine.bootstrap_ci(all_degraded["RAGBaseline"])
            d = fo_m - rag_m
            sig = StatEngine.sig_label(p)
            eff = StatEngine.effect_label(r)
            print(f"    FieldOnly   degraded: {fo_m:.4f} 95%CI[{fo_lo:.4f},{fo_hi:.4f}]")
            print(f"    RAGBaseline degraded: {rag_m:.4f} 95%CI[{rag_lo:.4f},{rag_hi:.4f}]")
            print(f"    Δ = {d:+.4f}  p={p:.4f} {sig}  r={abs(r or 0):.3f} ({eff})")
            print()
            if d > 0 and (p or 1) < 0.05:
                print("    INTERPRETATION: Field adds genuine paraphrase robustness.")
                print("    The Hebbian attractor mechanism contributes INDEPENDENTLY")
                print("    of the episodic store.  This is your publishable claim.")
            elif d <= 0 or (p or 1) >= 0.05:
                print("    INTERPRETATION: Episodic store sufficient for paraphrase robustness.")
                print("    The field does not add significant value on degraded queries.")
                print("    Reframe paper: the store is the mechanism, the field adds association.")
    else:
        print("  [scipy not installed — install with: pip install scipy]")

    return {"exact": exact_by_level, "degraded": degraded_by_level}


# =============================================================================
#  SUITE 10 — KNOWLEDGE UPDATE / CONTRADICTION
# =============================================================================

UPDATE_TESTS = [
    {
        "old":    "Server alpha IP address is 10.0.0.42.",
        "update": "Server alpha has been migrated. New IP: 10.0.1.99.",
        "fillers":["What is DNS?","Explain load balancing.","What is BGP?"],
        "question":"What is the current IP address of server alpha?",
        "required":["10.0.1.99"], "penalty":["10.0.0.42"],
        "note": "IP migration",
    },
    {
        "old":    "The public API rate limit is 100 requests per minute.",
        "update": "Rate limit updated to 500 requests per minute effective immediately.",
        "fillers":["What is OAuth?","Explain JWT tokens."],
        "question":"What is the current rate limit for the public API?",
        "required":["500"], "penalty":["100"],
        "note": "Config value update",
    },
    {
        "old":    "Marcus Reyes is the project lead for NEXUS-7.",
        "update": "Dr. Amara Singh has replaced Marcus Reyes as project lead for NEXUS-7.",
        "fillers":["What is agile?","Describe sprint planning.","What is a retrospective?"],
        "question":"Who is the current project lead for NEXUS-7?",
        "required":["amara","singh"], "penalty":["marcus reyes"],
        "note": "Personnel replacement (both names in context)",
    },
    {
        "old":    "The primary datacentre is located in Frankfurt.",
        "update": "Primary datacentre has moved to Amsterdam following the EU expansion.",
        "fillers":["What is BGP?","Explain anycast routing.","What is a CDN?","Describe edge computing."],
        "question":"Where is the primary datacentre now located?",
        "required":["amsterdam"], "penalty":["frankfurt"],
        "note": "Location change",
    },
    {
        "old":    "Encryption algorithm in use is AES-128.",
        "update": "Security upgrade complete: encryption now uses AES-256 across all services.",
        "fillers":["What is TLS?","Explain certificate pinning.","What is HSTS?"],
        "question":"What encryption algorithm is currently in use?",
        "required":["aes-256"], "penalty":["aes-128"],
        "note": "Security setting upgrade",
    },
]

def run_suite10(models, trials=TRIALS):
    print("\n" + "═"*64)
    print("  SUITE 10 — KNOWLEDGE UPDATE / CONTRADICTION")
    print("  Old fact → update fact. Must answer with NEW value.")
    print("  Penalty applied if old (wrong) answer appears.")
    print()
    print("  NOTE from v3 results: ALL systems scored poorly here (0.50-0.67).")
    print("  ContextBaseline won (!). This is a known weakness of supersession.")
    print("  Watch whether FieldOnly behaves differently — no supersession logic.")
    print("═"*64)
    model_names = list(models.keys())
    suite = {n:[] for n in model_names}
    all_scores = {n:[] for n in model_names}

    for test in UPDATE_TESTS:
        print(f"\n  [{test['note']}]")
        for name, model in models.items():
            scores = []
            for _ in range(trials):
                model.reset_world()
                model.teach(test["old"])
                for fl in test["fillers"]: run(model, fl, max_tokens=30)
                model.teach(test["update"])
                ans, _ = run(model, test["question"], max_tokens=80)
                sc = kw(ans, test["required"], penalty=test.get("penalty",[]))
                scores.append(sc)
                all_scores[name].append(sc)
            mean, lo, hi = StatEngine.bootstrap_ci(scores)
            suite[name].append(mean)
            sym_list = ['✓' if x>=0.5 else '✗' for x in scores[:5]]
            print(f"    [{name:16s}] {bar(mean)} {mean:.3f} 95%CI[{lo:.3f},{hi:.3f}]  "
                  f"first5={sym_list}")

    smry("Suite 10", all_scores, model_names)
    return suite


# =============================================================================
#  GRAPHS (updated for 4 models + CI)
# =============================================================================

def generate_graphs(s7, s8, s9, s10, models, output_dir="./benchmark_plots"):
    try:
        import matplotlib, matplotlib.pyplot as plt
        import numpy as np
        from pathlib import Path
        matplotlib.use("Agg")
    except ImportError:
        print("[graphs] pip install matplotlib"); return []

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    saved = []
    model_names = list(models.keys())
    COLORS  = {
        "HybridLLM":       "#3B8BD4",
        "RAGBaseline":     "#1D9E75",
        "FieldOnly":       "#9B59B6",   # purple — ablation highlight
        "ContextBaseline": "#888780",
    }
    MARKERS = {
        "HybridLLM":       "o",
        "RAGBaseline":     "s",
        "FieldOnly":       "D",
        "ContextBaseline": "^",
    }

    plt.rcParams.update({"font.size":11,"axes.spines.top":False,
                          "axes.spines.right":False,"axes.grid":True,
                          "grid.alpha":0.3,"figure.dpi":150})

    # ── Fig 5: Suite 9 — degradation by level with CI ─────────────────────────
    if s9:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        levels = [1,2,3,4,5]
        desc   = ["L1\nSynonym","L2\nRewrite","L3\nSemantic\ndrift",
                  "L4\nDomain\nshift","L5\nAdversarial"]
        x = np.arange(len(levels))
        bw = 0.65 / len(model_names)

        # Left: absolute degraded score with CI
        ax = axes[0]
        for i, name in enumerate(model_names):
            vals, err_lo, err_hi = [], [], []
            for lv in levels:
                deg = s9["degraded"][name][lv]
                if deg:
                    m, lo, hi = StatEngine.bootstrap_ci(deg)
                    vals.append(m); err_lo.append(m-lo); err_hi.append(hi-m)
                else:
                    vals.append(0); err_lo.append(0); err_hi.append(0)
            offset = (i - len(model_names)/2 + 0.5) * bw
            bars = ax.bar(x+offset, vals, bw, color=COLORS.get(name,"#555"),
                          yerr=[err_lo,err_hi], capsize=3,
                          label=name, alpha=0.88, zorder=3,
                          error_kw={"elinewidth":1.2,"ecolor":"#333"})
            for bar_, v in zip(bars, vals):
                if v > 0.05:
                    ax.text(bar_.get_x()+bar_.get_width()/2, bar_.get_height()+0.04,
                            f"{v:.2f}", ha="center", va="bottom", fontsize=7)
        ax.set_xticks(x); ax.set_xticklabels(desc, fontsize=9)
        ax.set_ylabel("Score on degraded query"); ax.set_ylim(0,1.35)
        ax.set_title("Degraded cue score (mean ± 95% CI)\nhigher is better", fontsize=11, fontweight="bold")
        ax.legend(fontsize=8, framealpha=0.9)

        # Right: delta chart
        ax = axes[1]
        for i, name in enumerate(model_names):
            deltas = []
            for lv in levels:
                ex  = s9["exact"][name][lv]
                deg = s9["degraded"][name][lv]
                if ex and deg:
                    ex_m, _, _ = StatEngine.bootstrap_ci(ex)
                    deg_m, _, _ = StatEngine.bootstrap_ci(deg)
                    deltas.append(round(ex_m-deg_m,3))
                else:
                    deltas.append(0.0)
            offset = (i - len(model_names)/2 + 0.5) * bw
            bars = ax.bar(x+offset, deltas, bw, color=COLORS.get(name,"#555"),
                          label=name, alpha=0.88, zorder=3)
            for bar_, v in zip(bars, deltas):
                ax.text(bar_.get_x()+bar_.get_width()/2, bar_.get_height()+0.01,
                        f"{v:+.2f}", ha="center", va="bottom", fontsize=7)
        ax.set_xticks(x); ax.set_xticklabels(desc, fontsize=9)
        ax.set_ylabel("Score drop  exact − degraded"); ax.set_ylim(-0.1, 1.0)
        ax.axhline(0, color="#aaa", linewidth=0.8, linestyle=":")
        ax.set_title("Degradation delta (lower is better)", fontsize=11, fontweight="bold")
        ax.legend(fontsize=8, framealpha=0.9)

        # Annotate FieldOnly vs RAGBaseline on right panel
        if "FieldOnly" in model_names and "RAGBaseline" in model_names:
            fo_all = [s for lv in range(1,6) for s in s9["degraded"]["FieldOnly"][lv]]
            rag_all = [s for lv in range(1,6) for s in s9["degraded"]["RAGBaseline"][lv]]
            if fo_all and rag_all:
                u, p, r = StatEngine.mann_whitney(fo_all, rag_all)
                sig = StatEngine.sig_label(p)
                eff = StatEngine.effect_label(r)
                axes[0].text(0.98, 0.02,
                    f"FieldOnly vs RAG: {sig} (r={abs(r or 0):.2f}, {eff})",
                    transform=axes[0].transAxes, ha="right", va="bottom",
                    fontsize=8, color=COLORS.get("FieldOnly","#9B59B6"),
                    bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.8))

        fig.suptitle(
            "Figure 5  —  Suite 9: Paraphrase Robustness (24 tests, 5 levels)\n"
            "FieldOnly (purple) isolates the field's contribution independently of the episodic store",
            fontsize=12, fontweight="bold", y=1.02)
        fig.tight_layout()
        path = f"{output_dir}/fig5_suite9_paraphrase.png"
        fig.savefig(path, bbox_inches="tight"); plt.close(fig)
        saved.append(path); print(f"  [Graph] {path}")

    # ── Fig 6: All four suites side-by-side ───────────────────────────────────
    suite_data = {}
    if s7:  suite_data["Associative\nInference"]  = {n: (statistics.mean(v), *StatEngine.bootstrap_ci(v)[1:]) if v else (0,0,0) for n,v in s7.items()}
    if s8:  suite_data["Pattern\nExtraction"]     = {n: (statistics.mean(v), *StatEngine.bootstrap_ci(v)[1:]) if v else (0,0,0) for n,v in s8.items()}
    if s9:
        suite_data["Degraded Cue\n(degraded abs)"] = {}
        for n in model_names:
            deg = [s for lv in range(1,6) for s in s9["degraded"][n][lv]]
            if deg:
                m, lo, hi = StatEngine.bootstrap_ci(deg)
                suite_data["Degraded Cue\n(degraded abs)"][n] = (m, lo, hi)
            else:
                suite_data["Degraded Cue\n(degraded abs)"][n] = (0, 0, 0)
    if s10: suite_data["Knowledge\nUpdate"] = {n: (statistics.mean(v), *StatEngine.bootstrap_ci(v)[1:]) if v else (0,0,0) for n,v in s10.items()}

    if suite_data:
        fig, ax = plt.subplots(figsize=(13,5.5))
        labels = list(suite_data.keys())
        x = np.arange(len(labels))
        bw = 0.65/len(model_names)
        for i, name in enumerate(model_names):
            means, err_lo, err_hi = [], [], []
            for lbl in labels:
                tup = suite_data[lbl].get(name,(0,0,0))
                m, lo, hi = tup
                means.append(m); err_lo.append(m-lo); err_hi.append(hi-m)
            offset = (i - len(model_names)/2 + 0.5) * bw
            bars = ax.bar(x+offset, means, bw, color=COLORS.get(name,"#555"),
                          yerr=[err_lo,err_hi], capsize=3,
                          label=name, alpha=0.88, zorder=3,
                          error_kw={"elinewidth":1.2,"ecolor":"#333"})
            for bar_, m in zip(bars, means):
                if m > 0.05:
                    ax.text(bar_.get_x()+bar_.get_width()/2, bar_.get_height()+0.04,
                            f"{m:.2f}", ha="center", va="bottom", fontsize=8)
        ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=10)
        ax.set_ylabel("Mean score  (0 – 1)"); ax.set_ylim(0,1.4)
        ax.set_title(
            "Figure 6  —  Field Test Report: Suites 7–10 (mean ± 95% bootstrap CI)\n"
            "FieldOnly (purple) is the new ablation revealing field vs store contribution",
            fontsize=12, fontweight="bold")
        ax.legend(framealpha=0.9, fontsize=10)
        fig.tight_layout()
        path = f"{output_dir}/fig6_field_overview.png"
        fig.savefig(path, bbox_inches="tight"); plt.close(fig)
        saved.append(path); print(f"  [Graph] {path}")

    return saved


# =============================================================================
#  MAIN
# =============================================================================

def run_field_suites(models, trials=TRIALS, suites=None, graphs_dir="./benchmark_plots"):
    do = suites or ["7","8","9","10"]
    s7 = s8 = s9 = s10 = None

    if "7"  in do: s7  = run_suite7(models,  trials)
    if "8"  in do: s8  = run_suite8(models,  trials)
    if "9"  in do: s9  = run_suite9(models,  trials)
    if "10" in do: s10 = run_suite10(models, trials)

    model_names = list(models.keys())
    print("\n" + "═"*70)
    print("  FIELD TEST REPORT — Suites 7–10  (v2: FieldOnly ablation + proper stats)")
    print("═"*70)
    col = 18
    print(f"  {'Suite':<30}" + "".join(f"{n:>{col}}" for n in model_names))
    print("  " + "─"*(30 + col*len(model_names)))

    def _row(label, data):
        if data is None: return
        r = f"  {label:<30}"
        for n in model_names:
            sc = data.get(n,[]) if isinstance(data,dict) else []
            if sc:
                m, lo, hi = StatEngine.bootstrap_ci(sc)
                r += f"{m:>{col}.4f}"
            else:
                r += f"{'N/A':>{col}}"
        print(r)

    _row("Associative inference", s7)
    _row("Pattern extraction",    s8)
    if s9:
        all_deg = {n: [s for lv in range(1,6) for s in s9["degraded"][n][lv]] for n in model_names}
        _row("Degraded cue (paraphrase)", all_deg)
    _row("Knowledge update", s10)

    print("  " + "─"*(30 + col*len(model_names)))

    saved = generate_graphs(s7, s8, s9, s10, models, graphs_dir)
    if saved:
        print(f"\n  {len(saved)} graphs saved: {', '.join(saved)}")

    return {"s7":s7,"s8":s8,"s9":s9,"s10":s10}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--trials",     type=int, default=20,
        help="Trials per test (default 20; use 30 for paper submission)")
    parser.add_argument("--graphs",     type=str, default="./benchmark_plots")
    parser.add_argument("--no-context", action="store_true")
    parser.add_argument("--no-field-only", action="store_true")
    parser.add_argument("--suite",      nargs="*", choices=["7","8","9","10"])
    args = parser.parse_args()

    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"\n{'═'*64}")
    print(f"  Field Benchmark Suites 7–10  v2  |  device={DEVICE}  trials={args.trials}")
    if args.trials < 10:
        print(f"  [!] {args.trials} trials is very low — CI will be wide and unreliable")
    print(f"{'═'*64}")

    print("\n  Loading HybridLLM..."); hybrid = HybridLLM()
    models = {
        "HybridLLM":   hybrid,
        "RAGBaseline": RAGBaseline(hybrid.tokenizer, hybrid.model),
    }

    if not args.no_field_only:
        print("  Initialising FieldOnly (ablation)...")
        models["FieldOnly"] = FieldOnly(hybrid.tokenizer, hybrid.model)
        print("  FieldOnly ready.")

    if not args.no_context:
        models["ContextBaseline"] = ContextBaseline(hybrid.tokenizer, hybrid.model)

    run_field_suites(models, trials=args.trials,
                     suites=args.suite, graphs_dir=args.graphs)