""" benchmark_field_tests.py v2 ============================ Four suites targeting what a Hebbian attractor field uniquely enables. NOW INCLUDES FieldOnly ablation and proper statistical testing. Suite 7 — Associative inference (multi-hop reasoning across separate facts) Suite 8 — Pattern extraction (implicit rule from N examples) Suite 9 — Degraded cue (paraphrase robustness — THE KEY RESULT) Suite 10 — Knowledge update (contradiction/supersession handling) v2 CHANGES: NEW MODEL: FieldOnly — Hebbian field + soft prefix, no episodic store. This is the critical ablation for Suite 9. If FieldOnly beats RAGBaseline on Suite 9, the field's distributed attractor priming is a real effect. If not, the episodic store is sufficient and the field is noise. STATISTICS: 20 trials default (was 3), bootstrap 95% CI, Mann-Whitney U test, rank-biserial correlation r as effect size. 3 trials with binary outcomes (std=0.462) is statistically meaningless. Reviewers will reject on this. INTERPRETATIONS printed inline: every suite summary explains what each comparison result means for the paper claim. Run: python benchmark_field_tests.py # all suites, 20 trials python benchmark_field_tests.py --trials 30 # paper-quality stats python benchmark_field_tests.py --no-context # skip ContextBaseline python benchmark_field_tests.py --suite 9 # Suite 9 only python benchmark_field_tests.py --suite 9 --trials 30 --no-context """ import argparse, os, sys, time, statistics, traceback, json from dataclasses import dataclass from pathlib import Path import torch import numpy as np os.environ["TRANSFORMERS_DISABLE_FLASH_ATTN"] = "1" try: from scipy import stats as scipy_stats HAS_SCIPY = True except ImportError: HAS_SCIPY = False print("[warn] scipy not found. Mann-Whitney disabled. pip install scipy") try: from app import HybridLLM, RAGBaseline, ContextBaseline, FieldOnly except ImportError as e: print(f"[error] {e}"); sys.exit(1) if not hasattr(HybridLLM, "reset_world"): def _rw(self): self.world.S.zero_(); self.world.strength.zero_() self.world.step_count = 0; self.world.memories = [""] * self.world.n if hasattr(self.world,"thresholds"): self.world.thresholds.fill_(0.5) if hasattr(self.world,"protected"): self.world.protected.fill_(False) if hasattr(self,"episodes"): self.episodes.clear() self.call_count = 0 HybridLLM.reset_world = _rw TRIALS = 20 # default; override with --trials # ============================================================================= # STATISTICAL ENGINE (same as benchmark.py — copy kept for standalone use) # ============================================================================= class StatEngine: """ Bootstrap CI + Mann-Whitney U for benchmark score lists. See benchmark.py StatEngine docstring for full rationale. """ N_BOOT = 2000 CI = 0.95 @classmethod def bootstrap_ci(cls, scores): if len(scores) < 2: v = scores[0] if scores else 0.0 return v, v, v arr = np.array(scores, dtype=float) rng = np.random.default_rng(seed=42) boot_means = [ np.mean(rng.choice(arr, len(arr), replace=True)) for _ in range(cls.N_BOOT) ] lo = np.percentile(boot_means, (1-cls.CI)/2*100) hi = np.percentile(boot_means, (1+cls.CI)/2*100) return float(np.mean(arr)), float(lo), float(hi) @classmethod def mann_whitney(cls, a, b): if not HAS_SCIPY or len(a) < 3 or len(b) < 3: return None, None, None try: u, p = scipy_stats.mannwhitneyu(a, b, alternative="two-sided") r = 1.0 - (2.0 * u) / (len(a) * len(b)) return float(u), float(p), float(r) except Exception: return None, None, None @staticmethod def sig_label(p): if p is None: return "n/a" if p < 0.001: return "***" if p < 0.01: return "**" if p < 0.05: return "*" return "ns" @staticmethod def effect_label(r): if r is None: return "" a = abs(r) if a >= 0.5: return "large" if a >= 0.3: return "medium" if a >= 0.1: return "small" return "trivial" @classmethod def compare_line(cls, name_a, name_b, scores_a, scores_b, suffix=""): mean_a, lo_a, hi_a = cls.bootstrap_ci(scores_a) mean_b, lo_b, hi_b = cls.bootstrap_ci(scores_b) u, p, r = cls.mann_whitney(scores_a, scores_b) d = mean_a - mean_b sig = cls.sig_label(p) eff = cls.effect_label(r) r_val = abs(r or 0) direction = "↑" if d > 0 else "↓" return (f" {name_a:16s} vs {name_b:16s} " f"Δ={d:+.4f}{direction} {sig:4s} r={r_val:.2f}({eff}){suffix}") # ============================================================================= # HELPERS # ============================================================================= def kw(response, required, any_of=None, penalty=None): r = response.lower() hits = sum(1 for k in required if k.lower() in r) score = hits / len(required) if required else 0.0 if any_of and any(k.lower() in r for k in any_of): score = min(1.0, score + 0.2) if penalty and any(bad.lower() in r for bad in penalty): score = max(0.0, score - 0.4) return round(score, 4) def extract(text): m = "assistant\n"; i = text.rfind(m) return text[i+len(m):].strip() if i != -1 else text.strip() def run(model, text, max_tokens=120): t0 = time.perf_counter() try: raw = model.generate(text, max_new_tokens=max_tokens) except Exception as e: return f"[ERR:{e}]", time.perf_counter()-t0 return extract(raw), round(time.perf_counter()-t0, 3) def bar(s, w=14): f = int(s*w); return "█"*f + "░"*(w-f) def smry(label, suite, model_names, ablation_key=None): """ Prints suite summary with bootstrap CI. ablation_key: if set, prints special comparison between FieldOnly and RAGBaseline. """ print(f"\n {label} summary:") all_scores = {} for n in model_names: sc = suite.get(n, []) if sc: mean, lo, hi = StatEngine.bootstrap_ci(sc) std = statistics.stdev(sc) if len(sc) > 1 else 0.0 all_scores[n] = sc print(f" [{n:16s}] mean={mean:.4f} ± {std:.4f} " f"95%CI[{lo:.4f},{hi:.4f}]") # Statistical comparisons if all_scores and HAS_SCIPY: print(f"\n {label} significance tests:") if "HybridLLM" in all_scores: for name in model_names: if name == "HybridLLM": continue if name not in all_scores: continue print(StatEngine.compare_line("HybridLLM", name, all_scores["HybridLLM"], all_scores[name])) if "FieldOnly" in all_scores and "RAGBaseline" in all_scores: note = " ← ABLATION KEY: field vs store" print(StatEngine.compare_line("FieldOnly", "RAGBaseline", all_scores["FieldOnly"], all_scores["RAGBaseline"], suffix=note)) # ============================================================================= # SUITE 7 — ASSOCIATIVE INFERENCE # ============================================================================= ASSOC_TESTS = [ { "facts": [ "Dr. Elena Vasquez leads project HELIOS.", "Project HELIOS is classified at clearance level CRIMSON.", ], "fillers": ["What is REST?", "Explain DNS.", "What is a semaphore?"], "question":"What clearance level does Dr. Elena Vasquez work at?", "required":["crimson"], "any_of":["elena","helios"], "note": "Person→project→clearance (2-hop)", }, { "facts": [ "The ORION-DB cluster has a replication lag budget of 50 milliseconds.", "The SLA for read operations requires responses under 20ms.", ], "fillers": ["What is a bloom filter?","Explain eventual consistency."], "question":"Given the replication lag and the read SLA, is there a potential SLA violation? Explain.", "required":["50","20"], "any_of":["violation","exceed","lag"], "note": "DB lag + SLA → violation (2-hop numeric)", }, { "facts": [ "Engineer Riku Tanaka owns the payment gateway service.", "The payment gateway had a P1 incident on 2025-03-14.", "P1 incidents require a post-mortem within 48 hours.", ], "fillers": ["What is CAP theorem?","What is a saga pattern?","Explain WAL.","What is DHT?"], "question":"Who must write the post-mortem for the March 14 incident, and what is the time constraint?", "required":["riku","48"], "any_of":["tanaka","payment","post-mortem"], "note": "Person→service→incident→SLA (3-hop)", }, { "facts": [ "The cache layer sits between the API gateway and the database.", "Cache TTL is set to 300 seconds.", "User profile data changes on average every 120 seconds.", ], "fillers": ["What is backpressure?","Describe circuit breakers."], "question":"Given the cache TTL and how often user profiles change, will users sometimes see stale data? Why?", "required":["300","120"], "any_of":["stale","outdated","ttl","cache"], "note": "Cache TTL vs update rate → staleness (2-hop)", }, { "facts": [ "Service auth-api is a dependency of service orders-api.", "Service orders-api is a dependency of service checkout-frontend.", ], "fillers": ["Explain Kubernetes pod scheduling.","What is a service mesh?"], "question":"If auth-api goes down, which services are affected? List them.", "required":["orders-api","checkout-frontend"], "any_of":["auth","downstream","cascade"], "note": "Dependency chain → blast radius (2-hop transitive)", }, { "facts": [ "The EU data residency policy requires all PII to be stored in eu-west-1.", "Customer email addresses are classified as PII.", "The current email storage bucket is in us-east-1.", ], "fillers": ["What is GDPR?","Explain data sovereignty.","What is a VPC?"], "question":"Is the current email storage setup compliant with the EU data residency policy? Why or why not?", "required":["eu-west-1","us-east-1"], "any_of":["non-compliant","violation","pii","compliant"], "note": "Policy + classification + location → compliance gap (3-hop GDPR)", }, ] def run_suite7(models, trials=TRIALS): print("\n" + "═"*64) print(" SUITE 7 — ASSOCIATIVE INFERENCE") print(" Multi-hop reasoning across separately taught facts.") print(" RAG injects all retrieved facts; field binds co-activated regions.") print(" FieldOnly: tests whether field alone can bridge the hops.") print("═"*64) model_names = list(models.keys()) suite = {n:[] for n in model_names} all_scores = {n:[] for n in model_names} for test in ASSOC_TESTS: print(f"\n [{test['note']}]") for name, model in models.items(): scores = [] for _ in range(trials): model.reset_world() for f in test["facts"]: model.teach(f) for fl in test["fillers"]: run(model, fl, max_tokens=30) ans, _ = run(model, test["question"], max_tokens=150) sc = kw(ans, test["required"], test.get("any_of",[])) scores.append(sc) all_scores[name].append(sc) mean, lo, hi = StatEngine.bootstrap_ci(scores) suite[name].append(mean) sym_list = ['✓' if x>=0.5 else '✗' for x in scores[:5]] # show first 5 print(f" [{name:16s}] {bar(mean)} {mean:.3f} 95%CI[{lo:.3f},{hi:.3f}] " f"first5={sym_list}") smry("Suite 7", all_scores, model_names) return suite # ============================================================================= # SUITE 8 — PATTERN EXTRACTION # ============================================================================= PATTERN_TESTS = [ { "examples": [ "Service auth-service: tier=critical, on-call=24/7.", "Service payment-service: tier=critical, on-call=24/7.", "Service logging-service: tier=standard, on-call=business-hours.", "Service analytics-service: tier=standard, on-call=business-hours.", "Service cache-service: tier=critical, on-call=24/7.", "Service monitoring-service: tier=standard, on-call=business-hours.", ], "question":"A new service notification-service has tier=critical. What on-call rotation should it have?", "required":["24/7"], "any_of":["critical","always"], "note": "tier→rotation (6 examples)", }, { "examples": [ "Error E-401: severity=warn, auto-retry=yes, max-retries=3.", "Error E-403: severity=warn, auto-retry=yes, max-retries=3.", "Error E-500: severity=critical, auto-retry=no, escalate=true.", "Error E-503: severity=critical, auto-retry=no, escalate=true.", "Error E-404: severity=warn, auto-retry=yes, max-retries=3.", ], "question":"A new error E-502 has severity=critical. Should it auto-retry? Answer yes or no and explain.", "required":["no"], "any_of":["escalate","critical","should not"], "note": "severity→retry (5 examples)", }, { "examples": [ "Region eu-west-1: residency=EU, encryption=AES-256, audit-log=enabled.", "Region eu-central-1: residency=EU, encryption=AES-256, audit-log=enabled.", "Region us-east-1: residency=US, encryption=AES-128, audit-log=disabled.", "Region us-west-2: residency=US, encryption=AES-128, audit-log=disabled.", "Region ap-southeast-1: residency=APAC, encryption=AES-128, audit-log=disabled.", ], "question":"A new region eu-north-1 is added with residency=EU. What encryption and audit-log settings should it have?", "required":["aes-256","enabled"], "any_of":["eu","256","audit"], "note": "region residency→config (5 examples, GDPR-flavour)", }, { "examples": [ "Incident INC-001: priority=P1, SLA=1 hour, escalate-to=CTO.", "Incident INC-002: priority=P2, SLA=4 hours, escalate-to=VP-Engineering.", "Incident INC-003: priority=P1, SLA=1 hour, escalate-to=CTO.", "Incident INC-004: priority=P3, SLA=24 hours, escalate-to=Team-Lead.", "Incident INC-005: priority=P2, SLA=4 hours, escalate-to=VP-Engineering.", "Incident INC-006: priority=P3, SLA=24 hours, escalate-to=Team-Lead.", ], "question":"A new incident INC-007 comes in with priority=P1. What is the SLA and who should it escalate to?", "required":["1 hour","cto"], "any_of":["p1","escalate","sla"], "note": "priority→SLA+escalation (6 examples, incident management)", }, { "examples": [ "Country DE: GDPR=yes, right-to-erasure=30-days, DPA=BfDI.", "Country FR: GDPR=yes, right-to-erasure=30-days, DPA=CNIL.", "Country US: GDPR=no, right-to-erasure=varies, DPA=FTC.", "Country AT: GDPR=yes, right-to-erasure=30-days, DPA=DSB.", "Country JP: GDPR=no, right-to-erasure=varies, DPA=PPC.", ], "question":"Country NL is added with GDPR=yes. What right-to-erasure deadline and which regulatory body type applies?", "required":["30"], "any_of":["30-days","dpa","gdpr","erasure"], "note": "GDPR membership→deadline+regulator (5 examples, bofrost-relevant)", }, ] def run_suite8(models, trials=TRIALS): print("\n" + "═"*64) print(" SUITE 8 — PATTERN EXTRACTION") print(" N examples of an implicit rule; model must generalise to new case.") print(" FieldOnly: tests whether repeated Hebbian co-activation builds") print(" a stable enough attractor to prime the correct rule.") print("═"*64) model_names = list(models.keys()) suite = {n:[] for n in model_names} all_scores = {n:[] for n in model_names} for test in PATTERN_TESTS: print(f"\n [{test['note']}] ({len(test['examples'])} examples)") for name, model in models.items(): scores = [] for _ in range(trials): model.reset_world() for ex in test["examples"]: model.teach(ex) ans, _ = run(model, test["question"], max_tokens=100) sc = kw(ans, test["required"], test.get("any_of",[])) scores.append(sc) all_scores[name].append(sc) mean, lo, hi = StatEngine.bootstrap_ci(scores) suite[name].append(mean) sym_list = ['✓' if x>=0.5 else '✗' for x in scores[:5]] print(f" [{name:16s}] {bar(mean)} {mean:.3f} 95%CI[{lo:.3f},{hi:.3f}] " f"first5={sym_list}") smry("Suite 8", all_scores, model_names) return suite # ============================================================================= # SUITE 9 — DEGRADED CUE COMPLETION (24 tests, 5 levels) # # THE PRIMARY RESULT FOR THE PAPER. # # WHY THIS SUITE IS THE KEY TEST: # Standard RAG retrieves by cosine similarity of exact token overlaps. # When the query is paraphrased (different words, same meaning), cosine # similarity drops and retrieval fails — even though the LLM "knows" the # fact. Hybrid's Hebbian field, if it works, should prime the LLM toward # the correct attractor region regardless of surface form. # # THE ABLATION QUESTION (FieldOnly vs RAGBaseline): # FieldOnly has no episodic store. It can ONLY succeed via the soft prefix # generated from the Hebbian attractor state. If FieldOnly > RAGBaseline # on degraded queries, this is direct evidence that attractor priming # provides paraphrase robustness INDEPENDENT of retrieval. # # Level 1: synonym swap — expected: all models handle fine # Level 2: structural rewrite — some divergence expected # Level 3: semantic drift — RAG may start failing # Level 4: domain shift — RAG likely fails; field may help # Level 5: adversarial — hard for all # ============================================================================= DEGRADED_TESTS = [ # ── Level 1: Synonym swap ───────────────────────────────────────────────── {"level":1,"note":"L1 – synonym: master→primary encryption key", "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.", "exact":"What is the master encryption key ID?", "degraded":"What is the primary encryption key identifier?", "required":["enc-key-2025-alpha-7742"]}, {"level":1,"note":"L1 – synonym: on-call contact→duty engineer", "teach":"The on-call contact for the payments cluster is Sofia Berglund.", "exact":"Who is the on-call contact for the payments cluster?", "degraded":"Who is the duty engineer for the payments cluster?", "required":["sofia","berglund"]}, {"level":1,"note":"L1 – synonym: deployment window→release schedule", "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.", "exact":"When is the NEXUS-7 deployment window?", "degraded":"What is the NEXUS-7 release schedule?", "required":["tuesday","22"]}, {"level":1,"note":"L1 – synonym: fallback→secondary server", "teach":"The fallback database server hostname is db-fallback-prod-03.internal.", "exact":"What is the fallback database server hostname?", "degraded":"What is the secondary database server hostname?", "required":["db-fallback-prod-03"]}, {"level":1,"note":"L1 – synonym: rate limit→request cap", "teach":"The API rate limit is 500 requests per minute per client.", "exact":"What is the API rate limit?", "degraded":"What is the API request cap per minute?", "required":["500"]}, # ── Level 2: Structural rewrite ─────────────────────────────────────────── {"level":2,"note":"L2 – rewrite: passive voice", "teach":"Dr. Amara Singh leads project NEXUS-7.", "exact":"Who leads project NEXUS-7?", "degraded":"Which project is led by Dr. Amara Singh?", "required":["nexus-7"],"any_of":["amara","singh"]}, {"level":2,"note":"L2 – rewrite: negation framing", "teach":"Server beta handles the load balancer at IP 10.0.0.43.", "exact":"What IP does server beta use?", "degraded":"Which server does NOT have the inference backend, and what is its IP?", "required":["10.0.0.43"],"any_of":["beta","load"]}, {"level":2,"note":"L2 – rewrite: question type change (yes/no→factual)", "teach":"Tokens expire after exactly 3600 seconds.", "exact":"How long until tokens expire?", "degraded":"Do tokens expire after one hour?", "required":["3600"],"any_of":["yes","hour","one hour"]}, {"level":2,"note":"L2 – rewrite: time phrasing", "teach":"The nightly batch runs at 02:30 UTC.", "exact":"When does the nightly batch run?", "degraded":"At what time in the early morning does the batch job execute?", "required":["02:30"],"any_of":["2:30","utc","batch"]}, {"level":2,"note":"L2 – rewrite: plural/generalise", "teach":"PostgreSQL is the primary database for NEXUS-7 on port 5437.", "exact":"What database does NEXUS-7 use?", "degraded":"List the data store technologies used by the NEXUS-7 project.", "required":["postgresql"],"any_of":["5437","postgres"]}, # ── Level 3: Semantic drift ─────────────────────────────────────────────── {"level":3,"note":"L3 – drift: 'backup server' for 'fallback DB'", "teach":"The fallback database server hostname is db-fallback-prod-03.internal.", "exact":"What is the fallback database server hostname?", "degraded":"Our main DB is down. Where is the backup server?", "required":["db-fallback-prod-03"]}, {"level":3,"note":"L3 – drift: 'payment system goes down' for 'payments cluster on-call'", "teach":"The on-call contact for the payments cluster is Sofia Berglund.", "exact":"Who is the on-call contact for the payments cluster?", "degraded":"If the payment system goes down at 3am, who do we call first?", "required":["sofia","berglund"]}, {"level":3,"note":"L3 – drift: 'push code' for 'deployment window'", "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.", "exact":"When is the NEXUS-7 deployment window?", "degraded":"What time can we push changes to the NEXUS project on a weekly basis?", "required":["tuesday","22"]}, {"level":3,"note":"L3 – drift: 'access credential' for 'encryption key ID'", "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.", "exact":"What is the master encryption key ID?", "degraded":"What is the identifier of our main access credential for encryption?", "required":["enc-key-2025-alpha-7742"]}, {"level":3,"note":"L3 – drift: 'throttle setting' for 'rate limit'", "teach":"The API rate limit is 500 requests per minute per client.", "exact":"What is the API rate limit?", "degraded":"How aggressively are clients throttled on our API?", "required":["500"],"any_of":["minute","per client","request"]}, # ── Level 4: Domain shift ───────────────────────────────────────────────── {"level":4,"note":"L4 – domain: infra jargon→plain English", "teach":"The fallback database server hostname is db-fallback-prod-03.internal.", "exact":"What is the fallback database server hostname?", "degraded":"We cannot reach our data storage. What is the address of the reserve system?", "required":["db-fallback-prod-03"]}, {"level":4,"note":"L4 – domain: devops→management language", "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.", "exact":"When is the NEXUS-7 deployment window?", "degraded":"During which approved maintenance period may the NEXUS-7 product be updated?", "required":["tuesday"],"any_of":["22","utc","maintenance"]}, {"level":4,"note":"L4 – domain: technical→business", "teach":"The API rate limit is 500 requests per minute per client.", "exact":"What is the API rate limit?", "degraded":"How many times per minute can a single customer call our service before being blocked?", "required":["500"],"any_of":["minute","customer","client"]}, {"level":4,"note":"L4 – domain: security term→plain", "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.", "exact":"What is the master encryption key ID?", "degraded":"What label is used to identify the secret that protects our data at rest?", "required":["enc-key-2025-alpha-7742"]}, {"level":4,"note":"L4 – domain: incident→plain", "teach":"The on-call contact for the payments cluster is Sofia Berglund.", "exact":"Who is the on-call contact for the payments cluster?", "degraded":"Money transactions are failing. Which person at the company should I wake up?", "required":["sofia","berglund"]}, # ── Level 5: Adversarial framing ───────────────────────────────────────── {"level":5,"note":"L5 – adversarial: asking what it is NOT", "teach":"The fallback database server hostname is db-fallback-prod-03.internal.", "exact":"What is the fallback database server hostname?", "degraded":"Our main database is db-primary-prod-01. What is the OTHER production database hostname?", "required":["db-fallback-prod-03"]}, {"level":5,"note":"L5 – adversarial: confirm-or-deny phrasing", "teach":"The master encryption key ID is ENC-KEY-2025-ALPHA-7742.", "exact":"What is the master encryption key ID?", "degraded":"Someone told me the encryption key ID is ENC-KEY-2024-BETA-0001. Is that correct? What is the right one?", "required":["enc-key-2025-alpha-7742"],"any_of":["no","incorrect","wrong","correct"]}, {"level":5,"note":"L5 – adversarial: obfuscated entity", "teach":"The on-call contact for the payments cluster is Sofia Berglund.", "exact":"Who is the on-call contact for the payments cluster?", "degraded":"If the revenue-critical transaction processing infrastructure becomes unavailable, who is accountable for the immediate response?", "required":["sofia","berglund"]}, {"level":5,"note":"L5 – adversarial: time pressure + noise", "teach":"The NEXUS-7 deployment window is every Tuesday 22:00–02:00 UTC.", "exact":"When is the NEXUS-7 deployment window?", "degraded":"URGENT: product manager is asking when we are allowed to do the next release for that project — the one that starts with N. What is the exact time slot?", "required":["tuesday"],"any_of":["22","utc","02"]}, ] def run_suite9(models, trials=TRIALS): """ Runs each test twice: once with exact query, once with degraded. Reports per-level aggregated delta (exact - degraded). With v2 stats: bootstrap CI on all per-level scores, Mann-Whitney comparison. THE ABLATION RESULT TO WATCH: FieldOnly degraded score vs RAGBaseline degraded score. If FieldOnly > RAGBaseline on degraded queries: attractor priming works. """ print("\n" + "═"*64) print(" SUITE 9 — DEGRADED CUE COMPLETION (24 tests, 5 levels)") print(" Lower delta = better paraphrase robustness.") print() print(" ABLATION FOCUS: Compare FieldOnly vs RAGBaseline on degraded scores.") print(" FieldOnly can ONLY use soft-prefix; RAGBaseline can ONLY use episodic store.") print(" Whichever is higher on degraded queries has the better mechanism for") print(" paraphrase robustness.") print("═"*64) model_names = list(models.keys()) # Per-level per-model: list of all trial scores across tests in that level exact_by_level = {n:{1:[],2:[],3:[],4:[],5:[]} for n in model_names} degraded_by_level = {n:{1:[],2:[],3:[],4:[],5:[]} for n in model_names} for test in DEGRADED_TESTS: lv = test["level"] print(f"\n [{test['note']}]") print(f" exact: '{test['exact'][:62]}'") print(f" degraded: '{test['degraded'][:62]}'") for name, model in models.items(): ex_scores, deg_scores = [], [] for _ in range(trials): # Exact query model.reset_world(); model.teach(test["teach"]) ans, _ = run(model, test["exact"], max_tokens=80) ex_scores.append(kw(ans, test["required"], test.get("any_of"))) # Degraded query model.reset_world(); model.teach(test["teach"]) ans, _ = run(model, test["degraded"], max_tokens=80) deg_scores.append(kw(ans, test["required"], test.get("any_of"))) ex_m, ex_lo, ex_hi = StatEngine.bootstrap_ci(ex_scores) deg_m, deg_lo, deg_hi = StatEngine.bootstrap_ci(deg_scores) delta = round(ex_m - deg_m, 3) exact_by_level[name][lv].extend(ex_scores) degraded_by_level[name][lv].extend(deg_scores) tag = "GOOD" if delta <= 0.1 else "degrades" print(f" [{name:16s}] exact={ex_m:.3f}[{ex_lo:.3f},{ex_hi:.3f}] " f"degraded={deg_m:.3f}[{deg_lo:.3f},{deg_hi:.3f}] " f"Δ={delta:+.3f} ← {tag}") # Per-level summary table print("\n" + "═"*64) print(" SUITE 9 — PER-LEVEL SUMMARY (mean delta, lower is better)") print("═"*64) col = 16 print(f" {'Level / description':<30}" + "".join(f"{n:>{col}}" for n in model_names)) print(" " + "─"*(30 + col*len(model_names))) level_desc = { 1:"L1 synonym swap ", 2:"L2 structural rw ", 3:"L3 semantic drift ", 4:"L4 domain shift ", 5:"L5 adversarial ", } for lv in range(1,6): row = f" {level_desc[lv]:<30}" for name in model_names: ex = exact_by_level[name][lv] deg = degraded_by_level[name][lv] if ex and deg: ex_m, _, _ = StatEngine.bootstrap_ci(ex) deg_m, _, _ = StatEngine.bootstrap_ci(deg) delta = round(ex_m - deg_m, 3) row += f"{delta:>+{col}.3f}" else: row += f"{'N/A':>{col}}" print(row) print(" " + "─"*(30 + col*len(model_names))) row = f" {'OVERALL mean delta':<30}" for name in model_names: all_ex = [s for lv in range(1,6) for s in exact_by_level[name][lv]] all_deg = [s for lv in range(1,6) for s in degraded_by_level[name][lv]] if all_ex and all_deg: ex_m, _, _ = StatEngine.bootstrap_ci(all_ex) deg_m, _, _ = StatEngine.bootstrap_ci(all_deg) delta = round(ex_m - deg_m, 3) row += f"{delta:>+{col}.3f}" else: row += f"{'N/A':>{col}}" print(row) # Absolute degraded score table print(f"\n {'Degraded score (abs, with CI)':<30}" + "".join(f"{n:>{col}}" for n in model_names)) print(" " + "─"*(30 + col*len(model_names))) for lv in range(1,6): row = f" {level_desc[lv]:<30}" for name in model_names: deg = degraded_by_level[name][lv] if deg: m, lo, hi = StatEngine.bootstrap_ci(deg) row += f"{m:>{col}.3f}" else: row += f"{'N/A':>{col}}" print(row) # THE CRITICAL ABLATION COMPARISON print("\n" + "═"*64) print(" SUITE 9 — ABLATION SIGNIFICANCE TESTS") print(" Comparing degraded-query performance (the hard case)") print("═"*64) all_degraded = { name: [s for lv in range(1,6) for s in degraded_by_level[name][lv]] for name in model_names } if HAS_SCIPY: if "HybridLLM" in all_degraded: for name in model_names: if name == "HybridLLM": continue print(StatEngine.compare_line( "HybridLLM", name, all_degraded["HybridLLM"], all_degraded.get(name,[]))) if "FieldOnly" in all_degraded and "RAGBaseline" in all_degraded: print() print(" KEY ABLATION: FieldOnly vs RAGBaseline on degraded queries") u, p, r = StatEngine.mann_whitney( all_degraded["FieldOnly"], all_degraded["RAGBaseline"]) fo_m, fo_lo, fo_hi = StatEngine.bootstrap_ci(all_degraded["FieldOnly"]) rag_m, rag_lo, rag_hi = StatEngine.bootstrap_ci(all_degraded["RAGBaseline"]) d = fo_m - rag_m sig = StatEngine.sig_label(p) eff = StatEngine.effect_label(r) print(f" FieldOnly degraded: {fo_m:.4f} 95%CI[{fo_lo:.4f},{fo_hi:.4f}]") print(f" RAGBaseline degraded: {rag_m:.4f} 95%CI[{rag_lo:.4f},{rag_hi:.4f}]") print(f" Δ = {d:+.4f} p={p:.4f} {sig} r={abs(r or 0):.3f} ({eff})") print() if d > 0 and (p or 1) < 0.05: print(" INTERPRETATION: Field adds genuine paraphrase robustness.") print(" The Hebbian attractor mechanism contributes INDEPENDENTLY") print(" of the episodic store. This is your publishable claim.") elif d <= 0 or (p or 1) >= 0.05: print(" INTERPRETATION: Episodic store sufficient for paraphrase robustness.") print(" The field does not add significant value on degraded queries.") print(" Reframe paper: the store is the mechanism, the field adds association.") else: print(" [scipy not installed — install with: pip install scipy]") return {"exact": exact_by_level, "degraded": degraded_by_level} # ============================================================================= # SUITE 10 — KNOWLEDGE UPDATE / CONTRADICTION # ============================================================================= UPDATE_TESTS = [ { "old": "Server alpha IP address is 10.0.0.42.", "update": "Server alpha has been migrated. New IP: 10.0.1.99.", "fillers":["What is DNS?","Explain load balancing.","What is BGP?"], "question":"What is the current IP address of server alpha?", "required":["10.0.1.99"], "penalty":["10.0.0.42"], "note": "IP migration", }, { "old": "The public API rate limit is 100 requests per minute.", "update": "Rate limit updated to 500 requests per minute effective immediately.", "fillers":["What is OAuth?","Explain JWT tokens."], "question":"What is the current rate limit for the public API?", "required":["500"], "penalty":["100"], "note": "Config value update", }, { "old": "Marcus Reyes is the project lead for NEXUS-7.", "update": "Dr. Amara Singh has replaced Marcus Reyes as project lead for NEXUS-7.", "fillers":["What is agile?","Describe sprint planning.","What is a retrospective?"], "question":"Who is the current project lead for NEXUS-7?", "required":["amara","singh"], "penalty":["marcus reyes"], "note": "Personnel replacement (both names in context)", }, { "old": "The primary datacentre is located in Frankfurt.", "update": "Primary datacentre has moved to Amsterdam following the EU expansion.", "fillers":["What is BGP?","Explain anycast routing.","What is a CDN?","Describe edge computing."], "question":"Where is the primary datacentre now located?", "required":["amsterdam"], "penalty":["frankfurt"], "note": "Location change", }, { "old": "Encryption algorithm in use is AES-128.", "update": "Security upgrade complete: encryption now uses AES-256 across all services.", "fillers":["What is TLS?","Explain certificate pinning.","What is HSTS?"], "question":"What encryption algorithm is currently in use?", "required":["aes-256"], "penalty":["aes-128"], "note": "Security setting upgrade", }, ] def run_suite10(models, trials=TRIALS): print("\n" + "═"*64) print(" SUITE 10 — KNOWLEDGE UPDATE / CONTRADICTION") print(" Old fact → update fact. Must answer with NEW value.") print(" Penalty applied if old (wrong) answer appears.") print() print(" NOTE from v3 results: ALL systems scored poorly here (0.50-0.67).") print(" ContextBaseline won (!). This is a known weakness of supersession.") print(" Watch whether FieldOnly behaves differently — no supersession logic.") print("═"*64) model_names = list(models.keys()) suite = {n:[] for n in model_names} all_scores = {n:[] for n in model_names} for test in UPDATE_TESTS: print(f"\n [{test['note']}]") for name, model in models.items(): scores = [] for _ in range(trials): model.reset_world() model.teach(test["old"]) for fl in test["fillers"]: run(model, fl, max_tokens=30) model.teach(test["update"]) ans, _ = run(model, test["question"], max_tokens=80) sc = kw(ans, test["required"], penalty=test.get("penalty",[])) scores.append(sc) all_scores[name].append(sc) mean, lo, hi = StatEngine.bootstrap_ci(scores) suite[name].append(mean) sym_list = ['✓' if x>=0.5 else '✗' for x in scores[:5]] print(f" [{name:16s}] {bar(mean)} {mean:.3f} 95%CI[{lo:.3f},{hi:.3f}] " f"first5={sym_list}") smry("Suite 10", all_scores, model_names) return suite # ============================================================================= # GRAPHS (updated for 4 models + CI) # ============================================================================= def generate_graphs(s7, s8, s9, s10, models, output_dir="./benchmark_plots"): try: import matplotlib, matplotlib.pyplot as plt import numpy as np from pathlib import Path matplotlib.use("Agg") except ImportError: print("[graphs] pip install matplotlib"); return [] Path(output_dir).mkdir(parents=True, exist_ok=True) saved = [] model_names = list(models.keys()) COLORS = { "HybridLLM": "#3B8BD4", "RAGBaseline": "#1D9E75", "FieldOnly": "#9B59B6", # purple — ablation highlight "ContextBaseline": "#888780", } MARKERS = { "HybridLLM": "o", "RAGBaseline": "s", "FieldOnly": "D", "ContextBaseline": "^", } plt.rcParams.update({"font.size":11,"axes.spines.top":False, "axes.spines.right":False,"axes.grid":True, "grid.alpha":0.3,"figure.dpi":150}) # ── Fig 5: Suite 9 — degradation by level with CI ───────────────────────── if s9: fig, axes = plt.subplots(1, 2, figsize=(14, 5)) levels = [1,2,3,4,5] desc = ["L1\nSynonym","L2\nRewrite","L3\nSemantic\ndrift", "L4\nDomain\nshift","L5\nAdversarial"] x = np.arange(len(levels)) bw = 0.65 / len(model_names) # Left: absolute degraded score with CI ax = axes[0] for i, name in enumerate(model_names): vals, err_lo, err_hi = [], [], [] for lv in levels: deg = s9["degraded"][name][lv] if deg: m, lo, hi = StatEngine.bootstrap_ci(deg) vals.append(m); err_lo.append(m-lo); err_hi.append(hi-m) else: vals.append(0); err_lo.append(0); err_hi.append(0) offset = (i - len(model_names)/2 + 0.5) * bw bars = ax.bar(x+offset, vals, bw, color=COLORS.get(name,"#555"), yerr=[err_lo,err_hi], capsize=3, label=name, alpha=0.88, zorder=3, error_kw={"elinewidth":1.2,"ecolor":"#333"}) for bar_, v in zip(bars, vals): if v > 0.05: ax.text(bar_.get_x()+bar_.get_width()/2, bar_.get_height()+0.04, f"{v:.2f}", ha="center", va="bottom", fontsize=7) ax.set_xticks(x); ax.set_xticklabels(desc, fontsize=9) ax.set_ylabel("Score on degraded query"); ax.set_ylim(0,1.35) ax.set_title("Degraded cue score (mean ± 95% CI)\nhigher is better", fontsize=11, fontweight="bold") ax.legend(fontsize=8, framealpha=0.9) # Right: delta chart ax = axes[1] for i, name in enumerate(model_names): deltas = [] for lv in levels: ex = s9["exact"][name][lv] deg = s9["degraded"][name][lv] if ex and deg: ex_m, _, _ = StatEngine.bootstrap_ci(ex) deg_m, _, _ = StatEngine.bootstrap_ci(deg) deltas.append(round(ex_m-deg_m,3)) else: deltas.append(0.0) offset = (i - len(model_names)/2 + 0.5) * bw bars = ax.bar(x+offset, deltas, bw, color=COLORS.get(name,"#555"), label=name, alpha=0.88, zorder=3) for bar_, v in zip(bars, deltas): ax.text(bar_.get_x()+bar_.get_width()/2, bar_.get_height()+0.01, f"{v:+.2f}", ha="center", va="bottom", fontsize=7) ax.set_xticks(x); ax.set_xticklabels(desc, fontsize=9) ax.set_ylabel("Score drop exact − degraded"); ax.set_ylim(-0.1, 1.0) ax.axhline(0, color="#aaa", linewidth=0.8, linestyle=":") ax.set_title("Degradation delta (lower is better)", fontsize=11, fontweight="bold") ax.legend(fontsize=8, framealpha=0.9) # Annotate FieldOnly vs RAGBaseline on right panel if "FieldOnly" in model_names and "RAGBaseline" in model_names: fo_all = [s for lv in range(1,6) for s in s9["degraded"]["FieldOnly"][lv]] rag_all = [s for lv in range(1,6) for s in s9["degraded"]["RAGBaseline"][lv]] if fo_all and rag_all: u, p, r = StatEngine.mann_whitney(fo_all, rag_all) sig = StatEngine.sig_label(p) eff = StatEngine.effect_label(r) axes[0].text(0.98, 0.02, f"FieldOnly vs RAG: {sig} (r={abs(r or 0):.2f}, {eff})", transform=axes[0].transAxes, ha="right", va="bottom", fontsize=8, color=COLORS.get("FieldOnly","#9B59B6"), bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.8)) fig.suptitle( "Figure 5 — Suite 9: Paraphrase Robustness (24 tests, 5 levels)\n" "FieldOnly (purple) isolates the field's contribution independently of the episodic store", fontsize=12, fontweight="bold", y=1.02) fig.tight_layout() path = f"{output_dir}/fig5_suite9_paraphrase.png" fig.savefig(path, bbox_inches="tight"); plt.close(fig) saved.append(path); print(f" [Graph] {path}") # ── Fig 6: All four suites side-by-side ─────────────────────────────────── suite_data = {} if s7: suite_data["Associative\nInference"] = {n: (statistics.mean(v), *StatEngine.bootstrap_ci(v)[1:]) if v else (0,0,0) for n,v in s7.items()} if s8: suite_data["Pattern\nExtraction"] = {n: (statistics.mean(v), *StatEngine.bootstrap_ci(v)[1:]) if v else (0,0,0) for n,v in s8.items()} if s9: suite_data["Degraded Cue\n(degraded abs)"] = {} for n in model_names: deg = [s for lv in range(1,6) for s in s9["degraded"][n][lv]] if deg: m, lo, hi = StatEngine.bootstrap_ci(deg) suite_data["Degraded Cue\n(degraded abs)"][n] = (m, lo, hi) else: suite_data["Degraded Cue\n(degraded abs)"][n] = (0, 0, 0) if s10: suite_data["Knowledge\nUpdate"] = {n: (statistics.mean(v), *StatEngine.bootstrap_ci(v)[1:]) if v else (0,0,0) for n,v in s10.items()} if suite_data: fig, ax = plt.subplots(figsize=(13,5.5)) labels = list(suite_data.keys()) x = np.arange(len(labels)) bw = 0.65/len(model_names) for i, name in enumerate(model_names): means, err_lo, err_hi = [], [], [] for lbl in labels: tup = suite_data[lbl].get(name,(0,0,0)) m, lo, hi = tup means.append(m); err_lo.append(m-lo); err_hi.append(hi-m) offset = (i - len(model_names)/2 + 0.5) * bw bars = ax.bar(x+offset, means, bw, color=COLORS.get(name,"#555"), yerr=[err_lo,err_hi], capsize=3, label=name, alpha=0.88, zorder=3, error_kw={"elinewidth":1.2,"ecolor":"#333"}) for bar_, m in zip(bars, means): if m > 0.05: ax.text(bar_.get_x()+bar_.get_width()/2, bar_.get_height()+0.04, f"{m:.2f}", ha="center", va="bottom", fontsize=8) ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=10) ax.set_ylabel("Mean score (0 – 1)"); ax.set_ylim(0,1.4) ax.set_title( "Figure 6 — Field Test Report: Suites 7–10 (mean ± 95% bootstrap CI)\n" "FieldOnly (purple) is the new ablation revealing field vs store contribution", fontsize=12, fontweight="bold") ax.legend(framealpha=0.9, fontsize=10) fig.tight_layout() path = f"{output_dir}/fig6_field_overview.png" fig.savefig(path, bbox_inches="tight"); plt.close(fig) saved.append(path); print(f" [Graph] {path}") return saved # ============================================================================= # MAIN # ============================================================================= def run_field_suites(models, trials=TRIALS, suites=None, graphs_dir="./benchmark_plots"): do = suites or ["7","8","9","10"] s7 = s8 = s9 = s10 = None if "7" in do: s7 = run_suite7(models, trials) if "8" in do: s8 = run_suite8(models, trials) if "9" in do: s9 = run_suite9(models, trials) if "10" in do: s10 = run_suite10(models, trials) model_names = list(models.keys()) print("\n" + "═"*70) print(" FIELD TEST REPORT — Suites 7–10 (v2: FieldOnly ablation + proper stats)") print("═"*70) col = 18 print(f" {'Suite':<30}" + "".join(f"{n:>{col}}" for n in model_names)) print(" " + "─"*(30 + col*len(model_names))) def _row(label, data): if data is None: return r = f" {label:<30}" for n in model_names: sc = data.get(n,[]) if isinstance(data,dict) else [] if sc: m, lo, hi = StatEngine.bootstrap_ci(sc) r += f"{m:>{col}.4f}" else: r += f"{'N/A':>{col}}" print(r) _row("Associative inference", s7) _row("Pattern extraction", s8) if s9: all_deg = {n: [s for lv in range(1,6) for s in s9["degraded"][n][lv]] for n in model_names} _row("Degraded cue (paraphrase)", all_deg) _row("Knowledge update", s10) print(" " + "─"*(30 + col*len(model_names))) saved = generate_graphs(s7, s8, s9, s10, models, graphs_dir) if saved: print(f"\n {len(saved)} graphs saved: {', '.join(saved)}") return {"s7":s7,"s8":s8,"s9":s9,"s10":s10} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--trials", type=int, default=20, help="Trials per test (default 20; use 30 for paper submission)") parser.add_argument("--graphs", type=str, default="./benchmark_plots") parser.add_argument("--no-context", action="store_true") parser.add_argument("--no-field-only", action="store_true") parser.add_argument("--suite", nargs="*", choices=["7","8","9","10"]) args = parser.parse_args() DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"\n{'═'*64}") print(f" Field Benchmark Suites 7–10 v2 | device={DEVICE} trials={args.trials}") if args.trials < 10: print(f" [!] {args.trials} trials is very low — CI will be wide and unreliable") print(f"{'═'*64}") print("\n Loading HybridLLM..."); hybrid = HybridLLM() models = { "HybridLLM": hybrid, "RAGBaseline": RAGBaseline(hybrid.tokenizer, hybrid.model), } if not args.no_field_only: print(" Initialising FieldOnly (ablation)...") models["FieldOnly"] = FieldOnly(hybrid.tokenizer, hybrid.model) print(" FieldOnly ready.") if not args.no_context: models["ContextBaseline"] = ContextBaseline(hybrid.tokenizer, hybrid.model) run_field_suites(models, trials=args.trials, suites=args.suite, graphs_dir=args.graphs)