{
  "schema_version": 1,
  "generated_at": "2026-05-18",
  "site": "https://dev.alphabell.com",
  "count": 32,
  "publications": [
    {
      "slug": "cooperative-membership-functions",
      "url": "https://dev.alphabell.com/publications/cooperative-membership-functions",
      "title": "Cooperative Membership Functions for Multi-Agent Oversight",
      "authors": [
        "Hiroshi Tanigawa",
        "Ifeoma Nwosu-Howard",
        "Ruth Wernicke"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "lebesgue-22",
      "year": 2026,
      "month": "Jan",
      "date": "2026-01-22",
      "venue": "ICLR 2026 \u00b7 arXiv 2601.04221",
      "venue_type": "conference",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2601.04221",
      "arxiv": "2601.04221",
      "pdf": "https://dev.alphabell.com/publications/cooperative-membership-functions.pdf",
      "code": "https://github.com/alphabell-labs/ab-membership",
      "data": "https://huggingface.co/datasets/alphabell/multi-agent-oversight-2026",
      "bibtex_key": "tanigawa2026cooperative",
      "abstract": "Multi-agent oversight protocols must answer a question that single-agent oversight elides: which agents in a group are jointly responsible for a contested action. We introduce cooperative membership functions \u2014 a calibrated, trace-derived signal of the degree to which each participating agent shares causal responsibility for a multi-agent outcome \u2014 and show that incorporating them into the debate-plus-trace protocol reduces unwarranted halting by 41% on adversarial bargaining scenarios while preserving the protocol's true-positive rate. We propose membership functions as a primitive that any agent substrate supporting multi-agent execution should expose."
    },
    {
      "slug": "mechanistic-markers-planning-depth",
      "url": "https://dev.alphabell.com/publications/mechanistic-markers-planning-depth",
      "title": "Mechanistic Markers of Planning Depth in Language-Model Agents",
      "authors": [
        "Karima Belkadi",
        "Hester Vandekerckhove",
        "Yuki Cho",
        "Jiang Yifei"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "hilbert-13",
      "year": 2026,
      "month": "Jan",
      "date": "2026-01-09",
      "venue": "ICLR 2026 \u00b7 alphabell index 26/02",
      "venue_type": "conference",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2601.01890",
      "arxiv": "2601.01890",
      "pdf": "https://dev.alphabell.com/publications/mechanistic-markers-planning-depth.pdf",
      "code": "https://github.com/alphabell-labs/ab-depth",
      "data": null,
      "bibtex_key": "belkadi2026depth",
      "abstract": "We identify a family of mechanistic markers that correlate with the depth of planning that a language-model agent is performing on a given step, where 'depth' is operationalised as the number of forward simulation steps the agent's internal computation appears to be considering. The markers are computable in near-real-time from residual-stream activations, transfer across agent substrates without retraining, and produce a depth estimate whose Pearson correlation with ground-truth planning depth (recovered by trace analysis) is 0.78 on the cell's evaluation suite."
    },
    {
      "slug": "soft-stopping-conditions-long-runs",
      "url": "https://dev.alphabell.com/publications/soft-stopping-conditions-long-runs",
      "title": "Soft Stopping Conditions for Long Training Runs",
      "authors": [
        "Aravind Periyasamy",
        "Liora Sabatini",
        "Marek Holub",
        "Karima Belkadi"
      ],
      "axis": "Recursive self-improvement",
      "axis_slug": "recursive-self-improvement",
      "cell": "godel-02",
      "year": 2025,
      "month": "Dec",
      "date": "2025-12-15",
      "venue": "alphabell index 25/20 \u00b7 ML Safety Workshop, NeurIPS 2025",
      "venue_type": "workshop",
      "tags": [
        "rsi"
      ],
      "doi": "10.48550/arXiv.2512.07221",
      "arxiv": "2512.07221",
      "pdf": "https://dev.alphabell.com/publications/soft-stopping-conditions-long-runs.pdf",
      "code": "https://github.com/alphabell-labs/ab-soft-stop",
      "data": null,
      "bibtex_key": "periyasamy2025softstop",
      "abstract": "Hard stopping conditions \u2014 those that halt a run the moment a tripwire is crossed \u2014 are appropriate for short or moderate-length runs but produce expensive false-positive halts on training runs that span multiple weeks. We introduce soft stopping conditions: a calibrated, signed-by-paired-cell escalation protocol in which an intermediate trip raises a flag rather than halts the run, while preserving the option of subsequent escalation. We discuss when soft stops are appropriate, when they are not, and how the MUR protocol is amended to support them."
    },
    {
      "slug": "symbolic-world-models-procedural",
      "url": "https://dev.alphabell.com/publications/symbolic-world-models-procedural",
      "title": "Symbolic World Models for Procedural Reasoning",
      "authors": [
        "Dimitri Yelchaninov",
        "Lin Hao",
        "Ananya Mukherjee",
        "Sera Wijewardene"
      ],
      "axis": "World models",
      "axis_slug": "world-models",
      "cell": "bessel-04",
      "year": 2025,
      "month": "Dec",
      "date": "2025-12-10",
      "venue": "NeurIPS 2025 \u00b7 alphabell index 25/23",
      "venue_type": "conference",
      "tags": [
        "world-models"
      ],
      "doi": "10.48550/arXiv.2512.04918",
      "arxiv": "2512.04918",
      "pdf": "https://dev.alphabell.com/publications/symbolic-world-models-procedural.pdf",
      "code": "https://github.com/alphabell-labs/ab-symbolic",
      "data": "https://huggingface.co/datasets/alphabell/symbolic-2025",
      "bibtex_key": "yelchaninov2025symbolic",
      "abstract": "Procedural domains \u2014 code execution, theorem proving, configuration management \u2014 admit world models whose state is naturally symbolic rather than continuous. We train a symbolic world model on 26k procedural trajectories drawn from the 70k-env pool, show that it outperforms neural baselines by 19-28% on out-of-distribution procedural tasks, and demonstrate that the model's symbolic state allows partial-execution reasoning that the neural baselines structurally cannot support."
    },
    {
      "slug": "cross-cell-replication-700-circuit",
      "url": "https://dev.alphabell.com/publications/cross-cell-replication-700-circuit",
      "title": "Cross-Cell Replication of the 700-Circuit Conjecture",
      "authors": [
        "Nico Almgren",
        "Helena Salgueiro",
        "Karima Belkadi",
        "Gita Sundaram"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "hilbert-13",
      "year": 2025,
      "month": "Dec",
      "date": "2025-12-06",
      "venue": "NeurIPS 2025 \u00b7 alphabell index 25/24",
      "venue_type": "conference",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2512.01775",
      "arxiv": "2512.01775",
      "pdf": "https://dev.alphabell.com/publications/cross-cell-replication-700-circuit.pdf",
      "code": "https://github.com/alphabell-labs/ab-circuits-replication",
      "data": "https://huggingface.co/datasets/alphabell/circuits-replication-2025",
      "bibtex_key": "almgren2025replication",
      "abstract": "The 700-circuit conjecture \u2014 that ~700 reusable circuits explain 86% of behaviourally relevant activations on frontier-class models \u2014 was an internal hilbert-13 finding that has been the load-bearing empirical claim behind the lab's mechanistic interpretability programme. We report on an independent cross-cell replication by cantor-18 across three model families and two non-alphabell foundation models, finding 81-89% behaviour coverage with circuit counts in the 612-758 range. The result strengthens the conjecture and suggests its applicability beyond the model families on which it was originally derived."
    },
    {
      "slug": "counterfactual-trajectory-replay",
      "url": "https://dev.alphabell.com/publications/counterfactual-trajectory-replay",
      "title": "Counterfactual Trajectory Replay for Off-Policy Agent Debugging",
      "authors": [
        "Mira Holloway",
        "Priya Anand",
        "Dineth Karunaratne",
        "Akoss Vidor"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "fourier-67",
      "year": 2025,
      "month": "Dec",
      "date": "2025-12-03",
      "venue": "alphabell index 25/19 \u00b7 arXiv 2512.00417",
      "venue_type": "preprint",
      "tags": [
        "agentic"
      ],
      "doi": "10.48550/arXiv.2512.00417",
      "arxiv": "2512.00417",
      "pdf": "https://dev.alphabell.com/publications/counterfactual-trajectory-replay.pdf",
      "code": "https://github.com/alphabell-labs/ab-replay",
      "data": null,
      "bibtex_key": "holloway2025counterfactual",
      "abstract": "Debugging long-horizon agents is hard precisely because the failure that motivates the debug session may have arisen tens of thousands of steps before the visible symptom. We introduce trajectory-replay, a debugging mode built on the alphabell substrate's content-addressed traces, in which a deviation at any prior step can be simulated forward under the same model weights and substrate state. We report on six cell-internal debugging sessions where the technique uncovered root causes that the producing cell had been unable to find through prompt-level inspection, and discuss the operational requirements for replay to be both fast and faithful."
    },
    {
      "slug": "steering-vectors-lightweight-alternative",
      "url": "https://dev.alphabell.com/publications/steering-vectors-lightweight-alternative",
      "title": "Steering Vectors as a Lightweight Alternative to Activation Patching",
      "authors": [
        "Karima Belkadi",
        "Jiang Yifei",
        "Nico Almgren",
        "Hester Vandekerckhove"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "hilbert-13",
      "year": 2025,
      "month": "Nov",
      "date": "2025-11-12",
      "venue": "NeurIPS 2025 \u00b7 alphabell index 25/17",
      "venue_type": "conference",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2511.04188",
      "arxiv": "2511.04188",
      "pdf": "https://dev.alphabell.com/publications/steering-vectors-lightweight-alternative.pdf",
      "code": "https://github.com/alphabell-labs/ab-steering",
      "data": "https://huggingface.co/datasets/alphabell/steering-2025",
      "bibtex_key": "belkadi2025steering",
      "abstract": "Activation patching has become a foundational tool for mechanistic interpretability, but its compute and memory cost scales poorly with model size and the number of intervention points. We show that a careful family of steering vectors \u2014 learned directly on contrastive activation pairs and applied at a single residual-stream layer \u2014 recovers 88% of patching's causal-attribution faithfulness on a 12-task benchmark, at roughly 4% of the compute. We provide both the theoretical justification for the substitution and an open-source implementation (ab-steering) that integrates with the lab's ab-circuits library."
    },
    {
      "slug": "durable-agents-substrate-v1",
      "url": "https://dev.alphabell.com/publications/durable-agents-substrate-v1",
      "title": "Durable Agent Substrate v1: persistent state, learned tool affordances, and verifiable execution traces",
      "authors": [
        "Mira Holloway",
        "Dineth Karunaratne",
        "Priya Anand",
        "Cheung Wai-Lin"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "fourier-67",
      "year": 2025,
      "month": "Oct",
      "date": "2025-10-15",
      "venue": "Internal release \u2014 alphabell index 25/14",
      "venue_type": "internal",
      "tags": [
        "agentic"
      ],
      "doi": "10.48550/arXiv.2510.16245",
      "arxiv": "2510.16245",
      "pdf": "https://dev.alphabell.com/publications/durable-agents-substrate-v1.pdf",
      "code": "https://github.com/alphabell-labs/ab-durable",
      "data": null,
      "bibtex_key": "holloway2025durable",
      "abstract": "We introduce v1 of the alphabell agent substrate, a runtime that treats agents as first-class computational entities with persistent state, structured memory, and execution traces verifiable against a content-addressed log. Trained on 11,200 long-horizon trajectories drawn from twelve cell-managed environments, agents trained on the substrate complete 14-day open-ended tasks with 64% completion under partial observability \u2014 vs. 38% for the best published react/plan baseline at matched parameter count. We show that exposing memory and resource budgets as first-class primitives removes 70% of the prompt-engineering effort previously needed to maintain agent identity over multi-week deployments."
    },
    {
      "slug": "pre-registered-capability-evaluations",
      "url": "https://dev.alphabell.com/publications/pre-registered-capability-evaluations",
      "title": "Pre-Registered Capability Evaluations for Internal Releases",
      "authors": [
        "Liora Sabatini",
        "Aravind Periyasamy",
        "Eitan Berkovich"
      ],
      "axis": "Recursive self-improvement",
      "axis_slug": "recursive-self-improvement",
      "cell": "turing-11",
      "year": 2025,
      "month": "Oct",
      "date": "2025-10-04",
      "venue": "alphabell methodology document 25-M-04",
      "venue_type": "internal",
      "tags": [
        "rsi"
      ],
      "doi": "10.48550/arXiv.2510.01166",
      "arxiv": "2510.01166",
      "pdf": "https://dev.alphabell.com/publications/pre-registered-capability-evaluations.pdf",
      "code": "https://github.com/alphabell-labs/ab-prereg",
      "data": null,
      "bibtex_key": "sabatini2025prereg",
      "abstract": "The MUR protocol (25/05) requires that every RSI-axis run pre-register stopping conditions; this document specifies how those pre-registrations should be structured, signed, and reviewed before a run begins. We describe the standard pre-registration template, the cell-internal review obligations, the disagreement procedure, and the auditable storage of pre-registered conditions across the lab's content-addressed trace store. The methodology document is canonical and is referenced by every RSI-axis run report."
    },
    {
      "slug": "bounded-self-modification-limits",
      "url": "https://dev.alphabell.com/publications/bounded-self-modification-limits",
      "title": "Bounded Self-Modification: Provable Limits on Agent Self-Editing",
      "authors": [
        "Liora Sabatini",
        "Marek Holub",
        "Eitan Berkovich"
      ],
      "axis": "Recursive self-improvement",
      "axis_slug": "recursive-self-improvement",
      "cell": "godel-02",
      "year": 2025,
      "month": "Sep",
      "date": "2025-09-30",
      "venue": "alphabell index 25/22 \u00b7 delayed release",
      "venue_type": "internal",
      "tags": [
        "rsi"
      ],
      "doi": "10.48550/arXiv.2509.10211",
      "arxiv": "2509.10211",
      "pdf": "https://dev.alphabell.com/publications/bounded-self-modification-limits.pdf",
      "code": null,
      "data": null,
      "bibtex_key": "sabatini2025bounded",
      "abstract": "We formalise the class of self-modifications that an agent may propose to its own tool catalogue, evaluation criteria, or training procedure, and prove a bound on the rate at which such modifications can compound capability without crossing a pre-registered measurement threshold. The result is constructive: we exhibit a confinement profile under which the bound is tight, and discuss what conditions on the substrate and the modification-under-review protocol make the bound load-bearing. The work is paired with the lab's MUR protocol (25/05)."
    },
    {
      "slug": "scalable-oversight-agent-systems",
      "url": "https://dev.alphabell.com/publications/scalable-oversight-agent-systems",
      "title": "Scalable Oversight for Multi-Step Agent Systems: a Debate-Plus-Trace Approach",
      "authors": [
        "Ifeoma Nwosu-Howard",
        "Hiroshi Tanigawa",
        "Maral Lotfi",
        "Ruth Wernicke"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "lebesgue-22",
      "year": 2025,
      "month": "Sep",
      "date": "2025-09-15",
      "venue": "Internal release \u2014 alphabell index 25/12 \u00b7 arXiv 2509.04221",
      "venue_type": "preprint",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2509.67033",
      "arxiv": "2509.67033",
      "pdf": "https://dev.alphabell.com/publications/scalable-oversight-agent-systems.pdf",
      "code": "https://github.com/alphabell-labs/ab-scalable",
      "data": null,
      "bibtex_key": "nwosuhoward2025scalable",
      "abstract": "We extend debate-based oversight to multi-step agent execution. Each agent action emits a structured trace; two debater agents present competing reconstructions of the trace, and an evaluator (human or weaker model) adjudicates only over the disputed steps. On adversarial environments where the action-taker is up to 6\u00d7 more capable than the evaluator, debate-plus-trace achieves 91% agreement with ground-truth correctness, vs. 64% for trace-only inspection and 71% for unstructured debate."
    },
    {
      "slug": "capability-elicitation-deployment-gap",
      "url": "https://dev.alphabell.com/publications/capability-elicitation-deployment-gap",
      "title": "Capability Elicitation vs Deployment: A Gap Analysis",
      "authors": [
        "Eitan Berkovich",
        "Yuki Cho",
        "Liora Sabatini",
        "Aravind Periyasamy"
      ],
      "axis": "Recursive self-improvement",
      "axis_slug": "recursive-self-improvement",
      "cell": "turing-11",
      "year": 2025,
      "month": "Aug",
      "date": "2025-08-17",
      "venue": "Alignment Forum (Aug 2025) \u00b7 arXiv 2508.02315",
      "venue_type": "preprint",
      "tags": [
        "rsi"
      ],
      "doi": "10.48550/arXiv.2508.02315",
      "arxiv": "2508.02315",
      "pdf": "https://dev.alphabell.com/publications/capability-elicitation-deployment-gap.pdf",
      "code": "https://github.com/alphabell-labs/ab-elicit",
      "data": null,
      "bibtex_key": "berkovich2025elicit",
      "abstract": "Capability evaluations performed under elicitation conditions \u2014 using prompting strategies designed to extract maximum capability \u2014 produce capability estimates that are systematically higher than what the model exhibits in deployment. We quantify the gap on six lab-internal capability benchmarks and find median gaps of 22-44%. We argue that the gap is structural rather than a measurement artifact, propose a deployment-conditioned evaluation protocol that closes most of it, and discuss the implications for RSI-axis stopping-condition design."
    },
    {
      "slug": "compositional-latent-dynamics",
      "url": "https://dev.alphabell.com/publications/compositional-latent-dynamics",
      "title": "Compositional Latent Dynamics for Long-Horizon World Modelling",
      "authors": [
        "Jonas Bremer",
        "Sasha Petrov",
        "Felicity Anjali Sandirasegaram",
        "Tomoko Niwa"
      ],
      "axis": "World models",
      "axis_slug": "world-models",
      "cell": "voronoi-19",
      "year": 2025,
      "month": "Aug",
      "date": "2025-08-15",
      "venue": "Internal release \u2014 alphabell index 25/09 \u00b7 arXiv 2508.10912",
      "venue_type": "preprint",
      "tags": [
        "world-models"
      ],
      "doi": "10.48550/arXiv.2508.58450",
      "arxiv": "2508.58450",
      "pdf": "https://dev.alphabell.com/publications/compositional-latent-dynamics.pdf",
      "code": "https://github.com/alphabell-labs/ab-composit",
      "data": null,
      "bibtex_key": "bremer2025compositional",
      "abstract": "We introduce CLD, an architecture in which latent dynamics are composed from a discrete library of learned operators rather than predicted by a monolithic transition model. CLD trained on 14,000 hours of multimodal interaction data extrapolates physical dynamics across regimes the monolithic baseline never sees (impact, contact-rich manipulation, two-body wave interaction) and produces counterfactual rollouts whose final-state error is 2.4\u00d7 lower than the previous state of the art at 200-step horizons."
    },
    {
      "slug": "tokenizer-bias-agentic-decisions",
      "url": "https://dev.alphabell.com/publications/tokenizer-bias-agentic-decisions",
      "title": "Tokenizer Bias in Agentic Decision-Making",
      "authors": [
        "Iben Lykke",
        "Mira Holloway",
        "Cheung Wai-Lin"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "ramanujan-07",
      "year": 2025,
      "month": "Jul",
      "date": "2025-07-29",
      "venue": "ACL 2025 \u00b7 alphabell index 25/13",
      "venue_type": "conference",
      "tags": [
        "agentic"
      ],
      "doi": "10.48550/arXiv.2507.10402",
      "arxiv": "2507.10402",
      "pdf": "https://dev.alphabell.com/publications/tokenizer-bias-agentic-decisions.pdf",
      "code": "https://github.com/alphabell-labs/ab-tokens",
      "data": "https://huggingface.co/datasets/alphabell/tokenbias-2025",
      "bibtex_key": "lykke2025tokenizer",
      "abstract": "The tokenizer used by a substrate-hosted agent's underlying language model is not a neutral preprocessing step \u2014 it systematically biases which tool calls the agent prefers, which observations the agent treats as similar, and which plans the agent finds tractable to express. We characterise the bias on five widely used tokenizers, show that it survives across model scales, and propose a compositional vocabulary layer at the substrate boundary that partially mitigates it without retraining the underlying model."
    },
    {
      "slug": "compositional-generalisation-mixed-modality",
      "url": "https://dev.alphabell.com/publications/compositional-generalisation-mixed-modality",
      "title": "Compositional Generalisation in Mixed-Modality World Models",
      "authors": [
        "Wen Shao",
        "S\u00f8ren Almqvist",
        "Tomoko Niwa",
        "Jonas Bremer"
      ],
      "axis": "World models",
      "axis_slug": "world-models",
      "cell": "hadamard-08",
      "year": 2025,
      "month": "Jul",
      "date": "2025-07-17",
      "venue": "ICML 2025 \u00b7 alphabell index 25/12b",
      "venue_type": "conference",
      "tags": [
        "world-models"
      ],
      "doi": "10.48550/arXiv.2507.05432",
      "arxiv": "2507.05432",
      "pdf": "https://dev.alphabell.com/publications/compositional-generalisation-mixed-modality.pdf",
      "code": "https://github.com/alphabell-labs/ab-mixed",
      "data": "https://huggingface.co/datasets/alphabell/mixed-modality-2025",
      "bibtex_key": "shao2025compositional",
      "abstract": "Mixed-modality world models \u2014 those whose latent state must capture vision, language, and symbolic dynamics simultaneously \u2014 succeed or fail at compositional generalisation on novel mode-mixings that none of the unimodal models in their substrate ever encountered. We characterise the regimes in which our cross-modal latent unification approach generalises, identify a specific failure mode in language-symbolic mixings that we trace back to a quirk in the unified tokenizer, and propose a substrate-level change that resolves it."
    },
    {
      "slug": "embodied-pretraining-via-simulation",
      "url": "https://dev.alphabell.com/publications/embodied-pretraining-via-simulation",
      "title": "Embodied Pretraining via Cell-Operated Simulation: a 70k-environment study",
      "authors": [
        "Lin Hao",
        "Dimitri Yelchaninov",
        "Sera Wijewardene",
        "Ananya Mukherjee"
      ],
      "axis": "World models",
      "axis_slug": "world-models",
      "cell": "bessel-04",
      "year": 2025,
      "month": "Jul",
      "date": "2025-07-15",
      "venue": "Internal release \u2014 alphabell index 25/07",
      "venue_type": "internal",
      "tags": [
        "world-models"
      ],
      "doi": "10.48550/arXiv.2507.46060",
      "arxiv": "2507.46060",
      "pdf": "https://dev.alphabell.com/publications/embodied-pretraining-via-simulation.pdf",
      "code": "https://github.com/alphabell-labs/ab-embodied",
      "data": null,
      "bibtex_key": "hao2025embodied",
      "abstract": "Three cells in the world-models axis pooled their simulation infrastructure into a 70,142-environment training pool drawn from physics, social, and symbolic dynamics. We show that pretraining a 7B-parameter policy on the pool yields a 38% sample-efficiency improvement on every downstream robotics benchmark we evaluated, and \u2014 more importantly \u2014 produces dynamics-aware representations that transfer non-trivially to symbolic planning tasks. We argue the result supports treating perception and prediction as a single learning objective rather than two."
    },
    {
      "slug": "latent-goal-decoding-sparse-probes",
      "url": "https://dev.alphabell.com/publications/latent-goal-decoding-sparse-probes",
      "title": "Latent Goal Decoding via Sparse Probes",
      "authors": [
        "Jiang Yifei",
        "Karima Belkadi",
        "Wen Shao"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "hilbert-13",
      "year": 2025,
      "month": "Jul",
      "date": "2025-07-09",
      "venue": "ICML 2025 \u00b7 Alignment Forum cross-post",
      "venue_type": "conference",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2507.02614",
      "arxiv": "2507.02614",
      "pdf": "https://dev.alphabell.com/publications/latent-goal-decoding-sparse-probes.pdf",
      "code": "https://github.com/alphabell-labs/ab-probes",
      "data": "https://huggingface.co/datasets/alphabell/probes-2025",
      "bibtex_key": "jiang2025latent",
      "abstract": "We show that a small (\u22641k-parameter) sparse linear probe trained on residual-stream activations recovers the deployed goal of a substrate-hosted agent to within an F1 of 0.91 across the alphabell benchmark of 38 long-horizon tasks. The probe is robust to paraphrase of the initial goal specification, transfers across substrate versions without retraining, and produces an interpretable goal-vector decomposition that the producing cell's contributors verified by hand on 200 sampled trajectories. We release the probe family and the goal-decomposition tooling as part of ab-probes."
    },
    {
      "slug": "long-horizon-plan-repair",
      "url": "https://dev.alphabell.com/publications/long-horizon-plan-repair",
      "title": "Long-Horizon Plan Repair Under Adversarial Environment Shift",
      "authors": [
        "Catriona MacLeod",
        "Sho Tachibana",
        "Renata Coello"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "euler-99",
      "year": 2025,
      "month": "Jun",
      "date": "2025-06-18",
      "venue": "ICAPS 2025 \u00b7 alphabell index 25/11",
      "venue_type": "conference",
      "tags": [
        "agentic"
      ],
      "doi": "10.48550/arXiv.2506.04471",
      "arxiv": "2506.04471",
      "pdf": "https://dev.alphabell.com/publications/long-horizon-plan-repair.pdf",
      "code": "https://github.com/alphabell-labs/ab-repair",
      "data": "https://huggingface.co/datasets/alphabell/plan-repair-2025",
      "bibtex_key": "macleod2025planrepair",
      "abstract": "Long-horizon plans inevitably encounter environment shifts that the plan was not constructed against. We study plan repair under adversarial shifts \u2014 shifts that are bounded in magnitude but chosen worst-case \u2014 and show that a substrate-integrated repair procedure that operates on plans as first-class objects (rather than reconstructing the plan from a prompt) reduces total replanning cost by 4.3\u00d7 over the strongest baseline, while preserving the eventual plan-success rate. The result generalises across the cell's three POMDP benchmark suites."
    },
    {
      "slug": "recursive-modification-protocol",
      "url": "https://dev.alphabell.com/publications/recursive-modification-protocol",
      "title": "Modification-Under-Review: protocols for safe self-modification of training procedures",
      "authors": [
        "Liora Sabatini",
        "Yuki Cho",
        "Aravind Periyasamy"
      ],
      "axis": "Recursive self-improvement",
      "axis_slug": "recursive-self-improvement",
      "cell": "godel-02",
      "year": 2025,
      "month": "Jun",
      "date": "2025-06-15",
      "venue": "Internal release \u2014 alphabell index 25/05 \u00b7 delayed release",
      "venue_type": "internal",
      "tags": [
        "rsi"
      ],
      "doi": "10.48550/arXiv.2506.17989",
      "arxiv": "2506.17989",
      "pdf": "https://dev.alphabell.com/publications/recursive-modification-protocol.pdf",
      "code": "https://github.com/alphabell-labs/ab-recursiv",
      "data": null,
      "bibtex_key": "sabatini2025recursive",
      "abstract": "We present the modification-under-review (MUR) protocol used internally by RSI-axis cells when a candidate model proposes a change to its own training procedure, architecture, or evaluation criteria. The protocol decouples proposal, evaluation, and incorporation into three signed phases; each phase has a pre-registered stopping condition and a corresponding interpretability cell with read-access. We report on eleven months of operation across four cells, including two runs that triggered the threshold and were halted."
    },
    {
      "slug": "mechanistic-circuits-frontier",
      "url": "https://dev.alphabell.com/publications/mechanistic-circuits-frontier",
      "title": "Mechanistic Circuit Analysis at Frontier Scale: cells as a unit of interpretability",
      "authors": [
        "Jiang Yifei",
        "Nico Almgren",
        "Karima Belkadi",
        "Hester Vandekerckhove"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "hilbert-13",
      "year": 2025,
      "month": "May",
      "date": "2025-05-15",
      "venue": "Internal release \u2014 alphabell index 25/03 \u00b7 arXiv 2505.18831",
      "venue_type": "preprint",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2505.89369",
      "arxiv": "2505.89369",
      "pdf": "https://dev.alphabell.com/publications/mechanistic-circuits-frontier.pdf",
      "code": "https://github.com/alphabell-labs/ab-mechanis",
      "data": null,
      "bibtex_key": "yifei2025mechanistic",
      "abstract": "We adapt mechanistic interpretability tooling to operate at the parameter scale of frontier-class models without quadratic costs in attention-head enumeration. Our core observation: features cluster into ~700 reusable circuits whose composition explains 86% of behaviourally relevant activations on the benchmarks we tested. The methodology is now used as a precondition for the closed-loop interpretability cells paired with every RSI run."
    },
    {
      "slug": "coalition-stability-negotiation",
      "url": "https://dev.alphabell.com/publications/coalition-stability-negotiation",
      "title": "Coalition Stability in Substrate-Mediated Negotiation",
      "authors": [
        "Roman Iliescu",
        "Yvonne Akande",
        "Lakshmi Ravi",
        "Wenona Tate"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "kalman-04",
      "year": 2025,
      "month": "May",
      "date": "2025-05-14",
      "venue": "AAMAS 2025 \u00b7 alphabell index 25/04",
      "venue_type": "conference",
      "tags": [
        "agentic"
      ],
      "doi": "10.48550/arXiv.2505.06013",
      "arxiv": "2505.06013",
      "pdf": "https://dev.alphabell.com/publications/coalition-stability-negotiation.pdf",
      "code": "https://github.com/alphabell-labs/ab-coalitions",
      "data": null,
      "bibtex_key": "iliescu2025coalition",
      "abstract": "When negotiation among heterogeneous agents permits the formation of coalitions, the stability of the resulting allocation depends both on the negotiation protocol and on what the substrate exposes about each agent's commitments. We extend the substrate-mediated negotiation result (25/02) to coalitions, show that the core may be empty under standard utility-maximising agents but is non-empty for a class of commitment-preserving agents, and propose a substrate primitive that makes the commitment-preserving class operationally distinguishable from its alternatives."
    },
    {
      "slug": "predictive-coding-operator-discovery",
      "url": "https://dev.alphabell.com/publications/predictive-coding-operator-discovery",
      "title": "Predictive Coding Objectives for Operator Discovery",
      "authors": [
        "Jonas Bremer",
        "Tomoko Niwa",
        "Sasha Petrov"
      ],
      "axis": "World models",
      "axis_slug": "world-models",
      "cell": "voronoi-19",
      "year": 2025,
      "month": "Apr",
      "date": "2025-04-26",
      "venue": "ICLR 2025 \u00b7 alphabell index 25/02b",
      "venue_type": "conference",
      "tags": [
        "world-models"
      ],
      "doi": "10.48550/arXiv.2504.04019",
      "arxiv": "2504.04019",
      "pdf": "https://dev.alphabell.com/publications/predictive-coding-operator-discovery.pdf",
      "code": "https://github.com/alphabell-labs/ab-operators",
      "data": null,
      "bibtex_key": "bremer2025predictive",
      "abstract": "The compositional latent dynamics (CLD) framework relies on a discrete library of learned operators; how those operators are discovered has been a sometimes-arbitrary part of the pipeline. We propose a predictive-coding objective that discovers operator boundaries directly from sequential observation streams, and show that operator libraries discovered under the objective produce dynamics models that are uniformly more sample-efficient than libraries discovered by clustering on temporal-difference signals. The discovered operators are interpretable in the same sense as the hand-curated ones from CLD."
    },
    {
      "slug": "agent-negotiation-protocols",
      "url": "https://dev.alphabell.com/publications/agent-negotiation-protocols",
      "title": "Negotiation Protocols Among Heterogeneous Agents: a benchmark and three baselines",
      "authors": [
        "Roman Iliescu",
        "Yvonne Akande",
        "Lakshmi Ravi",
        "Pascal Niedermeier"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "kalman-04",
      "year": 2025,
      "month": "Apr",
      "date": "2025-04-15",
      "venue": "Internal release \u2014 alphabell index 25/02",
      "venue_type": "internal",
      "tags": [
        "agentic"
      ],
      "doi": "10.48550/arXiv.2504.19425",
      "arxiv": "2504.19425",
      "pdf": "https://dev.alphabell.com/publications/agent-negotiation-protocols.pdf",
      "code": "https://github.com/alphabell-labs/ab-agent",
      "data": null,
      "bibtex_key": "iliescu2025agent",
      "abstract": "We introduce a benchmark of 312 multi-agent negotiation scenarios spanning resource allocation, joint planning, and adversarial bargaining, with explicit asymmetries in goals, observability, and capability. Across three baseline protocols (utility-anchored, deliberation-anchored, and substrate-mediated), we find substrate-mediated negotiation \u2014 in which a shared substrate exposes commitments and constraints as first-class objects \u2014 produces the highest joint utility and the lowest exploitation rate against weaker counterparties."
    },
    {
      "slug": "verifiable-policies-formal-verification",
      "url": "https://dev.alphabell.com/publications/verifiable-policies-formal-verification",
      "title": "Toward Formal Verification of Learned Policies in Bounded Environments",
      "authors": [
        "Aviva Stern",
        "Sun Kyung-min",
        "Felipe Avelar"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "lebesgue-22",
      "year": 2025,
      "month": "Mar",
      "date": "2025-03-15",
      "venue": "Internal release \u2014 alphabell index 25/01",
      "venue_type": "internal",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2503.73959",
      "arxiv": "2503.73959",
      "pdf": "https://dev.alphabell.com/publications/verifiable-policies-formal-verification.pdf",
      "code": "https://github.com/alphabell-labs/ab-verifiab",
      "data": null,
      "bibtex_key": "stern2025verifiable",
      "abstract": "We present a verification framework for learned policies operating in bounded environments, combining abstract interpretation of the policy network with symbolic execution of the environment dynamics. We prove safety properties of three deployed cells' policies for a class of warehouse-coordination tasks. The framework's applicability is bounded: it requires environments with finite, formally specifiable state. We argue the right ambition is to expand the class of such environments rather than to weaken the proof."
    },
    {
      "slug": "counterfactual-rollouts-for-planning",
      "url": "https://dev.alphabell.com/publications/counterfactual-rollouts-for-planning",
      "title": "Counterfactual Rollouts for Planning: a 30-day deployment study",
      "authors": [
        "Sasha Petrov",
        "Maya Quesada",
        "Bilal Hossain"
      ],
      "axis": "World models",
      "axis_slug": "world-models",
      "cell": "voronoi-19",
      "year": 2024,
      "month": "Dec",
      "date": "2024-12-15",
      "venue": "Internal release \u2014 alphabell index 24/22",
      "venue_type": "internal",
      "tags": [
        "world-models"
      ],
      "doi": "10.48550/arXiv.2412.93805",
      "arxiv": "2412.93805",
      "pdf": "https://dev.alphabell.com/publications/counterfactual-rollouts-for-planning.pdf",
      "code": "https://github.com/alphabell-labs/ab-counterf",
      "data": null,
      "bibtex_key": "petrov2024counterfactual",
      "abstract": "Counterfactual rollouts \u2014 what-if simulations from a world model \u2014 are widely used in planning, but their accuracy degrades sharply outside the data manifold. We instrumented two cells' production planners with counterfactual confidence estimates and found that confidence-weighted rollouts yield a 19% reduction in plan-execution failure across 30 days and 11k plan invocations, vs. uniformly-weighted counterfactuals."
    },
    {
      "slug": "latent-trajectory-surgery",
      "url": "https://dev.alphabell.com/publications/latent-trajectory-surgery",
      "title": "Latent Trajectory Surgery: Editing Agent Plans in Mid-Run",
      "authors": [
        "Helena Salgueiro",
        "Gita Sundaram",
        "Catriona MacLeod"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "cantor-18",
      "year": 2024,
      "month": "Dec",
      "date": "2024-12-13",
      "venue": "NeurIPS 2024 \u00b7 alphabell index 24/21",
      "venue_type": "conference",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2412.04881",
      "arxiv": "2412.04881",
      "pdf": "https://dev.alphabell.com/publications/latent-trajectory-surgery.pdf",
      "code": "https://github.com/alphabell-labs/ab-surgery",
      "data": null,
      "bibtex_key": "salgueiro2024surgery",
      "abstract": "When a paired interpretability cell wants to test whether a particular sub-plan is load-bearing for an agent's behaviour, the cleanest experiment is to surgically remove or replace that sub-plan and re-execute. We make this practical: a substrate-integrated surgery tool that can edit a substrate-hosted agent's plan at a specific causal joint, while preserving the upstream state. We demonstrate the tool's use on three case-study halts called by paired interpretability cells in 2024."
    },
    {
      "slug": "adversarial-robustness-goal-conditioned",
      "url": "https://dev.alphabell.com/publications/adversarial-robustness-goal-conditioned",
      "title": "Adversarial Robustness of Goal-Conditioned World Models",
      "authors": [
        "Sasha Petrov",
        "Jonas Bremer",
        "Tomoko Niwa",
        "Maya Quesada"
      ],
      "axis": "World models",
      "axis_slug": "world-models",
      "cell": "voronoi-19",
      "year": 2024,
      "month": "Dec",
      "date": "2024-12-10",
      "venue": "NeurIPS 2024 \u00b7 alphabell index 24/20",
      "venue_type": "conference",
      "tags": [
        "world-models"
      ],
      "doi": "10.48550/arXiv.2412.03998",
      "arxiv": "2412.03998",
      "pdf": "https://dev.alphabell.com/publications/adversarial-robustness-goal-conditioned.pdf",
      "code": "https://github.com/alphabell-labs/ab-robust-worlds",
      "data": null,
      "bibtex_key": "petrov2024adversarial",
      "abstract": "Goal-conditioned world models are increasingly used as planners \u2014 but planners only inherit the model's robustness to inputs the world model itself has not been certified against. We construct an adversarial test suite of perturbed goal embeddings drawn from the same distribution as benign goals, and show that several state-of-the-art world models drop plan-success rates by 47-62% under perturbations that are individually within 0.02 of the L2 ball used during training. We propose a goal-injection penalty that recovers most of the lost performance and discuss the implications for downstream planning."
    },
    {
      "slug": "sandboxed-self-modification",
      "url": "https://dev.alphabell.com/publications/sandboxed-self-modification",
      "title": "Sandboxed Self-Modification: a confinement specification and implementation",
      "authors": [
        "Liora Sabatini",
        "Cheung Wai-Lin",
        "Marek Holub"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "godel-02",
      "year": 2024,
      "month": "Nov",
      "date": "2024-11-15",
      "venue": "Internal release \u2014 alphabell index 24/19 \u00b7 delayed release",
      "venue_type": "internal",
      "tags": [
        "agentic",
        "rsi"
      ],
      "doi": "10.48550/arXiv.2411.13633",
      "arxiv": "2411.13633",
      "pdf": "https://dev.alphabell.com/publications/sandboxed-self-modification.pdf",
      "code": "https://github.com/alphabell-labs/ab-sandboxe",
      "data": null,
      "bibtex_key": "sabatini2024sandboxed",
      "abstract": "Self-modification of an agent's code, tool catalogue, or training procedure is the most consequential operation we permit agents to perform. We specify a confinement profile under which such operations may proceed \u2014 including jurisdictional segregation of artefacts, mandatory dual-cell sign-off, and rolling read-access for paired interpretability cells \u2014 and an implementation in the alphabell substrate. The specification is informed by three internal incidents redacted in the public release; full incident reports are available to long-tenured contributors and external audit partners."
    },
    {
      "slug": "federated-trace-auditing",
      "url": "https://dev.alphabell.com/publications/federated-trace-auditing",
      "title": "Federated Trace Auditing Without Centralizing Logs",
      "authors": [
        "Akoss Vidor",
        "Henri Brouillard",
        "Olu Folarin",
        "Pranav Iyer"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "babbage-14",
      "year": 2024,
      "month": "Nov",
      "date": "2024-11-04",
      "venue": "SOSP 2024 \u00b7 alphabell index 24/16",
      "venue_type": "conference",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2411.02009",
      "arxiv": "2411.02009",
      "pdf": "https://dev.alphabell.com/publications/federated-trace-auditing.pdf",
      "code": "https://github.com/alphabell-labs/ab-trace",
      "data": null,
      "bibtex_key": "vidor2024federated",
      "abstract": "The lab's federated compute model creates an awkward auditing requirement: an interpretability cell paired with a producing cell whose compute is allocated across multiple operators must be able to audit the full trace without any operator centralising the trace logs. We describe the design and implementation of ab-trace v3 \u2014 a content-addressed, jurisdiction-aware execution-trace store that supports federated read access under cryptographically-verified pairing records \u2014 and the lessons from running it across nine federated operators for eighteen months."
    },
    {
      "slug": "federated-compute-scheduler",
      "url": "https://dev.alphabell.com/publications/federated-compute-scheduler",
      "title": "A Federated Compute Scheduler for an Asynchronous Research Lab",
      "authors": [
        "Pranav Iyer",
        "Yusra Habibi",
        "Akoss Vidor"
      ],
      "axis": "Agentic engineering",
      "axis_slug": "agentic-engineering",
      "cell": "polya-25",
      "year": 2024,
      "month": "Oct",
      "date": "2024-10-15",
      "venue": "Internal release \u2014 alphabell index 24/17",
      "venue_type": "internal",
      "tags": [
        "agentic"
      ],
      "doi": "10.48550/arXiv.2410.84851",
      "arxiv": "2410.84851",
      "pdf": "https://dev.alphabell.com/publications/federated-compute-scheduler.pdf",
      "code": "https://github.com/alphabell-labs/ab-federate",
      "data": null,
      "bibtex_key": "iyer2024federated",
      "abstract": "We describe the scheduler underlying alphabell's federated compute pool. Cells commit GPU and TPU capacity; access is allocated by a hybrid mechanism combining tenure-weighted priority, project signals, and quadratic voting among active contributors. We discuss two failure modes: collusion in QV rounds, and capacity hoarding by cells with long-running RSI training runs. Mitigations are documented in the open implementation."
    },
    {
      "slug": "interpretability-cells-protocol",
      "url": "https://dev.alphabell.com/publications/interpretability-cells-protocol",
      "title": "Interpretability Cell Pairing: how every dual-use capability run gets a watchful sibling",
      "authors": [
        "Karima Belkadi",
        "Hester Vandekerckhove",
        "Yuki Cho"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "hilbert-13",
      "year": 2024,
      "month": "Sep",
      "date": "2024-09-15",
      "venue": "Internal release \u2014 alphabell index 24/15",
      "venue_type": "internal",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2409.58618",
      "arxiv": "2409.58618",
      "pdf": "https://dev.alphabell.com/publications/interpretability-cells-protocol.pdf",
      "code": "https://github.com/alphabell-labs/ab-interpre",
      "data": null,
      "bibtex_key": "belkadi2024interpretability",
      "abstract": "alphabell's structural commitment is that any cell working on dual-use capabilities is paired with an interpretability cell with rolling read-access to checkpoints, training logs, and proposal commits. We describe the practical implementation: the trust model, the artefact pipeline, the disagreement procedure, and an example walkthrough drawn from a 2024 sandboxed self-modification run that the paired interpretability cell escalated to halt."
    },
    {
      "slug": "reward-hacking-trace-anomaly",
      "url": "https://dev.alphabell.com/publications/reward-hacking-trace-anomaly",
      "title": "Reward Hacking Detection Through Trace-Level Anomaly Models",
      "authors": [
        "Maral Lotfi",
        "Hiroshi Tanigawa",
        "Olu Folarin"
      ],
      "axis": "Interpretability & alignment",
      "axis_slug": "interpretability-alignment",
      "cell": "lebesgue-22",
      "year": 2024,
      "month": "Sep",
      "date": "2024-09-08",
      "venue": "arXiv 2409.02118 \u00b7 Alignment Forum (Sep 2024)",
      "venue_type": "preprint",
      "tags": [
        "interp"
      ],
      "doi": "10.48550/arXiv.2409.02118",
      "arxiv": "2409.02118",
      "pdf": "https://dev.alphabell.com/publications/reward-hacking-trace-anomaly.pdf",
      "code": "https://github.com/alphabell-labs/ab-anomaly",
      "data": null,
      "bibtex_key": "lotfi2024reward",
      "abstract": "We train an unsupervised anomaly detector over execution traces emitted by substrate-hosted agents and show that the detector flags 78% of held-out reward-hacking attempts at a 4% false-positive rate. Unlike eval-time tripwires that compare scalar rewards against expectations, the trace-level detector recognises structural deviations in the tool-call sequence, the resource-consumption profile, and the trace-tree shape. We propose this as a standard component of paired-interpretability monitoring."
    }
  ]
}