Surrogate Gradient Function Comparison¶

Train the same LIF-based architecture on SHD with six different surrogate gradients and compare validation accuracy. All variants share weights and learning-rate schedules so the surrogate is the only changing factor.

Required extras:

pip install "spyx[loaders]"

Expected result: this page is rendered with a reduced 15-epoch budget to keep the six back-to-back trainings tractable; at 15 epochs the smooth surrogates (arctan, superspike, tanh, boxcar, triangular) cluster around 26-32% test accuracy and are still climbing, while the hard straight-through estimator (STE) stalls near chance (~5%, on 20 classes). A full ~50-epoch run pushes the smooth surrogates well past ~70%; the persistent takeaway is that a smooth surrogate is what makes deep BPTT-through-time trainable at all, whereas the non-smooth STE struggles. Exact ranking among the smooth surrogates is dataset- and seed-dependent.

In [1]:

Copied!





import os

os.environ.setdefault("XLA_PYTHON_CLIENT_MEM_FRACTION", ".80")

import jax
import jax.numpy as jnp
import optax
from flax import nnx
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from tqdm import trange
import matplotlib.pyplot as plt

import spyx
import spyx.nn as snn
import os

os.environ.setdefault("XLA_PYTHON_CLIENT_MEM_FRACTION", ".80")

import jax
import jax.numpy as jnp
import optax
from flax import nnx
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from tqdm import trange
import matplotlib.pyplot as plt

import spyx
import spyx.nn as snn

/home/kade/Code/spyx/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Data loading¶

In [2]:

Copied!





BATCH = 256
SAMPLE_T = 128
CHANNELS = 256

shd_dl = spyx.data.SHD_loader(batch_size=BATCH, sample_T=SAMPLE_T, channels=CHANNELS)
augment = spyx.data.shift_augment(max_shift=16, axes=(2,))
N_CLASSES = int(shd_dl.act_shape[0])

# Prestage the whole dataset onto the accelerator once, then scan over the
# prestaged batches each epoch. This is dramatically faster than re-decoding
# SHD through the streaming grain pipeline every epoch, and the whole dataset
# fits comfortably in memory. `prestage` returns per-batch arrays shaped
# (num_batches, batch, packed_time, channels).
train_obs, train_labels = shd_dl.prestage("train")
test_obs, test_labels = shd_dl.prestage("test")
print("prestaged train:", train_obs.shape, " test:", test_obs.shape)
BATCH = 256
SAMPLE_T = 128
CHANNELS = 256

shd_dl = spyx.data.SHD_loader(batch_size=BATCH, sample_T=SAMPLE_T, channels=CHANNELS)
augment = spyx.data.shift_augment(max_shift=16, axes=(2,))
N_CLASSES = int(shd_dl.act_shape[0])

# Prestage the whole dataset onto the accelerator once, then scan over the
# prestaged batches each epoch. This is dramatically faster than re-decoding
# SHD through the streaming grain pipeline every epoch, and the whole dataset
# fits comfortably in memory. `prestage` returns per-batch arrays shaped
# (num_batches, batch, packed_time, channels).
train_obs, train_labels = shd_dl.prestage("train")
test_obs, test_labels = shd_dl.prestage("test")
print("prestaged train:", train_obs.shape, " test:", test_obs.shape)

prestaged train: (31, 256, 16, 256)  test: (8, 256, 16, 256)

SNN factory parameterised by surrogate¶

All variants use the same Linear -> LIF -> Linear -> LIF -> Linear -> LI architecture; only the surrogate-gradient activation changes.

In [3]:

Copied!





HIDDEN = 128


def make_snn(activation, *, seed=0):
    rngs = nnx.Rngs(seed)
    return snn.Sequential(
        nnx.Linear(CHANNELS, HIDDEN, use_bias=False, rngs=rngs),
        snn.LIF((HIDDEN,), activation=activation, rngs=rngs),
        nnx.Linear(HIDDEN, HIDDEN, use_bias=False, rngs=rngs),
        snn.LIF((HIDDEN,), activation=activation, rngs=rngs),
        nnx.Linear(HIDDEN, N_CLASSES, use_bias=False, rngs=rngs),
        snn.LI((N_CLASSES,), rngs=rngs),
    )
HIDDEN = 128


def make_snn(activation, *, seed=0):
    rngs = nnx.Rngs(seed)
    return snn.Sequential(
        nnx.Linear(CHANNELS, HIDDEN, use_bias=False, rngs=rngs),
        snn.LIF((HIDDEN,), activation=activation, rngs=rngs),
        nnx.Linear(HIDDEN, HIDDEN, use_bias=False, rngs=rngs),
        snn.LIF((HIDDEN,), activation=activation, rngs=rngs),
        nnx.Linear(HIDDEN, N_CLASSES, use_bias=False, rngs=rngs),
        snn.LI((N_CLASSES,), rngs=rngs),
    )

Reusable training loop¶

In [4]:

Copied!





Loss = spyx.fn.integral_crossentropy()
Acc = spyx.fn.integral_accuracy()


def _unpack(batch_obs):
    obs = jnp.asarray(batch_obs)
    return jnp.unpackbits(obs, axis=1)[:, :SAMPLE_T, :].astype(jnp.float32)


def _forward(model, x_BTC):
    x_TBC = jnp.transpose(x_BTC, (1, 0, 2))
    traces, _ = snn.run(model, x_TBC)
    return jnp.transpose(traces, (1, 0, 2))


def run_experiment(activation, name, *, epochs=30, lr=1e-4, seed=0):
    model = make_snn(activation, seed=seed)
    optimizer = nnx.Optimizer(model, optax.chain(optax.centralize(), optax.lion(lr)), wrt=nnx.Param)
    rng = jax.random.PRNGKey(seed)
    n_train = train_obs.shape[0]
    n_test = test_obs.shape[0]

    @nnx.jit
    def train_step(model, optimizer, events, targets):
        def loss_fn(m):
            traces = _forward(m, events)
            return Loss(traces, targets)
        loss, grads = nnx.value_and_grad(loss_fn)(model)
        optimizer.update(model, grads)
        return loss

    @nnx.jit
    def eval_step(model, events, targets):
        traces = _forward(model, events)
        acc, _preds = Acc(traces, targets)
        loss = Loss(traces, targets)
        return acc, loss

    history = []
    for _ in trange(epochs, desc=name):
        rng, perm_key = jax.random.split(rng)
        order = jax.random.permutation(perm_key, n_train)
        train_losses = []
        for bi in order:
            rng, k = jax.random.split(rng)
            events = augment(_unpack(train_obs[bi]), k)
            targets = train_labels[bi]
            train_losses.append(train_step(model, optimizer, events, targets))

        accs, losses = [], []
        for bi in range(n_test):
            events = _unpack(test_obs[bi])
            targets = test_labels[bi]
            a, l = eval_step(model, events, targets)
            accs.append(a)
            losses.append(l)
        history.append((
            float(jnp.mean(jnp.stack(train_losses))),
            float(jnp.mean(jnp.stack(accs))),
            float(jnp.mean(jnp.stack(losses))),
        ))
    return jnp.array(history), model
Loss = spyx.fn.integral_crossentropy()
Acc = spyx.fn.integral_accuracy()


def _unpack(batch_obs):
    obs = jnp.asarray(batch_obs)
    return jnp.unpackbits(obs, axis=1)[:, :SAMPLE_T, :].astype(jnp.float32)


def _forward(model, x_BTC):
    x_TBC = jnp.transpose(x_BTC, (1, 0, 2))
    traces, _ = snn.run(model, x_TBC)
    return jnp.transpose(traces, (1, 0, 2))


def run_experiment(activation, name, *, epochs=30, lr=1e-4, seed=0):
    model = make_snn(activation, seed=seed)
    optimizer = nnx.Optimizer(model, optax.chain(optax.centralize(), optax.lion(lr)), wrt=nnx.Param)
    rng = jax.random.PRNGKey(seed)
    n_train = train_obs.shape[0]
    n_test = test_obs.shape[0]

    @nnx.jit
    def train_step(model, optimizer, events, targets):
        def loss_fn(m):
            traces = _forward(m, events)
            return Loss(traces, targets)
        loss, grads = nnx.value_and_grad(loss_fn)(model)
        optimizer.update(model, grads)
        return loss

    @nnx.jit
    def eval_step(model, events, targets):
        traces = _forward(model, events)
        acc, _preds = Acc(traces, targets)
        loss = Loss(traces, targets)
        return acc, loss

    history = []
    for _ in trange(epochs, desc=name):
        rng, perm_key = jax.random.split(rng)
        order = jax.random.permutation(perm_key, n_train)
        train_losses = []
        for bi in order:
            rng, k = jax.random.split(rng)
            events = augment(_unpack(train_obs[bi]), k)
            targets = train_labels[bi]
            train_losses.append(train_step(model, optimizer, events, targets))

        accs, losses = [], []
        for bi in range(n_test):
            events = _unpack(test_obs[bi])
            targets = test_labels[bi]
            a, l = eval_step(model, events, targets)
            accs.append(a)
            losses.append(l)
        history.append((
            float(jnp.mean(jnp.stack(train_losses))),
            float(jnp.mean(jnp.stack(accs))),
            float(jnp.mean(jnp.stack(losses))),
        ))
    return jnp.array(history), model

Train all six variants¶

Per-surrogate learning-rate hints follow the original notebook's tuned values; the original used 500 epochs. Drop EPOCHS to a smaller value while iterating.

In [5]:

Copied!





EPOCHS = 15  # rendered with a reduced budget; see the note above.

arctan_hist, arctan_model = run_experiment(spyx.axn.arctan(), "arctan", epochs=EPOCHS, lr=1e-4)
superspike_hist, superspike_model = run_experiment(spyx.axn.superspike(), "superspike", epochs=EPOCHS, lr=1e-4)
tanh_hist, tanh_model = run_experiment(spyx.axn.tanh(), "tanh", epochs=EPOCHS, lr=1.5e-4)
boxcar_hist, boxcar_model = run_experiment(spyx.axn.boxcar(), "boxcar", epochs=EPOCHS, lr=2e-4)
triangular_hist, triangular_model = run_experiment(spyx.axn.triangular(), "triangular", epochs=EPOCHS, lr=1e-4)
ste_hist, ste_model = run_experiment(spyx.axn.custom(), "STE", epochs=EPOCHS, lr=7e-5)
EPOCHS = 15  # rendered with a reduced budget; see the note above.

arctan_hist, arctan_model = run_experiment(spyx.axn.arctan(), "arctan", epochs=EPOCHS, lr=1e-4)
superspike_hist, superspike_model = run_experiment(spyx.axn.superspike(), "superspike", epochs=EPOCHS, lr=1e-4)
tanh_hist, tanh_model = run_experiment(spyx.axn.tanh(), "tanh", epochs=EPOCHS, lr=1.5e-4)
boxcar_hist, boxcar_model = run_experiment(spyx.axn.boxcar(), "boxcar", epochs=EPOCHS, lr=2e-4)
triangular_hist, triangular_model = run_experiment(spyx.axn.triangular(), "triangular", epochs=EPOCHS, lr=1e-4)
ste_hist, ste_model = run_experiment(spyx.axn.custom(), "STE", epochs=EPOCHS, lr=7e-5)

arctan:   0%|          | 0/15 [00:00<?, ?it/s]

arctan:   7%|▋         | 1/15 [00:03<00:53,  3.80s/it]

arctan:  13%|█▎        | 2/15 [00:06<00:41,  3.18s/it]

arctan:  20%|██        | 3/15 [00:09<00:36,  3.04s/it]

arctan:  27%|██▋       | 4/15 [00:12<00:32,  3.00s/it]

arctan:  33%|███▎      | 5/15 [00:15<00:29,  2.98s/it]

arctan:  40%|████      | 6/15 [00:18<00:26,  2.96s/it]

arctan:  47%|████▋     | 7/15 [00:21<00:23,  2.90s/it]

arctan:  53%|█████▎    | 8/15 [00:23<00:19,  2.81s/it]

arctan:  60%|██████    | 9/15 [00:26<00:17,  2.86s/it]

arctan:  67%|██████▋   | 10/15 [00:29<00:14,  2.92s/it]

arctan:  73%|███████▎  | 11/15 [00:32<00:11,  2.97s/it]

arctan:  80%|████████  | 12/15 [00:35<00:08,  2.97s/it]

arctan:  87%|████████▋ | 13/15 [00:38<00:05,  2.85s/it]

arctan:  93%|█████████▎| 14/15 [00:41<00:02,  2.84s/it]

arctan: 100%|██████████| 15/15 [00:43<00:00,  2.83s/it]

arctan: 100%|██████████| 15/15 [00:43<00:00,  2.93s/it]

superspike:   0%|          | 0/15 [00:00<?, ?it/s]

superspike:   7%|▋         | 1/15 [00:03<00:47,  3.36s/it]

superspike:  13%|█▎        | 2/15 [00:06<00:39,  3.01s/it]

superspike:  20%|██        | 3/15 [00:09<00:36,  3.04s/it]

superspike:  27%|██▋       | 4/15 [00:11<00:31,  2.88s/it]

superspike:  33%|███▎      | 5/15 [00:14<00:29,  2.90s/it]

superspike:  40%|████      | 6/15 [00:17<00:26,  2.91s/it]

superspike:  47%|████▋     | 7/15 [00:20<00:23,  2.95s/it]

superspike:  53%|█████▎    | 8/15 [00:23<00:20,  2.93s/it]

superspike:  60%|██████    | 9/15 [00:26<00:17,  2.92s/it]

superspike:  67%|██████▋   | 10/15 [00:29<00:14,  2.95s/it]

superspike:  73%|███████▎  | 11/15 [00:32<00:11,  2.99s/it]

superspike:  80%|████████  | 12/15 [00:35<00:09,  3.00s/it]

superspike:  87%|████████▋ | 13/15 [00:38<00:05,  2.96s/it]

superspike:  93%|█████████▎| 14/15 [00:41<00:02,  2.84s/it]

superspike: 100%|██████████| 15/15 [00:43<00:00,  2.85s/it]

superspike: 100%|██████████| 15/15 [00:43<00:00,  2.93s/it]

tanh:   0%|          | 0/15 [00:00<?, ?it/s]

tanh:   7%|▋         | 1/15 [00:03<00:48,  3.45s/it]

tanh:  13%|█▎        | 2/15 [00:06<00:40,  3.12s/it]

tanh:  20%|██        | 3/15 [00:09<00:35,  2.96s/it]

tanh:  27%|██▋       | 4/15 [00:11<00:31,  2.90s/it]

tanh:  33%|███▎      | 5/15 [00:14<00:27,  2.78s/it]

tanh:  40%|████      | 6/15 [00:17<00:25,  2.80s/it]

tanh:  47%|████▋     | 7/15 [00:20<00:22,  2.81s/it]

tanh:  53%|█████▎    | 8/15 [00:23<00:19,  2.83s/it]

tanh:  60%|██████    | 9/15 [00:25<00:17,  2.87s/it]

tanh:  67%|██████▋   | 10/15 [00:28<00:14,  2.89s/it]

tanh:  73%|███████▎  | 11/15 [00:31<00:11,  2.91s/it]

tanh:  80%|████████  | 12/15 [00:34<00:08,  2.94s/it]

tanh:  87%|████████▋ | 13/15 [00:37<00:05,  2.97s/it]

tanh:  93%|█████████▎| 14/15 [00:40<00:02,  2.94s/it]

tanh: 100%|██████████| 15/15 [00:43<00:00,  2.93s/it]

tanh: 100%|██████████| 15/15 [00:43<00:00,  2.91s/it]

boxcar:   0%|          | 0/15 [00:00<?, ?it/s]

boxcar:   7%|▋         | 1/15 [00:03<00:49,  3.50s/it]

boxcar:  13%|█▎        | 2/15 [00:06<00:39,  3.02s/it]

boxcar:  20%|██        | 3/15 [00:09<00:35,  2.98s/it]

boxcar:  27%|██▋       | 4/15 [00:11<00:32,  2.93s/it]

boxcar:  33%|███▎      | 5/15 [00:14<00:28,  2.90s/it]

boxcar:  40%|████      | 6/15 [00:17<00:26,  2.89s/it]

boxcar:  47%|████▋     | 7/15 [00:20<00:23,  2.92s/it]

boxcar:  53%|█████▎    | 8/15 [00:23<00:20,  2.89s/it]

boxcar:  60%|██████    | 9/15 [00:26<00:16,  2.82s/it]

boxcar:  67%|██████▋   | 10/15 [00:28<00:13,  2.72s/it]

boxcar:  73%|███████▎  | 11/15 [00:31<00:10,  2.74s/it]

boxcar:  80%|████████  | 12/15 [00:34<00:08,  2.79s/it]

boxcar:  87%|████████▋ | 13/15 [00:37<00:05,  2.84s/it]

boxcar:  93%|█████████▎| 14/15 [00:40<00:02,  2.88s/it]

boxcar: 100%|██████████| 15/15 [00:42<00:00,  2.79s/it]

boxcar: 100%|██████████| 15/15 [00:42<00:00,  2.86s/it]

triangular:   0%|          | 0/15 [00:00<?, ?it/s]

triangular:   7%|▋         | 1/15 [00:03<00:47,  3.42s/it]

triangular:  13%|█▎        | 2/15 [00:06<00:40,  3.11s/it]

triangular:  20%|██        | 3/15 [00:09<00:35,  2.96s/it]

triangular:  27%|██▋       | 4/15 [00:11<00:32,  2.93s/it]

triangular:  33%|███▎      | 5/15 [00:14<00:28,  2.89s/it]

triangular:  40%|████      | 6/15 [00:17<00:24,  2.77s/it]

triangular:  47%|████▋     | 7/15 [00:20<00:21,  2.75s/it]

triangular:  53%|█████▎    | 8/15 [00:22<00:19,  2.80s/it]

triangular:  60%|██████    | 9/15 [00:25<00:16,  2.76s/it]

triangular:  67%|██████▋   | 10/15 [00:28<00:13,  2.80s/it]

triangular:  73%|███████▎  | 11/15 [00:31<00:11,  2.83s/it]

triangular:  80%|████████  | 12/15 [00:34<00:08,  2.84s/it]

triangular:  87%|████████▋ | 13/15 [00:37<00:05,  2.85s/it]

triangular:  93%|█████████▎| 14/15 [00:39<00:02,  2.84s/it]

triangular: 100%|██████████| 15/15 [00:42<00:00,  2.77s/it]

triangular: 100%|██████████| 15/15 [00:42<00:00,  2.84s/it]

STE:   0%|          | 0/15 [00:00<?, ?it/s]

STE:   7%|▋         | 1/15 [00:03<00:49,  3.53s/it]

STE:  13%|█▎        | 2/15 [00:06<00:41,  3.17s/it]

STE:  20%|██        | 3/15 [00:09<00:36,  3.02s/it]

STE:  27%|██▋       | 4/15 [00:12<00:32,  2.98s/it]

STE:  33%|███▎      | 5/15 [00:15<00:29,  2.96s/it]

STE:  40%|████      | 6/15 [00:18<00:26,  2.95s/it]

STE:  47%|████▋     | 7/15 [00:21<00:23,  2.96s/it]

STE:  53%|█████▎    | 8/15 [00:23<00:20,  2.95s/it]

STE:  60%|██████    | 9/15 [00:26<00:17,  2.95s/it]

STE:  67%|██████▋   | 10/15 [00:29<00:14,  2.92s/it]

STE:  73%|███████▎  | 11/15 [00:32<00:11,  2.89s/it]

STE:  80%|████████  | 12/15 [00:35<00:08,  2.91s/it]

STE:  87%|████████▋ | 13/15 [00:38<00:05,  2.95s/it]

STE:  93%|█████████▎| 14/15 [00:41<00:02,  2.93s/it]

STE: 100%|██████████| 15/15 [00:44<00:00,  2.92s/it]

STE: 100%|██████████| 15/15 [00:44<00:00,  2.96s/it]

In [6]:

Copied!





histories = {
    "Arctan": arctan_hist,
    "SuperSpike": superspike_hist,
    "Tanh": tanh_hist,
    "Boxcar": boxcar_hist,
    "Triangular": triangular_hist,
    "STE": ste_hist,
}
for name, hist in histories.items():
    plt.plot(hist[:, 1], label=f"{name} val acc")
plt.title("Surrogate gradient validation accuracy comparison")
plt.xlabel("epoch")
plt.legend()
plt.show()
histories = {
    "Arctan": arctan_hist,
    "SuperSpike": superspike_hist,
    "Tanh": tanh_hist,
    "Boxcar": boxcar_hist,
    "Triangular": triangular_hist,
    "STE": ste_hist,
}
for name, hist in histories.items():
    plt.plot(hist[:, 1], label=f"{name} val acc")
plt.title("Surrogate gradient validation accuracy comparison")
plt.xlabel("epoch")
plt.legend()
plt.show()

No description has been provided for this image

Final test evaluation¶

Walk the test set for every variant and print summary metrics, then plot a confusion matrix for one of them.

In [7]:

Copied!





@nnx.jit
def test_step(model, events, targets):
    traces = _forward(model, events)
    acc, preds = Acc(traces, targets)
    loss = Loss(traces, targets)
    return acc, loss, preds


def test(model):
    accs, losses, all_preds, all_tgts = [], [], [], []
    for bi in range(test_obs.shape[0]):
        events = _unpack(test_obs[bi])
        targets = test_labels[bi]
        a, l, preds = test_step(model, events, targets)
        accs.append(a)
        losses.append(l)
        all_preds.append(preds)
        all_tgts.append(targets)
    return (
        float(jnp.mean(jnp.stack(accs))),
        float(jnp.mean(jnp.stack(losses))),
        jnp.concatenate(all_preds),
        jnp.concatenate(all_tgts),
    )


models = {
    "Arctan": arctan_model,
    "SuperSpike": superspike_model,
    "Tanh": tanh_model,
    "Boxcar": boxcar_model,
    "Triangular": triangular_model,
    "STE": ste_model,
}
for name, model in models.items():
    acc, loss, _, _ = test(model)
    print(f"{name:10s} acc={acc:.4f}  loss={loss:.4f}")

# Confusion matrix for the boxcar variant.
_, _, preds, tgts = test(boxcar_model)
cm = confusion_matrix(tgts, preds)
ConfusionMatrixDisplay(cm).plot()
plt.title("Boxcar LIF test confusion matrix")
plt.show()
@nnx.jit
def test_step(model, events, targets):
    traces = _forward(model, events)
    acc, preds = Acc(traces, targets)
    loss = Loss(traces, targets)
    return acc, loss, preds


def test(model):
    accs, losses, all_preds, all_tgts = [], [], [], []
    for bi in range(test_obs.shape[0]):
        events = _unpack(test_obs[bi])
        targets = test_labels[bi]
        a, l, preds = test_step(model, events, targets)
        accs.append(a)
        losses.append(l)
        all_preds.append(preds)
        all_tgts.append(targets)
    return (
        float(jnp.mean(jnp.stack(accs))),
        float(jnp.mean(jnp.stack(losses))),
        jnp.concatenate(all_preds),
        jnp.concatenate(all_tgts),
    )


models = {
    "Arctan": arctan_model,
    "SuperSpike": superspike_model,
    "Tanh": tanh_model,
    "Boxcar": boxcar_model,
    "Triangular": triangular_model,
    "STE": ste_model,
}
for name, model in models.items():
    acc, loss, _, _ = test(model)
    print(f"{name:10s} acc={acc:.4f}  loss={loss:.4f}")

# Confusion matrix for the boxcar variant.
_, _, preds, tgts = test(boxcar_model)
cm = confusion_matrix(tgts, preds)
ConfusionMatrixDisplay(cm).plot()
plt.title("Boxcar LIF test confusion matrix")
plt.show()

Arctan     acc=0.2993  loss=2.9443

SuperSpike acc=0.2607  loss=2.7760

Tanh       acc=0.2842  loss=3.2504

Boxcar     acc=0.3169  loss=3.0820

Triangular acc=0.3096  loss=2.8798

STE        acc=0.0474  loss=2.9957