Source code for ddmtolab.Algorithms.MTSO.MFEA_SSG

"""
Multifactorial Evolutionary Algorithm with Single-Step Generative Model (MFEA-SSG)

This module implements MFEA-SSG for expensive multi-task optimization using a diffusion-based
generative model with knowledge distillation for single-step inference.

References
----------
    [1] R. Wang, X. Feng, H. Yu, Y. Tan, and E. M. K. Lai, "Meta-Learning Inspired Single-Step Generative Model for Expensive Multitask Optimization Problems," IEEE Transactions on Evolutionary Computation, 2025.

Notes
-----
Author: Jiangtao Shen
Email: j.shen5@exeter.ac.uk
Date: 2025.12.01
Version: 1.0
"""
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from ddmtolab.Methods.Algo_Methods.algo_utils import *
from ddmtolab.Algorithms.MTSO.MFEA import mfea_selection


# ============================================================================
# Neural Network Components
# ============================================================================

class SinusoidalTimeEmbedding(nn.Module):
    """Sinusoidal positional embedding for diffusion timestep."""

    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, t):
        device = t.device
        half = self.dim // 2
        emb = np.log(10000) / (half - 1)
        emb = torch.exp(torch.arange(half, device=device, dtype=torch.float32) * -emb)
        emb = t.float().unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
        return emb


class ResBlock(nn.Module):
    """Residual block with two conv layers and time embedding injection."""

    def __init__(self, in_ch, out_ch, time_dim):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(out_ch)
        self.bn2 = nn.BatchNorm2d(out_ch)
        self.time_mlp = nn.Linear(time_dim, out_ch)
        self.residual = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
        self.act = nn.SiLU()

    def forward(self, x, t_emb):
        h = self.act(self.bn1(self.conv1(x)))
        t = self.act(self.time_mlp(t_emb))[:, :, None, None]
        h = h + t
        h = self.bn2(self.conv2(h))
        return self.act(h + self.residual(x))


class SelfAttention(nn.Module):
    """Self-attention block for feature maps."""

    def __init__(self, channels):
        super().__init__()
        self.norm = nn.GroupNorm(1, channels)
        self.q = nn.Conv2d(channels, channels, 1)
        self.k = nn.Conv2d(channels, channels, 1)
        self.v = nn.Conv2d(channels, channels, 1)
        self.out = nn.Conv2d(channels, channels, 1)

    def forward(self, x):
        b, c, h, w = x.shape
        xn = self.norm(x)
        q = self.q(xn).view(b, c, -1)
        k = self.k(xn).view(b, c, -1)
        v = self.v(xn).view(b, c, -1)
        attn = torch.softmax(torch.bmm(q.transpose(1, 2), k) / (c ** 0.5), dim=-1)
        out = torch.bmm(v, attn.transpose(1, 2)).view(b, c, h, w)
        return x + self.out(out)


class TeacherUNet(nn.Module):
    """U-Net teacher model for diffusion-based denoising with attention (10 ResBlocks, 4 Attention)."""

    def __init__(self, in_ch=1, base_ch=64, time_dim=128):
        super().__init__()
        self.time_embed = nn.Sequential(
            SinusoidalTimeEmbedding(time_dim),
            nn.Linear(time_dim, time_dim),
            nn.SiLU(),
        )
        self.conv_in = nn.Conv2d(in_ch, base_ch, 3, padding=1)
        # Encoder 1: 2 ResBlocks + 1 Attention
        self.enc1_res1 = ResBlock(base_ch, base_ch, time_dim)
        self.enc1_res2 = ResBlock(base_ch, base_ch, time_dim)
        self.enc1_attn = SelfAttention(base_ch)
        self.down1 = nn.Conv2d(base_ch, base_ch, 3, stride=2, padding=1)
        # Encoder 2: 2 ResBlocks + 1 Attention
        self.enc2_res1 = ResBlock(base_ch, base_ch * 2, time_dim)
        self.enc2_res2 = ResBlock(base_ch * 2, base_ch * 2, time_dim)
        self.enc2_attn = SelfAttention(base_ch * 2)
        self.down2 = nn.Conv2d(base_ch * 2, base_ch * 2, 3, stride=2, padding=1)
        # Bridge: 2 ResBlocks + 1 Attention
        self.bridge_res1 = ResBlock(base_ch * 2, base_ch * 4, time_dim)
        self.bridge_attn = SelfAttention(base_ch * 4)
        self.bridge_res2 = ResBlock(base_ch * 4, base_ch * 4, time_dim)
        # Decoder 1: 2 ResBlocks + 1 Attention
        self.up1 = nn.ConvTranspose2d(base_ch * 4, base_ch * 2, 4, stride=2, padding=1)
        self.dec1_res1 = ResBlock(base_ch * 4, base_ch * 2, time_dim)
        self.dec1_res2 = ResBlock(base_ch * 2, base_ch * 2, time_dim)
        self.dec1_attn = SelfAttention(base_ch * 2)
        # Decoder 2: 2 ResBlocks
        self.up2 = nn.ConvTranspose2d(base_ch * 2, base_ch, 4, stride=2, padding=1)
        self.dec2_res1 = ResBlock(base_ch * 2, base_ch, time_dim)
        self.dec2_res2 = ResBlock(base_ch, base_ch, time_dim)
        self.conv_out = nn.Conv2d(base_ch, in_ch, 1)

    def forward(self, x, t):
        t_emb = self.time_embed(t)
        h = self.conv_in(x)
        # Encoder 1
        h = self.enc1_res1(h, t_emb)
        h1 = self.enc1_attn(self.enc1_res2(h, t_emb))
        h = self.down1(h1)
        # Encoder 2
        h = self.enc2_res1(h, t_emb)
        h2 = self.enc2_attn(self.enc2_res2(h, t_emb))
        h = self.down2(h2)
        # Bridge
        h = self.bridge_res1(h, t_emb)
        h = self.bridge_attn(h)
        h = self.bridge_res2(h, t_emb)
        # Decoder 1 (skip from enc2)
        h = self.up1(h)
        h = h[:, :, :h2.shape[2], :h2.shape[3]]
        h = torch.cat([h, h2], dim=1)
        h = self.dec1_res1(h, t_emb)
        h = self.dec1_attn(self.dec1_res2(h, t_emb))
        # Decoder 2 (skip from enc1)
        h = self.up2(h)
        h = h[:, :, :h1.shape[2], :h1.shape[3]]
        h = torch.cat([h, h1], dim=1)
        h = self.dec2_res1(h, t_emb)
        h = self.dec2_res2(h, t_emb)
        return self.conv_out(h)


class StudentUNet(nn.Module):
    """Lightweight student model for single-step generation (no attention)."""

    def __init__(self, in_ch=1, base_ch=64, time_dim=128):
        super().__init__()
        self.time_embed = nn.Sequential(
            SinusoidalTimeEmbedding(time_dim),
            nn.Linear(time_dim, time_dim),
            nn.SiLU(),
        )
        self.conv_in = nn.Conv2d(in_ch, base_ch, 3, padding=1)
        self.enc_res = ResBlock(base_ch, base_ch, time_dim)
        self.down = nn.Conv2d(base_ch, base_ch, 3, stride=2, padding=1)
        self.bridge = ResBlock(base_ch, base_ch * 2, time_dim)
        self.up = nn.ConvTranspose2d(base_ch * 2, base_ch, 4, stride=2, padding=1)
        self.dec_res = ResBlock(base_ch * 2, base_ch, time_dim)
        self.conv_out = nn.Conv2d(base_ch, in_ch, 1)

    def forward(self, x, t):
        t_emb = self.time_embed(t)
        h = self.conv_in(x)
        h1 = self.enc_res(h, t_emb)
        h = self.down(h1)
        h = self.bridge(h, t_emb)
        h = self.up(h)
        h = h[:, :, :h1.shape[2], :h1.shape[3]]
        h = torch.cat([h, h1], dim=1)
        h = self.dec_res(h, t_emb)
        return self.conv_out(h)


# ============================================================================
# Diffusion Utilities
# ============================================================================

def get_diffusion_schedule(n_steps=100):
    """Create linear beta schedule and precompute alpha values."""
    betas = np.linspace(1e-4, 0.02, n_steps)
    alphas = 1.0 - betas
    alpha_bars = np.cumprod(alphas)
    return betas, alphas, alpha_bars


def diffusion_forward(x0, t, alpha_bars, device):
    """Add noise to clean data according to forward diffusion process."""
    alpha_bar_t = torch.tensor(alpha_bars[t], dtype=torch.float32, device=device).view(-1, 1, 1, 1)
    noise = torch.randn_like(x0)
    x_t = torch.sqrt(alpha_bar_t) * x0 + torch.sqrt(1 - alpha_bar_t) * noise
    return x_t, noise


def generate_with_student(student, elite_data, grid_h, grid_w, grid_dim,
                          alpha_bars, device, n_samples=1, denoise_t=50):
    """
    Generate samples using single-step student model with elite-guided denoising.

    Adds noise to elite solutions at timestep denoise_t via forward diffusion,
    then denoises in one step. Dimension shuffling (meta-learning inspired) is
    applied before noising and inverse-shuffled after denoising.
    """
    student.eval()
    with torch.no_grad():
        indices = np.random.randint(0, len(elite_data), size=n_samples)
        x0_np = elite_data[indices].copy()

        # Random dimension shuffling — track permutations for inverse
        shuffle_perms = []
        for i in range(len(x0_np)):
            perm = np.random.permutation(x0_np.shape[1])
            shuffle_perms.append(perm)
            x0_np[i] = x0_np[i][perm]

        x0 = torch.tensor(x0_np, dtype=torch.float32, device=device).view(-1, 1, grid_h, grid_w)

        # Forward diffusion: add noise at timestep denoise_t
        t = torch.full((n_samples,), denoise_t, device=device, dtype=torch.long)
        alpha_bar_t = torch.tensor(alpha_bars[denoise_t], dtype=torch.float32, device=device)
        noise = torch.randn_like(x0)
        x_t = torch.sqrt(alpha_bar_t) * x0 + torch.sqrt(1 - alpha_bar_t) * noise

        # Single-step denoising
        pred_noise = student(x_t, t)
        x_denoised = (x_t - torch.sqrt(1 - alpha_bar_t) * pred_noise) / torch.sqrt(alpha_bar_t)
        x_denoised = torch.clamp(x_denoised, 0.0, 1.0)

    result = x_denoised.cpu().numpy().reshape(n_samples, -1)[:, :grid_dim]

    # Inverse shuffle to restore original dimension ordering
    for i in range(n_samples):
        inv_perm = np.argsort(shuffle_perms[i])
        result[i] = result[i][inv_perm[:grid_dim]]

    return result


# ============================================================================
# Training Functions
# ============================================================================

def train_teacher(teacher, train_data, alpha_bars, n_steps, device, grid_h, grid_w,
                  epochs=5, batch_size=512, lr=5e-4):
    """
    Train the teacher diffusion model (Algorithm 2 in paper).

    Applies random dimension shuffling and reshaping to grid_h x grid_w images.
    """
    teacher.train()
    optimizer = optim.Adam(teacher.parameters(), lr=lr, betas=(0.9, 0.9999))
    effective_bs = min(batch_size, len(train_data))

    for epoch in range(epochs):
        perm = np.random.permutation(len(train_data))
        for start in range(0, len(perm), effective_bs):
            batch_idx = perm[start:start + effective_bs]
            batch = train_data[batch_idx].copy()

            for i in range(len(batch)):
                shuffle_idx = np.random.permutation(batch.shape[1])
                batch[i] = batch[i][shuffle_idx]

            x0 = torch.tensor(batch, dtype=torch.float32, device=device)
            x0 = x0.view(-1, 1, grid_h, grid_w)

            t = torch.randint(0, n_steps, (x0.shape[0],), device=device)
            x_t, noise = diffusion_forward(x0, t.cpu().numpy(), alpha_bars, device)

            pred_noise = teacher(x_t, t)
            loss = nn.functional.mse_loss(pred_noise, noise)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


def distill_student(teacher, student, train_data, alpha_bars, n_steps, device, grid_h, grid_w,
                    epochs=5, batch_size=512, lr=5e-4):
    """
    Knowledge distillation from teacher to student (Algorithm 3 in paper).

    Student learns to mimic teacher's noise predictions for single-step generation.
    No dimension shuffling is applied during distillation (per Algorithm 3).
    """
    teacher.eval()
    student.train()
    optimizer = optim.Adam(student.parameters(), lr=lr, betas=(0.9, 0.9999))
    effective_bs = min(batch_size, len(train_data))

    for epoch in range(epochs):
        perm = np.random.permutation(len(train_data))
        for start in range(0, len(perm), effective_bs):
            batch_idx = perm[start:start + effective_bs]
            batch = train_data[batch_idx].copy()

            x0 = torch.tensor(batch, dtype=torch.float32, device=device)
            x0 = x0.view(-1, 1, grid_h, grid_w)

            t = torch.randint(0, n_steps, (x0.shape[0],), device=device)
            x_t, _ = diffusion_forward(x0, t.cpu().numpy(), alpha_bars, device)

            with torch.no_grad():
                teacher_pred = teacher(x_t, t)

            student_pred = student(x_t, t)
            loss = nn.functional.mse_loss(student_pred, teacher_pred)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


# ============================================================================
# MFEA-SSG Algorithm
# ============================================================================


[docs]
class MFEA_SSG:
    """
    Multifactorial Evolutionary Algorithm with Single-Step Generative Model.

    Follows the MFEA architecture with a diffusion-based generative model replacing
    crossover in early generations. Knowledge distillation compresses the teacher
    model into a lightweight student for single-step inference.

    Attributes
    ----------
    algorithm_information : dict
        Dictionary containing algorithm capabilities and requirements
    """

    algorithm_information = {
        'n_tasks': '[2, K]',
        'dims': 'unequal',
        'objs': 'equal',
        'n_objs': '1',
        'cons': 'unequal',
        'n_cons': '[0, C]',
        'expensive': 'True',
        'knowledge_transfer': 'True',
        'n': 'equal',
        'max_nfes': 'equal'
    }

    @classmethod
    def get_algorithm_information(cls, print_info=True):
        return get_algorithm_information(cls, print_info)


[docs]
    def __init__(self, problem, n=None, max_nfes=None, rmp=0.3, muc=2, mum=5,
                 max_gen=None, refine_freq=3, n_pairs_per_gen=None,
                 n_diffusion_steps=100, train_epochs=5, distill_epochs=5,
                 batch_size=512, lr=5e-4, base_ch=64,
                 save_data=True, save_path='./Data', name='MFEA-SSG', disable_tqdm=True):
        """
        Initialize MFEA-SSG algorithm.

        Parameters
        ----------
        problem : MTOP
            Multi-task optimization problem instance
        n : int, optional
            Population size per task (default: 100)
        max_nfes : int, optional
            Maximum number of function evaluations per task (default: 10000)
        rmp : float, optional
            Random mating probability (default: 0.3)
        muc : float, optional
            Distribution index for SBX crossover (default: 2)
        mum : float, optional
            Distribution index for polynomial mutation (default: 5)
        max_gen : int, optional
            Maximum generation for generative phase (default: auto)
        refine_freq : int, optional
            Refinement frequency tau for generative model (default: 3)
        n_diffusion_steps : int, optional
            Number of diffusion timesteps N (default: 100)
        train_epochs : int, optional
            Training epochs for teacher model (default: 50)
        distill_epochs : int, optional
            Knowledge distillation epochs (default: 50)
        batch_size : int, optional
            Mini-batch size for training (default: 512)
        lr : float, optional
            Learning rate for Adam optimizer (default: 5e-4)
        base_ch : int, optional
            Base channel count for U-Net models (default: 64)
        save_data : bool, optional
            Whether to save optimization data (default: True)
        save_path : str, optional
            Path to save results (default: './Data')
        name : str, optional
            Name for the experiment (default: 'MFEA-SSG')
        disable_tqdm : bool, optional
            Whether to disable progress bar (default: True)
        """
        self.problem = problem
        self.n = n if n is not None else 100
        self.max_nfes = max_nfes if max_nfes is not None else 10000
        self.rmp = rmp
        self.muc = muc
        self.mum = mum
        self.max_gen = max_gen
        self.refine_freq = refine_freq
        self.n_pairs_per_gen = n_pairs_per_gen  # None = auto (nt pairs per gen)
        self.n_diffusion_steps = n_diffusion_steps
        self.train_epochs = train_epochs
        self.distill_epochs = distill_epochs
        self.batch_size = batch_size
        self.lr = lr
        self.base_ch = base_ch
        self.save_data = save_data
        self.save_path = save_path
        self.name = name
        self.disable_tqdm = disable_tqdm


    def _prepare_model_data(self, pop_decs, pop_objs, grid_dim, top_ratio=0.5):
        """
        Prepare training data for the generative model from population.

        Collects top-performing individuals from all tasks and pads to grid_dim.
        """
        all_data = []
        for i in range(len(pop_decs)):
            task_decs = pop_decs[i]
            task_objs = pop_objs[i]
            n_total = len(task_objs)
            n_elite = max(int(n_total * top_ratio), min(n_total, 2))
            indices = np.argsort(task_objs.flatten())[:n_elite]
            elite = task_decs[indices]

            # Pad to grid_dim if needed (constant 0.5 padding for unused grid cells)
            if elite.shape[1] < grid_dim:
                pad = np.full((elite.shape[0], grid_dim - elite.shape[1]), 0.5)
                elite = np.hstack([elite, pad])
            elif elite.shape[1] > grid_dim:
                elite = elite[:, :grid_dim]

            all_data.append(elite)
        return np.vstack(all_data)


[docs]
    def optimize(self):
        """
        Execute the MFEA-SSG algorithm (Algorithm 1 in paper).

        Returns
        -------
        Results
            Optimization results containing decision variables, objectives, and runtime
        """
        start_time = time.time()
        problem = self.problem
        nt = problem.n_tasks
        dims = problem.dims
        n = self.n
        max_nfes_per_task = par_list(self.max_nfes, nt)
        max_nfes = self.max_nfes * nt
        max_dim = max(dims)

        # Grid dimensions for 2D reshape: fixed grid_h=5, grid_w=ceil(max_dim/5)
        # e.g., 50D→5×10, 15D→5×3, 13D→5×3 (truncate 2 extra dims)
        grid_h = 5
        grid_w = max((max_dim + grid_h - 1) // grid_h, 1)
        grid_dim = grid_h * grid_w

        # Device setup
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Diffusion schedule
        betas, alphas, alpha_bars = get_diffusion_schedule(self.n_diffusion_steps)

        # Initialize teacher and student models
        teacher = TeacherUNet(in_ch=1, base_ch=self.base_ch, time_dim=128).to(device)
        student = StudentUNet(in_ch=1, base_ch=self.base_ch, time_dim=128).to(device)

        # ============================================================
        # Line 1: Initialize population P; gen <- 0
        # ============================================================
        decs = initialization(problem, n)
        objs, cons = evaluation(problem, decs)
        nfes = n * nt
        all_decs, all_objs, all_cons = init_history(decs, objs, cons)

        # Transform to unified search space
        pop_decs, pop_cons = space_transfer(problem=problem, decs=decs, cons=cons, type='uni', padding='mid')
        pop_objs = objs
        pop_sfs = [np.full((n, 1), fill_value=i) for i in range(nt)]

        # Train initial generative model G on the initial population
        model_data = self._prepare_model_data(pop_decs, pop_objs, grid_dim, top_ratio=1.0)
        train_teacher(teacher, model_data, alpha_bars, self.n_diffusion_steps, device,
                      grid_h, grid_w, self.train_epochs, self.batch_size, self.lr)
        distill_student(teacher, student, model_data, alpha_bars, self.n_diffusion_steps, device,
                        grid_h, grid_w, self.distill_epochs, self.batch_size, self.lr)

        # Determine pairs per generation (controls offspring count per generation)
        # Default: nt pairs → ~nt offspring per gen → many generations for model refinement
        n_pairs_per_gen = self.n_pairs_per_gen if self.n_pairs_per_gen is not None else nt

        # Estimate MaxGen: half of total generations use the generative model
        evals_per_gen = max(n_pairs_per_gen * 2, 1)  # ~2 evals per pair (1 gen, 2 GA)
        if self.max_gen is not None:
            max_gen_generative = self.max_gen
        else:
            est_total_gen = max((max_nfes - nfes) // evals_per_gen, 1)
            max_gen_generative = max(est_total_gen // 2, 1)

        gen = 0

        pbar = tqdm(total=max_nfes, initial=nfes, desc=f"{self.name}", disable=self.disable_tqdm)

        # ============================================================
        # Line 2: WHILE termination condition not met
        # ============================================================
        while nfes < max_nfes:

            # Merge populations from all tasks into single arrays
            pop_decs, pop_objs, pop_cons, pop_sfs = vstack_groups(pop_decs, pop_objs, pop_cons, pop_sfs)
            uni_dim = pop_decs.shape[1]
            n_cons_uni = pop_cons.shape[1]

            # Precompute elite data ONCE per generation for generative model
            if gen <= max_gen_generative:
                elite_data = self._prepare_model_data(
                    [pop_decs[pop_sfs.flatten() == t] for t in range(nt)],
                    [pop_objs[pop_sfs.flatten() == t] for t in range(nt)],
                    grid_dim, top_ratio=0.5)

            off_decs_list = []
            off_objs_list = []
            off_sfs_list = []

            # Line 4: FOR each pair of parents (p1, p2) selected from P
            shuffled_index = np.random.permutation(pop_decs.shape[0])
            max_pairs = min(n_pairs_per_gen, len(shuffled_index) // 2)

            for pair_idx in range(max_pairs):
                i = pair_idx * 2
                if nfes >= max_nfes:
                    break

                p1 = shuffled_index[i]
                p2 = shuffled_index[i + 1]
                sf1 = int(pop_sfs[p1].item())
                sf2 = int(pop_sfs[p2].item())

                # Line 5: IF gen <= MaxGen AND (Same task OR rand < RMP)
                if gen <= max_gen_generative and (sf1 == sf2 or np.random.rand() < self.rmp):
                    # Lines 6-9: Generate from student model, mutate, create one offspring
                    dec_gen = generate_with_student(
                        student, elite_data, grid_h, grid_w, grid_dim,
                        alpha_bars, device, n_samples=1)
                    dec_gen = dec_gen.flatten()

                    # Truncate/pad to unified space dimension
                    if len(dec_gen) >= uni_dim:
                        dec_uni = dec_gen[:uni_dim]
                    else:
                        dec_uni = np.concatenate([dec_gen, np.random.rand(uni_dim - len(dec_gen))])
                    dec_uni = np.clip(dec_uni, 0.0, 1.0)

                    # Apply mutation (Line 7)
                    dec_mut = mutation(dec_uni, mu=self.mum)

                    # Assign one parent's task factor (Line 9)
                    assigned_sf = np.random.choice([sf1, sf2])

                    # Evaluate on assigned task
                    off_dec_trimmed = dec_mut[:dims[assigned_sf]]
                    off_obj, off_con = evaluation_single(problem, off_dec_trimmed, assigned_sf)
                    nfes += 1
                    pbar.update(1)

                    off_decs_list.append(dec_mut.reshape(1, -1))
                    off_objs_list.append(off_obj)
                    off_sfs_list.append(np.array([[assigned_sf]]))

                else:
                    # Lines 11-14: GA crossover, create two offspring
                    off_dec1, off_dec2 = crossover(pop_decs[p1, :], pop_decs[p2, :], mu=self.muc)

                    # Assign task factors
                    sf_o1 = np.random.choice([sf1, sf2])
                    sf_o2 = sf1 if sf_o1 == sf2 else sf2

                    for off_dec, sf in [(off_dec1, sf_o1), (off_dec2, sf_o2)]:
                        if nfes >= max_nfes:
                            break

                        off_dec_trimmed = off_dec[:dims[sf]]
                        off_obj, off_con = evaluation_single(problem, off_dec_trimmed, sf)
                        nfes += 1
                        pbar.update(1)

                        off_decs_list.append(off_dec.reshape(1, -1))
                        off_objs_list.append(off_obj)
                        off_sfs_list.append(np.array([[sf]]))

            if len(off_decs_list) == 0:
                break

            # Stack offspring (cons use unified space dimension)
            off_decs = np.vstack(off_decs_list)
            off_objs = np.vstack(off_objs_list)
            off_cons = np.zeros((len(off_decs_list), n_cons_uni))
            off_sfs = np.vstack(off_sfs_list)

            # Line 17: Evaluate Offspring, update population P by selecting from P ∪ Offspring
            pop_decs, pop_objs, pop_cons, pop_sfs = vstack_groups(
                (pop_decs, off_decs), (pop_objs, off_objs), (pop_cons, off_cons), (pop_sfs, off_sfs)
            )

            pop_decs, pop_objs, pop_cons, pop_sfs = mfea_selection(
                pop_decs, pop_objs, pop_cons, pop_sfs, n, nt)

            # Transform back to native search space for history
            decs, cons = space_transfer(problem, decs=pop_decs, cons=pop_cons, type='real')
            append_history(all_decs, decs, all_objs, pop_objs, all_cons, cons)

            # Lines 18-20: Progressively refine generative model G (IF mod(gen, tau) == 0)
            if gen % self.refine_freq == 0:
                model_data = self._prepare_model_data(pop_decs, pop_objs, grid_dim, top_ratio=0.5)
                train_teacher(teacher, model_data, alpha_bars, self.n_diffusion_steps, device,
                              grid_h, grid_w, self.train_epochs, self.batch_size, self.lr)
                distill_student(teacher, student, model_data, alpha_bars, self.n_diffusion_steps, device,
                                grid_h, grid_w, self.distill_epochs, self.batch_size, self.lr)

            # Line 21: gen <- gen + 1
            gen += 1

        pbar.close()
        runtime = time.time() - start_time

        results = build_save_results(all_decs=all_decs, all_objs=all_objs, runtime=runtime, max_nfes=max_nfes_per_task,
                                     all_cons=all_cons, bounds=problem.bounds, save_path=self.save_path,
                                     filename=self.name, save_data=self.save_data)

        return results