utils.py

# --------------------------------------------------------
# Reversible Column Networks
# Copyright (c) 2022 Megvii Inc.
# Licensed under The Apache License 2.0 [see LICENSE for details]
# Written by Yuxuan Cai
# --------------------------------------------------------

import os
from typing import List
import torch
import torch.distributed as dist
from timm.utils import get_state_dict


def unitwise_norm(x, norm_type=2.0):
    if x.ndim <= 1:
        return x.norm(norm_type)
    else:
        # works for nn.ConvNd and nn,Linear where output dim is first in the kernel/weight tensor
        # might need special cases for other weights (possibly MHA) where this may not be true
        return x.norm(norm_type, dim=tuple(range(1, x.ndim)), keepdim=True)


def adaptive_clip_grad(parameters, clip_factor=0.01, eps=1e-3, norm_type=2.0):
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    for p in parameters:
        if p.grad is None:
            continue
        p_data = p.detach()
        g_data = p.grad.detach()
        max_norm = unitwise_norm(p_data, norm_type=norm_type).clamp_(min=eps).mul_(clip_factor)
        grad_norm = unitwise_norm(g_data, norm_type=norm_type)
        clipped_grad = g_data * (max_norm / grad_norm.clamp(min=1e-6))
        new_grads = torch.where(grad_norm < max_norm, g_data, clipped_grad)
        p.grad.detach().copy_(new_grads)


def load_checkpoint(config, model, optimizer, logger, model_ema=None):
    logger.info(
        f"==============> Resuming form {config.MODEL.RESUME}...................."
    )
    if config.MODEL.RESUME.startswith("https"):
        checkpoint = torch.hub.load_state_dict_from_url(
            config.MODEL.RESUME, map_location="cpu", check_hash=True
        )
    else:
        checkpoint = torch.load(config.MODEL.RESUME, map_location="cpu")
        logger.info("Already loaded checkpoint to memory..")
    if "state_dict_ema" in checkpoint.keys():
        msg = model.load_state_dict(checkpoint["state_dict_ema"], strict=False)
    else:
        msg = model.load_state_dict(checkpoint["model"], strict=False)
    logger.info(msg)
    max_accuracy = 0.0
    if config.MODEL_EMA:
        if "state_dict_ema" in checkpoint.keys():
            model_ema.ema.load_state_dict(checkpoint["state_dict_ema"], strict=False)

            logger.info("Loaded state_dict_ema")
        else:
            model_ema.ema.load_state_dict(checkpoint["model"], strict=False)
            logger.warning(
                "Failed to find state_dict_ema, starting from loaded model weights"
            )

    if not config.EVAL_MODE and "optimizer" in checkpoint and "epoch" in checkpoint:
        # optimizer.load_state_dict(checkpoint["optimizer"])
        config.defrost()
        config.TRAIN.START_EPOCH = checkpoint["epoch"] + 1
        config.freeze()

        logger.info(
            f"=> loaded successfully '{config.MODEL.RESUME}' (epoch {checkpoint['epoch']})"
        )
        if "max_accuracy" in checkpoint:
            max_accuracy = checkpoint["max_accuracy"]
    del checkpoint
    torch.cuda.empty_cache()
    return max_accuracy


def load_checkpoint_finetune(config, model, logger, model_ema=None):
    logger.info(f"==============> Finetune {config.MODEL.FINETUNE}....................")
    checkpoint = torch.load(config.MODEL.FINETUNE, map_location="cpu")
    # if "state_dict_ema" in checkpoint.keys():
    #     converted_weights = checkpoint["state_dict_ema"]
    # else:
    converted_weights = checkpoint["model"]
    # converted_weights = {}
    # keys = list(checkpoint.keys())
    # for key in keys:
    #     # if re.match(r"cls_blocks.*", key):
    #     if "classifier" in key:
    #         # if re.match(r'cls_blocks.classifier.1.*', key):
    #         print(f"key: {key} is used for pretrain, discarded.")
    #         continue
    #     else:
    #         converted_weights[key] = checkpoint[key]
    msg = model.load_state_dict(converted_weights, strict=False)
    # for key, param in list(model.named_parameters()):
    #     if "level0.blocks.1.dwconv2_reuse.weight" in key:
    #         param.copy_(model["subnet0.level0.blocks.1.dwconv2_reuse.weight"].data)
    #     if "level1.blocks.1.dwconv2_reuse.weight" in key:
    #         param.copy_(model["subnet0.level1.blocks.1.dwconv2_reuse.weight"].data)
    #     if "level2.blocks.1.dwconv2_reuse.weight" in key:
    #         param.copy_(model["subnet0.level2.blocks.1.dwconv2_reuse.weight"].data)
    #     if "level3.blocks.1.dwconv2_reuse.weight" in key:
    #         param.copy_(model["subnet0.level3.blocks.1.dwconv2_reuse.weight"].data)
    logger.info(msg)
    if model_ema is not None:
        ema_msg = model_ema.ema.load_state_dict(converted_weights, strict=False)
        logger.info(
            f"==============> Loaded Pretraind statedict into EMA...................."
        )
        logger.info(ema_msg)
    del checkpoint
    torch.cuda.empty_cache()


def save_checkpoint(
    config,
    epoch,
    model,
    epoch_accuracy,
    max_accuracy,
    optimizer,
    logger,
    model_ema=None,
):
    if model_ema is not None:
        logger.info("Model EMA is not None...")
        save_state = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "max_accuracy": max(max_accuracy, epoch_accuracy),
            "epoch": epoch,
            "state_dict_ema": get_state_dict(model_ema.ema),
            "input": input,
            "config": config,
        }
    else:
        save_state = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "max_accuracy": max(max_accuracy, epoch_accuracy),
            "epoch": epoch,
            "state_dict_ema": None,
            "input": input,
            "config": config,
        }

    save_path = os.path.join(config.OUTPUT, f"ckpt_epoch_{epoch}.pth")
    best_path = os.path.join(config.OUTPUT, f"best.pth")

    logger.info(f"{save_path} saving......")
    torch.save(save_state, save_path)
    if epoch_accuracy > max_accuracy:
        torch.save(save_state, best_path)
    logger.info(f"{save_path} saved !!!")


def get_grad_norm(parameters, norm_type=2):
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    parameters = list(filter(lambda p: p.grad is not None, parameters))
    norm_type = float(norm_type)
    total_norm = 0
    for p in parameters:
        param_norm = p.grad.data.norm(norm_type)
        total_norm += param_norm.item() ** norm_type
    total_norm = total_norm ** (1.0 / norm_type)
    return total_norm


def auto_resume_helper(output_dir, logger):
    checkpoints = os.listdir(output_dir)
    checkpoints = [
        ckpt
        for ckpt in checkpoints
        if ckpt.endswith("pth") and ckpt.startswith("ckpt_")
    ]
    logger.info(f"All checkpoints founded in {output_dir}: {checkpoints}")
    if len(checkpoints) > 0:
        latest_checkpoint = max(
            [os.path.join(output_dir, d) for d in checkpoints], key=os.path.getmtime
        )
        logger.info(f"The latest checkpoint founded: {latest_checkpoint}")
        resume_file = latest_checkpoint
    else:
        resume_file = None
    return resume_file


def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
    rt /= dist.get_world_size()
    return rt


def denormalize(
    tensor: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False
) -> torch.Tensor:
    """Denormalize a float tensor image with mean and standard deviation.
    This transform does not support PIL Image.

    .. note::
        This transform acts out of place by default, i.e., it does not mutates the input tensor.

    See :class:`~torchvision.transforms.Normalize` for more details.

    Args:
        tensor (Tensor): Float tensor image of size (C, H, W) or (B, C, H, W) to be normalized.
        mean (sequence): Sequence of means for each channel.
        std (sequence): Sequence of standard deviations for each channel.
        inplace(bool,optional): Bool to make this operation inplace.

    Returns:
        Tensor: Denormalized Tensor image.
    """
    if not isinstance(tensor, torch.Tensor):
        raise TypeError(
            "Input tensor should be a torch tensor. Got {}.".format(type(tensor))
        )

    if not tensor.is_floating_point():
        raise TypeError(
            "Input tensor should be a float tensor. Got {}.".format(tensor.dtype)
        )

    if tensor.ndim < 3:
        raise ValueError(
            "Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = "
            "{}.".format(tensor.size())
        )

    if not inplace:
        tensor = tensor.clone()

    dtype = tensor.dtype
    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
    if (std == 0).any():
        raise ValueError(
            "std evaluated to zero after conversion to {}, leading to division by zero.".format(
                dtype
            )
        )
    if mean.ndim == 1:
        mean = mean.view(-1, 1, 1)
    if std.ndim == 1:
        std = std.view(-1, 1, 1)
    tensor.mul_(std).add_(mean).clip_(0.0, 1.0)
    return tensor