Source code for attribench.result._deletion_result

from typing_extensions import override
import os
import yaml
import h5py
import numpy as np
from numpy import typing as npt
from typing import List, Tuple, Optional
from attribench.data.nd_array_tree._random_access_nd_array_tree import (
    RandomAccessNDArrayTree,
)
from ._metric_result import MetricResult
import pandas as pd


def _aoc(x: np.ndarray, columns: Optional[npt.NDArray] = None):
    if columns is not None:
        x = x[..., columns]
    return x[..., 0] - _auc(x, columns)


def _auc(x: np.ndarray, columns: Optional[npt.NDArray] = None):
    # TODO do we have to normalize by the first value? Same for AOC.
    # Should actually not make a difference.
    if columns is not None:
        x = x[..., columns]
    l = x.shape[-1] if columns is None else columns.shape[0]
    return np.sum(x, axis=-1) / l


[docs]class DeletionResult(MetricResult):
    """
    Represents results from running the Deletion metric.
    """

    def __init__(
        self,
        method_names: List[str],
        maskers: List[str],
        activation_fns: List[str],
        mode: str,
        num_samples: int,
        num_steps: int,
    ):
        """
        Parameters
        ----------
        method_names : List[str]
            Names of attribution methods tested by Deletion.
        maskers : List[str]
            Names of maskers used by Deletion.
        activation_fns : List[str]
            Names of activation functions used by Deletion.
        mode : str
            Indicates if Deletion-MoRF or Deletion-LeRF was used.
            Options: "morf", "lerf"
        num_samples : int
            Number of samples on which Deletion was run.
        num_steps : int
            Number of steps used by Deletion.
        """
        levels = {
            "method": method_names,
            "masker": maskers,
            "activation_fn": activation_fns,
        }
        level_order = ["method", "masker", "activation_fn"]
        shape = [num_samples, num_steps]
        super().__init__(method_names, shape, levels, level_order)
        self.mode = mode

[docs]    @override
    def save(self, path: str, format="hdf5"):
        super().save(path, format)

        # Save additional metadata
        if format == "hdf5":
            with h5py.File(path, mode="a") as fp:
                fp.attrs["mode"] = self.mode
        elif format == "csv":
            with open(os.path.join(path, "metadata.yaml"), "r") as fp:
                metadata = yaml.safe_load(fp)
            metadata["mode"] = self.mode
            with open(os.path.join(path, "metadata.yaml"), "w") as fp:
                yaml.dump(metadata, fp)

    @classmethod
    def _load_tree_mode(
        cls, path: str, format="hdf5"
    ) -> Tuple[RandomAccessNDArrayTree, str]:
        """Loads the tree and mode from a file or directory.

        Parameters
        ----------
        path : str
            Path to the file or directory.
        format : str, optional
            Format of the saved result.
            Options: "hdf5", "csv".
            By default "hdf5".

        Returns
        -------
        Tuple[RandomAccessNDArrayTree, str]
            The RandomAccessNDArrayTree object and the mode as a string.

        Raises
        ------
        ValueError
            If the format argument is not valid.
        """
        if format == "hdf5":
            with h5py.File(path, "r") as fp:
                tree = RandomAccessNDArrayTree.load_from_hdf(fp)
                mode = str(fp.attrs["mode"])
        elif format == "csv":
            with open(os.path.join(path, "metadata.yaml"), "r") as fp:
                metadata = yaml.safe_load(fp)
            tree = RandomAccessNDArrayTree.load_from_dir(path)
            mode = metadata["mode"]
        else:
            raise ValueError("Invalid format", format)
        return tree, mode

    @classmethod
    @override
    def _load(cls, path: str, format="hdf5") -> "DeletionResult":
        tree, mode = cls._load_tree_mode(path, format)
        res = DeletionResult(
            tree.levels["method"],
            tree.levels["masker"],
            tree.levels["activation_fn"],
            mode,
            tree.shape[0],
            tree.shape[1],
        )
        res.tree = tree
        return res

[docs]    @override
    def get_df(
        self,
        masker: str,
        activation_fn: str,
        agg_fn="auc",
        methods: Optional[List[str]] = None,
        columns: Optional[npt.NDArray] = None,
    ) -> Tuple[pd.DataFrame, bool]:
        """
        Retrieves a dataframe from the result for a given masker and
        activation function. The dataframe contains a row for each sample and a
        column for each method. Each value is the AUC/AOC for the given method
        on the given sample.

        Parameters
        ----------
        masker : str
            The masker to use.
        activation_fn : str
            The activation function to use.
        agg_fn : str
            Either "auc" for AUC or "aoc" for AOC.
        methods : Optional[List[str]]
            The methods to include. If None, includes all methods.
        columns : Optional[npt.NDArray]
            The columns used in the AUC/AOC calculation.
            If None, uses all columns.

        Returns
        -------
        Tuple[pd.DataFrame, bool]
            dataframe containing results,
            and boolean indicating if higher is better.
        """
        higher_is_better = (self.mode == "morf" and agg_fn == "aoc") or (
            self.mode == "lerf" and agg_fn == "auc"
        )
        methods = methods if methods is not None else self.method_names
        df_dict = {}
        agg_fns = {"auc": _auc, "aoc": _aoc}
        for method in methods:
            array = self.tree.get(
                masker=masker, activation_fn=activation_fn, method=method
            )
            df_dict[method] = agg_fns[agg_fn](array, columns)
        return pd.DataFrame.from_dict(df_dict), higher_is_better