Source code for instancelib.environment.base

# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

from __future__ import annotations

import random

from typing import (
    Callable,
    Generic,
    Iterable,
    Iterator,
    Mapping,
    MutableMapping,
    Optional,
    Sequence,
    Tuple,
    TypeVar,
    Any,
    Union,
)
from abc import ABC, abstractmethod

from ..utils.func import union
from ..instances.base import (
    InstanceProvider,
    Instance,
    default_instance_viewer,
)
from ..labels.base import LabelProvider, default_label_viewer

from ..typehints import KT, DT, VT, RT, LT

from ..export.pandas import to_pandas

import pandas as pd

import warnings


InstanceType = TypeVar("InstanceType", bound="Instance[Any, Any, Any, Any]")


[docs]class Environment(
    MutableMapping[str, InstanceProvider[InstanceType, KT, DT, VT, RT]],
    ABC,
    Generic[InstanceType, KT, DT, VT, RT, LT],
):
    """Environments provide an interface that enable you to access all data stored in the datasets.
    If there are labels stored in the environment, you can access these as well from here.

    There are two important properties in every :class:`Environment`:

    - :meth:`dataset`: Contains all Instances of the original dataset
    - :meth:`labels`: Contains an object that allows you to access labels easily

    Besides these properties, this object also provides methods to create new
    :class:`~instancelib.InstanceProvider` objects that contain a subset of
    the set of all instances stored in this environment.

    Examples
    --------

    Access the dataset:

    >>> dataset = env.dataset
    >>> instance = next(iter(dataset.values()))

    Access the labels:

    >>> labels = env.labels
    >>> ins_lbls = labels.get_labels(instance)

    Create a train-test split on the dataset (70 % train, 30 % test):

    >>> train, test = env.train_test_split(dataset, 0.70)
    """

[docs]    @abstractmethod
    def create_empty_provider(
        self,
    ) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        """Use this method to create an empty `InstanceProvider`

        Returns
        -------
        InstanceProvider[InstanceType, KT, DT, VT, RT]
            The newly created provider
        """
        raise NotImplementedError

    @property
    @abstractmethod
    def dataset(self) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        """This property contains the `InstanceProvider` that contains
        the original dataset. This provider should include all original
        instances.

        Returns
        -------
        InstanceProvider[InstanceType, KT, DT, VT, RT]
            The dataset :class:`InstanceProvider`
        """
        raise NotImplementedError

    @property
    @abstractmethod
    def all_instances(self) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        """This provider should include all instances in all providers.
        If there are any synthethic datapoints constructed,
        they should be also in here.

        Returns
        -------
        InstanceProvider[InstanceType, KT, DT, VT, RT]
            The all_instances :class:`InstanceProvider`
        """
        raise NotImplementedError

    @property
    def all_datapoints(self) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        """This provider should include all instances in all providers.
        If there are any synthethic datapoints constructed,
        they should be also in here.

        Returns
        -------
        InstanceProvider[InstanceType, KT, DT, VT, RT]
            The all_datapoints :class:`InstanceProvider`

        Warning
        -------
        Deprecated, use the all_instances property instead!

        """
        warnings.warn(
            "Use the `all_instances` property instead!",
            category=DeprecationWarning,
        )
        return self.all_instances

    @property
    @abstractmethod
    def labels(self) -> LabelProvider[KT, LT]:
        """This property contains provider that has a mapping from instances to labels and
        vice-versa.

        Returns
        -------
        LabelProvider[KT, LT]
            The label provider
        """
        raise NotImplementedError

[docs]    def add_vectors(self, keys: Sequence[KT], vectors: Sequence[VT]) -> None:
        """This method adds feature vectors or embeddings to instances
        associated with the keys in the first parameters. The sequences
        `keys` and `vectors` should have the same length.

        Parameters
        ----------
        keys : Sequence[KT]
            A sequence of keys
        vectors : Sequence[VT]
            A sequence of vectors that should be associated with the instances
            of the sequence `keys`

        """
        self.all_instances.bulk_add_vectors(keys, vectors)

[docs]    def create(self, *args: Any, **kwargs: Any) -> InstanceType:
        """Create a new Instance

        Returns
        -------
        InstanceType
            A new instance
        """
        new_instance = self.all_instances.create(*args, **kwargs)
        return new_instance

[docs]    @abstractmethod
    def create_bucket(
        self, keys: Iterable[KT]
    ) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        """Create an InstanceProvider that contains certain keys found in this
        environment.

        Parameters
        ----------
        keys : Iterable[KT]
            The keys that should be included in this bucket

        Returns
        -------
        InstanceProvider[InstanceType, KT, DT, VT, RT]
            An InstanceProvider that contains the instances specified in `keys`

        """
        raise NotImplementedError

[docs]    def train_test_split(
        self,
        source: InstanceProvider[InstanceType, KT, DT, VT, RT],
        train_size: Union[float, int],
    ) -> Tuple[
        InstanceProvider[InstanceType, KT, DT, VT, RT],
        InstanceProvider[InstanceType, KT, DT, VT, RT],
    ]:
        """Divide an InstanceProvider into two different providers containing a random
        division of the input according to the parameter `train_size`.

        Parameters
        ----------
        source : InstanceProvider[InstanceType, KT, DT, VT, RT]
            The InstanceProvider that should be divided
        train_size : Union[float, int]
            The number (int) of instances that should be included in the training
            or a float (between 0 and 1) of train / test ratio.

        Examples
        --------
        Example usage

        >>> train_val, test = env.train_test_split(provider, 0.70)
        >>> train, val = env.train_test_split(train_val, 0.70)


        Returns
        -------
        Tuple[InstanceProvider[InstanceType, KT, DT, VT, RT], InstanceProvider[InstanceType, KT, DT, VT, RT]]
            A Tuple containing two InstanceProviders:
                - The training set (containing `train_size` documents)
                - The test set
        """
        if isinstance(train_size, float):
            n_train_docs = round(train_size * len(source))
        else:
            n_train_docs = train_size
        source_keys = list(frozenset(source.key_list))

        # Randomly sample train keys
        train_keys = random.sample(source_keys, n_train_docs)
        # The remainder should be used for testing
        test_keys = frozenset(source_keys).difference(train_keys)

        train_provider = self.create_bucket(train_keys)
        test_provider = self.create_bucket(test_keys)
        return train_provider, test_provider

[docs]    def combine(
        self,
        *providers: InstanceProvider[InstanceType, KT, DT, VT, RT],
    ) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        """Combine Providers into a single Provider

        Parameters
        ----------
        providers
            The providers that should be combined into a single provider

        Returns
        -------
        InstanceProvider[InstanceType, KT, DT, VT, RT]
            The provider that contains all elements of the supplied Providers
        """

        keys = union(*(frozenset(pr.key_list) for pr in providers))
        combined_provider = self.create_bucket(keys)
        return combined_provider

[docs]    def get_children(
        self, parent: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        """Get the children that are registered to this parent

        Parameters
        ----------
        parent : Union[KT, Instance[KT, DT, VT, RT]]
            The parent from which you want to get the children from.

        Returns
        -------
        InstanceProvider[InstanceType, KT, DT, VT, RT]
            A Provider that contains all children
        """
        child_keys = self.all_instances.get_children_keys(parent)
        new_bucket = self.create_bucket(child_keys)
        return new_bucket

[docs]    def get_parent(
        self, child: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> InstanceType:
        """Get the parent of a child

        Parameters
        ----------
        child : Union[KT, Instance[KT, DT, VT, RT]]
            A child instance from which you want to get the children from.

        Returns
        -------
        InstanceType
            The parent of this child instance
        """
        return self.all_instances.get_parent(child)

[docs]    def discard_children(
        self, parent: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> None:
        """Discard all children from this parent

        Parameters
        ----------
        parent : Union[KT, Instance[KT, DT, VT, RT]]
            The parent Instance
        """
        self.all_instances.discard_children(parent)

[docs]    def get_subset_by_labels(
        self,
        provider: InstanceProvider[InstanceType, KT, DT, VT, RT],
        *labels: LT,
        labelprovider: Optional[LabelProvider[KT, LT]] = None,
    ) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        if labelprovider is None:
            l_provider = self.labels
        else:
            l_provider = labelprovider
        keys = union(
            *(l_provider.get_instances_by_label(label) for label in labels)
        ).intersection(provider)
        provider = self.create_bucket(keys)
        return provider

    @property
    def named_providers(
        self,
    ) -> Mapping[str, InstanceProvider[InstanceType, KT, DT, VT, RT]]:
        return dict(self)

[docs]    @abstractmethod
    def set_named_provider(
        self, name: str, value: InstanceProvider[InstanceType, KT, DT, VT, RT]
    ):
        raise NotImplementedError

[docs]    @abstractmethod
    def create_named_provider(
        self, name: str, keys: Iterable[KT] = list()
    ) -> InstanceProvider[InstanceType, KT, DT, VT, RT]:
        raise NotImplementedError

    def __repr__(self) -> str:
        return self.__str__()

    def __str__(self) -> str:
        result = (
            f"Environment(dataset={self.dataset}, \n"
            f"   labels={self.labels}, \n"
            f"   named_providers={self.named_providers}, \n"
            f"   length={len(self.all_instances)}, \n"
            f"   typeinfo={self.all_instances.type_info}) \n"
        )
        return result

[docs]    def to_pandas(
        self,
        provider: Optional[
            InstanceProvider[InstanceType, KT, DT, VT, RT]
        ] = None,
        labels: Optional[LabelProvider[KT, LT]] = None,
        instance_viewer: Callable[
            [Instance[KT, DT, VT, RT]], Mapping[str, Any]
        ] = default_instance_viewer,
        label_viewer: Callable[
            [KT, LabelProvider[KT, LT]], Mapping[str, Any]
        ] = default_label_viewer,
        provider_hooks: Sequence[
            Callable[
                [InstanceProvider[InstanceType, KT, DT, VT, RT]],
                Mapping[KT, Mapping[str, Any]],
            ]
        ] = list(),
    ) -> pd.DataFrame:

        chosen_provider = self.dataset if provider is None else provider
        chosen_labels = self.labels if labels is None else labels
        result = to_pandas(
            chosen_provider,
            chosen_labels,
            instance_viewer,
            label_viewer,
            provider_hooks,
        )
        return result


[docs]class AbstractEnvironment(
    Environment[InstanceType, KT, DT, VT, RT, LT],
    ABC,
    Generic[InstanceType, KT, DT, VT, RT, LT],
):
    """Environments provide an interface that enable you to access all data stored in the datasets.
    If there are labels stored in the environment, you can access these as well from here.

    There are two important properties in every :class:`Environment`:

    - :meth:`dataset`: Contains all Instances of the original dataset
    - :meth:`labels`: Contains an object that allows you to access labels easily

    Besides these properties, this object also provides methods to create new
    :class:`~instancelib.InstanceProvider` objects that contain a subset of
    the set of all instances stored in this environment.

    Examples
    --------

    Access the dataset:

    >>> dataset = env.dataset
    >>> instance = next(iter(dataset.values()))

    Access the labels:

    >>> labels = env.labels
    >>> ins_lbls = labels.get_labels(instance)

    Create a train-test split on the dataset (70 % train, 30 % test):

    >>> train, test = env.train_test_split(dataset, 0.70)
    """

    pass