Source code for instancelib.instances.base

# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass

from typing import (
    Any,
    Callable,
    FrozenSet,
    Generic,
    Iterable,
    Iterator,
    List,
    Mapping,
    MutableMapping,
    Optional,
    Sequence,
    Tuple,
    Type,
    TypeVar,
    Union,
)

from ..utils.chunks import divide_iterable_in_lists
from ..utils.func import filter_snd_none_zipped

from ..typehints import KT, DT, VT, RT

_V = TypeVar("_V")


[docs]@dataclass
class TypeInfo:
    identifier: Type
    data: Type
    vector: Type
    representation: Type

    def __repr__(self) -> str:
        result = (
            "TypeInfo("
            f"identifier={self.identifier.__name__}, "
            f"data={self.data.__name__}, "
            f"vector={self.vector.__name__}, "
            f"representation={self.representation.__name__})"
        )
        return result

    def __str__(self) -> str:
        return self.__repr__()


[docs]class Instance(ABC, Generic[KT, DT, VT, RT]):
    """A base Instance Class.

    Every Instance contains 4 properties:

        - A unique identifier (`identifier`)
        - The raw data (`data`)
        - A vector representation of the data (`vector`)
        - A human readable representation (`representation`)

    The ABC Instance has four Generic types:

        - :data:`~instancelib.typehints.KT`: The type of the key
        - :data:`~instancelib.typehints.DT`: The type of the data
        - :data:`~instancelib.typehints.VT`: The type of the vector
        - :data:`~instancelib.typehints.RT`: The type of the representation

    Combining these four items in a single object enables easy transfer between
    different operations like predictions, annotatation and transformation.
    """

    @property
    @abstractmethod
    def data(self) -> DT:
        """Return the raw data of this instance


        Returns
        -------
        DT
            The Raw Data
        """
        raise NotImplementedError

    @property
    @abstractmethod
    def representation(self) -> RT:
        """Return a representation for annotation


        Returns
        -------
        RT
            A representation of the raw data
        """
        raise NotImplementedError

    @property
    @abstractmethod
    def vector(self) -> Optional[VT]:
        """Get the vector represenation of the raw data

        Returns
        -------
        Optional[VT]
            The Vector
        """
        raise NotImplementedError

    @vector.setter
    def vector(self, value: Optional[VT]) -> None:  # type: ignore
        """Set the vector representation of the raw data

        Parameters
        ----------
        value : Optional[VT]
            A vector value (this may be `None`)

        Note
        ----
        It may be better to use the
        :meth:`InstanceProvider.bulk_add_vectors` method
        if you want update the vectors of many instances
        """
        raise NotImplementedError

    @property
    @abstractmethod
    def identifier(self) -> KT:
        """Get the identifier of the instance

        Returns
        -------
        KT
            The identifier key of the instance
        """
        raise NotImplementedError

    @identifier.setter
    def identifier(self, value: KT) -> None:
        """Set the identifier of the instance

        Parameters
        ----------
        value : KT
            The new identifier
        """
        raise NotImplementedError

    def __repr__(self) -> str:
        data_short = (
            self.data
            if len(repr(self.data)) <= 20
            else f"{repr(self.data)[0:20]} ...'"
        )
        str_rep = (
            f"Instance(identifier={self.identifier}, "
            f"data={data_short}, "
            f"has_vector={self.vector is not None})"
        )
        return str_rep

    def __str__(self) -> str:
        return self.__repr__()

[docs]    def to_dict(self) -> Mapping[str, Any]:
        mapping = {
            "identifier": self.identifier,
            "data": self.data,
            "vector": self.vector,
            "representation": self.representation,
        }
        return mapping

[docs]    @staticmethod
    def map_data(
        func: Callable[[DT], _V]
    ) -> Callable[[Instance[KT, DT, VT, RT]], _V]:
        """Transform function that works on raw data into a function that works on
        :class:`Instance` objects.

        Parameters
        ----------
        func : Callable[[DT], _V]
            The function that works on raw data

        Returns
        -------
        Callable[[Instance[KT, DT, VT, RT]], _V]
            The transformed function
        """

        def wrapped(instance: Instance[KT, DT, VT, RT]) -> _V:
            return func(instance.data)

        return wrapped

[docs]    @staticmethod
    def map_vector(
        func: Callable[[VT], _V]
    ) -> Callable[[Instance[KT, DT, VT, RT]], Optional[_V]]:
        """Transform function that works on vectors into a function that works on
        :class:`Instance` objects.

        Parameters
        ----------
        func : Callable[[VT], _V]
            The function that works on vectors

        Returns
        -------
        Callable[[Instance[KT, DT, VT, RT]], _V]
            The transformed function
        """

        def wrapped(instance: Instance[KT, DT, VT, RT]) -> Optional[_V]:
            if instance.vector is not None:
                return func(instance.vector)
            return None

        return wrapped

[docs]    @staticmethod
    def vectorized_data_map(
        func: Callable[[Iterable[DT]], _V]
    ) -> Callable[[Iterable[Instance[KT, DT, VT, RT]]], _V]:
        """Transform function that works on sequences of raw data
        into a function that works on sequences of :class:`Instance` objects.

        Parameters
        ----------
        func : Callable[[Iterable[DT]], _V]
            The function that works on sequences of raw data

        Returns
        -------
        Callable[[Iterable[Instance[KT, DT, VT, RT]]], _V]
            The transformed function
        """

        def wrapped(instances: Iterable[Instance[KT, DT, VT, RT]]) -> _V:
            data = (instance.data for instance in instances)
            results = func(data)
            return results

        return wrapped

    @property
    def type_info(self) -> TypeInfo:
        result = TypeInfo(
            type(self.identifier),
            type(self.data),
            type(self.vector),
            type(self.representation),
        )
        return result


InstanceType = TypeVar("InstanceType", bound="Instance[Any, Any, Any, Any]")


[docs]class ROInstanceProvider(
    Mapping[KT, InstanceType], ABC, Generic[InstanceType, KT, DT, VT, RT]
):
    """The Base InstanceProvider class (ReadOnly).

    This class provides an abstract implementation for a dataset.
    The InstanceProvider has five Generic types:

        - :data:`InstanceType` : A subclass of :class:`Instance`
        - :data:`~instancelib.typehints.KT`: The type of the key
        - :data:`~instancelib.typehints.DT`: The type of the data
        - :data:`~instancelib.typehints.VT`: The type of the vector
        - :data:`~instancelib.typehints.RT`: The type of the representation

    Specifying these allows Python to ensure the correctness of your implementation
    and eases further integration in your application.

    Examples
    --------

    Instance access:

    >>> provider = InstanceProvider() # Replace with your implementation's constructor
    >>> first_key = next(iter(textprovider))
    >>> first_doc = textprovider[first_key]

    Set operations:

    >>> new_instance = Instance()
    >>> provider.add(new_instance)
    >>> provider.discard(new_instance)

    Example implementation:

    >>> class TextProvider(InstanceProvider[Instance[int, str, npt.NDArray[Any], str],
    ...                                     int, str, npt.NDArray[Any], str]):
    ...     # Further implementation is needed
    >>> textprovider = TextProvider()

    There are a number of :func:`~abc.abstractmethod` that need to be implemented
    in your own implementation. See the source of this file to see what you need to
    implement.
    """

[docs]    @abstractmethod
    def __contains__(self, item: object) -> bool:
        """Special method that checks if something is contained in this
        provider.

        Parameters
        ----------
        item : object
            The item of which we want to know if it is contained in this
            provider

        Returns
        -------
        bool
            True if the provider contains `item`.

        Examples
        --------
        Example usage; check if the item exists and then remove it

        >>> doc_id = 20
        >>> provider = InstanceProvider()
        >>> if doc_id in provider:
        ...     del provider[doc_id]
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def __iter__(self) -> Iterator[KT]:
        """Enables you to iterate over Instances

        Yields
        ------
        :class:`KT`
            Keys included in the provider
        """
        raise NotImplementedError

    @property
    def key_list(self) -> List[KT]:
        """Return a list of all instance keys in this provider

        Returns
        -------
        List[KT]
            A list of instance keys
        """
        return list(self.keys())

    @property
    @abstractmethod
    def empty(self) -> bool:
        """Determines if the provider does not contain instances

        Returns
        -------
        bool
            True if the provider is empty
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def get_all(self) -> Iterator[InstanceType]:
        """Get an iterator that iterates over all instances

        Yields
        ------
        InstanceType
            An iterator that iterates over all instances
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def clear(self) -> None:
        """Removes all instances from the provider

        Warning
        -------
        Use this operation with caution! This operation is intended for
        use with providers that function as temporary user queues, not
        for large proportions of the dataset like `unlabeled` and `labeled`
        sets.
        """
        raise NotImplementedError

[docs]    def bulk_add_vectors(
        self, keys: Sequence[KT], values: Sequence[VT]
    ) -> None:
        """This methods adds vectors in `values` to the instances specified
        in `keys`.

        In some use cases, vectors are not known beforehand. This library
        provides several :term:`vectorizer` s that convert raw data points
        in feature vector form. Once these vectors are available, they can be
        added to the provider by using this method

        Parameters
        ----------
        keys
            A sequence of keys
        values
            A sequence of vectors

        Warning
        -------
        We assume that the indices and length of the parameters `keys` and `values`
        match.
        """
        for key, vec in zip(keys, values):
            self[key].vector = vec

[docs]    def bulk_get_vectors(
        self, keys: Sequence[KT]
    ) -> Tuple[Sequence[KT], Sequence[VT]]:
        """Given a list of instance `keys`, return the vectors

        Parameters
        ----------
        keys : Sequence[KT]
            A list of vectors

        Returns
        -------
        Tuple[Sequence[KT], Sequence[VT]]
            A tuple of two sequences, one with `keys` and one with `vectors`.
            The indices match, so the instance with ``keys[2]`` has as
            vector ``vectors[2]``

        Warning
        -------
        Some underlying implementations do not preserve the ordering of the parameter
        `keys`. Therefore, always use the keys variable from the returned tuple for
        the correct matching.
        """
        vector_pairs = ((key, self[key].vector) for key in keys)
        ret_keys, ret_vectors = filter_snd_none_zipped(vector_pairs)
        return ret_keys, ret_vectors  # type: ignore

[docs]    def data_chunker(
        self, batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, DT]]]:
        """Iterate over all instances data parts in
        this provider

        Parameters
        ----------
        batch_size : int
            The batch size, the generator will return lists with size `batch_size`

        Yields
        -------
        Sequence[Tuple[KT,DT]]
            A sequence of instances with length `batch_size`. The last list may have
            a shorter length.
        """
        datapoints = ((ins.identifier, ins.data) for ins in self.values())
        chunks = divide_iterable_in_lists(datapoints, batch_size)
        yield from chunks

[docs]    def data_chunker_selector(
        self, keys: Iterable[KT], batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, DT]]]:
        keyset = frozenset(keys)
        datapoints = (
            (ins.identifier, ins.data)
            for ins in self.values()
            if ins.identifier in keyset
        )
        chunks = divide_iterable_in_lists(datapoints, batch_size)
        yield from chunks

    @property
    def with_vector(self) -> FrozenSet[KT]:
        return frozenset((k for k, v in self.items() if v.vector is not None))

    @property
    def without_vector(self) -> FrozenSet[KT]:
        return frozenset((k for k, v in self.items() if v.vector is None))

[docs]    def instance_chunker_selector(
        self, keys: Iterable[KT], batch_size: int = 200
    ) -> Iterator[Sequence[InstanceType]]:
        chunks = divide_iterable_in_lists(keys, batch_size)
        for chunk in chunks:
            yield [self[key] for key in chunk]

[docs]    def instance_chunker(
        self, batch_size: int = 200
    ) -> Iterator[Sequence[InstanceType]]:
        """Iterate over all instances (with or without vectors) in
        this provider

        Parameters
        ----------
        batch_size : int
            The batch size, the generator will return lists with size `batch_size`

        Yields
        -------
        Sequence[Instance[KT, DT, VT, RT]]]
            A sequence of instances with length `batch_size`. The last list may have
            a shorter length.
        """
        chunks = divide_iterable_in_lists(self.values(), batch_size)
        yield from chunks

[docs]    def vector_chunker_selector(
        self, keys: Iterable[KT], batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, VT]]]:
        """Iterate over all instances (with or without vectors) in belonging the identifier
        :class:`Iterable` in the `keys` parameter.

        Parameters
        ----------
        keys : Iterable[KT]
            The keys that should should be chunked

        batch_size : int
            The batch size, the generator will return lists with size `batch_size`

        Yields
        -------
        Sequence[Instance[KT, DT, VT, RT]]]
            A sequence of instances with length `batch_size`. The last list may have
            a shorter length.

        Returns
        -------
        Iterator[Sequence[Tuple[KT, VT]]]
            An iterator over sequences of key vector tuples
        """
        included_ids = frozenset(self.key_list).intersection(keys)
        id_vecs = (
            (elem.identifier, elem.vector)
            for elem in self.values()
            if elem.vector is not None and elem.identifier in included_ids
        )
        chunks = divide_iterable_in_lists(id_vecs, batch_size)
        return chunks

[docs]    def vector_chunker(
        self, batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, VT]]]:
        """Iterate over all pairs of keys and vectors in
        this provider

        Parameters
        ----------
        batch_size : int
            The batch size, the generator will return lists with size `batch_size`

        Returns
        -------
        Iterator[Sequence[Tuple[KT, VT]]]
            An iterator over sequences of key vector tuples

        Yields
        -------
        Sequence[Tuple[KT, VT]]
            Sequences of key vector tuples
        """
        yield from self.vector_chunker_selector(self.key_list, batch_size)

[docs]    def bulk_get_all(self) -> List[InstanceType]:
        """Returns a list of all instances in this provider.

        Returns
        -------
        List[Instance[KT, DT, VT, RT]]
            A list of all instances in this provider

        Warning
        -------
        When using this method on very large providers with lazily loaded instances, this
        may yield Out of Memory errors, as all the data will be loaded into RAM.
        Use with caution!
        """
        return list(self.get_all())

[docs]    def map(self, func: Callable[[InstanceType], _V]) -> Iterator[_V]:
        """A higher order function that maps any function that works on
        individual :class:`Instance` objects on every contained object in
        this provider.

        Parameters
        ----------
        func : Callable[[InstanceType], _V]
            A function that works on :class:`Instance` objects of type `InstanceType`

        Yields
        -------
        Iterator[_V]
            The values produced by the function `func`
        """
        keys = self.key_list
        for key in keys:
            instance = self[key]
            result = func(instance)
            yield result

[docs]    def data_map(self, func: Callable[[DT], _V]) -> Iterator[_V]:
        """A higher order function that maps any function that works on
        individual :class:`~instancelib.typehints.KT` object
        on every  :class:`Instance` object in this provider.

        Parameters
        ----------
        func
            The function that should be applied

        Yields
        -------
        _V
            The values produced by the function `func`
        """
        instances = self.values()
        mapped_f = Instance[KT, DT, VT, RT].map_data(func)
        results = map(mapped_f, instances)
        yield from results

[docs]    def all_data(self) -> Iterator[DT]:
        """Return all the raw data from the instances in this provider

        Yields
        ------
        DT
            Raw data
        """
        yield from (instance.data for instance in self.values())

[docs]    def vectorized_map(
        self,
        func: Callable[[Iterable[InstanceType]], _V],
        batch_size: int = 200,
    ) -> Iterator[_V]:
        """Maps a function that works on multiple instances
        onto all the instances in batches of size `batch_size`.

        Note: If you run a function that combines multiple instances into
        a single result, this may possibly lead to undiserable results if
        batches are not taken into account.

        Parameters
        ----------
        func : Callable[[Iterable[InstanceType]], _V]
            The function that should be applied
        batch_size : int, optional
            The size of the batch, by default 200

        Yields
        -------
        _V
            The result type of the function in parameter `func`
        """
        chunks = divide_iterable_in_lists(self.values(), batch_size)
        results = map(func, chunks)
        yield from results

[docs]    def vectorized_data_map(
        self, func: Callable[[Iterable[DT]], _V], batch_size: int = 200
    ) -> Iterator[_V]:
        """Maps a function that works on multiple raw data points
        onto all the instances in batches of size `batch_size`.

        Note: If you run a function that combines multiple instances into
        a single result, this may possibly lead to undiserable results if
        batches are not taken into account.

        Parameters
        ----------
        func
            The function that should be applied
        batch_size : int, optional
            The size of the batch, by default 200

        Yields
        -------
        _V
            The result type of the function in parameter `func`
        """
        chunks = divide_iterable_in_lists(self.values(), batch_size)
        mapped_f = Instance[KT, DT, VT, RT].vectorized_data_map(func)
        results = map(mapped_f, chunks)
        yield from results

    @property
    def type_info(self) -> Optional[TypeInfo]:
        try:
            first_item = next(iter(self.values()))
        except StopIteration:
            return None
        return first_item.type_info

    def __repr__(self) -> str:
        result = f"InstanceProvider(length={len(self)})"
        return result

    def __str__(self) -> str:
        return self.__repr__()


[docs]def default_instance_viewer(
    ins: Instance[Any, Any, Any, RT]
) -> Mapping[str, RT]:
    return {"data": ins.representation}


[docs]class InstanceProvider(
    MutableMapping[KT, InstanceType],
    ROInstanceProvider[InstanceType, KT, DT, VT, RT],
    ABC,
    Generic[InstanceType, KT, DT, VT, RT],
):
    """The Base InstanceProvider class.

    This class provides an abstract implementation for a dataset.
    The InstanceProvider has five Generic types:

        - :data:`InstanceType` : A subclass of :class:`Instance`
        - :data:`~instancelib.typehints.KT`: The type of the key
        - :data:`~instancelib.typehints.DT`: The type of the data
        - :data:`~instancelib.typehints.VT`: The type of the vector
        - :data:`~instancelib.typehints.RT`: The type of the representation

    Specifying these allows Python to ensure the correctness of your implementation
    and eases further integration in your application.

    Examples
    --------

    Instance access:

    >>> provider = InstanceProvider() # Replace with your implementation's constructor
    >>> first_key = next(iter(textprovider))
    >>> first_doc = textprovider[first_key]

    Set operations:

    >>> new_instance = Instance()
    >>> provider.add(new_instance)
    >>> provider.discard(new_instance)

    Example implementation:

    >>> class TextProvider(InstanceProvider[Instance[int, str, npt.NDArray[Any], str],
    ...                                     int, str, npt.NDArray[Any], str]):
    ...     # Further implementation is needed
    >>> textprovider = TextProvider()

    There are a number of :func:`~abc.abstractmethod` that need to be implemented
    in your own implementation. See the source of this file to see what you need to
    implement.
    """

[docs]    @abstractmethod
    def add_child(
        self,
        parent: Union[KT, Instance[KT, DT, VT, RT]],
        child: Union[KT, Instance[KT, DT, VT, RT]],
    ) -> None:
        """Register a parent child relation between two instances

        Parameters
        ----------
        parent : Union[KT, Instance[KT, DT, VT, RT]]
            The parent instance (or identifier)
        child : Union[KT, Instance[KT, DT, VT, RT]]
            The child instance (or identifier)
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def get_children(
        self, parent: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> Sequence[InstanceType]:
        """Get the children that are registered to this parent

        Parameters
        ----------
        parent : Union[KT, Instance[KT, DT, VT, RT]]
            The parent from which you want to get the children from.

        Returns
        -------
        Sequence[InstanceType]
            A list containing the children
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def discard_children(
        self, parent: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> None:
        """Discard the children that are registered to this parent

        Parameters
        ----------
        parent : Union[KT, Instance[KT, DT, VT, RT]]
            The parent from which you want to get the children from.
        """
        raise NotImplementedError

[docs]    def get_children_keys(
        self, parent: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> Sequence[KT]:
        """Get the children that are registered to this parent

        Parameters
        ----------
        parent : Union[KT, Instance[KT, DT, VT, RT]]
            The parent from which you want to get the children from.

        Returns
        -------
        Sequence[InstanceType]
            A list containing the children
        """
        child_keys = [ins.identifier for ins in self.get_children(parent)]
        return child_keys

[docs]    @abstractmethod
    def get_parent(
        self, child: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> InstanceType:
        """Get the parent of a child

        Parameters
        ----------
        child : Union[KT, Instance[KT, DT, VT, RT]]
            A child instance from which you want to get the children from.

        Returns
        -------
        InstanceType
            The parent of this child instance

        Raises
        ------
        KeyError
            If there is no parent associated with this :class:`Instance`
        """
        raise NotImplementedError

[docs]    def add(self, instance: Instance[KT, DT, VT, RT]) -> None:
        """Add an instance to this provider.

        If the provider already contains `instance`, nothing happens.

        Parameters
        ----------
        instance : Instance[KT, DT, VT, RT]
            The instance that should be added to the provider
        """
        self.__setitem__(instance.identifier, instance)  # type: ignore

[docs]    def add_range(self, *instances: Instance[KT, DT, VT, RT]) -> None:
        """Add multiple instances to this provider.

        If the provider already contains `instance`, nothing happens.

        Parameters
        ----------
        instance : Instance[KT, DT, VT, RT]
            The instance that should be added to the provider
        """
        for instance in instances:
            self.add(instance)

[docs]    def discard(self, instance: Instance[KT, DT, VT, RT]) -> None:
        """Remove an instance from this provider. If the
        provider does not contain `instance`, nothing happens.

        Parameters
        ----------
        instance : Instance[KT, DT, VT, RT]
            The instance that should be removed from the provider
        """
        try:
            self.__delitem__(instance.identifier)
        except KeyError:
            pass  # To adhere to Set.discard(...) behavior

[docs]    def map_mutate(
        self, func: Callable[[InstanceType], Instance[KT, DT, VT, RT]]
    ) -> None:
        """Run a function on this provider that modifies all Instances in place

        Parameters
        ----------
        func : Callable[[InstanceType], InstanceType]
            A function that modifies instances in place
        """
        keys = self.key_list
        for key in keys:
            instance = self[key]
            upd_instance = func(instance)
            self[key] = upd_instance  # type: ignore

[docs]    @abstractmethod
    def create(self, *args: Any, **kwargs: Any) -> InstanceType:
        """Create a new instance of type :data:`InstanceType`.
        The created instance is subsequently added to the provider.

        Note: The number of arguments and keyword arguments may differ
        in actual implementation, so there are no standard arguments.

        Returns
        -------
        InstanceType
            The new instance Type
        """
        raise NotImplementedError


[docs]class AbstractBucketProvider(
    InstanceProvider[InstanceType, KT, DT, VT, RT],
    ABC,
    Generic[InstanceType, KT, DT, VT, RT],
):
    """This class allows the creation of subsets (`buckets`) from a provider,
    without copying data, while still preserving the :class:`InstanceProvider`
    API.

    For example, in Poolbased Active Learning, the dataset is partitioned
    in several sets; e.g., the `labeled` and `unlabeled` parts of the dataset.
    Or in traditional supervised learning, the train, test and validation sets.
    No data is copied, only a set of identifiers is kept in this provider.
    All data resides in the original provider.

    Attributes
    ----------
    dataset
        The :class:`InstanceProvider` that you want to take a subset from
    """

    dataset: InstanceProvider[InstanceType, KT, DT, VT, RT]
    """The original dataset. All data will remain there"""

    @abstractmethod
    def _add_to_bucket(self, key: KT) -> None:
        """Adds the :class:`Instance` with identifier `key` to the bucket

        Parameters
        ----------
        key : KT
            The identifier for the :class:`Instance` that should be added
        """
        raise NotImplementedError

    @abstractmethod
    def _remove_from_bucket(self, key: KT) -> None:
        """Removes the :class:`Instance` with identifier `key` from the bucket

        Parameters
        ----------
        key : KT
            The identifier for the :class:`Instance` that should be removed
        """
        raise NotImplementedError

    @abstractmethod
    def _in_bucket(self, key: KT) -> bool:
        """Returns if the :class:`Instance` with identifier `key` exists
        within this bucket

        Parameters
        ----------
        key : KT
            The identifier for the :class:`Instance` that should be added
        """
        raise NotImplementedError

    @abstractmethod
    def _clear_bucket(self) -> None:
        """Removes all elements from this bucket"""
        raise NotImplementedError

    @abstractmethod
    def _len_bucket(self) -> int:
        """Returns the number of elements in the buckets

        Returns
        -------
        int
            The size of the bucket
        """
        raise NotImplementedError

    @property
    @abstractmethod
    def _bucket(self) -> Iterable[KT]:
        """Return an iterable of all identifiers in the bucket.

        Returns
        -------
        Iterable[KT]
            An :class:`Iterable` that contains all identifiers
            present in this bucket
        """
        raise NotImplementedError

    def __iter__(self) -> Iterator[KT]:
        yield from self._bucket

    def __getitem__(self, key: KT):
        if self._in_bucket(key):
            return self.dataset[key]
        raise KeyError(
            f"This datapoint with key {key} does not exist in this provider"
        )

    def __setitem__(self, key: KT, value: InstanceType) -> None:
        self._add_to_bucket(key)
        self.dataset[key] = value  # type: ignore

    def __delitem__(self, key: KT) -> None:
        self._remove_from_bucket(key)

    def __len__(self) -> int:
        return self._len_bucket()

    def __contains__(self, key: object) -> bool:
        return self._in_bucket(key)  # type: ignore

[docs]    def get_all(self) -> Iterator[InstanceType]:
        yield from list(self.values())

[docs]    def vector_chunker(
        self, batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, VT]]]:
        results = self.dataset.vector_chunker_selector(
            self.key_list, batch_size
        )
        return results

[docs]    def data_chunker(
        self, batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, DT]]]:
        results = self.dataset.data_chunker_selector(self.key_list, batch_size)
        return results

[docs]    def data_chunker_selector(
        self, keys: Iterable[KT], batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, DT]]]:
        keyset = frozenset(self.key_list).intersection(keys)
        results = self.dataset.data_chunker_selector(keyset, batch_size)
        return results

[docs]    def vector_chunker_selector(
        self, keys: Iterable[KT], batch_size: int = 200
    ) -> Iterator[Sequence[Tuple[KT, VT]]]:
        keyset = frozenset(self.key_list).intersection(keys)
        results = self.dataset.vector_chunker_selector(keyset, batch_size)
        return results

[docs]    def clear(self) -> None:
        self._clear_bucket()

    @property
    def empty(self) -> bool:
        return not self

[docs]    def add_child(
        self, parent: Union[KT, InstanceType], child: Union[KT, InstanceType]
    ) -> None:
        self.dataset.add_child(parent, child)

[docs]    def get_children_keys(
        self, parent: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> Sequence[KT]:
        return self.dataset.get_children_keys(parent)

[docs]    def get_children(
        self, parent: Union[KT, InstanceType]
    ) -> Sequence[InstanceType]:
        return self.dataset.get_children(parent)

[docs]    def get_parent(self, child: Union[KT, InstanceType]) -> InstanceType:
        return self.dataset.get_parent(child)

[docs]    def discard_children(
        self, parent: Union[KT, Instance[KT, DT, VT, RT]]
    ) -> None:
        return self.dataset.discard_children(parent)

[docs]    def create(self, *args: Any, **kwargs: Any) -> InstanceType:
        new_instance = self.dataset.create(*args, **kwargs)
        self.add(new_instance)
        return new_instance


[docs]class SubtractionProvider(
    AbstractBucketProvider[InstanceType, KT, DT, VT, RT],
    ABC,
    Generic[InstanceType, KT, DT, VT, RT],
):
    """This abstract class allows the creation of large subsets (`buckets`) that
    do not contain some elements, specified in a `bucket`.
    No data is copied, however, the :class:`InstanceProvider` API is preserved.

    In some underlying implementations (like a Many to Many relation in Django),
    the creation of a large elements set takes a lot of time.
    This class allows the creation to subtract a (small) bucket from the dataset
    and include only the remainder.

    This method can be used in the Poolbased Active Learning setting; suppose
    you have a small `labeled` set and a huge dataset.
    You can subtract the `labeled` from the dataset and create an InstanceProvider
    that contains all `unlabeled` examples.

    Attributes
    ----------
    dataset
        The :class:`InstanceProvider` that you want to take a subset from

    bucket
        The :class:`InstanceProvider` that you want to exclude from the dataset

    Warning
    -------
    If possible, do not use this class: a solution that is based on only :class:`InstanceProvider` objects
    and :class:`AbstractBucketProvider` will probably be faster.
    """

    bucket: InstanceProvider[InstanceType, KT, DT, VT, RT]
    """The provider that should be excluded from the original `dataset`."""

    @property
    def _bucket(self) -> Iterable[KT]:
        ds_keys = frozenset(self.dataset)
        bu_keys = frozenset(self.bucket)
        difference = ds_keys.difference(bu_keys)
        return iter(difference)

    def _in_bucket(self, key: KT) -> bool:
        return key not in self.bucket and key in self.dataset

    def _add_to_bucket(self, key: KT) -> None:
        instance = self.dataset[key]
        self.bucket.discard(instance)

    def _remove_from_bucket(self, key: KT) -> None:
        instance = self.dataset[key]
        self.bucket.add(instance)

    def _clear_bucket(self) -> None:
        pass

    def _len_bucket(self) -> int:
        ds_keys = frozenset(self.dataset)
        bu_keys = frozenset(self.bucket)
        difference = ds_keys.difference(bu_keys)
        return len(difference)

[docs]    def create(self, *args: Any, **kwargs: Any) -> InstanceType:
        new_instance = self.dataset.create(*args, **kwargs)
        return new_instance

[docs]    def clear(self) -> None:
        pass