# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import (
Any,
Callable,
FrozenSet,
Generic,
Iterable,
Iterator,
List,
Mapping,
MutableMapping,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Union,
)
from ..utils.chunks import divide_iterable_in_lists
from ..utils.func import filter_snd_none_zipped
from ..typehints import KT, DT, VT, RT
_V = TypeVar("_V")
[docs]@dataclass
class TypeInfo:
identifier: Type
data: Type
vector: Type
representation: Type
def __repr__(self) -> str:
result = (
"TypeInfo("
f"identifier={self.identifier.__name__}, "
f"data={self.data.__name__}, "
f"vector={self.vector.__name__}, "
f"representation={self.representation.__name__})"
)
return result
def __str__(self) -> str:
return self.__repr__()
[docs]class Instance(ABC, Generic[KT, DT, VT, RT]):
"""A base Instance Class.
Every Instance contains 4 properties:
- A unique identifier (`identifier`)
- The raw data (`data`)
- A vector representation of the data (`vector`)
- A human readable representation (`representation`)
The ABC Instance has four Generic types:
- :data:`~instancelib.typehints.KT`: The type of the key
- :data:`~instancelib.typehints.DT`: The type of the data
- :data:`~instancelib.typehints.VT`: The type of the vector
- :data:`~instancelib.typehints.RT`: The type of the representation
Combining these four items in a single object enables easy transfer between
different operations like predictions, annotatation and transformation.
"""
@property
@abstractmethod
def data(self) -> DT:
"""Return the raw data of this instance
Returns
-------
DT
The Raw Data
"""
raise NotImplementedError
@property
@abstractmethod
def representation(self) -> RT:
"""Return a representation for annotation
Returns
-------
RT
A representation of the raw data
"""
raise NotImplementedError
@property
@abstractmethod
def vector(self) -> Optional[VT]:
"""Get the vector represenation of the raw data
Returns
-------
Optional[VT]
The Vector
"""
raise NotImplementedError
@vector.setter
def vector(self, value: Optional[VT]) -> None: # type: ignore
"""Set the vector representation of the raw data
Parameters
----------
value : Optional[VT]
A vector value (this may be `None`)
Note
----
It may be better to use the
:meth:`InstanceProvider.bulk_add_vectors` method
if you want update the vectors of many instances
"""
raise NotImplementedError
@property
@abstractmethod
def identifier(self) -> KT:
"""Get the identifier of the instance
Returns
-------
KT
The identifier key of the instance
"""
raise NotImplementedError
@identifier.setter
def identifier(self, value: KT) -> None:
"""Set the identifier of the instance
Parameters
----------
value : KT
The new identifier
"""
raise NotImplementedError
def __repr__(self) -> str:
data_short = (
self.data
if len(repr(self.data)) <= 20
else f"{repr(self.data)[0:20]} ...'"
)
str_rep = (
f"Instance(identifier={self.identifier}, "
f"data={data_short}, "
f"has_vector={self.vector is not None})"
)
return str_rep
def __str__(self) -> str:
return self.__repr__()
[docs] def to_dict(self) -> Mapping[str, Any]:
mapping = {
"identifier": self.identifier,
"data": self.data,
"vector": self.vector,
"representation": self.representation,
}
return mapping
[docs] @staticmethod
def map_data(
func: Callable[[DT], _V]
) -> Callable[[Instance[KT, DT, VT, RT]], _V]:
"""Transform function that works on raw data into a function that works on
:class:`Instance` objects.
Parameters
----------
func : Callable[[DT], _V]
The function that works on raw data
Returns
-------
Callable[[Instance[KT, DT, VT, RT]], _V]
The transformed function
"""
def wrapped(instance: Instance[KT, DT, VT, RT]) -> _V:
return func(instance.data)
return wrapped
[docs] @staticmethod
def map_vector(
func: Callable[[VT], _V]
) -> Callable[[Instance[KT, DT, VT, RT]], Optional[_V]]:
"""Transform function that works on vectors into a function that works on
:class:`Instance` objects.
Parameters
----------
func : Callable[[VT], _V]
The function that works on vectors
Returns
-------
Callable[[Instance[KT, DT, VT, RT]], _V]
The transformed function
"""
def wrapped(instance: Instance[KT, DT, VT, RT]) -> Optional[_V]:
if instance.vector is not None:
return func(instance.vector)
return None
return wrapped
[docs] @staticmethod
def vectorized_data_map(
func: Callable[[Iterable[DT]], _V]
) -> Callable[[Iterable[Instance[KT, DT, VT, RT]]], _V]:
"""Transform function that works on sequences of raw data
into a function that works on sequences of :class:`Instance` objects.
Parameters
----------
func : Callable[[Iterable[DT]], _V]
The function that works on sequences of raw data
Returns
-------
Callable[[Iterable[Instance[KT, DT, VT, RT]]], _V]
The transformed function
"""
def wrapped(instances: Iterable[Instance[KT, DT, VT, RT]]) -> _V:
data = (instance.data for instance in instances)
results = func(data)
return results
return wrapped
@property
def type_info(self) -> TypeInfo:
result = TypeInfo(
type(self.identifier),
type(self.data),
type(self.vector),
type(self.representation),
)
return result
InstanceType = TypeVar("InstanceType", bound="Instance[Any, Any, Any, Any]")
[docs]class ROInstanceProvider(
Mapping[KT, InstanceType], ABC, Generic[InstanceType, KT, DT, VT, RT]
):
"""The Base InstanceProvider class (ReadOnly).
This class provides an abstract implementation for a dataset.
The InstanceProvider has five Generic types:
- :data:`InstanceType` : A subclass of :class:`Instance`
- :data:`~instancelib.typehints.KT`: The type of the key
- :data:`~instancelib.typehints.DT`: The type of the data
- :data:`~instancelib.typehints.VT`: The type of the vector
- :data:`~instancelib.typehints.RT`: The type of the representation
Specifying these allows Python to ensure the correctness of your implementation
and eases further integration in your application.
Examples
--------
Instance access:
>>> provider = InstanceProvider() # Replace with your implementation's constructor
>>> first_key = next(iter(textprovider))
>>> first_doc = textprovider[first_key]
Set operations:
>>> new_instance = Instance()
>>> provider.add(new_instance)
>>> provider.discard(new_instance)
Example implementation:
>>> class TextProvider(InstanceProvider[Instance[int, str, npt.NDArray[Any], str],
... int, str, npt.NDArray[Any], str]):
... # Further implementation is needed
>>> textprovider = TextProvider()
There are a number of :func:`~abc.abstractmethod` that need to be implemented
in your own implementation. See the source of this file to see what you need to
implement.
"""
[docs] @abstractmethod
def __contains__(self, item: object) -> bool:
"""Special method that checks if something is contained in this
provider.
Parameters
----------
item : object
The item of which we want to know if it is contained in this
provider
Returns
-------
bool
True if the provider contains `item`.
Examples
--------
Example usage; check if the item exists and then remove it
>>> doc_id = 20
>>> provider = InstanceProvider()
>>> if doc_id in provider:
... del provider[doc_id]
"""
raise NotImplementedError
[docs] @abstractmethod
def __iter__(self) -> Iterator[KT]:
"""Enables you to iterate over Instances
Yields
------
:class:`KT`
Keys included in the provider
"""
raise NotImplementedError
@property
def key_list(self) -> List[KT]:
"""Return a list of all instance keys in this provider
Returns
-------
List[KT]
A list of instance keys
"""
return list(self.keys())
@property
@abstractmethod
def empty(self) -> bool:
"""Determines if the provider does not contain instances
Returns
-------
bool
True if the provider is empty
"""
raise NotImplementedError
[docs] @abstractmethod
def get_all(self) -> Iterator[InstanceType]:
"""Get an iterator that iterates over all instances
Yields
------
InstanceType
An iterator that iterates over all instances
"""
raise NotImplementedError
[docs] @abstractmethod
def clear(self) -> None:
"""Removes all instances from the provider
Warning
-------
Use this operation with caution! This operation is intended for
use with providers that function as temporary user queues, not
for large proportions of the dataset like `unlabeled` and `labeled`
sets.
"""
raise NotImplementedError
[docs] def bulk_add_vectors(
self, keys: Sequence[KT], values: Sequence[VT]
) -> None:
"""This methods adds vectors in `values` to the instances specified
in `keys`.
In some use cases, vectors are not known beforehand. This library
provides several :term:`vectorizer` s that convert raw data points
in feature vector form. Once these vectors are available, they can be
added to the provider by using this method
Parameters
----------
keys
A sequence of keys
values
A sequence of vectors
Warning
-------
We assume that the indices and length of the parameters `keys` and `values`
match.
"""
for key, vec in zip(keys, values):
self[key].vector = vec
[docs] def bulk_get_vectors(
self, keys: Sequence[KT]
) -> Tuple[Sequence[KT], Sequence[VT]]:
"""Given a list of instance `keys`, return the vectors
Parameters
----------
keys : Sequence[KT]
A list of vectors
Returns
-------
Tuple[Sequence[KT], Sequence[VT]]
A tuple of two sequences, one with `keys` and one with `vectors`.
The indices match, so the instance with ``keys[2]`` has as
vector ``vectors[2]``
Warning
-------
Some underlying implementations do not preserve the ordering of the parameter
`keys`. Therefore, always use the keys variable from the returned tuple for
the correct matching.
"""
vector_pairs = ((key, self[key].vector) for key in keys)
ret_keys, ret_vectors = filter_snd_none_zipped(vector_pairs)
return ret_keys, ret_vectors # type: ignore
[docs] def data_chunker(
self, batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, DT]]]:
"""Iterate over all instances data parts in
this provider
Parameters
----------
batch_size : int
The batch size, the generator will return lists with size `batch_size`
Yields
-------
Sequence[Tuple[KT,DT]]
A sequence of instances with length `batch_size`. The last list may have
a shorter length.
"""
datapoints = ((ins.identifier, ins.data) for ins in self.values())
chunks = divide_iterable_in_lists(datapoints, batch_size)
yield from chunks
[docs] def data_chunker_selector(
self, keys: Iterable[KT], batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, DT]]]:
keyset = frozenset(keys)
datapoints = (
(ins.identifier, ins.data)
for ins in self.values()
if ins.identifier in keyset
)
chunks = divide_iterable_in_lists(datapoints, batch_size)
yield from chunks
@property
def with_vector(self) -> FrozenSet[KT]:
return frozenset((k for k, v in self.items() if v.vector is not None))
@property
def without_vector(self) -> FrozenSet[KT]:
return frozenset((k for k, v in self.items() if v.vector is None))
[docs] def instance_chunker_selector(
self, keys: Iterable[KT], batch_size: int = 200
) -> Iterator[Sequence[InstanceType]]:
chunks = divide_iterable_in_lists(keys, batch_size)
for chunk in chunks:
yield [self[key] for key in chunk]
[docs] def instance_chunker(
self, batch_size: int = 200
) -> Iterator[Sequence[InstanceType]]:
"""Iterate over all instances (with or without vectors) in
this provider
Parameters
----------
batch_size : int
The batch size, the generator will return lists with size `batch_size`
Yields
-------
Sequence[Instance[KT, DT, VT, RT]]]
A sequence of instances with length `batch_size`. The last list may have
a shorter length.
"""
chunks = divide_iterable_in_lists(self.values(), batch_size)
yield from chunks
[docs] def vector_chunker_selector(
self, keys: Iterable[KT], batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, VT]]]:
"""Iterate over all instances (with or without vectors) in belonging the identifier
:class:`Iterable` in the `keys` parameter.
Parameters
----------
keys : Iterable[KT]
The keys that should should be chunked
batch_size : int
The batch size, the generator will return lists with size `batch_size`
Yields
-------
Sequence[Instance[KT, DT, VT, RT]]]
A sequence of instances with length `batch_size`. The last list may have
a shorter length.
Returns
-------
Iterator[Sequence[Tuple[KT, VT]]]
An iterator over sequences of key vector tuples
"""
included_ids = frozenset(self.key_list).intersection(keys)
id_vecs = (
(elem.identifier, elem.vector)
for elem in self.values()
if elem.vector is not None and elem.identifier in included_ids
)
chunks = divide_iterable_in_lists(id_vecs, batch_size)
return chunks
[docs] def vector_chunker(
self, batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, VT]]]:
"""Iterate over all pairs of keys and vectors in
this provider
Parameters
----------
batch_size : int
The batch size, the generator will return lists with size `batch_size`
Returns
-------
Iterator[Sequence[Tuple[KT, VT]]]
An iterator over sequences of key vector tuples
Yields
-------
Sequence[Tuple[KT, VT]]
Sequences of key vector tuples
"""
yield from self.vector_chunker_selector(self.key_list, batch_size)
[docs] def bulk_get_all(self) -> List[InstanceType]:
"""Returns a list of all instances in this provider.
Returns
-------
List[Instance[KT, DT, VT, RT]]
A list of all instances in this provider
Warning
-------
When using this method on very large providers with lazily loaded instances, this
may yield Out of Memory errors, as all the data will be loaded into RAM.
Use with caution!
"""
return list(self.get_all())
[docs] def map(self, func: Callable[[InstanceType], _V]) -> Iterator[_V]:
"""A higher order function that maps any function that works on
individual :class:`Instance` objects on every contained object in
this provider.
Parameters
----------
func : Callable[[InstanceType], _V]
A function that works on :class:`Instance` objects of type `InstanceType`
Yields
-------
Iterator[_V]
The values produced by the function `func`
"""
keys = self.key_list
for key in keys:
instance = self[key]
result = func(instance)
yield result
[docs] def data_map(self, func: Callable[[DT], _V]) -> Iterator[_V]:
"""A higher order function that maps any function that works on
individual :class:`~instancelib.typehints.KT` object
on every :class:`Instance` object in this provider.
Parameters
----------
func
The function that should be applied
Yields
-------
_V
The values produced by the function `func`
"""
instances = self.values()
mapped_f = Instance[KT, DT, VT, RT].map_data(func)
results = map(mapped_f, instances)
yield from results
[docs] def all_data(self) -> Iterator[DT]:
"""Return all the raw data from the instances in this provider
Yields
------
DT
Raw data
"""
yield from (instance.data for instance in self.values())
[docs] def vectorized_map(
self,
func: Callable[[Iterable[InstanceType]], _V],
batch_size: int = 200,
) -> Iterator[_V]:
"""Maps a function that works on multiple instances
onto all the instances in batches of size `batch_size`.
Note: If you run a function that combines multiple instances into
a single result, this may possibly lead to undiserable results if
batches are not taken into account.
Parameters
----------
func : Callable[[Iterable[InstanceType]], _V]
The function that should be applied
batch_size : int, optional
The size of the batch, by default 200
Yields
-------
_V
The result type of the function in parameter `func`
"""
chunks = divide_iterable_in_lists(self.values(), batch_size)
results = map(func, chunks)
yield from results
[docs] def vectorized_data_map(
self, func: Callable[[Iterable[DT]], _V], batch_size: int = 200
) -> Iterator[_V]:
"""Maps a function that works on multiple raw data points
onto all the instances in batches of size `batch_size`.
Note: If you run a function that combines multiple instances into
a single result, this may possibly lead to undiserable results if
batches are not taken into account.
Parameters
----------
func
The function that should be applied
batch_size : int, optional
The size of the batch, by default 200
Yields
-------
_V
The result type of the function in parameter `func`
"""
chunks = divide_iterable_in_lists(self.values(), batch_size)
mapped_f = Instance[KT, DT, VT, RT].vectorized_data_map(func)
results = map(mapped_f, chunks)
yield from results
@property
def type_info(self) -> Optional[TypeInfo]:
try:
first_item = next(iter(self.values()))
except StopIteration:
return None
return first_item.type_info
def __repr__(self) -> str:
result = f"InstanceProvider(length={len(self)})"
return result
def __str__(self) -> str:
return self.__repr__()
[docs]def default_instance_viewer(
ins: Instance[Any, Any, Any, RT]
) -> Mapping[str, RT]:
return {"data": ins.representation}
[docs]class InstanceProvider(
MutableMapping[KT, InstanceType],
ROInstanceProvider[InstanceType, KT, DT, VT, RT],
ABC,
Generic[InstanceType, KT, DT, VT, RT],
):
"""The Base InstanceProvider class.
This class provides an abstract implementation for a dataset.
The InstanceProvider has five Generic types:
- :data:`InstanceType` : A subclass of :class:`Instance`
- :data:`~instancelib.typehints.KT`: The type of the key
- :data:`~instancelib.typehints.DT`: The type of the data
- :data:`~instancelib.typehints.VT`: The type of the vector
- :data:`~instancelib.typehints.RT`: The type of the representation
Specifying these allows Python to ensure the correctness of your implementation
and eases further integration in your application.
Examples
--------
Instance access:
>>> provider = InstanceProvider() # Replace with your implementation's constructor
>>> first_key = next(iter(textprovider))
>>> first_doc = textprovider[first_key]
Set operations:
>>> new_instance = Instance()
>>> provider.add(new_instance)
>>> provider.discard(new_instance)
Example implementation:
>>> class TextProvider(InstanceProvider[Instance[int, str, npt.NDArray[Any], str],
... int, str, npt.NDArray[Any], str]):
... # Further implementation is needed
>>> textprovider = TextProvider()
There are a number of :func:`~abc.abstractmethod` that need to be implemented
in your own implementation. See the source of this file to see what you need to
implement.
"""
[docs] @abstractmethod
def add_child(
self,
parent: Union[KT, Instance[KT, DT, VT, RT]],
child: Union[KT, Instance[KT, DT, VT, RT]],
) -> None:
"""Register a parent child relation between two instances
Parameters
----------
parent : Union[KT, Instance[KT, DT, VT, RT]]
The parent instance (or identifier)
child : Union[KT, Instance[KT, DT, VT, RT]]
The child instance (or identifier)
"""
raise NotImplementedError
[docs] @abstractmethod
def get_children(
self, parent: Union[KT, Instance[KT, DT, VT, RT]]
) -> Sequence[InstanceType]:
"""Get the children that are registered to this parent
Parameters
----------
parent : Union[KT, Instance[KT, DT, VT, RT]]
The parent from which you want to get the children from.
Returns
-------
Sequence[InstanceType]
A list containing the children
"""
raise NotImplementedError
[docs] @abstractmethod
def discard_children(
self, parent: Union[KT, Instance[KT, DT, VT, RT]]
) -> None:
"""Discard the children that are registered to this parent
Parameters
----------
parent : Union[KT, Instance[KT, DT, VT, RT]]
The parent from which you want to get the children from.
"""
raise NotImplementedError
[docs] def get_children_keys(
self, parent: Union[KT, Instance[KT, DT, VT, RT]]
) -> Sequence[KT]:
"""Get the children that are registered to this parent
Parameters
----------
parent : Union[KT, Instance[KT, DT, VT, RT]]
The parent from which you want to get the children from.
Returns
-------
Sequence[InstanceType]
A list containing the children
"""
child_keys = [ins.identifier for ins in self.get_children(parent)]
return child_keys
[docs] @abstractmethod
def get_parent(
self, child: Union[KT, Instance[KT, DT, VT, RT]]
) -> InstanceType:
"""Get the parent of a child
Parameters
----------
child : Union[KT, Instance[KT, DT, VT, RT]]
A child instance from which you want to get the children from.
Returns
-------
InstanceType
The parent of this child instance
Raises
------
KeyError
If there is no parent associated with this :class:`Instance`
"""
raise NotImplementedError
[docs] def add(self, instance: Instance[KT, DT, VT, RT]) -> None:
"""Add an instance to this provider.
If the provider already contains `instance`, nothing happens.
Parameters
----------
instance : Instance[KT, DT, VT, RT]
The instance that should be added to the provider
"""
self.__setitem__(instance.identifier, instance) # type: ignore
[docs] def add_range(self, *instances: Instance[KT, DT, VT, RT]) -> None:
"""Add multiple instances to this provider.
If the provider already contains `instance`, nothing happens.
Parameters
----------
instance : Instance[KT, DT, VT, RT]
The instance that should be added to the provider
"""
for instance in instances:
self.add(instance)
[docs] def discard(self, instance: Instance[KT, DT, VT, RT]) -> None:
"""Remove an instance from this provider. If the
provider does not contain `instance`, nothing happens.
Parameters
----------
instance : Instance[KT, DT, VT, RT]
The instance that should be removed from the provider
"""
try:
self.__delitem__(instance.identifier)
except KeyError:
pass # To adhere to Set.discard(...) behavior
[docs] def map_mutate(
self, func: Callable[[InstanceType], Instance[KT, DT, VT, RT]]
) -> None:
"""Run a function on this provider that modifies all Instances in place
Parameters
----------
func : Callable[[InstanceType], InstanceType]
A function that modifies instances in place
"""
keys = self.key_list
for key in keys:
instance = self[key]
upd_instance = func(instance)
self[key] = upd_instance # type: ignore
[docs] @abstractmethod
def create(self, *args: Any, **kwargs: Any) -> InstanceType:
"""Create a new instance of type :data:`InstanceType`.
The created instance is subsequently added to the provider.
Note: The number of arguments and keyword arguments may differ
in actual implementation, so there are no standard arguments.
Returns
-------
InstanceType
The new instance Type
"""
raise NotImplementedError
[docs]class AbstractBucketProvider(
InstanceProvider[InstanceType, KT, DT, VT, RT],
ABC,
Generic[InstanceType, KT, DT, VT, RT],
):
"""This class allows the creation of subsets (`buckets`) from a provider,
without copying data, while still preserving the :class:`InstanceProvider`
API.
For example, in Poolbased Active Learning, the dataset is partitioned
in several sets; e.g., the `labeled` and `unlabeled` parts of the dataset.
Or in traditional supervised learning, the train, test and validation sets.
No data is copied, only a set of identifiers is kept in this provider.
All data resides in the original provider.
Attributes
----------
dataset
The :class:`InstanceProvider` that you want to take a subset from
"""
dataset: InstanceProvider[InstanceType, KT, DT, VT, RT]
"""The original dataset. All data will remain there"""
@abstractmethod
def _add_to_bucket(self, key: KT) -> None:
"""Adds the :class:`Instance` with identifier `key` to the bucket
Parameters
----------
key : KT
The identifier for the :class:`Instance` that should be added
"""
raise NotImplementedError
@abstractmethod
def _remove_from_bucket(self, key: KT) -> None:
"""Removes the :class:`Instance` with identifier `key` from the bucket
Parameters
----------
key : KT
The identifier for the :class:`Instance` that should be removed
"""
raise NotImplementedError
@abstractmethod
def _in_bucket(self, key: KT) -> bool:
"""Returns if the :class:`Instance` with identifier `key` exists
within this bucket
Parameters
----------
key : KT
The identifier for the :class:`Instance` that should be added
"""
raise NotImplementedError
@abstractmethod
def _clear_bucket(self) -> None:
"""Removes all elements from this bucket"""
raise NotImplementedError
@abstractmethod
def _len_bucket(self) -> int:
"""Returns the number of elements in the buckets
Returns
-------
int
The size of the bucket
"""
raise NotImplementedError
@property
@abstractmethod
def _bucket(self) -> Iterable[KT]:
"""Return an iterable of all identifiers in the bucket.
Returns
-------
Iterable[KT]
An :class:`Iterable` that contains all identifiers
present in this bucket
"""
raise NotImplementedError
def __iter__(self) -> Iterator[KT]:
yield from self._bucket
def __getitem__(self, key: KT):
if self._in_bucket(key):
return self.dataset[key]
raise KeyError(
f"This datapoint with key {key} does not exist in this provider"
)
def __setitem__(self, key: KT, value: InstanceType) -> None:
self._add_to_bucket(key)
self.dataset[key] = value # type: ignore
def __delitem__(self, key: KT) -> None:
self._remove_from_bucket(key)
def __len__(self) -> int:
return self._len_bucket()
def __contains__(self, key: object) -> bool:
return self._in_bucket(key) # type: ignore
[docs] def get_all(self) -> Iterator[InstanceType]:
yield from list(self.values())
[docs] def vector_chunker(
self, batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, VT]]]:
results = self.dataset.vector_chunker_selector(
self.key_list, batch_size
)
return results
[docs] def data_chunker(
self, batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, DT]]]:
results = self.dataset.data_chunker_selector(self.key_list, batch_size)
return results
[docs] def data_chunker_selector(
self, keys: Iterable[KT], batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, DT]]]:
keyset = frozenset(self.key_list).intersection(keys)
results = self.dataset.data_chunker_selector(keyset, batch_size)
return results
[docs] def vector_chunker_selector(
self, keys: Iterable[KT], batch_size: int = 200
) -> Iterator[Sequence[Tuple[KT, VT]]]:
keyset = frozenset(self.key_list).intersection(keys)
results = self.dataset.vector_chunker_selector(keyset, batch_size)
return results
[docs] def clear(self) -> None:
self._clear_bucket()
@property
def empty(self) -> bool:
return not self
[docs] def add_child(
self, parent: Union[KT, InstanceType], child: Union[KT, InstanceType]
) -> None:
self.dataset.add_child(parent, child)
[docs] def get_children_keys(
self, parent: Union[KT, Instance[KT, DT, VT, RT]]
) -> Sequence[KT]:
return self.dataset.get_children_keys(parent)
[docs] def get_children(
self, parent: Union[KT, InstanceType]
) -> Sequence[InstanceType]:
return self.dataset.get_children(parent)
[docs] def get_parent(self, child: Union[KT, InstanceType]) -> InstanceType:
return self.dataset.get_parent(child)
[docs] def discard_children(
self, parent: Union[KT, Instance[KT, DT, VT, RT]]
) -> None:
return self.dataset.discard_children(parent)
[docs] def create(self, *args: Any, **kwargs: Any) -> InstanceType:
new_instance = self.dataset.create(*args, **kwargs)
self.add(new_instance)
return new_instance
[docs]class SubtractionProvider(
AbstractBucketProvider[InstanceType, KT, DT, VT, RT],
ABC,
Generic[InstanceType, KT, DT, VT, RT],
):
"""This abstract class allows the creation of large subsets (`buckets`) that
do not contain some elements, specified in a `bucket`.
No data is copied, however, the :class:`InstanceProvider` API is preserved.
In some underlying implementations (like a Many to Many relation in Django),
the creation of a large elements set takes a lot of time.
This class allows the creation to subtract a (small) bucket from the dataset
and include only the remainder.
This method can be used in the Poolbased Active Learning setting; suppose
you have a small `labeled` set and a huge dataset.
You can subtract the `labeled` from the dataset and create an InstanceProvider
that contains all `unlabeled` examples.
Attributes
----------
dataset
The :class:`InstanceProvider` that you want to take a subset from
bucket
The :class:`InstanceProvider` that you want to exclude from the dataset
Warning
-------
If possible, do not use this class: a solution that is based on only :class:`InstanceProvider` objects
and :class:`AbstractBucketProvider` will probably be faster.
"""
bucket: InstanceProvider[InstanceType, KT, DT, VT, RT]
"""The provider that should be excluded from the original `dataset`."""
@property
def _bucket(self) -> Iterable[KT]:
ds_keys = frozenset(self.dataset)
bu_keys = frozenset(self.bucket)
difference = ds_keys.difference(bu_keys)
return iter(difference)
def _in_bucket(self, key: KT) -> bool:
return key not in self.bucket and key in self.dataset
def _add_to_bucket(self, key: KT) -> None:
instance = self.dataset[key]
self.bucket.discard(instance)
def _remove_from_bucket(self, key: KT) -> None:
instance = self.dataset[key]
self.bucket.add(instance)
def _clear_bucket(self) -> None:
pass
def _len_bucket(self) -> int:
ds_keys = frozenset(self.dataset)
bu_keys = frozenset(self.bucket)
difference = ds_keys.difference(bu_keys)
return len(difference)
[docs] def create(self, *args: Any, **kwargs: Any) -> InstanceType:
new_instance = self.dataset.create(*args, **kwargs)
return new_instance
[docs] def clear(self) -> None:
pass