Source code for instancelib.machinelearning.base

# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


from __future__ import annotations

from abc import ABC, abstractmethod
from typing import (
    FrozenSet,
    Generic,
    Iterable,
    Iterator,
    Optional,
    Sequence,
    Tuple,
    TypeVar,
    Any,
    Union,
)

from ..labels import LabelProvider
from ..instances import Instance, InstanceProvider

from ..typehints import KT, VT, DT, RT, LT, LMT, PMT

IT = TypeVar("IT", bound="Instance[Any,Any,Any,Any]", covariant=True)

InstanceInput = Union[
    InstanceProvider[IT, KT, DT, VT, RT], Iterable[Instance[KT, DT, VT, RT]]
]


[docs]class AbstractClassifier(ABC, Generic[IT, KT, DT, VT, RT, LT, LMT, PMT]): """This class provides an interface that can be used to connect your model to :class:`~instancelib.InstanceProvider`, :class:`~instancelib.LabelProvider`, and :class:`~instancelib.Environment` objects. The main methods of this class are listed below: - :meth:`fit_provider`: Fit a classifier on training instances - :meth:`predict`: Predict the class labels for (unseen) instances - :meth:`predict_proba`: Predict the class labels and corresponding probabilities - :meth:`predict_proba_raw`: Predicht the class probabilities and return them in matrix form Examples -------- Fit a classifier on train data: >>> model.fit_provider(train, env.labels) Predict the class labels for a list of instances: >>> model.predict([ins]) [(20, frozenset({"Games"}))] Return the class labels and probabilities: >>> model.predict_proba(test) [(20, frozenset({("Games", 0.66), ("Bedrijfsnieuws", 0.22), ("Smartphones", 0.12)})), ... ] Return the raw prediction matrix: >>> preds = model.predict_proba_raw(test, batch_size=512) >>> next(preds) ([3, 4, 5, ...], array([[0.143, 0.622, 0.233], [0.278, 0.546, 0.175], [0.726, 0.126, 0.146], ...])) """ _name = "AbstractClassifier"
[docs] @abstractmethod def get_label_column_index(self, label: LT) -> int: """Return the column in which the labels are stored in the label and prediction matrices Parameters ---------- label : LT The label Returns ------- int The column index of the label """ raise NotImplementedError
[docs] @abstractmethod def set_target_labels(self, labels: Iterable[LT]) -> None: """Set the target labels of the classifier Parameters ---------- labels : Iterable[LT] The class labels that the classifier can predict """ raise NotImplementedError
[docs] @abstractmethod def predict_instances( self, instances: Iterable[Instance[KT, DT, VT, RT]], batch_size: int = 200, ) -> Sequence[Tuple[KT, FrozenSet[LT]]]: """Predict the labels for a :term:`iterable` of instances Parameters ---------- instances : Iterable[Instance[KT, DT, VT, RT]] The instances batch_size : int, optional The batch size, by default 200 Returns ------- Sequence[Tuple[KT, FrozenSet[LT]]] A sequence of (identifier, prediction) pairs """ raise NotImplementedError
[docs] @abstractmethod def predict_provider( self, provider: InstanceProvider[IT, KT, DT, VT, RT], batch_size: int = 200, ) -> Sequence[Tuple[KT, FrozenSet[LT]]]: """Predict the labels for all instances in an :class:`InstanceProvider`. Parameters ---------- instances : InstanceProvider[IT, KT, DT, VT, RT] The instanceprovider batch_size : int, optional The batch size, by default 200 Returns ------- Sequence[Tuple[KT, FrozenSet[LT]]] A sequence of (identifier, prediction) pairs """ raise NotImplementedError
[docs] @abstractmethod def predict_proba_provider( self, provider: InstanceProvider[IT, KT, DT, VT, RT], batch_size: int = 200, ) -> Sequence[Tuple[KT, FrozenSet[Tuple[LT, float]]]]: """Predict the labels for each instance in the provider and return the probability for each label. Parameters ---------- provider : InstanceProvider[IT, KT, DT, VT, RT] The provider batch_size : int, optional The batch size, by default 200 Returns ------- Sequence[Tuple[KT, FrozenSet[Tuple[LT, float]]]] A sequence of tuples consisting of: - The instance identifier - The class labels and their probabilities """ raise NotImplementedError
[docs] @abstractmethod def predict_proba_provider_raw( self, provider: InstanceProvider[IT, KT, DT, VT, RT], batch_size: int = 200, ) -> Iterator[Tuple[Sequence[KT], PMT]]: """Generator function that predicts the labels for each instance in the provider. The generator lazy evaluates the prediction function on batches of instances and yields class probabilities in matrix form. Parameters ---------- provider The input InstanceProvider batch_size : int, optional The batch size in which instances are processed, by default 200 This also influences the shape of the resulting probability matrix. Yields ------- Iterator[Tuple[Sequence[KT], PMT]] An iterator yielding tuples consisting of: - A sequence of keys that match the rows of the probability matrix - The Probability matrix with shape ``(len(keys), batch_size)`` """ raise NotImplementedError
[docs] @abstractmethod def predict_proba_instances( self, instances: Iterable[Instance[KT, DT, VT, RT]], batch_size: int = 200, ) -> Sequence[Tuple[KT, FrozenSet[Tuple[LT, float]]]]: """Predict the labels for each instance in the provider and return the probability for each label. Parameters ---------- instances Input instances batch_size : int, optional The batch size, by default 200 Returns ------- Sequence[Tuple[KT, FrozenSet[Tuple[LT, float]]]] A sequence of tuples consisting of: - The instance identifier - The class labels and their probabilities """ raise NotImplementedError
[docs] @abstractmethod def predict_proba_instances_raw( self, instances: Iterable[Instance[KT, DT, VT, RT]], batch_size: int = 200, ) -> Iterator[Tuple[Sequence[KT], PMT]]: """Generator function that predicts the labels for each instance. The generator lazy evaluates the prediction function on batches of instances and yields class probabilities in matrix form. Parameters ---------- instances Input instances batch_size : int, optional The batch size in which instances are processed, by default 200 This also influences the shape of the resulting probability matrix. Yields ------- Tuple[Sequence[KT], PMT] An iterator yielding tuples consisting of: - A sequence of keys that match the rows of the probability matrix - The Probability matrix with shape ``(batch_size, n_labels)`` """ raise NotImplementedError
[docs] @abstractmethod def fit_provider( self, provider: InstanceProvider[IT, KT, DT, VT, RT], labels: LabelProvider[KT, LT], batch_size: int = 200, ) -> None: """Fit the classifier with the instances found in the :class:`InstanceProvider` based on the labels in the :class:`LabelProvider` Parameters ---------- provider : InstanceProvider[IT, KT, DT, VT, RT] The provider that contains the training data labels : LabelProvider[KT, LT] The provider that contains the labels of the training data batch_size : int, optional A batch size for the training process, by default 200 """ raise NotImplementedError
[docs] def fit_val_provider( self, provider: InstanceProvider[IT, KT, DT, VT, RT], labels: LabelProvider[KT, LT], validation: Optional[InstanceProvider[IT, KT, DT, VT, RT]] = None, batch_size: int = 200, ) -> None: return self.fit_provider(provider, labels, batch_size=batch_size)
[docs] @abstractmethod def fit_instances( self, instances: Iterable[Instance[KT, DT, VT, RT]], labels: Iterable[Iterable[LT]], ) -> None: """Fit the classifier with the instances and accompanied labels found in the arguments. Parameters ---------- instances : Iterable[Instance[KT, DT, VT, RT]] The train data labels : Iterable[Iterable[LT]] The labels of the train data """ raise NotImplementedError
@property def name(self) -> str: """The name of the classifier Returns ------- str A name that can be used to identify the classifier """ return self._name @property @abstractmethod def fitted(self) -> bool: """Return true if the classifier has been fitted Returns ------- bool True if the classifier has been fitted """ pass
[docs] def predict( self, instances: InstanceInput[IT, KT, DT, VT, RT], batch_size: int = 200, ) -> Sequence[Tuple[KT, FrozenSet[LT]]]: """Predict the labels on input instances. Parameters ---------- instances : InstanceInput[IT, KT, DT, VT, RT] An :class:`InstanceProvider` or :class:`Iterable` of :class:`Instance` objects. batch_size : int, optional A batch size, by default 200 Returns ------- Sequence[Tuple[KT, FrozenSet[LT]]] A Tuple of Keys corresponding with their labels Raises ------ ValueError If you supply incorrect formatted arguments """ if isinstance(instances, InstanceProvider): typed_provider: InstanceProvider[IT, KT, DT, VT, RT] = instances # type: ignore result = self.predict_provider(typed_provider, batch_size) return result result = self.predict_instances(instances, batch_size) return result
[docs] def predict_proba( self, instances: InstanceInput[IT, KT, DT, VT, RT], batch_size: int = 200, ) -> Sequence[Tuple[KT, FrozenSet[Tuple[LT, float]]]]: """Predict the labels and corresponding probabilities on input instances. Parameters ---------- instances : InstanceInput[IT, KT, DT, VT, RT] An :class:`InstanceProvider` or :class:`Iterable` of :class:`Instance` objects. batch_size : int, optional A batch size, by default 200 Returns ------- Sequence[Tuple[KT, FrozenSet[Tuple[LT, float]]]] Tuple of Keys corresponding with tuples of probabilities and the labels Raises ------ ValueError If you supply incorrect formatted arguments """ if isinstance(instances, InstanceProvider): typed_provider: InstanceProvider[IT, KT, DT, VT, RT] = instances # type: ignore result = self.predict_proba_provider(typed_provider, batch_size) return result preds = self.predict_proba_instances(instances, batch_size) return preds
[docs] def predict_proba_raw( self, instances: InstanceInput[IT, KT, DT, VT, RT], batch_size: int = 200, ) -> Iterator[Tuple[Sequence[KT], PMT]]: """Generator function that predicts the labels for each instance. The generator lazy evaluates the prediction function on batches of instances and yields class probabilities in matrix form. Parameters ---------- instances Input instances batch_size : int, optional The batch size in which instances are processed, by default 200 This also influences the shape of the resulting probability matrix. Yields ------- Tuple[Sequence[KT], PMT] An iterator yielding tuples consisting of: - A sequence of keys that match the rows of the probability matrix - The Probability matrix with shape ``(batch_size, n_labels)`` """ if isinstance(instances, InstanceProvider): typed_provider: InstanceProvider[IT, KT, DT, VT, RT] = instances # type: ignore result = self.predict_proba_provider_raw( typed_provider, batch_size ) return result preds = self.predict_proba_instances_raw(instances, batch_size) return preds