Source code for instancelib.labels.encoder

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import (
    FrozenSet,
    Generic,
    Iterable,
    Iterator,
    List,
    Mapping,
    Optional,
    Sequence,
    Tuple,
    Any,
)

import numpy as np
import numpy.typing as npt
import sklearn

from ..exceptions.base import LabelEncodingException

from ..typehints import LMT, LT, LVT, PMT
from ..utils.func import invert_mapping


[docs]class LabelEncoder(ABC, Generic[LT, LVT, LMT, PMT]):
[docs] @abstractmethod def initialize(self, labels: Iterable[LT]) -> None: pass
[docs] @abstractmethod def encode(self, labels: Iterable[LT]) -> LVT: raise NotImplementedError
[docs] def encode_safe(self, labels: Iterable[LT]) -> Optional[LVT]: try: encoding = self.encode(labels) except LabelEncodingException: return None return encoding
[docs] @abstractmethod def encode_batch(self, labelings: Iterable[Iterable[LT]]) -> LMT: raise NotImplementedError
[docs] @abstractmethod def decode_vector(self, vector: LVT) -> FrozenSet[LT]: raise NotImplementedError
[docs] @abstractmethod def decode_matrix(self, matrix: LMT) -> Sequence[FrozenSet[LT]]: raise NotImplementedError
[docs] @abstractmethod def decode_proba_matrix( self, matrix: PMT ) -> Sequence[FrozenSet[Tuple[LT, float]]]: raise NotImplementedError
[docs] @abstractmethod def get_label_column_index(self, label: LT) -> int: raise NotImplementedError
@property @abstractmethod def labels(self) -> Sequence[LT]: raise NotImplementedError
[docs]class DictionaryEncoder( LabelEncoder[LT, npt.NDArray[Any], npt.NDArray[Any], npt.NDArray[Any]], Generic[LT], ): def __init__(self, mapping: Mapping[LT, int]): self.mapping = mapping self.inv_mapping = invert_mapping(self.mapping) self.labelset = frozenset(self.mapping.keys()) self._labels = [lab for _, lab in sorted(self.inv_mapping.items())]
[docs] def initialize(self, labels: Iterable[LT]) -> None: self.mapping = {label: idx for (idx, label) in enumerate(labels)} self.inv_mapping = invert_mapping(self.mapping) self.labelset = frozenset(self.mapping.keys()) self._labels = [lab for _, lab in sorted(self.inv_mapping.items())]
[docs] def encode(self, labels: Iterable[LT]) -> npt.NDArray[Any]: result = np.array([self.mapping[lab] for lab in labels]) # type: ignore return result
[docs] def encode_batch( self, labelings: Iterable[Iterable[LT]] ) -> npt.NDArray[Any]: encoded = tuple(map(self.encode, labelings)) result = np.vstack(encoded) return result
[docs] def decode_vector(self, vector: npt.NDArray[Any]) -> FrozenSet[LT]: listed: List[int] = vector.tolist() # type: ignore result = frozenset([self.inv_mapping[enc] for enc in listed]) return result
[docs] def decode_matrix( self, matrix: npt.NDArray[Any] ) -> Sequence[FrozenSet[LT]]: listed: List[int] = matrix.tolist() result = [frozenset([self.inv_mapping[enc]]) for enc in listed] return result
[docs] def decode_proba_matrix( self, matrix: npt.NDArray[Any] ) -> Sequence[FrozenSet[Tuple[LT, float]]]: prob_mat: List[List[float]] = matrix.tolist() label_list = self.labels labels = [ frozenset(zip(label_list, prob_vec)) for prob_vec in prob_mat ] return labels
@property def labels(self) -> Sequence[LT]: return self._labels
[docs] def get_label_column_index(self, label: LT) -> int: label_list = self.labels return label_list.index(label)
[docs] @classmethod def from_list(cls, labels: Iterable[LT]) -> DictionaryEncoder[LT]: mapping = {lab: idx for idx, lab in enumerate(labels)} return cls(mapping)
[docs] @classmethod def from_inv(cls, inv_mapping: Mapping[int, LT]) -> DictionaryEncoder[LT]: mapping = invert_mapping(inv_mapping) return cls(mapping)
[docs]class IdentityEncoder(DictionaryEncoder[LT], Generic[LT]):
[docs] def encode(self, labels: Iterable[LT]) -> npt.NDArray[Any]: result = np.array([labels]) # type: ignore return result
[docs] def encode_batch( self, labelings: Iterable[Iterable[LT]] ) -> npt.NDArray[Any]: encoded = tuple(map(self.encode, labelings)) result = np.vstack(encoded) return result
[docs] def decode_vector(self, vector: npt.NDArray[Any]) -> FrozenSet[LT]: listed: List[LT] = vector.tolist() # type: ignore result = frozenset(listed) return result
[docs] def decode_matrix( self, matrix: npt.NDArray[Any] ) -> Sequence[FrozenSet[LT]]: listed: List[LT] = matrix.tolist() result = [frozenset([enc]) for enc in listed] return result
[docs]class MultilabelDictionaryEncoder(DictionaryEncoder[LT], Generic[LT]):
[docs] def encode(self, labels: Iterable[LT]) -> npt.NDArray[Any]: def return_binary(lab: LT, labeling: FrozenSet[LT]) -> int: return lab in labeling labeling = frozenset(labels) result = np.array([return_binary(lab, labeling) for lab in self.labels]) # type: ignore return result
def _decode_binary(self, listed_vector: List[int]) -> Iterator[LT]: for idx, included in enumerate(listed_vector): if included > 0: yield self.inv_mapping[idx]
[docs] def decode_vector(self, vector: npt.NDArray[Any]) -> FrozenSet[LT]: listed = vector.tolist() result = frozenset(self._decode_binary(listed)) return result
[docs] def decode_matrix( self, matrix: npt.NDArray[Any] ) -> Sequence[FrozenSet[LT]]: listed: List[List[int]] = matrix.tolist() result = [frozenset(self._decode_binary(vec)) for vec in listed] return result
[docs]class SklearnLabelEncoder( LabelEncoder[LT, npt.NDArray[Any], npt.NDArray[Any], npt.NDArray[Any]], Generic[LT], ): def __init__( self, encoder: sklearn.base.TransformerMixin, labels: Iterable[LT] ) -> None: self.labelset = frozenset(labels) self.encoder = encoder if self.labelset: self._fit_label_encoder()
[docs] def initialize(self, labels: Iterable[LT]) -> None: self.labelset = frozenset(labels) self._fit_label_encoder()
def _fit_label_encoder(self) -> None: self.encoder.fit(list(self.labelset)) # type: ignore
[docs] def encode(self, labels: Iterable[LT]) -> npt.NDArray[Any]: try: first_label = next(iter(labels)) except StopIteration: raise LabelEncodingException( "This instance has no label, but one is required (binary / multiclass classification)" ) return self.encoder.transform([first_label]) # type: ignore
[docs] def encode_batch( self, labelings: Iterable[Iterable[LT]] ) -> npt.NDArray[Any]: try: formatted = [next(iter(labeling)) for labeling in labelings] except StopIteration: raise LabelEncodingException( "One of the instances has no label, but one is required (binary / multiclass classfication)" ) encoded: npt.NDArray[Any] = self.encoder.transform(formatted) # type: ignore return encoded
[docs] def decode_vector(self, vector: npt.NDArray[Any]) -> FrozenSet[LT]: first_labeling: LT = self.encoder.inverse_transform(vector).tolist()[0] # type: ignore return frozenset([first_labeling])
[docs] def decode_matrix( self, matrix: npt.NDArray[Any] ) -> Sequence[FrozenSet[LT]]: labelings: Iterable[LT] = self.encoder.inverse_transform(matrix).tolist() # type: ignore return [frozenset([labeling]) for labeling in labelings]
[docs] def get_label_column_index(self, label: LT) -> int: label_list = self.labels return label_list.index(label)
@property def labels(self) -> Sequence[LT]: labels: Sequence[LT] = self.encoder.classes_.tolist() # type: ignore return labels
[docs] def decode_proba_matrix( self, matrix: npt.NDArray[Any] ) -> Sequence[FrozenSet[Tuple[LT, float]]]: prob_mat: List[List[float]] = matrix.tolist() label_list = self.labels labels = [ frozenset(zip(label_list, prob_vec)) for prob_vec in prob_mat ] return labels
[docs]class SklearnMultiLabelEncoder(SklearnLabelEncoder[LT], Generic[LT]): def _fit_label_encoder(self) -> None: self.encoder.fit(list(map(lambda x: {x}, self._target_labels))) # type: ignore
[docs] def encode_batch( self, labelings: Iterable[Iterable[LT]] ) -> npt.NDArray[Any]: formatted = [frozenset(labeling) for labeling in labelings] encoded: npt.NDArray[Any] = self.encoder.transform(formatted) # type: ignore return encoded
[docs] def encode(self, labels: Iterable[LT]) -> npt.NDArray[Any]: return self.encoder.transform([list(set(labels))]) # type: ignore
[docs] def decode_matrix( self, matrix: npt.NDArray[Any] ) -> Sequence[FrozenSet[LT]]: labelings: Iterable[Iterable[LT]] = self.encoder.inverse_transform(matrix) # type: ignore return [frozenset(labeling) for labeling in labelings]
[docs] def decode_vector(self, vector: npt.NDArray[Any]) -> FrozenSet[LT]: first_labeling: Iterable[LT] = self.encoder.inverse_transform(vector).tolist()[0] # type: ignore return frozenset(first_labeling)