Source code for instancelib.feature_extraction.base

# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Generic, Sequence, TypeVar, List, Any

from sklearn.exceptions import NotFittedError  # type: ignore
import numpy as np  # type: ignore

DT = TypeVar("DT")
CT = TypeVar("CT")
LT = TypeVar("LT")


[docs]class BaseVectorizer(ABC, Generic[DT]):
    """This is the :class:`~abc.ABC` specifies a generic vectorizer.
    Vectorizers transform raw data examples into feature vectors.
    Given a data type `DT`, it specifies the methods :meth:`~.fit`
    that initializes or fits the vectorizer. The method :meth:`~.transform`
    transforms the data into vector form.
    """

    _name = "BaseVectorizer"

    def __init__(self):
        self._fitted = False

    @property
    def fitted(self) -> bool:
        """Check if the vectorizer has been fitted

        Returns
        -------
        bool
            True if the vectorizer has been fitted
        """
        return self._fitted

[docs]    @abstractmethod
    def fit(self, x_data: Sequence[DT], **kwargs: Any) -> BaseVectorizer[DT]:
        """Fit the vectorizer according to the data in the given
        :class:`~collections.abc.Sequence`.

        Parameters
        ----------
        x_data : Sequence[DT]
            A Sequence of examples with type `DT`.

        Returns
        -------
        BaseVectorizer[DT]
            A fitted vectorizer for data with type `DT`

        Examples
        --------
        Assume the creation of a vectorizer and a sequence of data examples
        in the variable `data_list`

        >>> vectorizer = BaseVectorizer[DT]()
        >>> vectorizer = vectorizer.fit(data_list)
        """
        pass

[docs]    @abstractmethod
    def transform(self, x_data: Sequence[DT], **kwargs: Any) -> npt.NDArray[Any]:  # type: ignore
        """Transform a list raw data points to a feature matrix
        according to the fitted vectorizer

        Parameters
        ----------
        x_data : Sequence[DT]
            A sequence of raw data examples with length `n_examples`

        Returns
        -------
        npt.NDArray[Any]
            A feature matrix with shape `(n_examples, n_features)`

        Examples
        --------
        Assume the vectorizer is fitted

        >>> x_mat = vectorizer.transform(x_data)
        """
        pass

[docs]    @abstractmethod
    def fit_transform(self, x_data: Sequence[DT], **kwargs: Any) -> npt.NDArray[Any]:  # type: ignore
        """Transform a list of data to a feature matrix. The transformation
        is based on the data contained in the parameter `x_data`. Subsequent
        transformations with :meth:`~.transform()` will be based on the fit
        of the data provided in this call.

        Parameters
        ----------
        x_data : Sequence[DT]
            A sequence of raw data examples with length `n_examples`

        Returns
        -------
        npt.NDArray[Any]
            A feature matrix with shape `(n_examples, n_features)`

        Examples
        --------
        Assume the vectorizer is fitted

        >>> x_mat = vectorizer.fit_transform(x_data)
        """
        pass

    @property
    def name(self) -> str:
        return self._name


[docs]class SeparateContextVectorizer(ABC, Generic[DT, CT]):
    """This :class:`~abc.ABC` specifies a generic vectorizer for data types
    that consists of two parts that have to be fitted or configured according
    to different specifications. The feature vectors of the two parts
    are concatenated for each example.

    The two parts are referred to the `data` part and the `context` part.
    This vectorizer contains two inner vectorizer, one for the data part and
    one for the context part respectively.

    Arguments
    ---------
    data_vectorizer : BaseVectorizer[DT]
            The vectorizer for the data part
    context_vectorizer : BaseVectorizer[CT]
            The vectorizer for the context part

    Examples
    --------
    Construction:

    >>> from sklearn.feature_extraction.text import TfidfVectorizer
    >>> data_vectorizer = SklearnVectorizer[str](TfidfVectorizer())
    >>> context_vectorizer = Doc2VecVectorizer[str]()
    >>> vectorizer = SeparateContextVectorizer[str, str](data_vectorizer,
    ...     context_vectorizer)

    Fitting:

    >>> x_data = ["This...", "Another text...", ... "Last Text"]
    >>> x_context_data = ["Surrounding text", ... , "Another text"]
    >>> vectorizer = vectorizer.fit(x_data, x_context_data)

    Transforming:

    >>> x_mat = vectorizer.transform(x_data, x_context_data)
    """

    _name = "SeparateContextVectorizer"

    def __init__(
        self,
        data_vectorizer: BaseVectorizer[DT],
        context_vectorizer: BaseVectorizer[CT],
    ):
        self.data_vectorizer = data_vectorizer
        self.context_vectorizer = context_vectorizer

    @property
    def fitted(self) -> bool:
        """Check if the vectorizer has been fitted

        Returns
        -------
        bool
            True if the vectorizer has been fitted
        """
        return self.data_vectorizer.fitted and self.context_vectorizer.fitted

[docs]    def fit(
        self, x_data: Sequence[DT], context_data: Sequence[CT], **kwargs: Any
    ) -> SeparateContextVectorizer[DT, CT]:
        """Fit the vectorizer according to the data in the given
        :class:`~collections.abc.Sequence` s.

        Parameters
        ----------
        x_data : Sequence[DT]
            The data parts
        context_data : Sequence[CT]
            The contexts parts

        Returns
        -------
        SeparateContextVectorizer[DT, CT]
            A fitted vectorizer

        Examples
        --------
        Fitting this vectorizer can be performed as follows:

        >>> x_data = ["This...", "Another text...", ... "Last Text"]
        >>> x_context_data = ["Surrounding text", ... , "Another text"]
        >>> data_vectorizer = SklearnVectorizer[str](TfidfVectorizer())
        >>> context_vectorizer = Doc2VecVectorizer[str]()
        >>> vectorizer = SeparateContextVectorizer[str, str](
        ...     data_vectorizer,
        ...     context_vectorizer)
        >>> vectorizer = vectorizer.fit(x_data, x_context_data)

        Warning
        -------
        We assume that the variables `x_data` and `context_data` are sequences
        of equal length.
        """
        self.data_vectorizer.fit(x_data, **kwargs)
        self.context_vectorizer.fit(context_data, **kwargs)
        return self

[docs]    def transform(
        self, x_data: Sequence[DT], context_data: Sequence[CT], **kwargs: Any
    ) -> npt.NDArray[Any]:  # type: ignore
        """Transform a list raw data points to a feature matrix
        according to the fitted vectorizers


        Parameters
        ----------
        x_data : Sequence[DT]
            A sequence with data parts of the data points of length `n_docs`
        context_data : Sequence[CT]
            A sequence with context part of the data points of length `n_docs`

        Returns
        -------
        npt.NDArray[Any]
            A feature matrix of concatenated vectors with shape
            `(n_docs, n_features_data + n_features_context)`

        Raises
        ------
        NotFittedError
            If the model is not fitted

        Warning
        -------
        We assume that the variables `x_data` and `context_data` are sequences
        of equal length and that the indices of the sequences correspond to the
        same data point.
        """
        if self.fitted:
            data_part: npt.NDArray[Any] = self.data_vectorizer.transform(x_data, **kwargs)  # type: ignore
            context_part: npt.NDArray[Any] = self.context_vectorizer.transform(  # type: ignore
                context_data, **kwargs
            )  # type: ignore
            return np.concatenate((data_part, context_part), axis=1)  # type: ignore
        raise NotFittedError

[docs]    def fit_transform(
        self, x_data: Sequence[DT], context_data: Sequence[CT], **kwargs: Any
    ) -> npt.NDArray[Any]:  # type: ignore
        """Fit and transform a list raw data points to a feature matrix
        according to the fitted vectorizers. Subsequent
        transformations with :meth:`~.transform()` will be based on the fit
        of the data provided in this call.


        Parameters
        ----------
        x_data : Sequence[DT]
            A sequence with data parts of the data points of length `n_docs`
        context_data : Sequence[CT]
            A sequence with context part of the data points of length `n_docs`

        Returns
        -------
        npt.NDArray[Any]
            A feature matrix of concatenated vectors with shape
            `(n_docs, n_features_data + n_features_context)`
        """
        self.fit(x_data, **kwargs)
        return self.transform(x_data, context_data, **kwargs)  # type: ignore


[docs]class StackVectorizer(BaseVectorizer[DT], Generic[DT]):
    """This :class:`~abc.ABC` specifies a generic vectorizer that consists of
    several vectorizers that are fitted on the same data points.

    The feature vectors of the contained vectorizers are concatenated in the
    transform step, according to the order they are specified in the
    constructor (argument order).

    Arguments
    ----------
    vectorizer : BaseVectorizer[DT]
        At least one vectorizer is required
    *vectorizers: BaseVectorizer[DT]
        Any number of vectorizers for the same data type

    Examples
    --------
    Construction

    >>> tf_idf = SklearnVectorizer[str](TfidfVectorizer())
    >>> doc2vec = Doc2VecVectorizer[str]()
    >>> count = SklearnVectorizer[str](CountVectorizer())
    >>> vectorizer = StackVectorizer[str](tfidf, doc2vec, count)

    Fitting

    >>> x_data = ["This...", "Another text...", ... "Last Text"]
    >>> vectorizer = vectorizer.fit(x_data)

    Transforming

    >>> another_data = ["Another test text", ... , "Another text"]
    >>> x_mat = vectorizer.transform(x_data)
    """

    vectorizers: List[BaseVectorizer[DT]]
    """The internal vectorizers are stored in this list"""

    _name = "StackVectorizer"

    def __init__(
        self, vectorizer: BaseVectorizer[DT], *vectorizers: BaseVectorizer[DT]
    ) -> None:
        """[summary]

        Parameters
        ----------
        vectorizer : BaseVectorizer[DT]
            [description]
        """
        super().__init__()
        self.vectorizers = [vectorizer, *vectorizers]

[docs]    def fit(self, x_data: Sequence[DT], **kwargs: Any) -> StackVectorizer[DT]:
        for vec in self.vectorizers:
            vec.fit(x_data, **kwargs)
        return self

    @property
    def fitted(self) -> bool:
        return all([vec.fitted for vec in self.vectorizers])

[docs]    def transform(self, x_data: Sequence[DT], **kwargs: Any) -> npt.NDArray[Any]:  # type: ignore
        if self.fitted:
            sub_vectors = [  # type: ignore
                vec.transform(x_data, **kwargs)  # type: ignore
                for vec in self.vectorizers
            ]
            return np.concatenate(sub_vectors, axis=1)  # type: ignore
        raise NotFittedError

[docs]    def fit_transform(self, x_data: Sequence[DT], **kwargs: Any) -> npt.NDArray[Any]:  # type: ignore
        self.fit(x_data, **kwargs)
        return self.transform(x_data, **kwargs)  # type: ignore