Source code for instancelib.feature_extraction.doc2vec

# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

from __future__ import annotations

from os import PathLike

from typing import Sequence, Optional, Callable, List, Dict, Any
from tempfile import NamedTemporaryFile

import numpy as np
import numpy.typing as npt
from sklearn.exceptions import NotFittedError  # type: ignore

from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument  # type: ignore
from gensim.utils import save_as_line_sentence  # type: ignore


from .base import BaseVectorizer
from ..utils import SaveableInnerModel


[docs]def get_line_docs(documents: Sequence[str]) -> TaggedLineDocument:
    file = NamedTemporaryFile(delete=False, mode="w+b")
    file.close()
    corpus = [[element] for element in documents]
    save_as_line_sentence(corpus, file.name)
    tld = TaggedLineDocument(file.name)
    return tld


[docs]def split_tokenizer(text: str) -> List[str]:
    return text.split(" ")


DocTokenizer = Callable[..., List[str]]


[docs]class Doc2VecVectorizer(BaseVectorizer[str], SaveableInnerModel):
    _name = "Doc2Vec"
    innermodel: Optional[Doc2Vec]
    tokenizer: DocTokenizer

    def __init__(
        self,
        d2v_params: Dict[str, Any],
        tokenizer: DocTokenizer = split_tokenizer,
        storage_location: "Optional[PathLike[str]]" = None,
        filename: "Optional[PathLike[str]]" = None,
    ) -> None:
        BaseVectorizer.__init__(self)  # type: ignore
        self.tokenizer = tokenizer  # type: ignore
        self.d2v_params = d2v_params
        self.innermodel = None
        SaveableInnerModel.__init__(
            self, self.innermodel, storage_location, filename
        )

[docs]    def fit(self, x_data: Sequence[str], **kwargs: Any) -> Doc2VecVectorizer:
        self.innermodel = Doc2Vec(
            documents=get_line_docs(x_data), **self.d2v_params
        )
        self.innermodel.delete_temporary_training_data(  # type: ignore
            keep_doctags_vectors=True, keep_inference=True
        )
        self._fitted = True
        return self

[docs]    @SaveableInnerModel.load_model_fallback
    def transform(self, x_data: Sequence[str], **kwargs: Any) -> npt.NDArray[Any]:  # type: ignore
        if self.fitted and self.innermodel is not None:
            return np.array(  # type: ignore
                [
                    self.innermodel.infer_vector(self.tokenizer(doc))  # type: ignore
                    for doc in x_data
                ]
            )
        raise NotFittedError

[docs]    def fit_transform(self, x_data: Sequence[str], **kwargs: Any) -> npt.NDArray[Any]:  # type: ignore
        self.fit(x_data)
        return self.transform(x_data)  # type: ignore

[docs]    def save(self) -> None:
        if self.innermodel is not None:
            self.innermodel.save(self.filepath)  # type: ignore
            self.saved = True

[docs]    def load(self) -> None:
        self.innermodel = Doc2Vec.load(self.filepath)  # type: ignore