Source code for instancelib.functions.vectorize

# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
import itertools
from typing import Any, Iterator, List, Optional, Sequence, Tuple

import numpy.typing as npt

from ..environment import AbstractEnvironment
from ..feature_extraction import BaseVectorizer
from ..instances import Instance, InstanceProvider
from ..typehints.typevars import KT
from ..utils.numpy import matrix_tuple_to_vectors
from ..utils.to_key import to_key


[docs]def fit_vectorizer( vectorizer: BaseVectorizer[Instance[Any, Any, npt.NDArray[Any], Any]], provider: InstanceProvider[ Instance[Any, Any, npt.NDArray[Any], Any], Any, Any, npt.NDArray[Any], Any, ], chunk_size: int = 200, ) -> BaseVectorizer[Instance[Any, Any, npt.NDArray[Any], Any]]: instances = list( itertools.chain.from_iterable(provider.instance_chunker(chunk_size)) ) vectorizer.fit(instances) return vectorizer
[docs]def vectorize_provider( vectorizer: BaseVectorizer[Instance[KT, Any, npt.NDArray[Any], Any]], provider: InstanceProvider[ Instance[KT, Any, npt.NDArray[Any], Any], Any, Any, npt.NDArray[Any], Any, ], chunk_size: int = 200, ) -> Iterator[Tuple[Sequence[KT], Sequence[npt.NDArray[Any]]]]: instance_chunks = provider.instance_chunker(chunk_size) for instance_chunk in instance_chunks: matrix = vectorizer.transform(instance_chunk) keys: List[KT] = list(map(to_key, instance_chunk)) # type: ignore ret_keys, vectors = matrix_tuple_to_vectors(keys, matrix) yield ret_keys, vectors
[docs]def vectorize( vectorizer: BaseVectorizer[Instance[Any, Any, npt.NDArray[Any], Any]], environment: AbstractEnvironment[ Instance[KT, Any, npt.NDArray[Any], Any], KT, Any, npt.NDArray[Any], Any, Any, ], fit: bool = True, chunk_size: int = 200, fit_instances: Optional[ InstanceProvider[ Instance[KT, Any, npt.NDArray[Any], Any], KT, Any, npt.NDArray[Any], Any, ] ] = None, transform_instances: Optional[ InstanceProvider[ Instance[KT, Any, npt.NDArray[Any], Any], KT, Any, npt.NDArray[Any], Any, ] ] = None, fit_chunk_size: Optional[int] = None, transform_chunk_size: Optional[int] = None, ): # Set parameters f_chunk_size = chunk_size if fit_chunk_size is None else fit_chunk_size t_chunk_size = ( chunk_size if transform_chunk_size is None else transform_chunk_size ) # Determine source and target provider source_provider = ( fit_instances if fit_instances is not None else environment.all_instances ) target_provider = ( transform_instances if transform_instances is not None else environment.all_instances ) # Vectorization Procedure if fit: vectorizer = fit_vectorizer(vectorizer, source_provider, f_chunk_size) results = vectorize_provider(vectorizer, target_provider, t_chunk_size) # Store the vectors in the Environment for keys, vecs in results: environment.add_vectors(keys, vecs)