Source code for instancelib.instances.hdf5pandas

# Copyright (C) 2021 The InstanceLib Authors. All Rights Reserved.

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

from __future__ import annotations

from os import PathLike
from typing import Any, Iterator, Optional, Sequence, Union

import numpy.typing as npt
import pandas as pd  # type: ignore

from .base import Instance
from .external import ExternalProvider
from .hdf5 import HDF5VectorInstanceProvider
from .hdf5vector import HDF5VectorStorage
from .memory import DataPoint
from .text import TextInstance


[docs]class HDF5TextInstance( DataPoint[Union[int, str], str, npt.NDArray[Any], str], TextInstance[Union[int, str], npt.NDArray[Any]], ): def __init__( self, identifier: Union[int, str], data: str, vector: Optional[npt.NDArray[Any]], representation: Optional[str] = None, tokenized: Optional[Sequence[str]] = None, map_to_original: Optional[npt.NDArray[Any]] = None, split_marker: Optional[Any] = None, external: Optional[ ExternalProvider[Any, Union[int, str], str, npt.NDArray[Any], str] ] = None, ) -> None: representation = data if representation is None else representation super().__init__(identifier, data, vector, representation) self._tokenized = tokenized self._map_to_original = map_to_original self._split_marker = split_marker self._external = external @property def map_to_original(self) -> Optional[npt.NDArray[Any]]: return self._map_to_original @map_to_original.setter def map_to_original(self, value: Optional[npt.NDArray[Any]]) -> None: self._map_to_original = value if self._external is not None: self._external.update_external(self) @property def split_marker(self) -> Optional[Any]: return self._split_marker @split_marker.setter def split_marker(self, value: Any): self._split_marker = value if self._external is not None: self._external.update_external(self) @property def tokenized(self) -> Optional[Sequence[str]]: return self._tokenized @tokenized.setter def tokenized(self, value: Sequence[str]) -> None: self._tokenized = value if self._external is not None: self._external.update_external(self)
[docs]class HDF5TextProvider( HDF5VectorInstanceProvider[HDF5TextInstance, Union[int, str], str, str], ExternalProvider[ HDF5TextInstance, Union[int, str], str, npt.NDArray[Any], str ], ): def __init__( self, data_storage: "PathLike[str]", vector_storage_location: "PathLike[str]", hdf5_dataset: str, id_col: str, data_cols: Sequence[str], ) -> None: self.instance_cache = {} self.hdf5_dataset = hdf5_dataset self.id_col = id_col self.data_cols: Sequence[str] = data_cols self.data_storage = data_storage self.vector_storage_location = vector_storage_location self.vectorstorage = HDF5VectorStorage[Union[int, str], Any]( vector_storage_location )
[docs] def build_from_external(self, k: Union[int, str]) -> HDF5TextInstance: if self.vectorstorage is None: self.vectorstorage = self.load_vectors() df = self.dataframe row = df[df[self.id_col] == k] # type: ignore vec = self.vectorstorage[k] data: str = " ".join([row[col] for col in self.data_cols]) # type: ignore ins = HDF5TextInstance( k, data, vec, data, tokenized=None, map_to_original=None, split_marker=None, external=self, ) return ins
[docs] def update_external( self, ins: Instance[Union[int, str], str, npt.NDArray[Any], str] ) -> None: pass
@property def dataframe(self) -> pd.DataFrame: df: pd.DataFrame = pd.read_hdf(self.data_storage, self.hdf5_dataset) # type: ignore return df def __iter__(self) -> Iterator[Union[int, str]]: key_col = self.dataframe[self.id_col] for _, key in key_col.items(): # type: ignore yield int(key) # type: ignore def __setitem__( self, key: int, value: Instance[int, str, npt.NDArray[Any], str] ) -> None: pass def __delitem__(self, key: int) -> None: pass def __len__(self) -> int: return len(self.dataframe) def __contains__(self, key: object) -> bool: df = self.dataframe return len(df[df[self.id_col] == key]) > 0 # type: ignore @property def empty(self) -> bool: return not self.dataframe
[docs] def get_all(self): yield from list(self.values())
[docs] def clear(self) -> None: pass