From 98cd75d8b895de0e9eb438bc3ee5bc6d20d41299 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 4 Jun 2025 10:48:17 +0200 Subject: [PATCH 01/23] add datatypes to data loaders --- .../systemds/scuro/dataloader/audio_loader.py | 26 +++++++-- .../systemds/scuro/dataloader/base_loader.py | 56 ++++++++++++++++--- .../systemds/scuro/dataloader/json_loader.py | 5 +- .../systemds/scuro/dataloader/text_loader.py | 3 +- .../systemds/scuro/dataloader/video_loader.py | 52 ++++++++++------- .../tests/scuro/test_multimodal_join.py | 2 +- 6 files changed, 107 insertions(+), 37 deletions(-) diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index a0089626802..1197617673f 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -21,6 +21,8 @@ from typing import List, Optional, Union import librosa +import numpy as np + from systemds.scuro.dataloader.base_loader import BaseLoader from systemds.scuro.modality.type import ModalityType @@ -30,19 +32,31 @@ def __init__( self, source_path: str, indices: List[str], + data_type: Union[np.dtype, str] = np.float32, chunk_size: Optional[int] = None, normalize: bool = True, + load=True, ): - super().__init__(source_path, indices, chunk_size, ModalityType.AUDIO) + super().__init__( + source_path, indices, data_type, chunk_size, ModalityType.AUDIO + ) self.normalize = normalize + self.load_data_from_file = load def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) - audio, sr = librosa.load(file) + if not self.load_data_from_file: + import numpy as np + + self.metadata[file] = self.modality_type.create_audio_metadata( + 1000, np.array([0]) + ) + else: + audio, sr = librosa.load(file, dtype=self._data_type) - if self.normalize: - audio = librosa.util.normalize(audio) + if self.normalize: + audio = librosa.util.normalize(audio) - self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio) + self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio) - self.data.append(audio) + self.data.append(audio) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index ea2b25bbb48..0d330926ee6 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -21,6 +21,10 @@ import os from abc import ABC, abstractmethod from typing import List, Optional, Union +import math + +import numpy as np +from tensorflow.python.ops.numpy_ops.np_dtypes import int16 class BaseLoader(ABC): @@ -28,6 +32,7 @@ def __init__( self, source_path: str, indices: List[str], + data_type: Union[np.dtype, str], chunk_size: Optional[int] = None, modality_type=None, ): @@ -48,6 +53,7 @@ def __init__( self._next_chunk = 0 self._num_chunks = 1 self._chunk_size = None + self._data_type = self.resolve_data_type(data_type) if chunk_size: self.chunk_size = chunk_size @@ -59,7 +65,7 @@ def chunk_size(self): @chunk_size.setter def chunk_size(self, value): self._chunk_size = value - self._num_chunks = int(len(self.indices) / self._chunk_size) + self._num_chunks = int(math.ceil(len(self.indices) / self._chunk_size)) @property def num_chunks(self): @@ -110,16 +116,25 @@ def _load_next_chunk(self): return self._load(next_chunk_indices) def _load(self, indices: List[str]): - is_dir = True if os.path.isdir(self.source_path) else False + file_names = self.get_file_names(indices) + if isinstance(file_names, str): + self.extract(file_names, indices) + else: + for file_name in file_names: + self.extract(file_name) + return self.data, self.metadata + + def get_file_names(self, indices=None): + is_dir = True if os.path.isdir(self.source_path) else False + file_names = [] if is_dir: _, ext = os.path.splitext(os.listdir(self.source_path)[0]) - for index in indices: - self.extract(self.source_path + index + ext) + for index in self.indices if indices is None else indices: + file_names.append(self.source_path + index + ext) + return file_names else: - self.extract(self.source_path, indices) - - return self.data, self.metadata + return self.source_path @abstractmethod def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): @@ -137,3 +152,30 @@ def file_sanity_check(file): if file_size == 0: raise ("File {0} is empty".format(file)) + + @staticmethod + def resolve_data_type(data_type): + if isinstance(data_type, str): + if data_type.lower() in [ + "float16", + "float32", + "float64", + "int16", + "int32", + "int64", + ]: + return np.dtype(data_type) + else: + raise ValueError(f"Unsupported data_type string: {data_type}") + elif data_type in [ + np.float16, + np.float32, + np.float64, + np.int16, + np.int32, + np.int64, + str, + ]: + return data_type + else: + raise ValueError(f"Unsupported data_type: {data_type}") diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py index edef7f205be..a355edded89 100644 --- a/src/main/python/systemds/scuro/dataloader/json_loader.py +++ b/src/main/python/systemds/scuro/dataloader/json_loader.py @@ -20,6 +20,8 @@ # ------------------------------------------------------------- import json +import numpy as np + from systemds.scuro.modality.type import ModalityType from systemds.scuro.dataloader.base_loader import BaseLoader from typing import Optional, List, Union @@ -31,9 +33,10 @@ def __init__( source_path: str, indices: List[str], field: str, + data_type: Union[np.dtype, str] = str, chunk_size: Optional[int] = None, ): - super().__init__(source_path, indices, chunk_size, ModalityType.TEXT) + super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT) self.field = field def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py index 3f871551479..6689fb6d92b 100644 --- a/src/main/python/systemds/scuro/dataloader/text_loader.py +++ b/src/main/python/systemds/scuro/dataloader/text_loader.py @@ -29,10 +29,11 @@ def __init__( self, source_path: str, indices: List[str], + data_type: str = str, chunk_size: Optional[int] = None, prefix: Optional[Pattern[str]] = None, ): - super().__init__(source_path, indices, chunk_size, ModalityType.TEXT) + super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT) self.prefix = prefix def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index 333960e698b..03f788148b5 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -32,36 +32,46 @@ def __init__( self, source_path: str, indices: List[str], + data_type: Union[np.dtype, str] = np.float16, chunk_size: Optional[int] = None, + load=True, ): - super().__init__(source_path, indices, chunk_size, ModalityType.VIDEO) + super().__init__( + source_path, indices, data_type, chunk_size, ModalityType.VIDEO + ) + self.load_data_from_file = load def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) - cap = cv2.VideoCapture(file) + if not self.load_data_from_file: + self.metadata[file] = self.modality_type.create_video_metadata( + 30, 10, 100, 100, 3 + ) + else: + cap = cv2.VideoCapture(file) - if not cap.isOpened(): - raise f"Could not read video at path: {file}" + if not cap.isOpened(): + raise f"Could not read video at path: {file}" - fps = cap.get(cv2.CAP_PROP_FPS) - length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - num_channels = 3 + fps = cap.get(cv2.CAP_PROP_FPS) + length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + num_channels = 3 - self.metadata[file] = self.modality_type.create_video_metadata( - fps, length, width, height, num_channels - ) + self.metadata[file] = self.modality_type.create_video_metadata( + fps, length, width, height, num_channels + ) - frames = [] - while cap.isOpened(): - ret, frame = cap.read() + frames = [] + while cap.isOpened(): + ret, frame = cap.read() - if not ret: - break - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frame = frame.astype(np.float32) / 255.0 + if not ret: + break + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frame = frame.astype(self._data_type) / 255.0 - frames.append(frame) + frames.append(frame) - self.data.append(frames) + self.data.append(frames) diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index a5e3a7caf9b..bebb52351ba 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -101,7 +101,7 @@ def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): audio_data_loader = AudioLoader( self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices, - r_chunk_size, + chunk_size=r_chunk_size, ) audio = UnimodalModality(audio_data_loader) From 8ec8f0dd712632408e20fac3fb2f643c8febbe68 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 4 Jun 2025 12:59:29 +0200 Subject: [PATCH 02/23] add data_type to modality --- .../systemds/scuro/dataloader/audio_loader.py | 24 ++++----- .../systemds/scuro/dataloader/base_loader.py | 12 ++++- .../systemds/scuro/dataloader/video_loader.py | 50 +++++++++---------- .../systemds/scuro/modality/modality.py | 4 +- .../scuro/modality/unimodal_modality.py | 2 +- .../systemds/scuro/representations/resnet.py | 2 +- .../systemds/scuro/utils/torch_dataset.py | 30 ++++++++++- 7 files changed, 80 insertions(+), 44 deletions(-) diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index 1197617673f..a1dad304e53 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -45,18 +45,18 @@ def __init__( def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) - if not self.load_data_from_file: - import numpy as np + # if not self.load_data_from_file: + # import numpy as np + # + # self.metadata[file] = self.modality_type.create_audio_metadata( + # 1000, np.array([0]) + # ) + # else: + audio, sr = librosa.load(file, dtype=self._data_type) - self.metadata[file] = self.modality_type.create_audio_metadata( - 1000, np.array([0]) - ) - else: - audio, sr = librosa.load(file, dtype=self._data_type) + if self.normalize: + audio = librosa.util.normalize(audio) - if self.normalize: - audio = librosa.util.normalize(audio) + self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio) - self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio) - - self.data.append(audio) + self.data.append(audio) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index 0d330926ee6..b9a9f4721c3 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -53,7 +53,7 @@ def __init__( self._next_chunk = 0 self._num_chunks = 1 self._chunk_size = None - self._data_type = self.resolve_data_type(data_type) + self._data_type = data_type if chunk_size: self.chunk_size = chunk_size @@ -74,7 +74,15 @@ def num_chunks(self): @property def next_chunk(self): return self._next_chunk - + + @property + def data_type(self): + return self._data_type + + @data_type.setter + def data_type(self, data_type): + self._data_type = self.resolve_data_type(data_type) + def reset(self): self._next_chunk = 0 self.data = [] diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index 03f788148b5..b6ec6ec6b11 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -43,35 +43,35 @@ def __init__( def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) - if not self.load_data_from_file: - self.metadata[file] = self.modality_type.create_video_metadata( - 30, 10, 100, 100, 3 - ) - else: - cap = cv2.VideoCapture(file) + # if not self.load_data_from_file: + # self.metadata[file] = self.modality_type.create_video_metadata( + # 30, 10, 100, 100, 3 + # ) + # else: + cap = cv2.VideoCapture(file) - if not cap.isOpened(): - raise f"Could not read video at path: {file}" + if not cap.isOpened(): + raise f"Could not read video at path: {file}" - fps = cap.get(cv2.CAP_PROP_FPS) - length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - num_channels = 3 + fps = cap.get(cv2.CAP_PROP_FPS) + length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + num_channels = 3 - self.metadata[file] = self.modality_type.create_video_metadata( - fps, length, width, height, num_channels - ) + self.metadata[file] = self.modality_type.create_video_metadata( + fps, length, width, height, num_channels + ) - frames = [] - while cap.isOpened(): - ret, frame = cap.read() + frames = [] + while cap.isOpened(): + ret, frame = cap.read() - if not ret: - break - frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frame = frame.astype(self._data_type) / 255.0 + if not ret: + break + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frame = frame.astype(self._data_type) / 255.0 - frames.append(frame) + frames.append(frame) - self.data.append(frames) + self.data.append(frames) diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index c16db00172c..7700403146c 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -29,7 +29,7 @@ class Modality: - def __init__(self, modalityType: ModalityType, modality_id=-1, metadata={}): + def __init__(self, modalityType: ModalityType, modality_id=-1, metadata={}, data_type=None): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the modality @@ -38,7 +38,7 @@ def __init__(self, modalityType: ModalityType, modality_id=-1, metadata={}): self.schema = modalityType.get_schema() self.metadata = metadata self.data = [] - self.data_type = None + self.data_type = data_type self.cost = None self.shape = None self.modality_id = modality_id diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 714fe42c33d..7dfdc38942f 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -37,7 +37,7 @@ def __init__(self, data_loader: BaseLoader): :param data_loader: Defines how the raw data should be loaded :param modality_type: Type of the modality """ - super().__init__(data_loader.modality_type, ModalityIdentifier().new_id(), None) + super().__init__(data_loader.modality_type, ModalityIdentifier().new_id(), None, data_loader.data_type) self.data_loader = data_loader def copy_from_instance(self): diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 68771eccdd3..5c8d6ea6106 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -110,7 +110,7 @@ def _get_parameters(self, high_level=True): return parameters def transform(self, modality): - dataset = CustomDataset(modality.data) + dataset = CustomDataset(modality.data, modality.data_type) embeddings = {} res5c_output = None diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py index a0f3d88b6a4..2d724b88492 100644 --- a/src/main/python/systemds/scuro/utils/torch_dataset.py +++ b/src/main/python/systemds/scuro/utils/torch_dataset.py @@ -26,14 +26,16 @@ class CustomDataset(torch.utils.data.Dataset): - def __init__(self, data): + def __init__(self, data, data_type): self.data = data + data_type = numpy_dtype_to_torch_dtype(data_type) self.tf = transforms.Compose( [ transforms.ToPILImage(), transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), + transforms.ConvertImageDtype(dtype=data_type), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), @@ -61,3 +63,29 @@ def __getitem__(self, index) -> Dict[str, object]: def __len__(self) -> int: return len(self.data) + + +def numpy_dtype_to_torch_dtype(dtype): + """ + Convert a NumPy dtype (or dtype string) to the corresponding PyTorch dtype. + Raises ValueError if the dtype is not supported. + """ + if isinstance(dtype, torch.dtype): + return dtype + + mapping = { + np.float32: torch.float32, + np.float64: torch.float64, + np.float16: torch.bfloat16, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + } + + np_dtype = np.dtype(dtype) + if np_dtype.type in mapping: + return mapping[np_dtype.type] + else: + raise ValueError(f"No corresponding torch dtype for NumPy dtype {np_dtype}") \ No newline at end of file From a736fba4ab976d5ac45c7f3c408bc038b54beb64 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 4 Jun 2025 20:32:16 +0200 Subject: [PATCH 03/23] add data_type to transformed modality --- .../systemds/scuro/dataloader/base_loader.py | 6 +- .../systemds/scuro/dataloader/video_loader.py | 2 +- .../scuro/drsearch/representation_cache.py | 2 +- .../unimodal_representation_optimizer.py | 58 ++++++------ .../systemds/scuro/modality/modality.py | 6 +- .../systemds/scuro/modality/transformed.py | 23 +++-- .../scuro/modality/unimodal_modality.py | 15 +-- .../aggregated_representation.py | 2 +- .../systemds/scuro/representations/bert.py | 2 +- .../systemds/scuro/representations/bow.py | 2 +- .../scuro/representations/concatenation.py | 2 +- .../systemds/scuro/representations/glove.py | 12 ++- .../scuro/representations/image_bind.py | 94 +++++++++++++++++++ .../systemds/scuro/representations/lstm.py | 18 ++++ .../scuro/representations/mel_spectrogram.py | 2 +- .../systemds/scuro/representations/mfcc.py | 2 +- .../scuro/representations/optical_flow.py | 6 +- .../systemds/scuro/representations/resnet.py | 50 +++++++--- .../scuro/representations/spectrogram.py | 3 +- .../systemds/scuro/representations/tfidf.py | 2 +- .../systemds/scuro/representations/wav2vec.py | 2 +- .../scuro/representations/word2vec.py | 2 +- .../systemds/scuro/representations/x3d.py | 2 +- .../python/systemds/scuro/utils/converter.py | 49 ++++++++++ .../systemds/scuro/utils/torch_dataset.py | 54 +++-------- src/main/python/tests/scuro/data_generator.py | 19 ++-- .../tests/scuro/test_multimodal_join.py | 4 + 27 files changed, 305 insertions(+), 136 deletions(-) create mode 100644 src/main/python/systemds/scuro/representations/image_bind.py create mode 100644 src/main/python/systemds/scuro/utils/converter.py diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index b9a9f4721c3..68c21280f40 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -74,15 +74,15 @@ def num_chunks(self): @property def next_chunk(self): return self._next_chunk - + @property def data_type(self): return self._data_type - + @data_type.setter def data_type(self, data_type): self._data_type = self.resolve_data_type(data_type) - + def reset(self): self._next_chunk = 0 self.data = [] diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index b6ec6ec6b11..96ea5f11f69 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -74,4 +74,4 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): frames.append(frame) - self.data.append(frames) + self.data.append(np.stack(frames)) diff --git a/src/main/python/systemds/scuro/drsearch/representation_cache.py b/src/main/python/systemds/scuro/drsearch/representation_cache.py index fc78167f2e1..a560ed068ab 100644 --- a/src/main/python/systemds/scuro/drsearch/representation_cache.py +++ b/src/main/python/systemds/scuro/drsearch/representation_cache.py @@ -112,7 +112,7 @@ def load_from_cache(self, modality, operators): metadata = pickle.load(f) transformed_modality = TransformedModality( - modality.modality_type, op_names, modality.modality_id, metadata + modality, op_names, ) data = None with open(f"{filename}.pkl", "rb") as f: diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py index e59ddbe9beb..1a3e6cad05a 100644 --- a/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py +++ b/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py @@ -238,34 +238,34 @@ def _apply_operator_chain(self, current_modality, operator_chain): modified_modality = current_modality representation_start = time.time() - try: - cached_representation, representation_ops, used_op_names = ( - self.cache.load_from_cache( - modified_modality, copy.deepcopy(operator_chain) - ) + # try: + cached_representation, representation_ops, used_op_names = ( + self.cache.load_from_cache( + modified_modality, copy.deepcopy(operator_chain) ) - if cached_representation is not None: - modified_modality = cached_representation - store = False - for operator in representation_ops: - if isinstance(operator, Context): - modified_modality = modified_modality.context(operator) - else: - modified_modality = modified_modality.apply_representation(operator) - store = True - op_params[operator.name] = operator.get_current_parameters() - if store: - self.cache.save_to_cache( - modified_modality, used_op_names, representation_ops - ) - representation_end = time.time() - - self._evaluate_operator_chain( - modified_modality, - operator_chain, - op_params, - representation_end - representation_start, + ) + if cached_representation is not None: + modified_modality = cached_representation + store = False + for operator in representation_ops: + if isinstance(operator, Context): + modified_modality = modified_modality.context(operator) + else: + modified_modality = modified_modality.apply_representation(operator) + store = True + op_params[operator.name] = operator.get_current_parameters() + if store: + self.cache.save_to_cache( + modified_modality, used_op_names, representation_ops ) - except Exception as e: - print(f"Failed to evaluate chain {operator_chain}: {str(e)}") - return + representation_end = time.time() + + self._evaluate_operator_chain( + modified_modality, + operator_chain, + op_params, + representation_end - representation_start, + ) + # except Exception as e: + # print(f"Failed to evaluate chain {operator_chain}: {str(e)}") + # return diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index 7700403146c..5690bc592b4 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -29,7 +29,9 @@ class Modality: - def __init__(self, modalityType: ModalityType, modality_id=-1, metadata={}, data_type=None): + def __init__( + self, modalityType: ModalityType, modality_id=-1, metadata={}, data_type=None + ): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the modality @@ -67,7 +69,7 @@ def copy_from_instance(self): """ Create a copy of the modality instance """ - return type(self)(self.modality_type, self.metadata) + return type(self)(self.modality_type, self.modality_id, self.metadata, self.data_type) def update_metadata(self): """ diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index aba59c1efba..dbb4482cc7f 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -29,18 +29,22 @@ class TransformedModality(Modality): - def __init__(self, modality_type, transformation, modality_id, metadata): + def __init__(self, modality, transformation, new_modality_type=None): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the original modality(ies) :param transformation: Representation to be applied on the modality """ - super().__init__(modality_type, modality_id, metadata) + if new_modality_type is None: + new_modality_type = modality.modality_type + + metadata = modality.metadata.copy() if modality.metadata is not None else None + super().__init__(new_modality_type, modality.modality_id, metadata, modality.data_type) self.transformation = transformation def copy_from_instance(self): return type(self)( - self.modality_type, self.transformation, self.modality_id, self.metadata + self, self.transformation ) def join(self, right, join_condition): @@ -66,17 +70,17 @@ def join(self, right, join_condition): return joined_modality def window(self, windowSize, aggregation): + w = WindowAggregation(windowSize, aggregation) transformed_modality = TransformedModality( - self.modality_type, "window", self.modality_id, self.metadata + self, w ) - w = WindowAggregation(windowSize, aggregation) transformed_modality.data = w.execute(self) return transformed_modality def context(self, context_operator): transformed_modality = TransformedModality( - self.modality_type, context_operator.name, self.modality_id, self.metadata + self, context_operator ) transformed_modality.data = context_operator.execute(self) @@ -93,12 +97,7 @@ def combine(self, other, fusion_method): :param other: The modality to be combined :param fusion_method: The fusion method to be used to combine modalities """ - fused_modality = TransformedModality( - ModalityType.EMBEDDING, - fusion_method, - self.modality_id, - self.metadata, - ) + fused_modality = TransformedModality( self, fusion_method, ModalityType.EMBEDDING) modalities = [self] if isinstance(other, list): modalities.extend(other) diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 7dfdc38942f..dd4a1139cb1 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -37,7 +37,12 @@ def __init__(self, data_loader: BaseLoader): :param data_loader: Defines how the raw data should be loaded :param modality_type: Type of the modality """ - super().__init__(data_loader.modality_type, ModalityIdentifier().new_id(), None, data_loader.data_type) + super().__init__( + data_loader.modality_type, + ModalityIdentifier().new_id(), + {}, + data_loader.data_type, + ) self.data_loader = data_loader def copy_from_instance(self): @@ -85,7 +90,7 @@ def context(self, context_operator): self.extract_raw_data() transformed_modality = TransformedModality( - self.modality_type, context_operator.name, self.modality_id, self.metadata + self, context_operator ) transformed_modality.data = context_operator.execute(self) @@ -101,10 +106,8 @@ def apply_representations(self, representations): def apply_representation(self, representation): new_modality = TransformedModality( - self.modality_type, - representation.name, - self.modality_id, - self.data_loader.metadata.copy(), + self, + representation, ) new_modality.data = [] diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/representations/aggregated_representation.py index 46e6b8bed2c..fab16a52903 100644 --- a/src/main/python/systemds/scuro/representations/aggregated_representation.py +++ b/src/main/python/systemds/scuro/representations/aggregated_representation.py @@ -29,7 +29,7 @@ def __init__(self, aggregation): def transform(self, modality): aggregated_modality = TransformedModality( - modality.modality_type, self.name, modality.modality_id, modality.metadata + modality, self ) aggregated_modality.data = self.aggregation.execute(modality) return aggregated_modality diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 802d7e3d0b3..48e0445371d 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -39,7 +39,7 @@ def __init__(self, model_name="bert", output_file=None): def transform(self, modality): transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata + modality, self ) model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained( diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py index e2bc94041f0..75030c6861e 100644 --- a/src/main/python/systemds/scuro/representations/bow.py +++ b/src/main/python/systemds/scuro/representations/bow.py @@ -40,7 +40,7 @@ def __init__(self, ngram_range=2, min_df=2, output_file=None): def transform(self, modality): transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata + modality, self ) vectorizer = CountVectorizer( ngram_range=(1, self.ngram_range), min_df=self.min_df diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py index 1265563b6cd..a1d163366ba 100644 --- a/src/main/python/systemds/scuro/representations/concatenation.py +++ b/src/main/python/systemds/scuro/representations/concatenation.py @@ -58,7 +58,7 @@ def transform(self, modalities: List[Modality]): [ data, pad_sequences( - modality.data, maxlen=max_emb_size, dtype="float32" + modality.data, maxlen=max_emb_size, dtype=modality.data.dtype ), ], axis=-1, diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py index 66a6847a94c..84d871df068 100644 --- a/src/main/python/systemds/scuro/representations/glove.py +++ b/src/main/python/systemds/scuro/representations/glove.py @@ -21,7 +21,7 @@ import numpy as np from gensim.utils import tokenize - +from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.utils import save_embeddings from systemds.scuro.modality.type import ModalityType @@ -46,11 +46,14 @@ def __init__(self, glove_path, output_file=None): self.glove_path = glove_path self.output_file = output_file - def transform(self, data): + def transform(self, modality): + transformed_modality = TransformedModality( + modality, self + ) glove_embeddings = load_glove_embeddings(self.glove_path) embeddings = [] - for sentences in data: + for sentences in modality.data: tokens = list(tokenize(sentences.lower())) embeddings.append( np.mean( @@ -66,4 +69,5 @@ def transform(self, data): if self.output_file is not None: save_embeddings(np.array(embeddings), self.output_file) - return np.array(embeddings) + transformed_modality.data = np.array(embeddings) + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/image_bind.py b/src/main/python/systemds/scuro/representations/image_bind.py new file mode 100644 index 00000000000..b7b1e31ecec --- /dev/null +++ b/src/main/python/systemds/scuro/representations/image_bind.py @@ -0,0 +1,94 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import torch +import imagebind.data as data + +from imagebind.models.imagebind_model import ModalityType as IBModalityType + +from imagebind.models import imagebind_model +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.representations.unimodal import UnimodalRepresentation +from systemds.scuro.representations.utils import save_embeddings + +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.drsearch.operator_registry import register_representation + +if torch.backends.mps.is_available(): + DEVICE = torch.device("mps") +# elif torch.cuda.is_available(): +# DEVICE = torch.device("cuda") +else: + DEVICE = torch.device("cpu") + + +@register_representation([ModalityType.TEXT, ModalityType.AUDIO, ModalityType.VIDEO]) +class ImageBind(UnimodalRepresentation): + def __init__(self): + parameters = {} + super().__init__("ImageBind", ModalityType.EMBEDDING, parameters) + + def transform(self, modality): + transformed_modality = TransformedModality( + modality.modality_type, self, modality.modality_id, modality.metadata + ) + + model = imagebind_model.imagebind_huge(pretrained=True) + for param in model.parameters(): + param.requires_grad = False + model.eval() + model.to(DEVICE) + + result = [] + if modality.modality_type == ModalityType.TEXT: + for i, instance in enumerate(modality.data): + text_inputs = data.load_and_transform_text(instance, DEVICE) + text_embeddings = model({IBModalityType.TEXT: text_inputs})[ + IBModalityType.TEXT + ] + result.append(text_embeddings.mean(axis=0).cpu().detach().numpy()) + if modality.modality_type == ModalityType.AUDIO: + audio_inputs = data.load_and_transform_audio_data( + list(modality.metadata), + DEVICE, + ) + audio_embeddings = model({IBModalityType.AUDIO: audio_inputs})[ + IBModalityType.AUDIO + ] + result.append(audio_embeddings.mean(axis=0).cpu().detach().numpy()) + if modality.modality_type == ModalityType.VIDEO: + video_inputs = data.load_and_transform_video_data( + list(modality.metadata)[ + (modality.data_loader.next_chunk - 1) + * (modality.data_loader.chunk_size) : ( + modality.data_loader.next_chunk - 1 + ) + * (modality.data_loader.chunk_size) + + (modality.data_loader.chunk_size) + ], + DEVICE, + ) + video_embeddings = model({IBModalityType.VISION: video_inputs})[ + IBModalityType.VISION + ] + result.append(video_embeddings.mean(axis=0).cpu().detach().numpy()) + + transformed_modality.data = result + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py index a82a1e2500b..cbab0f68978 100644 --- a/src/main/python/systemds/scuro/representations/lstm.py +++ b/src/main/python/systemds/scuro/representations/lstm.py @@ -18,6 +18,9 @@ # under the License. # # ------------------------------------------------------------- +import os +import random + import torch from torch import nn @@ -31,6 +34,8 @@ from systemds.scuro.drsearch.operator_registry import register_fusion_operator +# TODO: concatenate before embedding +# Make this a hyperparameter @register_fusion_operator() class LSTM(Fusion): def __init__(self, width=128, depth=1, dropout_rate=0.1): @@ -42,8 +47,18 @@ def __init__(self, width=128, depth=1, dropout_rate=0.1): self.width = width self.dropout_rate = dropout_rate self.unimodal_embeddings = {} + seed = 42 + + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False def transform(self, modalities: List[Modality]): + self.unimodal_embeddings = {} size = len(modalities[0].data) result = np.zeros((size, 0)) @@ -60,6 +75,9 @@ def transform(self, modalities: List[Modality]): return result def run_lstm(self, data): + if isinstance(data, list): + data = np.array(data) + d = data.astype(np.float32) dim = d.shape[-1] d = torch.from_numpy(d) diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 4095ceead0d..8c14c03ac60 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -43,7 +43,7 @@ def __init__(self, n_mels=128, hop_length=512, n_fft=2048): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py index 75cc00d62d9..37e60736528 100644 --- a/src/main/python/systemds/scuro/representations/mfcc.py +++ b/src/main/python/systemds/scuro/representations/mfcc.py @@ -45,7 +45,7 @@ def __init__(self, n_mfcc=12, dct_type=2, n_mels=128, hop_length=512): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + modality, self,self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/optical_flow.py b/src/main/python/systemds/scuro/representations/optical_flow.py index 1fb922d7a36..b15a4283231 100644 --- a/src/main/python/systemds/scuro/representations/optical_flow.py +++ b/src/main/python/systemds/scuro/representations/optical_flow.py @@ -48,10 +48,8 @@ def __init__(self): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, - "opticalFlow", - modality.modality_id, - modality.metadata, + modality, self, self.output_modality_type + ) for video_id, instance in enumerate(modality.data): diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 5c8d6ea6106..bdfbfb17fc0 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -18,10 +18,11 @@ # under the License. # # ------------------------------------------------------------- +from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype from systemds.scuro.utils.torch_dataset import CustomDataset from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation -from typing import Callable, Dict, Tuple, Any +from typing import Tuple, Any from systemds.scuro.drsearch.operator_registry import register_representation import torch.utils.data import torch @@ -42,6 +43,7 @@ ) class ResNet(UnimodalRepresentation): def __init__(self, layer="avgpool", model_name="ResNet18", output_file=None): + self.data_type = torch.bfloat16 self.model_name = model_name parameters = self._get_parameters() super().__init__( @@ -68,25 +70,38 @@ def model_name(self): def model_name(self, model_name): self._model_name = model_name if model_name == "ResNet18": - self.model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet18(weights=models.ResNet18_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + elif model_name == "ResNet34": self.model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to( DEVICE ) + self.model = self.model.to(self.data_type) elif model_name == "ResNet50": - self.model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet50(weights=models.ResNet50_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + elif model_name == "ResNet101": - self.model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet101(weights=models.ResNet101_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + elif model_name == "ResNet152": - self.model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet152(weights=models.ResNet152_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + else: raise NotImplementedError @@ -110,7 +125,11 @@ def _get_parameters(self, high_level=True): return parameters def transform(self, modality): - dataset = CustomDataset(modality.data, modality.data_type) + self.data_type = numpy_dtype_to_torch_dtype(modality.data_type) + if next(self.model.parameters()).dtype != self.data_type: + self.model = self.model.to(self.data_type) + + dataset = CustomDataset(modality.data, self.data_type, DEVICE) embeddings = {} res5c_output = None @@ -132,7 +151,7 @@ def hook( for instance in torch.utils.data.DataLoader(dataset): video_id = instance["id"][0] - frames = instance["data"][0].to(DEVICE) + frames = instance["data"][0] embeddings[video_id] = [] batch_size = 64 @@ -146,13 +165,18 @@ def hook( pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) embeddings[video_id].extend( - torch.flatten(pooled, 1).detach().cpu().numpy() + torch.flatten(pooled, 1) + .detach() + .cpu() + .float() + .numpy() + .astype(modality.data_type) ) embeddings[video_id] = np.array(embeddings[video_id]) transformed_modality = TransformedModality( - self.output_modality_type, "resnet", modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) transformed_modality.data = list(embeddings.values()) diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py index b5558b1b264..d531ebc99da 100644 --- a/src/main/python/systemds/scuro/representations/spectrogram.py +++ b/src/main/python/systemds/scuro/representations/spectrogram.py @@ -37,8 +37,7 @@ def __init__(self, hop_length=512, n_fft=2048): self.n_fft = n_fft def transform(self, modality): - transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + transformed_modality = TransformedModality( modality, self, self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py index c17527b4765..461fb36c042 100644 --- a/src/main/python/systemds/scuro/representations/tfidf.py +++ b/src/main/python/systemds/scuro/representations/tfidf.py @@ -39,7 +39,7 @@ def __init__(self, min_df=2, output_file=None): def transform(self, modality): transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata + modality, self ) vectorizer = TfidfVectorizer(min_df=self.min_df) diff --git a/src/main/python/systemds/scuro/representations/wav2vec.py b/src/main/python/systemds/scuro/representations/wav2vec.py index bf251b101c6..29f5bcbea02 100644 --- a/src/main/python/systemds/scuro/representations/wav2vec.py +++ b/src/main/python/systemds/scuro/representations/wav2vec.py @@ -46,7 +46,7 @@ def __init__(self): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) result = [] diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index e1d1669d9bc..8025b48222f 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -55,7 +55,7 @@ def __init__(self, vector_size=3, min_count=2, window=2, output_file=None): def transform(self, modality): transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata + modality, self ) t = [list(tokenize(s.lower())) for s in modality.data] model = Word2Vec( diff --git a/src/main/python/systemds/scuro/representations/x3d.py b/src/main/python/systemds/scuro/representations/x3d.py index bb5d1ec5ed7..b518f68ef54 100644 --- a/src/main/python/systemds/scuro/representations/x3d.py +++ b/src/main/python/systemds/scuro/representations/x3d.py @@ -127,7 +127,7 @@ def hook( embeddings[video_id] = np.array(embeddings[video_id]) transformed_modality = TransformedModality( - self.output_modality_type, "x3d", modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) transformed_modality.data = list(embeddings.values()) diff --git a/src/main/python/systemds/scuro/utils/converter.py b/src/main/python/systemds/scuro/utils/converter.py new file mode 100644 index 00000000000..030fc4ae29d --- /dev/null +++ b/src/main/python/systemds/scuro/utils/converter.py @@ -0,0 +1,49 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import numpy as np +import torch + + +def numpy_dtype_to_torch_dtype(dtype): + """ + Convert a NumPy dtype (or dtype string) to the corresponding PyTorch dtype. + Raises ValueError if the dtype is not supported. + """ + if isinstance(dtype, torch.dtype): + return dtype + + mapping = { + np.float32: torch.float32, + np.float64: torch.float64, + np.float16: torch.bfloat16, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + } + + np_dtype = np.dtype(dtype) + if np_dtype.type in mapping: + return mapping[np_dtype.type] + else: + raise ValueError(f"No corresponding torch dtype for NumPy dtype {np_dtype}") diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py index 2d724b88492..314dfcd5fc1 100644 --- a/src/main/python/systemds/scuro/utils/torch_dataset.py +++ b/src/main/python/systemds/scuro/utils/torch_dataset.py @@ -20,22 +20,22 @@ # ------------------------------------------------------------- from typing import Dict -import numpy as np import torch import torchvision.transforms as transforms class CustomDataset(torch.utils.data.Dataset): - def __init__(self, data, data_type): + def __init__(self, data, data_type, device): self.data = data - data_type = numpy_dtype_to_torch_dtype(data_type) + self.data_type = data_type + self.device = device self.tf = transforms.Compose( [ transforms.ToPILImage(), transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), - transforms.ConvertImageDtype(dtype=data_type), + transforms.ConvertImageDtype(dtype=self.data_type), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), @@ -44,48 +44,18 @@ def __init__(self, data, data_type): def __getitem__(self, index) -> Dict[str, object]: data = self.data[index] - if type(data) is np.ndarray: - output = torch.empty((1, 3, 224, 224)) - d = torch.tensor(data) - d = d.repeat(3, 1, 1) - output[0] = self.tf(d) - else: - output = torch.empty((len(data), 3, 224, 224)) + output = torch.empty( + (len(data), 3, 224, 224), dtype=self.data_type, device=self.device + ) - for i, d in enumerate(data): - if data[0].ndim < 3: - d = torch.tensor(d) - d = d.repeat(3, 1, 1) + for i, d in enumerate(data): + if data[0].ndim < 3: + d = torch.tensor(d) + d = d.repeat(3, 1, 1) - output[i] = self.tf(d) + output[i] = self.tf(d) return {"id": index, "data": output} def __len__(self) -> int: return len(self.data) - - -def numpy_dtype_to_torch_dtype(dtype): - """ - Convert a NumPy dtype (or dtype string) to the corresponding PyTorch dtype. - Raises ValueError if the dtype is not supported. - """ - if isinstance(dtype, torch.dtype): - return dtype - - mapping = { - np.float32: torch.float32, - np.float64: torch.float64, - np.float16: torch.bfloat16, - np.uint8: torch.uint8, - np.int8: torch.int8, - np.int16: torch.int16, - np.int32: torch.int32, - np.int64: torch.int64, - } - - np_dtype = np.dtype(dtype) - if np_dtype.type in mapping: - return mapping[np_dtype.type] - else: - raise ValueError(f"No corresponding torch dtype for NumPy dtype {np_dtype}") \ No newline at end of file diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index e31887ff833..6c6ecc029c7 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -37,7 +37,10 @@ class ModalityRandomDataGenerator: def __init__(self): - self._modality_id = 0 + self.modality_id = 0 + self.modality_type = None + self.metadata = {} + self.data_type = np.float32 def create1DModality( self, @@ -46,29 +49,31 @@ def create1DModality( modality_type, ): data = np.random.rand(num_instances, num_features) + data.dtype = self.data_type + # TODO: write a dummy method to create the same metadata for all instances to avoid the for loop - metadata = {} + self.modality_type = modality_type for i in range(num_instances): if modality_type == ModalityType.AUDIO: - metadata[i] = modality_type.create_audio_metadata( + self.metadata[i] = modality_type.create_audio_metadata( num_features / 10, data[i] ) elif modality_type == ModalityType.TEXT: - metadata[i] = modality_type.create_text_metadata( + self.metadata[i] = modality_type.create_text_metadata( num_features / 10, data[i] ) elif modality_type == ModalityType.VIDEO: - metadata[i] = modality_type.create_video_metadata( + self.metadata[i] = modality_type.create_video_metadata( num_features / 30, 10, 0, 0, 1 ) else: raise NotImplementedError tf_modality = TransformedModality( - modality_type, "test_transformation", self._modality_id, metadata + self, "test_transformation" ) tf_modality.data = data - self._modality_id += 1 + self.modality_id += 1 return tf_modality diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index bebb52351ba..6f1497ae133 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -23,6 +23,8 @@ import shutil import unittest +import numpy as np + from systemds.scuro.modality.joined import JoinCondition from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.mel_spectrogram import MelSpectrogram @@ -94,6 +96,7 @@ def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): video_data_loader = VideoLoader( self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices, + data_type=np.float32, chunk_size=l_chunk_size, ) video = UnimodalModality(video_data_loader) @@ -101,6 +104,7 @@ def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): audio_data_loader = AudioLoader( self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices, + data_type=np.float32, chunk_size=r_chunk_size, ) audio = UnimodalModality(audio_data_loader) From 1dfdb2b60d5895077b3ea42aa9b495b56e0e2354 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Thu, 5 Jun 2025 14:45:29 +0200 Subject: [PATCH 04/23] change function name to window_aggregation --- src/main/python/systemds/scuro/__init__.py | 2 +- .../scuro/drsearch/representation_cache.py | 3 ++- .../unimodal_representation_optimizer.py | 4 +-- .../scuro/modality/joined_transformed.py | 4 +-- .../systemds/scuro/modality/modality.py | 4 ++- .../systemds/scuro/modality/transformed.py | 26 +++++++++---------- .../scuro/modality/unimodal_modality.py | 4 +-- .../aggregated_representation.py | 4 +-- .../systemds/scuro/representations/bert.py | 4 +-- .../systemds/scuro/representations/bow.py | 4 +-- .../scuro/representations/concatenation.py | 4 ++- .../systemds/scuro/representations/glove.py | 4 +-- .../systemds/scuro/representations/mfcc.py | 2 +- .../scuro/representations/optical_flow.py | 1 - .../scuro/representations/spectrogram.py | 3 ++- .../systemds/scuro/representations/tfidf.py | 4 +-- .../{window.py => window_aggregation.py} | 0 .../scuro/representations/word2vec.py | 4 +-- src/main/python/tests/scuro/test_dr_search.py | 2 +- .../tests/scuro/test_multimodal_join.py | 2 +- .../tests/scuro/test_operator_registry.py | 2 +- .../tests/scuro/test_window_operations.py | 2 +- 22 files changed, 38 insertions(+), 51 deletions(-) rename src/main/python/systemds/scuro/representations/{window.py => window_aggregation.py} (100%) diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index 4b2185316a0..b878200b0b4 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -52,7 +52,7 @@ from systemds.scuro.representations.tfidf import TfIdf from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.wav2vec import Wav2Vec -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.x3d import X3D from systemds.scuro.models.model import Model diff --git a/src/main/python/systemds/scuro/drsearch/representation_cache.py b/src/main/python/systemds/scuro/drsearch/representation_cache.py index a560ed068ab..4df478272df 100644 --- a/src/main/python/systemds/scuro/drsearch/representation_cache.py +++ b/src/main/python/systemds/scuro/drsearch/representation_cache.py @@ -112,7 +112,8 @@ def load_from_cache(self, modality, operators): metadata = pickle.load(f) transformed_modality = TransformedModality( - modality, op_names, + modality, + op_names, ) data = None with open(f"{filename}.pkl", "rb") as f: diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py index 1a3e6cad05a..a08846eb49b 100644 --- a/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py +++ b/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py @@ -240,9 +240,7 @@ def _apply_operator_chain(self, current_modality, operator_chain): representation_start = time.time() # try: cached_representation, representation_ops, used_op_names = ( - self.cache.load_from_cache( - modified_modality, copy.deepcopy(operator_chain) - ) + self.cache.load_from_cache(modified_modality, copy.deepcopy(operator_chain)) ) if cached_representation is not None: modified_modality = cached_representation diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py index a0ab8c4ce9a..6c6190e03cc 100644 --- a/src/main/python/systemds/scuro/modality/joined_transformed.py +++ b/src/main/python/systemds/scuro/modality/joined_transformed.py @@ -25,7 +25,7 @@ from systemds.scuro.modality.modality import Modality from systemds.scuro.representations.utils import pad_sequences -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation class JoinedTransformedModality(Modality): @@ -70,7 +70,7 @@ def combine(self, fusion_method): self.data = pad_sequences(self.data) return self - def window(self, window_size, aggregation): + def window_aggregation(self, window_size, aggregation): w = WindowAggregation(window_size, aggregation) self.left_modality.data = w.execute(self.left_modality) self.right_modality.data = w.execute(self.right_modality) diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index 5690bc592b4..87d5b5ee4e4 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -69,7 +69,9 @@ def copy_from_instance(self): """ Create a copy of the modality instance """ - return type(self)(self.modality_type, self.modality_id, self.metadata, self.data_type) + return type(self)( + self.modality_type, self.modality_id, self.metadata, self.data_type + ) def update_metadata(self): """ diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index dbb4482cc7f..362764d21e9 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -24,7 +24,7 @@ from systemds.scuro.modality.type import ModalityType from systemds.scuro.modality.joined import JoinedModality from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation class TransformedModality(Modality): @@ -37,15 +37,15 @@ def __init__(self, modality, transformation, new_modality_type=None): """ if new_modality_type is None: new_modality_type = modality.modality_type - + metadata = modality.metadata.copy() if modality.metadata is not None else None - super().__init__(new_modality_type, modality.modality_id, metadata, modality.data_type) + super().__init__( + new_modality_type, modality.modality_id, metadata, modality.data_type + ) self.transformation = transformation def copy_from_instance(self): - return type(self)( - self, self.transformation - ) + return type(self)(self, self.transformation) def join(self, right, join_condition): chunked_execution = False @@ -69,19 +69,15 @@ def join(self, right, join_condition): return joined_modality - def window(self, windowSize, aggregation): + def window_aggregation(self, windowSize, aggregation): w = WindowAggregation(windowSize, aggregation) - transformed_modality = TransformedModality( - self, w - ) + transformed_modality = TransformedModality(self, w) transformed_modality.data = w.execute(self) return transformed_modality def context(self, context_operator): - transformed_modality = TransformedModality( - self, context_operator - ) + transformed_modality = TransformedModality(self, context_operator) transformed_modality.data = context_operator.execute(self) return transformed_modality @@ -97,7 +93,9 @@ def combine(self, other, fusion_method): :param other: The modality to be combined :param fusion_method: The fusion method to be used to combine modalities """ - fused_modality = TransformedModality( self, fusion_method, ModalityType.EMBEDDING) + fused_modality = TransformedModality( + self, fusion_method, ModalityType.EMBEDDING + ) modalities = [self] if isinstance(other, list): modalities.extend(other) diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index dd4a1139cb1..c0ee70557c5 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -89,9 +89,7 @@ def context(self, context_operator): if not self.has_data(): self.extract_raw_data() - transformed_modality = TransformedModality( - self, context_operator - ) + transformed_modality = TransformedModality(self, context_operator) transformed_modality.data = context_operator.execute(self) return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/representations/aggregated_representation.py index fab16a52903..9412c5be008 100644 --- a/src/main/python/systemds/scuro/representations/aggregated_representation.py +++ b/src/main/python/systemds/scuro/representations/aggregated_representation.py @@ -28,8 +28,6 @@ def __init__(self, aggregation): self.aggregation = aggregation def transform(self, modality): - aggregated_modality = TransformedModality( - modality, self - ) + aggregated_modality = TransformedModality(modality, self) aggregated_modality.data = self.aggregation.execute(modality) return aggregated_modality diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 48e0445371d..5aa073ed820 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -38,9 +38,7 @@ def __init__(self, model_name="bert", output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality, self - ) + transformed_modality = TransformedModality(modality, self) model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained( model_name, clean_up_tokenization_spaces=True diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py index 75030c6861e..6778811c49c 100644 --- a/src/main/python/systemds/scuro/representations/bow.py +++ b/src/main/python/systemds/scuro/representations/bow.py @@ -39,9 +39,7 @@ def __init__(self, ngram_range=2, min_df=2, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality, self - ) + transformed_modality = TransformedModality(modality, self) vectorizer = CountVectorizer( ngram_range=(1, self.ngram_range), min_df=self.min_df ) diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py index a1d163366ba..c7ce33ab5c7 100644 --- a/src/main/python/systemds/scuro/representations/concatenation.py +++ b/src/main/python/systemds/scuro/representations/concatenation.py @@ -58,7 +58,9 @@ def transform(self, modalities: List[Modality]): [ data, pad_sequences( - modality.data, maxlen=max_emb_size, dtype=modality.data.dtype + modality.data, + maxlen=max_emb_size, + dtype=modality.data.dtype, ), ], axis=-1, diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py index 84d871df068..d948567f3f5 100644 --- a/src/main/python/systemds/scuro/representations/glove.py +++ b/src/main/python/systemds/scuro/representations/glove.py @@ -47,9 +47,7 @@ def __init__(self, glove_path, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality, self - ) + transformed_modality = TransformedModality(modality, self) glove_embeddings = load_glove_embeddings(self.glove_path) embeddings = [] diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py index 37e60736528..234e93246fd 100644 --- a/src/main/python/systemds/scuro/representations/mfcc.py +++ b/src/main/python/systemds/scuro/representations/mfcc.py @@ -45,7 +45,7 @@ def __init__(self, n_mfcc=12, dct_type=2, n_mels=128, hop_length=512): def transform(self, modality): transformed_modality = TransformedModality( - modality, self,self.output_modality_type + modality, self, self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/optical_flow.py b/src/main/python/systemds/scuro/representations/optical_flow.py index b15a4283231..27817302d4a 100644 --- a/src/main/python/systemds/scuro/representations/optical_flow.py +++ b/src/main/python/systemds/scuro/representations/optical_flow.py @@ -49,7 +49,6 @@ def __init__(self): def transform(self, modality): transformed_modality = TransformedModality( modality, self, self.output_modality_type - ) for video_id, instance in enumerate(modality.data): diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py index d531ebc99da..6a713a3d21c 100644 --- a/src/main/python/systemds/scuro/representations/spectrogram.py +++ b/src/main/python/systemds/scuro/representations/spectrogram.py @@ -37,7 +37,8 @@ def __init__(self, hop_length=512, n_fft=2048): self.n_fft = n_fft def transform(self, modality): - transformed_modality = TransformedModality( modality, self, self.output_modality_type + transformed_modality = TransformedModality( + modality, self, self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py index 461fb36c042..1df5a1fde08 100644 --- a/src/main/python/systemds/scuro/representations/tfidf.py +++ b/src/main/python/systemds/scuro/representations/tfidf.py @@ -38,9 +38,7 @@ def __init__(self, min_df=2, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality, self - ) + transformed_modality = TransformedModality(modality, self) vectorizer = TfidfVectorizer(min_df=self.min_df) diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window_aggregation.py similarity index 100% rename from src/main/python/systemds/scuro/representations/window.py rename to src/main/python/systemds/scuro/representations/window_aggregation.py diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index 8025b48222f..0210207a013 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -54,9 +54,7 @@ def __init__(self, vector_size=3, min_count=2, window=2, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality, self - ) + transformed_modality = TransformedModality(modality, self) t = [list(tokenize(s.lower())) for s in modality.data] model = Word2Vec( sentences=t, diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index 521ff3f468c..d896815ab24 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -113,7 +113,7 @@ def setUpClass(cls): cls.resnet = ( cls.data_generator.modalities_by_type[ModalityType.VIDEO] .apply_representation(ResNet()) - .window(10, "mean") + .window_aggregation(10, "mean") .flatten() ) cls.mods = [cls.bert, cls.mel_spe, cls.resnet] diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index 6f1497ae133..e65eb935bca 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -119,7 +119,7 @@ def _join(self, left_modality, right_modality, window_size): right_modality, JoinCondition("timestamp", "timestamp", "<") ) .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50")) - .window(window_size, "mean") + .window_aggregation(window_size, "mean") .combine("concat") ) diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index aaecde2991c..e35bf5b1d2c 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -23,7 +23,7 @@ from systemds.scuro.representations.mfcc import MFCC from systemds.scuro.representations.wav2vec import Wav2Vec -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation from systemds.scuro.representations.bow import BoW from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.tfidf import TfIdf diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py index d7210ddb6dc..ea1b0f46f2e 100644 --- a/src/main/python/tests/scuro/test_window_operations.py +++ b/src/main/python/tests/scuro/test_window_operations.py @@ -51,7 +51,7 @@ def test_window_operations_on_text_representations(self): def run_window_operations_for_modality(self, modality_type, window_size): r = self.data_generator.create1DModality(40, 100, modality_type) for aggregation in self.aggregations: - windowed_modality = r.window(window_size, aggregation) + windowed_modality = r.window_aggregation(window_size, aggregation) self.verify_window_operation(aggregation, r, windowed_modality, window_size) From 3f1c16a899ad1b169e5a7ad480253d5f2b6d56de Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Thu, 5 Jun 2025 15:02:19 +0200 Subject: [PATCH 05/23] remove library --- src/main/python/systemds/scuro/dataloader/base_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index 68c21280f40..f21f212e7a0 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -24,7 +24,6 @@ import math import numpy as np -from tensorflow.python.ops.numpy_ops.np_dtypes import int16 class BaseLoader(ABC): From da9e9ac32767bf5f8a44b3b3fba7203a2b6ca0aa Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Thu, 5 Jun 2025 18:22:44 +0200 Subject: [PATCH 06/23] formatting --- src/main/python/tests/scuro/data_generator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 6c6ecc029c7..ee8c699b515 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -50,7 +50,7 @@ def create1DModality( ): data = np.random.rand(num_instances, num_features) data.dtype = self.data_type - + # TODO: write a dummy method to create the same metadata for all instances to avoid the for loop self.modality_type = modality_type for i in range(num_instances): @@ -69,9 +69,7 @@ def create1DModality( else: raise NotImplementedError - tf_modality = TransformedModality( - self, "test_transformation" - ) + tf_modality = TransformedModality(self, "test_transformation") tf_modality.data = data self.modality_id += 1 return tf_modality From 809c242cc4c47655fced324e401a05d6b62350b1 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Fri, 6 Jun 2025 12:04:00 +0200 Subject: [PATCH 07/23] add data_type to test data generator --- src/main/python/tests/scuro/data_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index ee8c699b515..2fd667ce36e 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -48,7 +48,7 @@ def create1DModality( num_features, modality_type, ): - data = np.random.rand(num_instances, num_features) + data = np.random.rand(num_instances, num_features).astype(self.data_type) data.dtype = self.data_type # TODO: write a dummy method to create the same metadata for all instances to avoid the for loop From 2103b40cca61463652f988cf61ed6d0f29ed4171 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Sat, 7 Jun 2025 14:59:31 +0200 Subject: [PATCH 08/23] enable try except for invalid representations --- .../unimodal_representation_optimizer.py | 56 ++++++++++--------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py index a08846eb49b..e59ddbe9beb 100644 --- a/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py +++ b/src/main/python/systemds/scuro/drsearch/unimodal_representation_optimizer.py @@ -238,32 +238,34 @@ def _apply_operator_chain(self, current_modality, operator_chain): modified_modality = current_modality representation_start = time.time() - # try: - cached_representation, representation_ops, used_op_names = ( - self.cache.load_from_cache(modified_modality, copy.deepcopy(operator_chain)) - ) - if cached_representation is not None: - modified_modality = cached_representation - store = False - for operator in representation_ops: - if isinstance(operator, Context): - modified_modality = modified_modality.context(operator) - else: - modified_modality = modified_modality.apply_representation(operator) - store = True - op_params[operator.name] = operator.get_current_parameters() - if store: - self.cache.save_to_cache( - modified_modality, used_op_names, representation_ops + try: + cached_representation, representation_ops, used_op_names = ( + self.cache.load_from_cache( + modified_modality, copy.deepcopy(operator_chain) + ) ) - representation_end = time.time() + if cached_representation is not None: + modified_modality = cached_representation + store = False + for operator in representation_ops: + if isinstance(operator, Context): + modified_modality = modified_modality.context(operator) + else: + modified_modality = modified_modality.apply_representation(operator) + store = True + op_params[operator.name] = operator.get_current_parameters() + if store: + self.cache.save_to_cache( + modified_modality, used_op_names, representation_ops + ) + representation_end = time.time() - self._evaluate_operator_chain( - modified_modality, - operator_chain, - op_params, - representation_end - representation_start, - ) - # except Exception as e: - # print(f"Failed to evaluate chain {operator_chain}: {str(e)}") - # return + self._evaluate_operator_chain( + modified_modality, + operator_chain, + op_params, + representation_end - representation_start, + ) + except Exception as e: + print(f"Failed to evaluate chain {operator_chain}: {str(e)}") + return From ddaa329363e8d5aed20c7f1001ccebcf20fd0a3e Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 30 Jun 2025 11:42:49 +0200 Subject: [PATCH 09/23] change audio frequency for tests --- src/main/python/tests/scuro/data_generator.py | 2 +- src/main/python/tests/scuro/test_multimodal_join.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 2fd667ce36e..545487c40b8 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -205,7 +205,7 @@ def __create_text_data(self, idx, speed_factor): def __create_audio_data(self, idx, duration, speed_factor): path = f"{self.path}/AUDIO/{idx}.wav" - sample_rate = 44100 + sample_rate = 16000 t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) frequency_variation = random.uniform(200.0, 500.0) diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index e65eb935bca..a871e4297bc 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -96,7 +96,7 @@ def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): video_data_loader = VideoLoader( self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices, - data_type=np.float32, + data_type=np.float16, chunk_size=l_chunk_size, ) video = UnimodalModality(video_data_loader) From d5253f0071a1bb8e598e387d1a74ce739b409cba Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 30 Jun 2025 13:21:40 +0200 Subject: [PATCH 10/23] testing --- .../python/tests/scuro/test_data_loaders.py | 127 +++++++++--------- 1 file changed, 64 insertions(+), 63 deletions(-) diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py index 85da2919a04..0f60225de0e 100644 --- a/src/main/python/tests/scuro/test_data_loaders.py +++ b/src/main/python/tests/scuro/test_data_loaders.py @@ -37,69 +37,70 @@ class TestDataLoaders(unittest.TestCase): - test_file_path = None - mods = None - text = None - audio = None - video = None - data_generator = None - num_instances = 0 - - @classmethod - def setUpClass(cls): - cls.test_file_path = "test_data" - cls.num_instances = 2 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - - os.makedirs(f"{cls.test_file_path}/embeddings") - - cls.text_ref = cls.data_generator.modalities_by_type[ - ModalityType.TEXT - ].apply_representation(Bert()) - cls.audio_ref = cls.data_generator.modalities_by_type[ - ModalityType.AUDIO - ].apply_representation(MelSpectrogram()) - cls.video_ref = cls.data_generator.modalities_by_type[ - ModalityType.VIDEO - ].apply_representation(ResNet()) - - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) - - def test_load_audio_data_from_file(self): - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - ) - audio = UnimodalModality(audio_data_loader).apply_representation( - MelSpectrogram() - ) - - for i in range(0, self.num_instances): - np.testing.assert_almost_equal(self.audio_ref.data[i], audio.data[i]) - - def test_load_video_data_from_file(self): - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - ) - video = UnimodalModality(video_data_loader).apply_representation(ResNet()) - - for i in range(0, self.num_instances): - np.testing.assert_almost_equal(self.video_ref.data[i], video.data[i]) - - def test_load_text_data_from_file(self): - text_data_loader = TextLoader( - self.data_generator.get_modality_path(ModalityType.TEXT), - self.data_generator.indices, - ) - text = UnimodalModality(text_data_loader).apply_representation(Bert()) - - for i in range(0, self.num_instances): - np.testing.assert_almost_equal(self.text_ref.data[i], text.data[i]) + pass + # test_file_path = None + # mods = None + # text = None + # audio = None + # video = None + # data_generator = None + # num_instances = 0 + # + # @classmethod + # def setUpClass(cls): + # cls.test_file_path = "test_data" + # cls.num_instances = 2 + # cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + # cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + # + # os.makedirs(f"{cls.test_file_path}/embeddings") + # + # cls.text_ref = cls.data_generator.modalities_by_type[ + # ModalityType.TEXT + # ].apply_representation(Bert()) + # cls.audio_ref = cls.data_generator.modalities_by_type[ + # ModalityType.AUDIO + # ].apply_representation(MelSpectrogram()) + # cls.video_ref = cls.data_generator.modalities_by_type[ + # ModalityType.VIDEO + # ].apply_representation(ResNet()) + # + # @classmethod + # def tearDownClass(cls): + # print("Cleaning up test data") + # shutil.rmtree(cls.test_file_path) + # + # def test_load_audio_data_from_file(self): + # audio_data_loader = AudioLoader( + # self.data_generator.get_modality_path(ModalityType.AUDIO), + # self.data_generator.indices, + # ) + # audio = UnimodalModality(audio_data_loader).apply_representation( + # MelSpectrogram() + # ) + # + # for i in range(0, self.num_instances): + # np.testing.assert_almost_equal(self.audio_ref.data[i], audio.data[i]) + # + # def test_load_video_data_from_file(self): + # video_data_loader = VideoLoader( + # self.data_generator.get_modality_path(ModalityType.VIDEO), + # self.data_generator.indices, + # ) + # video = UnimodalModality(video_data_loader).apply_representation(ResNet()) + # + # for i in range(0, self.num_instances): + # np.testing.assert_almost_equal(self.video_ref.data[i], video.data[i]) + # + # def test_load_text_data_from_file(self): + # text_data_loader = TextLoader( + # self.data_generator.get_modality_path(ModalityType.TEXT), + # self.data_generator.indices, + # ) + # text = UnimodalModality(text_data_loader).apply_representation(Bert()) + # + # for i in range(0, self.num_instances): + # np.testing.assert_almost_equal(self.text_ref.data[i], text.data[i]) if __name__ == "__main__": From cad88cd228f0b40201eadc9c0d076aee1b7a57e0 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 30 Jun 2025 13:46:52 +0200 Subject: [PATCH 11/23] testing --- src/main/python/tests/scuro/test_multimodal_fusion.py | 4 ++++ src/main/python/tests/scuro/test_unimodal_optimizer.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py index 8456279c3d3..389cf118401 100644 --- a/src/main/python/tests/scuro/test_multimodal_fusion.py +++ b/src/main/python/tests/scuro/test_multimodal_fusion.py @@ -200,3 +200,7 @@ def test_multimodal_fusion(self): debug=False, ) multimodal_optimizer.optimize() + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index bfc52f01031..86c47621d43 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -201,3 +201,8 @@ def optimize_unimodal_representation_for_modality(self, modality): ) >= 1 ) + + +if __name__ == "__main__": + unittest.main() + From a86f95dad564c69df676270c3196fdd31ddb7f82 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 30 Jun 2025 14:58:55 +0200 Subject: [PATCH 12/23] testing --- .../python/tests/scuro/test_data_loaders.py | 6 +- src/main/python/tests/scuro/test_dr_search.py | 213 +++++++++--------- .../tests/scuro/test_multimodal_fusion.py | 203 ++++++++--------- .../tests/scuro/test_multimodal_join.py | 195 ++++++++-------- .../tests/scuro/test_operator_registry.py | 87 +++---- .../tests/scuro/test_unimodal_optimizer.py | 209 ++++++++--------- .../scuro/test_unimodal_representations.py | 157 ++++++------- .../tests/scuro/test_window_operations.py | 151 +++++++------ 8 files changed, 614 insertions(+), 607 deletions(-) diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py index 0f60225de0e..08c7bb4fecd 100644 --- a/src/main/python/tests/scuro/test_data_loaders.py +++ b/src/main/python/tests/scuro/test_data_loaders.py @@ -102,6 +102,6 @@ class TestDataLoaders(unittest.TestCase): # for i in range(0, self.num_instances): # np.testing.assert_almost_equal(self.text_ref.data[i], text.data[i]) - -if __name__ == "__main__": - unittest.main() +# +# if __name__ == "__main__": +# unittest.main() diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index d896815ab24..3838d8f2114 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -78,109 +78,110 @@ def scale_data(data, train_indizes): class TestDataLoaders(unittest.TestCase): - train_indizes = None - val_indizes = None - test_file_path = None - mods = None - text = None - audio = None - video = None - data_generator = None - num_instances = 0 - representations = None - - @classmethod - def setUpClass(cls): - cls.test_file_path = "test_data_dr_search" - cls.num_instances = 20 - modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data( - modalities, cls.num_instances, cls.test_file_path - ) - os.makedirs(f"{cls.test_file_path}/embeddings") - - # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead - - cls.bert = cls.data_generator.modalities_by_type[ - ModalityType.TEXT - ].apply_representation(Bert()) - cls.mel_spe = ( - cls.data_generator.modalities_by_type[ModalityType.AUDIO] - .apply_representation(MelSpectrogram()) - .flatten() - ) - cls.resnet = ( - cls.data_generator.modalities_by_type[ModalityType.VIDEO] - .apply_representation(ResNet()) - .window_aggregation(10, "mean") - .flatten() - ) - cls.mods = [cls.bert, cls.mel_spe, cls.resnet] - - split = train_test_split( - cls.data_generator.indices, - cls.data_generator.labels, - test_size=0.2, - random_state=42, - ) - cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ - int(i) for i in split[1] - ] - - for m in cls.mods: - m.data = scale_data(m.data, cls.train_indizes) - - cls.representations = [ - Concatenation(), - Average(), - RowMax(100), - Multiplication(), - Sum(), - LSTM(width=256, depth=3), - ] - - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) - - def test_enumerate_all(self): - task = Task( - "TestTask", - TestSVM(), - self.data_generator.labels, - self.train_indizes, - self.val_indizes, - ) - dr_search = DRSearch(self.mods, task, self.representations) - best_representation, best_score, best_modalities = dr_search.fit_enumerate_all() - - for r in dr_search.scores.values(): - for scores in r.values(): - assert scores[1] <= best_score - - def test_enumerate_all_vs_random(self): - task = Task( - "TestTask", - TestSVM(), - self.data_generator.labels, - self.train_indizes, - self.val_indizes, - ) - dr_search = DRSearch(self.mods, task, self.representations) - best_representation_enum, best_score_enum, best_modalities_enum = ( - dr_search.fit_enumerate_all() - ) - - dr_search.reset_best_params() - - best_representation_rand, best_score_rand, best_modalities_rand = ( - dr_search.fit_random(seed=42) - ) - - assert best_score_rand <= best_score_enum - - -if __name__ == "__main__": - unittest.main() + pass +# train_indizes = None +# val_indizes = None +# test_file_path = None +# mods = None +# text = None +# audio = None +# video = None +# data_generator = None +# num_instances = 0 +# representations = None +# +# @classmethod +# def setUpClass(cls): +# cls.test_file_path = "test_data_dr_search" +# cls.num_instances = 20 +# modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] +# +# cls.data_generator = setup_data( +# modalities, cls.num_instances, cls.test_file_path +# ) +# os.makedirs(f"{cls.test_file_path}/embeddings") +# +# # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead +# +# cls.bert = cls.data_generator.modalities_by_type[ +# ModalityType.TEXT +# ].apply_representation(Bert()) +# cls.mel_spe = ( +# cls.data_generator.modalities_by_type[ModalityType.AUDIO] +# .apply_representation(MelSpectrogram()) +# .flatten() +# ) +# cls.resnet = ( +# cls.data_generator.modalities_by_type[ModalityType.VIDEO] +# .apply_representation(ResNet()) +# .window_aggregation(10, "mean") +# .flatten() +# ) +# cls.mods = [cls.bert, cls.mel_spe, cls.resnet] +# +# split = train_test_split( +# cls.data_generator.indices, +# cls.data_generator.labels, +# test_size=0.2, +# random_state=42, +# ) +# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ +# int(i) for i in split[1] +# ] +# +# for m in cls.mods: +# m.data = scale_data(m.data, cls.train_indizes) +# +# cls.representations = [ +# Concatenation(), +# Average(), +# RowMax(100), +# Multiplication(), +# Sum(), +# LSTM(width=256, depth=3), +# ] +# +# @classmethod +# def tearDownClass(cls): +# print("Cleaning up test data") +# shutil.rmtree(cls.test_file_path) +# +# def test_enumerate_all(self): +# task = Task( +# "TestTask", +# TestSVM(), +# self.data_generator.labels, +# self.train_indizes, +# self.val_indizes, +# ) +# dr_search = DRSearch(self.mods, task, self.representations) +# best_representation, best_score, best_modalities = dr_search.fit_enumerate_all() +# +# for r in dr_search.scores.values(): +# for scores in r.values(): +# assert scores[1] <= best_score +# +# def test_enumerate_all_vs_random(self): +# task = Task( +# "TestTask", +# TestSVM(), +# self.data_generator.labels, +# self.train_indizes, +# self.val_indizes, +# ) +# dr_search = DRSearch(self.mods, task, self.representations) +# best_representation_enum, best_score_enum, best_modalities_enum = ( +# dr_search.fit_enumerate_all() +# ) +# +# dr_search.reset_best_params() +# +# best_representation_rand, best_score_rand, best_modalities_rand = ( +# dr_search.fit_random(seed=42) +# ) +# +# assert best_score_rand <= best_score_enum +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py index 389cf118401..5ddab8b65f3 100644 --- a/src/main/python/tests/scuro/test_multimodal_fusion.py +++ b/src/main/python/tests/scuro/test_multimodal_fusion.py @@ -103,104 +103,105 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestMultimodalRepresentationOptimizer(unittest.TestCase): - test_file_path = None - data_generator = None - num_instances = 0 - - @classmethod - def setUpClass(cls): - cls.test_file_path = "fusion_optimizer_test_data" - - cls.num_instances = 10 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - split = train_test_split( - cls.data_generator.indices, - cls.data_generator.labels, - test_size=0.2, - random_state=42, - ) - cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ - int(i) for i in split[1] - ] - - cls.tasks = [ - Task( - "UnimodalRepresentationTask1", - TestSVM(), - cls.data_generator.labels, - cls.train_indizes, - cls.val_indizes, - ), - Task( - "UnimodalRepresentationTask2", - TestCNN(), - cls.data_generator.labels, - cls.train_indizes, - cls.val_indizes, - ), - ] - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.test_file_path) - - def test_multimodal_fusion(self): - task = Task( - "UnimodalRepresentationTask1", - TestSVM(), - self.data_generator.labels, - self.train_indizes, - self.val_indizes, - ) - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - ) - audio = UnimodalModality(audio_data_loader) - - text_data_loader = TextLoader( - self.data_generator.get_modality_path(ModalityType.TEXT), - self.data_generator.indices, - ) - text = UnimodalModality(text_data_loader) - - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - ) - video = UnimodalModality(video_data_loader) - - with patch.object( - Registry, - "_representations", - { - ModalityType.TEXT: [W2V], - ModalityType.AUDIO: [Spectrogram], - ModalityType.TIMESERIES: [ResNet], - ModalityType.VIDEO: [ResNet], - ModalityType.EMBEDDING: [], - }, - ): - registry = Registry() - registry._fusion_operators = [Average, Concatenation] - unimodal_optimizer = UnimodalRepresentationOptimizer( - [text, audio, video], [task], max_chain_depth=2 - ) - unimodal_optimizer.optimize() - - multimodal_optimizer = FusionOptimizer( - [audio, text, video], - task, - unimodal_optimizer.optimization_results, - unimodal_optimizer.cache, - 2, - 2, - debug=False, - ) - multimodal_optimizer.optimize() - - -if __name__ == "__main__": - unittest.main() + pass +# test_file_path = None +# data_generator = None +# num_instances = 0 +# +# @classmethod +# def setUpClass(cls): +# cls.test_file_path = "fusion_optimizer_test_data" +# +# cls.num_instances = 10 +# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] +# +# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) +# split = train_test_split( +# cls.data_generator.indices, +# cls.data_generator.labels, +# test_size=0.2, +# random_state=42, +# ) +# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ +# int(i) for i in split[1] +# ] +# +# cls.tasks = [ +# Task( +# "UnimodalRepresentationTask1", +# TestSVM(), +# cls.data_generator.labels, +# cls.train_indizes, +# cls.val_indizes, +# ), +# Task( +# "UnimodalRepresentationTask2", +# TestCNN(), +# cls.data_generator.labels, +# cls.train_indizes, +# cls.val_indizes, +# ), +# ] +# +# @classmethod +# def tearDownClass(cls): +# shutil.rmtree(cls.test_file_path) +# +# def test_multimodal_fusion(self): +# task = Task( +# "UnimodalRepresentationTask1", +# TestSVM(), +# self.data_generator.labels, +# self.train_indizes, +# self.val_indizes, +# ) +# audio_data_loader = AudioLoader( +# self.data_generator.get_modality_path(ModalityType.AUDIO), +# self.data_generator.indices, +# ) +# audio = UnimodalModality(audio_data_loader) +# +# text_data_loader = TextLoader( +# self.data_generator.get_modality_path(ModalityType.TEXT), +# self.data_generator.indices, +# ) +# text = UnimodalModality(text_data_loader) +# +# video_data_loader = VideoLoader( +# self.data_generator.get_modality_path(ModalityType.VIDEO), +# self.data_generator.indices, +# ) +# video = UnimodalModality(video_data_loader) +# +# with patch.object( +# Registry, +# "_representations", +# { +# ModalityType.TEXT: [W2V], +# ModalityType.AUDIO: [Spectrogram], +# ModalityType.TIMESERIES: [ResNet], +# ModalityType.VIDEO: [ResNet], +# ModalityType.EMBEDDING: [], +# }, +# ): +# registry = Registry() +# registry._fusion_operators = [Average, Concatenation] +# unimodal_optimizer = UnimodalRepresentationOptimizer( +# [text, audio, video], [task], max_chain_depth=2 +# ) +# unimodal_optimizer.optimize() +# +# multimodal_optimizer = FusionOptimizer( +# [audio, text, video], +# task, +# unimodal_optimizer.optimization_results, +# unimodal_optimizer.cache, +# 2, +# 2, +# debug=False, +# ) +# multimodal_optimizer.optimize() +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index a871e4297bc..fb88b9f5e53 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -37,100 +37,101 @@ class TestMultimodalJoin(unittest.TestCase): - test_file_path = None - mods = None - text = None - audio = None - video = None - data_generator = None - num_instances = 0 - indizes = [] - - @classmethod - def setUpClass(cls): - cls.test_file_path = "join_test_data" - cls.num_instances = 4 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] - - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) - - def test_video_audio_join(self): - self._execute_va_join() - - def test_chunked_video_audio_join(self): - self._execute_va_join(2) - - def test_video_chunked_audio_join(self): - self._execute_va_join(None, 2) - - def test_chunked_video_chunked_audio_join(self): - self._execute_va_join(2, 2) - - def test_audio_video_join(self): - # Audio has a much higher frequency than video, hence we would need to - # duplicate or interpolate frames to match them to the audio frequency - self._execute_av_join() - - # TODO - # def test_chunked_audio_video_join(self): - # self._execute_av_join(2) - - # TODO - # def test_chunked_audio_chunked_video_join(self): - # self._execute_av_join(2, 2) - - def _execute_va_join(self, l_chunk_size=None, r_chunk_size=None): - video, audio = self._prepare_data(l_chunk_size, r_chunk_size) - self._join(video, audio, 2) - - def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): - video, audio = self._prepare_data(l_chunk_size, r_chunk_size) - self._join(audio, video, 2) - - def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - data_type=np.float16, - chunk_size=l_chunk_size, - ) - video = UnimodalModality(video_data_loader) - - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - data_type=np.float32, - chunk_size=r_chunk_size, - ) - audio = UnimodalModality(audio_data_loader) - - mel_audio = audio.apply_representation(MelSpectrogram()) - - return video, mel_audio - - def _join(self, left_modality, right_modality, window_size): - resnet_modality = ( - left_modality.join( - right_modality, JoinCondition("timestamp", "timestamp", "<") - ) - .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50")) - .window_aggregation(window_size, "mean") - .combine("concat") - ) - - assert resnet_modality.left_modality is not None - assert resnet_modality.right_modality is not None - assert len(resnet_modality.left_modality.data) == self.num_instances - assert len(resnet_modality.right_modality.data) == self.num_instances - assert resnet_modality.data is not None - - return resnet_modality - - -if __name__ == "__main__": - unittest.main() + pass +# test_file_path = None +# mods = None +# text = None +# audio = None +# video = None +# data_generator = None +# num_instances = 0 +# indizes = [] +# +# @classmethod +# def setUpClass(cls): +# cls.test_file_path = "join_test_data" +# cls.num_instances = 4 +# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] +# +# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) +# +# @classmethod +# def tearDownClass(cls): +# print("Cleaning up test data") +# shutil.rmtree(cls.test_file_path) +# +# def test_video_audio_join(self): +# self._execute_va_join() +# +# def test_chunked_video_audio_join(self): +# self._execute_va_join(2) +# +# def test_video_chunked_audio_join(self): +# self._execute_va_join(None, 2) +# +# def test_chunked_video_chunked_audio_join(self): +# self._execute_va_join(2, 2) +# +# def test_audio_video_join(self): +# # Audio has a much higher frequency than video, hence we would need to +# # duplicate or interpolate frames to match them to the audio frequency +# self._execute_av_join() +# +# # TODO +# # def test_chunked_audio_video_join(self): +# # self._execute_av_join(2) +# +# # TODO +# # def test_chunked_audio_chunked_video_join(self): +# # self._execute_av_join(2, 2) +# +# def _execute_va_join(self, l_chunk_size=None, r_chunk_size=None): +# video, audio = self._prepare_data(l_chunk_size, r_chunk_size) +# self._join(video, audio, 2) +# +# def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): +# video, audio = self._prepare_data(l_chunk_size, r_chunk_size) +# self._join(audio, video, 2) +# +# def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): +# video_data_loader = VideoLoader( +# self.data_generator.get_modality_path(ModalityType.VIDEO), +# self.data_generator.indices, +# data_type=np.float16, +# chunk_size=l_chunk_size, +# ) +# video = UnimodalModality(video_data_loader) +# +# audio_data_loader = AudioLoader( +# self.data_generator.get_modality_path(ModalityType.AUDIO), +# self.data_generator.indices, +# data_type=np.float32, +# chunk_size=r_chunk_size, +# ) +# audio = UnimodalModality(audio_data_loader) +# +# mel_audio = audio.apply_representation(MelSpectrogram()) +# +# return video, mel_audio +# +# def _join(self, left_modality, right_modality, window_size): +# resnet_modality = ( +# left_modality.join( +# right_modality, JoinCondition("timestamp", "timestamp", "<") +# ) +# .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50")) +# .window_aggregation(window_size, "mean") +# .combine("concat") +# ) +# +# assert resnet_modality.left_modality is not None +# assert resnet_modality.right_modality is not None +# assert len(resnet_modality.left_modality.data) == self.num_instances +# assert len(resnet_modality.right_modality.data) == self.num_instances +# assert resnet_modality.data is not None +# +# return resnet_modality +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index e35bf5b1d2c..03f95b0123b 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -42,46 +42,47 @@ class TestOperatorRegistry(unittest.TestCase): - def test_audio_representations_in_registry(self): - registry = Registry() - for representation in [Spectrogram, MelSpectrogram, Wav2Vec, MFCC]: - assert representation in registry.get_representations( - ModalityType.AUDIO - ), f"{representation} not in registry" - - def test_video_representations_in_registry(self): - registry = Registry() - assert registry.get_representations(ModalityType.VIDEO) == [ResNet] - - def test_timeseries_representations_in_registry(self): - registry = Registry() - assert registry.get_representations(ModalityType.TIMESERIES) == [ResNet] - - def test_text_representations_in_registry(self): - registry = Registry() - for representation in [BoW, TfIdf, W2V, Bert]: - assert representation in registry.get_representations( - ModalityType.TEXT - ), f"{representation} not in registry" - - def test_context_operator_in_registry(self): - registry = Registry() - assert registry.get_context_operators() == [WindowAggregation] - - # def test_fusion_operator_in_registry(self): - # registry = Registry() - # for fusion_operator in [ - # # RowMax, - # Sum, - # Average, - # Concatenation, - # LSTM, - # Multiplication, - # ]: - # assert ( - # fusion_operator in registry.get_fusion_operators() - # ), f"{fusion_operator} not in registry" - - -if __name__ == "__main__": - unittest.main() + pass +# def test_audio_representations_in_registry(self): +# registry = Registry() +# for representation in [Spectrogram, MelSpectrogram, Wav2Vec, MFCC]: +# assert representation in registry.get_representations( +# ModalityType.AUDIO +# ), f"{representation} not in registry" +# +# def test_video_representations_in_registry(self): +# registry = Registry() +# assert registry.get_representations(ModalityType.VIDEO) == [ResNet] +# +# def test_timeseries_representations_in_registry(self): +# registry = Registry() +# assert registry.get_representations(ModalityType.TIMESERIES) == [ResNet] +# +# def test_text_representations_in_registry(self): +# registry = Registry() +# for representation in [BoW, TfIdf, W2V, Bert]: +# assert representation in registry.get_representations( +# ModalityType.TEXT +# ), f"{representation} not in registry" +# +# def test_context_operator_in_registry(self): +# registry = Registry() +# assert registry.get_context_operators() == [WindowAggregation] +# +# # def test_fusion_operator_in_registry(self): +# # registry = Registry() +# # for fusion_operator in [ +# # # RowMax, +# # Sum, +# # Average, +# # Concatenation, +# # LSTM, +# # Multiplication, +# # ]: +# # assert ( +# # fusion_operator in registry.get_fusion_operators() +# # ), f"{fusion_operator} not in registry" +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index 86c47621d43..76426dc392b 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -101,108 +101,109 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestUnimodalRepresentationOptimizer(unittest.TestCase): - test_file_path = None - data_generator = None - num_instances = 0 - - @classmethod - def setUpClass(cls): - cls.test_file_path = "unimodal_optimizer_test_data" - - cls.num_instances = 10 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - split = train_test_split( - cls.data_generator.indices, - cls.data_generator.labels, - test_size=0.2, - random_state=42, - ) - cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ - int(i) for i in split[1] - ] - - cls.tasks = [ - Task( - "UnimodalRepresentationTask1", - TestSVM(), - cls.data_generator.labels, - cls.train_indizes, - cls.val_indizes, - ), - Task( - "UnimodalRepresentationTask2", - TestCNN(), - cls.data_generator.labels, - cls.train_indizes, - cls.val_indizes, - ), - ] - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.test_file_path) - - def test_unimodal_optimizer_for_audio_modality(self): - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - ) - audio = UnimodalModality(audio_data_loader) - - self.optimize_unimodal_representation_for_modality(audio) - - def test_unimodal_optimizer_for_text_modality(self): - text_data_loader = TextLoader( - self.data_generator.get_modality_path(ModalityType.TEXT), - self.data_generator.indices, - ) - text = UnimodalModality(text_data_loader) - self.optimize_unimodal_representation_for_modality(text) - - def test_unimodal_optimizer_for_video_modality(self): - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - ) - video = UnimodalModality(video_data_loader) - self.optimize_unimodal_representation_for_modality(video) - - def optimize_unimodal_representation_for_modality(self, modality): - with patch.object( - Registry, - "_representations", - { - ModalityType.TEXT: [W2V], - ModalityType.AUDIO: [Spectrogram], - ModalityType.TIMESERIES: [ResNet], - ModalityType.VIDEO: [ResNet], - ModalityType.EMBEDDING: [], - }, - ): - registry = Registry() - - unimodal_optimizer = UnimodalRepresentationOptimizer( - [modality], self.tasks, max_chain_depth=2 - ) - unimodal_optimizer.optimize() - - assert ( - list(unimodal_optimizer.optimization_results.keys())[0] - == modality.modality_id - ) - assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 - assert ( - len( - unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ - 0 - ].operator_chain - ) - >= 1 - ) - - -if __name__ == "__main__": - unittest.main() + pass +# test_file_path = None +# data_generator = None +# num_instances = 0 +# +# @classmethod +# def setUpClass(cls): +# cls.test_file_path = "unimodal_optimizer_test_data" +# +# cls.num_instances = 10 +# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] +# +# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) +# split = train_test_split( +# cls.data_generator.indices, +# cls.data_generator.labels, +# test_size=0.2, +# random_state=42, +# ) +# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ +# int(i) for i in split[1] +# ] +# +# cls.tasks = [ +# Task( +# "UnimodalRepresentationTask1", +# TestSVM(), +# cls.data_generator.labels, +# cls.train_indizes, +# cls.val_indizes, +# ), +# Task( +# "UnimodalRepresentationTask2", +# TestCNN(), +# cls.data_generator.labels, +# cls.train_indizes, +# cls.val_indizes, +# ), +# ] +# +# @classmethod +# def tearDownClass(cls): +# shutil.rmtree(cls.test_file_path) +# +# def test_unimodal_optimizer_for_audio_modality(self): +# audio_data_loader = AudioLoader( +# self.data_generator.get_modality_path(ModalityType.AUDIO), +# self.data_generator.indices, +# ) +# audio = UnimodalModality(audio_data_loader) +# +# self.optimize_unimodal_representation_for_modality(audio) +# +# def test_unimodal_optimizer_for_text_modality(self): +# text_data_loader = TextLoader( +# self.data_generator.get_modality_path(ModalityType.TEXT), +# self.data_generator.indices, +# ) +# text = UnimodalModality(text_data_loader) +# self.optimize_unimodal_representation_for_modality(text) +# +# def test_unimodal_optimizer_for_video_modality(self): +# video_data_loader = VideoLoader( +# self.data_generator.get_modality_path(ModalityType.VIDEO), +# self.data_generator.indices, +# ) +# video = UnimodalModality(video_data_loader) +# self.optimize_unimodal_representation_for_modality(video) +# +# def optimize_unimodal_representation_for_modality(self, modality): +# with patch.object( +# Registry, +# "_representations", +# { +# ModalityType.TEXT: [W2V], +# ModalityType.AUDIO: [Spectrogram], +# ModalityType.TIMESERIES: [ResNet], +# ModalityType.VIDEO: [ResNet], +# ModalityType.EMBEDDING: [], +# }, +# ): +# registry = Registry() +# +# unimodal_optimizer = UnimodalRepresentationOptimizer( +# [modality], self.tasks, max_chain_depth=2 +# ) +# unimodal_optimizer.optimize() +# +# assert ( +# list(unimodal_optimizer.optimization_results.keys())[0] +# == modality.modality_id +# ) +# assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 +# assert ( +# len( +# unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ +# 0 +# ].operator_chain +# ) +# >= 1 +# ) +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index ac167e8fbf1..64e45e4a11e 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -39,81 +39,82 @@ class TestUnimodalRepresentations(unittest.TestCase): - test_file_path = None - mods = None - text = None - audio = None - video = None - data_generator = None - num_instances = 0 - - @classmethod - def setUpClass(cls): - cls.test_file_path = "unimodal_test_data" - - cls.num_instances = 4 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - os.makedirs(f"{cls.test_file_path}/embeddings") - - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) - - def test_audio_representations(self): - audio_representations = [MelSpectrogram()] # TODO: add FFT, TFN, 1DCNN - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - ) - audio = UnimodalModality(audio_data_loader) - - for representation in audio_representations: - r = audio.apply_representation(representation) - assert r.data is not None - assert len(r.data) == self.num_instances - - def test_video_representations(self): - video_representations = [ResNet()] # Todo: add other video representations - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - ) - video = UnimodalModality(video_data_loader) - for representation in video_representations: - r = video.apply_representation(representation) - assert r.data is not None - assert len(r.data) == self.num_instances - - def test_text_representations(self): - test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()] - text_data_loader = TextLoader( - self.data_generator.get_modality_path(ModalityType.TEXT), - self.data_generator.indices, - ) - text = UnimodalModality(text_data_loader) - - for representation in test_representations: - r = text.apply_representation(representation) - assert r.data is not None - assert len(r.data) == self.num_instances - - def test_chunked_video_representations(self): - video_representations = [ResNet()] - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - chunk_size=2, - ) - video = UnimodalModality(video_data_loader) - for representation in video_representations: - r = video.apply_representation(representation) - assert r.data is not None - assert len(r.data) == self.num_instances - assert len(r.metadata) == self.num_instances - - -if __name__ == "__main__": - unittest.main() + pass +# test_file_path = None +# mods = None +# text = None +# audio = None +# video = None +# data_generator = None +# num_instances = 0 +# +# @classmethod +# def setUpClass(cls): +# cls.test_file_path = "unimodal_test_data" +# +# cls.num_instances = 4 +# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] +# +# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) +# os.makedirs(f"{cls.test_file_path}/embeddings") +# +# @classmethod +# def tearDownClass(cls): +# print("Cleaning up test data") +# shutil.rmtree(cls.test_file_path) +# +# def test_audio_representations(self): +# audio_representations = [MelSpectrogram()] # TODO: add FFT, TFN, 1DCNN +# audio_data_loader = AudioLoader( +# self.data_generator.get_modality_path(ModalityType.AUDIO), +# self.data_generator.indices, +# ) +# audio = UnimodalModality(audio_data_loader) +# +# for representation in audio_representations: +# r = audio.apply_representation(representation) +# assert r.data is not None +# assert len(r.data) == self.num_instances +# +# def test_video_representations(self): +# video_representations = [ResNet()] # Todo: add other video representations +# video_data_loader = VideoLoader( +# self.data_generator.get_modality_path(ModalityType.VIDEO), +# self.data_generator.indices, +# ) +# video = UnimodalModality(video_data_loader) +# for representation in video_representations: +# r = video.apply_representation(representation) +# assert r.data is not None +# assert len(r.data) == self.num_instances +# +# def test_text_representations(self): +# test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()] +# text_data_loader = TextLoader( +# self.data_generator.get_modality_path(ModalityType.TEXT), +# self.data_generator.indices, +# ) +# text = UnimodalModality(text_data_loader) +# +# for representation in test_representations: +# r = text.apply_representation(representation) +# assert r.data is not None +# assert len(r.data) == self.num_instances +# +# def test_chunked_video_representations(self): +# video_representations = [ResNet()] +# video_data_loader = VideoLoader( +# self.data_generator.get_modality_path(ModalityType.VIDEO), +# self.data_generator.indices, +# chunk_size=2, +# ) +# video = UnimodalModality(video_data_loader) +# for representation in video_representations: +# r = video.apply_representation(representation) +# assert r.data is not None +# assert len(r.data) == self.num_instances +# assert len(r.metadata) == self.num_instances +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py index ea1b0f46f2e..d56a5c18abe 100644 --- a/src/main/python/tests/scuro/test_window_operations.py +++ b/src/main/python/tests/scuro/test_window_operations.py @@ -29,78 +29,79 @@ class TestWindowOperations(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.num_instances = 40 - cls.data_generator = ModalityRandomDataGenerator() - cls.aggregations = ["mean", "sum", "max", "min"] - - def test_window_operations_on_audio_representations(self): - window_size = 10 - self.run_window_operations_for_modality(ModalityType.AUDIO, window_size) - - def test_window_operations_on_video_representations(self): - window_size = 10 - self.run_window_operations_for_modality(ModalityType.VIDEO, window_size) - - def test_window_operations_on_text_representations(self): - window_size = 10 - - self.run_window_operations_for_modality(ModalityType.TEXT, window_size) - - def run_window_operations_for_modality(self, modality_type, window_size): - r = self.data_generator.create1DModality(40, 100, modality_type) - for aggregation in self.aggregations: - windowed_modality = r.window_aggregation(window_size, aggregation) - - self.verify_window_operation(aggregation, r, windowed_modality, window_size) - - def verify_window_operation( - self, aggregation, modality, windowed_modality, window_size - ): - assert windowed_modality.data is not None - assert len(windowed_modality.data) == self.num_instances - - for i, instance in enumerate(windowed_modality.data): - # assert ( - # list(windowed_modality.metadata.values())[i]["data_layout"]["shape"][0] - # == list(modality.metadata.values())[i]["data_layout"]["shape"][0] - # ) - assert len(instance) == math.ceil(len(modality.data[i]) / window_size) - for j in range(0, len(instance)): - if aggregation == "mean": - np.testing.assert_almost_equal( - instance[j], - np.mean( - modality.data[i][j * window_size : (j + 1) * window_size], - axis=0, - ), - ) - elif aggregation == "sum": - np.testing.assert_almost_equal( - instance[j], - np.sum( - modality.data[i][j * window_size : (j + 1) * window_size], - axis=0, - ), - ) - elif aggregation == "max": - np.testing.assert_almost_equal( - instance[j], - np.max( - modality.data[i][j * window_size : (j + 1) * window_size], - axis=0, - ), - ) - elif aggregation == "min": - np.testing.assert_almost_equal( - instance[j], - np.min( - modality.data[i][j * window_size : (j + 1) * window_size], - axis=0, - ), - ) - - -if __name__ == "__main__": - unittest.main() + pass +# @classmethod +# def setUpClass(cls): +# cls.num_instances = 40 +# cls.data_generator = ModalityRandomDataGenerator() +# cls.aggregations = ["mean", "sum", "max", "min"] +# +# def test_window_operations_on_audio_representations(self): +# window_size = 10 +# self.run_window_operations_for_modality(ModalityType.AUDIO, window_size) +# +# def test_window_operations_on_video_representations(self): +# window_size = 10 +# self.run_window_operations_for_modality(ModalityType.VIDEO, window_size) +# +# def test_window_operations_on_text_representations(self): +# window_size = 10 +# +# self.run_window_operations_for_modality(ModalityType.TEXT, window_size) +# +# def run_window_operations_for_modality(self, modality_type, window_size): +# r = self.data_generator.create1DModality(40, 100, modality_type) +# for aggregation in self.aggregations: +# windowed_modality = r.window_aggregation(window_size, aggregation) +# +# self.verify_window_operation(aggregation, r, windowed_modality, window_size) +# +# def verify_window_operation( +# self, aggregation, modality, windowed_modality, window_size +# ): +# assert windowed_modality.data is not None +# assert len(windowed_modality.data) == self.num_instances +# +# for i, instance in enumerate(windowed_modality.data): +# # assert ( +# # list(windowed_modality.metadata.values())[i]["data_layout"]["shape"][0] +# # == list(modality.metadata.values())[i]["data_layout"]["shape"][0] +# # ) +# assert len(instance) == math.ceil(len(modality.data[i]) / window_size) +# for j in range(0, len(instance)): +# if aggregation == "mean": +# np.testing.assert_almost_equal( +# instance[j], +# np.mean( +# modality.data[i][j * window_size : (j + 1) * window_size], +# axis=0, +# ), +# ) +# elif aggregation == "sum": +# np.testing.assert_almost_equal( +# instance[j], +# np.sum( +# modality.data[i][j * window_size : (j + 1) * window_size], +# axis=0, +# ), +# ) +# elif aggregation == "max": +# np.testing.assert_almost_equal( +# instance[j], +# np.max( +# modality.data[i][j * window_size : (j + 1) * window_size], +# axis=0, +# ), +# ) +# elif aggregation == "min": +# np.testing.assert_almost_equal( +# instance[j], +# np.min( +# modality.data[i][j * window_size : (j + 1) * window_size], +# axis=0, +# ), +# ) +# +# +# if __name__ == "__main__": +# unittest.main() From 9af4b89aac5d01814d3ee8b25d9c8af8a7338a31 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 30 Jun 2025 16:23:46 +0200 Subject: [PATCH 13/23] testing --- .../python/tests/scuro/test_data_loaders.py | 133 +++++++++--------- 1 file changed, 66 insertions(+), 67 deletions(-) diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py index 08c7bb4fecd..85da2919a04 100644 --- a/src/main/python/tests/scuro/test_data_loaders.py +++ b/src/main/python/tests/scuro/test_data_loaders.py @@ -37,71 +37,70 @@ class TestDataLoaders(unittest.TestCase): - pass - # test_file_path = None - # mods = None - # text = None - # audio = None - # video = None - # data_generator = None - # num_instances = 0 - # - # @classmethod - # def setUpClass(cls): - # cls.test_file_path = "test_data" - # cls.num_instances = 2 - # cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - # cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - # - # os.makedirs(f"{cls.test_file_path}/embeddings") - # - # cls.text_ref = cls.data_generator.modalities_by_type[ - # ModalityType.TEXT - # ].apply_representation(Bert()) - # cls.audio_ref = cls.data_generator.modalities_by_type[ - # ModalityType.AUDIO - # ].apply_representation(MelSpectrogram()) - # cls.video_ref = cls.data_generator.modalities_by_type[ - # ModalityType.VIDEO - # ].apply_representation(ResNet()) - # - # @classmethod - # def tearDownClass(cls): - # print("Cleaning up test data") - # shutil.rmtree(cls.test_file_path) - # - # def test_load_audio_data_from_file(self): - # audio_data_loader = AudioLoader( - # self.data_generator.get_modality_path(ModalityType.AUDIO), - # self.data_generator.indices, - # ) - # audio = UnimodalModality(audio_data_loader).apply_representation( - # MelSpectrogram() - # ) - # - # for i in range(0, self.num_instances): - # np.testing.assert_almost_equal(self.audio_ref.data[i], audio.data[i]) - # - # def test_load_video_data_from_file(self): - # video_data_loader = VideoLoader( - # self.data_generator.get_modality_path(ModalityType.VIDEO), - # self.data_generator.indices, - # ) - # video = UnimodalModality(video_data_loader).apply_representation(ResNet()) - # - # for i in range(0, self.num_instances): - # np.testing.assert_almost_equal(self.video_ref.data[i], video.data[i]) - # - # def test_load_text_data_from_file(self): - # text_data_loader = TextLoader( - # self.data_generator.get_modality_path(ModalityType.TEXT), - # self.data_generator.indices, - # ) - # text = UnimodalModality(text_data_loader).apply_representation(Bert()) - # - # for i in range(0, self.num_instances): - # np.testing.assert_almost_equal(self.text_ref.data[i], text.data[i]) + test_file_path = None + mods = None + text = None + audio = None + video = None + data_generator = None + num_instances = 0 -# -# if __name__ == "__main__": -# unittest.main() + @classmethod + def setUpClass(cls): + cls.test_file_path = "test_data" + cls.num_instances = 2 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + + os.makedirs(f"{cls.test_file_path}/embeddings") + + cls.text_ref = cls.data_generator.modalities_by_type[ + ModalityType.TEXT + ].apply_representation(Bert()) + cls.audio_ref = cls.data_generator.modalities_by_type[ + ModalityType.AUDIO + ].apply_representation(MelSpectrogram()) + cls.video_ref = cls.data_generator.modalities_by_type[ + ModalityType.VIDEO + ].apply_representation(ResNet()) + + @classmethod + def tearDownClass(cls): + print("Cleaning up test data") + shutil.rmtree(cls.test_file_path) + + def test_load_audio_data_from_file(self): + audio_data_loader = AudioLoader( + self.data_generator.get_modality_path(ModalityType.AUDIO), + self.data_generator.indices, + ) + audio = UnimodalModality(audio_data_loader).apply_representation( + MelSpectrogram() + ) + + for i in range(0, self.num_instances): + np.testing.assert_almost_equal(self.audio_ref.data[i], audio.data[i]) + + def test_load_video_data_from_file(self): + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, + ) + video = UnimodalModality(video_data_loader).apply_representation(ResNet()) + + for i in range(0, self.num_instances): + np.testing.assert_almost_equal(self.video_ref.data[i], video.data[i]) + + def test_load_text_data_from_file(self): + text_data_loader = TextLoader( + self.data_generator.get_modality_path(ModalityType.TEXT), + self.data_generator.indices, + ) + text = UnimodalModality(text_data_loader).apply_representation(Bert()) + + for i in range(0, self.num_instances): + np.testing.assert_almost_equal(self.text_ref.data[i], text.data[i]) + + +if __name__ == "__main__": + unittest.main() From e680fc8020bda44062dc15bc5e8a59b379d250a4 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Tue, 1 Jul 2025 09:40:45 +0200 Subject: [PATCH 14/23] testing --- src/main/python/tests/scuro/test_dr_search.py | 213 +++++++++--------- 1 file changed, 106 insertions(+), 107 deletions(-) diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index 3838d8f2114..d896815ab24 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -78,110 +78,109 @@ def scale_data(data, train_indizes): class TestDataLoaders(unittest.TestCase): - pass -# train_indizes = None -# val_indizes = None -# test_file_path = None -# mods = None -# text = None -# audio = None -# video = None -# data_generator = None -# num_instances = 0 -# representations = None -# -# @classmethod -# def setUpClass(cls): -# cls.test_file_path = "test_data_dr_search" -# cls.num_instances = 20 -# modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] -# -# cls.data_generator = setup_data( -# modalities, cls.num_instances, cls.test_file_path -# ) -# os.makedirs(f"{cls.test_file_path}/embeddings") -# -# # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead -# -# cls.bert = cls.data_generator.modalities_by_type[ -# ModalityType.TEXT -# ].apply_representation(Bert()) -# cls.mel_spe = ( -# cls.data_generator.modalities_by_type[ModalityType.AUDIO] -# .apply_representation(MelSpectrogram()) -# .flatten() -# ) -# cls.resnet = ( -# cls.data_generator.modalities_by_type[ModalityType.VIDEO] -# .apply_representation(ResNet()) -# .window_aggregation(10, "mean") -# .flatten() -# ) -# cls.mods = [cls.bert, cls.mel_spe, cls.resnet] -# -# split = train_test_split( -# cls.data_generator.indices, -# cls.data_generator.labels, -# test_size=0.2, -# random_state=42, -# ) -# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ -# int(i) for i in split[1] -# ] -# -# for m in cls.mods: -# m.data = scale_data(m.data, cls.train_indizes) -# -# cls.representations = [ -# Concatenation(), -# Average(), -# RowMax(100), -# Multiplication(), -# Sum(), -# LSTM(width=256, depth=3), -# ] -# -# @classmethod -# def tearDownClass(cls): -# print("Cleaning up test data") -# shutil.rmtree(cls.test_file_path) -# -# def test_enumerate_all(self): -# task = Task( -# "TestTask", -# TestSVM(), -# self.data_generator.labels, -# self.train_indizes, -# self.val_indizes, -# ) -# dr_search = DRSearch(self.mods, task, self.representations) -# best_representation, best_score, best_modalities = dr_search.fit_enumerate_all() -# -# for r in dr_search.scores.values(): -# for scores in r.values(): -# assert scores[1] <= best_score -# -# def test_enumerate_all_vs_random(self): -# task = Task( -# "TestTask", -# TestSVM(), -# self.data_generator.labels, -# self.train_indizes, -# self.val_indizes, -# ) -# dr_search = DRSearch(self.mods, task, self.representations) -# best_representation_enum, best_score_enum, best_modalities_enum = ( -# dr_search.fit_enumerate_all() -# ) -# -# dr_search.reset_best_params() -# -# best_representation_rand, best_score_rand, best_modalities_rand = ( -# dr_search.fit_random(seed=42) -# ) -# -# assert best_score_rand <= best_score_enum -# -# -# if __name__ == "__main__": -# unittest.main() + train_indizes = None + val_indizes = None + test_file_path = None + mods = None + text = None + audio = None + video = None + data_generator = None + num_instances = 0 + representations = None + + @classmethod + def setUpClass(cls): + cls.test_file_path = "test_data_dr_search" + cls.num_instances = 20 + modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + + cls.data_generator = setup_data( + modalities, cls.num_instances, cls.test_file_path + ) + os.makedirs(f"{cls.test_file_path}/embeddings") + + # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead + + cls.bert = cls.data_generator.modalities_by_type[ + ModalityType.TEXT + ].apply_representation(Bert()) + cls.mel_spe = ( + cls.data_generator.modalities_by_type[ModalityType.AUDIO] + .apply_representation(MelSpectrogram()) + .flatten() + ) + cls.resnet = ( + cls.data_generator.modalities_by_type[ModalityType.VIDEO] + .apply_representation(ResNet()) + .window_aggregation(10, "mean") + .flatten() + ) + cls.mods = [cls.bert, cls.mel_spe, cls.resnet] + + split = train_test_split( + cls.data_generator.indices, + cls.data_generator.labels, + test_size=0.2, + random_state=42, + ) + cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ + int(i) for i in split[1] + ] + + for m in cls.mods: + m.data = scale_data(m.data, cls.train_indizes) + + cls.representations = [ + Concatenation(), + Average(), + RowMax(100), + Multiplication(), + Sum(), + LSTM(width=256, depth=3), + ] + + @classmethod + def tearDownClass(cls): + print("Cleaning up test data") + shutil.rmtree(cls.test_file_path) + + def test_enumerate_all(self): + task = Task( + "TestTask", + TestSVM(), + self.data_generator.labels, + self.train_indizes, + self.val_indizes, + ) + dr_search = DRSearch(self.mods, task, self.representations) + best_representation, best_score, best_modalities = dr_search.fit_enumerate_all() + + for r in dr_search.scores.values(): + for scores in r.values(): + assert scores[1] <= best_score + + def test_enumerate_all_vs_random(self): + task = Task( + "TestTask", + TestSVM(), + self.data_generator.labels, + self.train_indizes, + self.val_indizes, + ) + dr_search = DRSearch(self.mods, task, self.representations) + best_representation_enum, best_score_enum, best_modalities_enum = ( + dr_search.fit_enumerate_all() + ) + + dr_search.reset_best_params() + + best_representation_rand, best_score_rand, best_modalities_rand = ( + dr_search.fit_random(seed=42) + ) + + assert best_score_rand <= best_score_enum + + +if __name__ == "__main__": + unittest.main() From e505f5d8370ae487800750c32d50dbd82f64b738 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Tue, 1 Jul 2025 10:22:15 +0200 Subject: [PATCH 15/23] testing --- src/main/python/tests/scuro/test_dr_search.py | 40 ++++++++----------- .../tests/scuro/test_multimodal_fusion.py | 2 + .../tests/scuro/test_multimodal_join.py | 2 + .../tests/scuro/test_operator_registry.py | 2 + .../tests/scuro/test_unimodal_optimizer.py | 3 +- .../scuro/test_unimodal_representations.py | 2 + .../tests/scuro/test_window_operations.py | 2 + 7 files changed, 28 insertions(+), 25 deletions(-) diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index d896815ab24..76bab994ae0 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -41,7 +41,7 @@ from systemds.scuro.representations.multiplication import Multiplication from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.sum import Sum -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import ModalityRandomDataGenerator import warnings @@ -93,34 +93,26 @@ class TestDataLoaders(unittest.TestCase): def setUpClass(cls): cls.test_file_path = "test_data_dr_search" cls.num_instances = 20 - modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data( - modalities, cls.num_instances, cls.test_file_path - ) - os.makedirs(f"{cls.test_file_path}/embeddings") + cls.data_generator = ModalityRandomDataGenerator() + cls.labels = np.random.choice([0, 1], size=cls.num_instances) # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead - cls.bert = cls.data_generator.modalities_by_type[ - ModalityType.TEXT - ].apply_representation(Bert()) - cls.mel_spe = ( - cls.data_generator.modalities_by_type[ModalityType.AUDIO] - .apply_representation(MelSpectrogram()) - .flatten() + cls.video = cls.data_generator.create1DModality( + cls.num_instances, 100, ModalityType.VIDEO ) - cls.resnet = ( - cls.data_generator.modalities_by_type[ModalityType.VIDEO] - .apply_representation(ResNet()) - .window_aggregation(10, "mean") - .flatten() + cls.text = cls.data_generator.create1DModality( + cls.num_instances, 100, ModalityType.TEXT ) - cls.mods = [cls.bert, cls.mel_spe, cls.resnet] + cls.audio = cls.data_generator.create1DModality( + cls.num_instances, 100, ModalityType.AUDIO + ) + + cls.mods = [cls.video, cls.audio, cls.text] split = train_test_split( - cls.data_generator.indices, - cls.data_generator.labels, + np.array(range(cls.num_instances)), + cls.labels, test_size=0.2, random_state=42, ) @@ -149,7 +141,7 @@ def test_enumerate_all(self): task = Task( "TestTask", TestSVM(), - self.data_generator.labels, + self.labels, self.train_indizes, self.val_indizes, ) @@ -164,7 +156,7 @@ def test_enumerate_all_vs_random(self): task = Task( "TestTask", TestSVM(), - self.data_generator.labels, + self.labels, self.train_indizes, self.val_indizes, ) diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py index 5ddab8b65f3..64fb9d13674 100644 --- a/src/main/python/tests/scuro/test_multimodal_fusion.py +++ b/src/main/python/tests/scuro/test_multimodal_fusion.py @@ -104,6 +104,8 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestMultimodalRepresentationOptimizer(unittest.TestCase): pass + + # test_file_path = None # data_generator = None # num_instances = 0 diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index fb88b9f5e53..c2c83493a3d 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -38,6 +38,8 @@ class TestMultimodalJoin(unittest.TestCase): pass + + # test_file_path = None # mods = None # text = None diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index 03f95b0123b..fa4e5cf0356 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -43,6 +43,8 @@ class TestOperatorRegistry(unittest.TestCase): pass + + # def test_audio_representations_in_registry(self): # registry = Registry() # for representation in [Spectrogram, MelSpectrogram, Wav2Vec, MFCC]: diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index 76426dc392b..ef61ec18b53 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -102,6 +102,8 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestUnimodalRepresentationOptimizer(unittest.TestCase): pass + + # test_file_path = None # data_generator = None # num_instances = 0 @@ -206,4 +208,3 @@ class TestUnimodalRepresentationOptimizer(unittest.TestCase): # # if __name__ == "__main__": # unittest.main() - diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index 64e45e4a11e..f1dd530372e 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -40,6 +40,8 @@ class TestUnimodalRepresentations(unittest.TestCase): pass + + # test_file_path = None # mods = None # text = None diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py index d56a5c18abe..9e9e82d9ed9 100644 --- a/src/main/python/tests/scuro/test_window_operations.py +++ b/src/main/python/tests/scuro/test_window_operations.py @@ -30,6 +30,8 @@ class TestWindowOperations(unittest.TestCase): pass + + # @classmethod # def setUpClass(cls): # cls.num_instances = 40 From 8807b15e285de2c360f176e359a1a832b277a46e Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Tue, 1 Jul 2025 16:14:13 +0200 Subject: [PATCH 16/23] testing --- src/main/python/tests/scuro/data_generator.py | 52 +++++ src/main/python/tests/scuro/test_dr_search.py | 6 - .../tests/scuro/test_multimodal_fusion.py | 214 +++++++++--------- 3 files changed, 161 insertions(+), 111 deletions(-) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 545487c40b8..29e7493205f 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -26,6 +26,9 @@ import random import os +import nltk + +from systemds.scuro.dataloader.base_loader import BaseLoader from systemds.scuro.dataloader.video_loader import VideoLoader from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.text_loader import TextLoader @@ -34,6 +37,17 @@ from systemds.scuro.modality.type import ModalityType +class TestDataLoader(BaseLoader): + def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata): + super().__init__("", indices, data_type, chunk_size, modality_type) + + self.metadata = metadata + self.test_data = data + + def extract(self, file, indices): + self.data = self.test_data + + class ModalityRandomDataGenerator: def __init__(self): @@ -74,6 +88,44 @@ def create1DModality( self.modality_id += 1 return tf_modality + def create_audio_data(self, num_instances, num_features): + data = np.random.rand(num_instances, num_features).astype(np.float32) + metadata = { + i: ModalityType.AUDIO.create_audio_metadata(num_features / 10, data[i]) + for i in range(num_instances) + } + + return data, metadata + + def create_text_data(self, num_instances): + nltk.download("webtext") + sentences = nltk.corpus.webtext.sents()[:num_instances] + + metadata = { + i: ModalityType.TEXT.create_text_metadata(len(sentences[i]), sentences[i]) + for i in range(num_instances) + } + + return [" ".join(sentence) for sentence in sentences], metadata + + def create_visual_modality(self, num_instances, num_frames=1, height=28, width=28): + if num_frames == 1: + print(f"TODO: create image metadata") + else: + metadata = { + i: ModalityType.VIDEO.create_video_metadata( + num_instances / 30, num_frames / 30, width, height, 1 + ) + for i in range(num_instances) + } + + return ( + np.random.randint( + 0, 256, (num_instances, num_frames, height, width) + ).astype(np.float16), + metadata, + ) + def setup_data(modalities, num_instances, path): if os.path.isdir(path): diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index 76bab994ae0..a01ac308201 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -91,7 +91,6 @@ class TestDataLoaders(unittest.TestCase): @classmethod def setUpClass(cls): - cls.test_file_path = "test_data_dr_search" cls.num_instances = 20 cls.data_generator = ModalityRandomDataGenerator() @@ -132,11 +131,6 @@ def setUpClass(cls): LSTM(width=256, depth=3), ] - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) - def test_enumerate_all(self): task = Task( "TestTask", diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py index 64fb9d13674..56e53089253 100644 --- a/src/main/python/tests/scuro/test_multimodal_fusion.py +++ b/src/main/python/tests/scuro/test_multimodal_fusion.py @@ -42,7 +42,11 @@ from systemds.scuro.representations.word2vec import W2V from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import ( + setup_data, + TestDataLoader, + ModalityRandomDataGenerator, +) from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader @@ -103,107 +107,107 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestMultimodalRepresentationOptimizer(unittest.TestCase): - pass - - -# test_file_path = None -# data_generator = None -# num_instances = 0 -# -# @classmethod -# def setUpClass(cls): -# cls.test_file_path = "fusion_optimizer_test_data" -# -# cls.num_instances = 10 -# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] -# -# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) -# split = train_test_split( -# cls.data_generator.indices, -# cls.data_generator.labels, -# test_size=0.2, -# random_state=42, -# ) -# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ -# int(i) for i in split[1] -# ] -# -# cls.tasks = [ -# Task( -# "UnimodalRepresentationTask1", -# TestSVM(), -# cls.data_generator.labels, -# cls.train_indizes, -# cls.val_indizes, -# ), -# Task( -# "UnimodalRepresentationTask2", -# TestCNN(), -# cls.data_generator.labels, -# cls.train_indizes, -# cls.val_indizes, -# ), -# ] -# -# @classmethod -# def tearDownClass(cls): -# shutil.rmtree(cls.test_file_path) -# -# def test_multimodal_fusion(self): -# task = Task( -# "UnimodalRepresentationTask1", -# TestSVM(), -# self.data_generator.labels, -# self.train_indizes, -# self.val_indizes, -# ) -# audio_data_loader = AudioLoader( -# self.data_generator.get_modality_path(ModalityType.AUDIO), -# self.data_generator.indices, -# ) -# audio = UnimodalModality(audio_data_loader) -# -# text_data_loader = TextLoader( -# self.data_generator.get_modality_path(ModalityType.TEXT), -# self.data_generator.indices, -# ) -# text = UnimodalModality(text_data_loader) -# -# video_data_loader = VideoLoader( -# self.data_generator.get_modality_path(ModalityType.VIDEO), -# self.data_generator.indices, -# ) -# video = UnimodalModality(video_data_loader) -# -# with patch.object( -# Registry, -# "_representations", -# { -# ModalityType.TEXT: [W2V], -# ModalityType.AUDIO: [Spectrogram], -# ModalityType.TIMESERIES: [ResNet], -# ModalityType.VIDEO: [ResNet], -# ModalityType.EMBEDDING: [], -# }, -# ): -# registry = Registry() -# registry._fusion_operators = [Average, Concatenation] -# unimodal_optimizer = UnimodalRepresentationOptimizer( -# [text, audio, video], [task], max_chain_depth=2 -# ) -# unimodal_optimizer.optimize() -# -# multimodal_optimizer = FusionOptimizer( -# [audio, text, video], -# task, -# unimodal_optimizer.optimization_results, -# unimodal_optimizer.cache, -# 2, -# 2, -# debug=False, -# ) -# multimodal_optimizer.optimize() -# -# -# if __name__ == "__main__": -# unittest.main() + test_file_path = None + data_generator = None + num_instances = 0 + + @classmethod + def setUpClass(cls): + cls.num_instances = 10 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + cls.labels = np.random.choice([0, 1], size=cls.num_instances) + cls.indices = np.array(range(cls.num_instances)) + + split = train_test_split( + cls.indices, + cls.labels, + test_size=0.2, + random_state=42, + ) + cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ + int(i) for i in split[1] + ] + + cls.tasks = [ + Task( + "UnimodalRepresentationTask1", + TestSVM(), + cls.labels, + cls.train_indizes, + cls.val_indizes, + ), + Task( + "UnimodalRepresentationTask2", + TestCNN(), + cls.labels, + cls.train_indizes, + cls.val_indizes, + ), + ] + + def test_multimodal_fusion(self): + task = Task( + "UnimodalRepresentationTask1", + TestSVM(), + self.labels, + self.train_indizes, + self.val_indizes, + ) + + audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( + self.num_instances, 30000 + ) + text_data, text_md = ModalityRandomDataGenerator().create_text_data( + self.num_instances + ) + video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 60 + ) + audio = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md + ) + ) + video = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md + ) + ) + text = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.TEXT, text_data, str, text_md + ) + ) + + with patch.object( + Registry, + "_representations", + { + ModalityType.TEXT: [W2V], + ModalityType.AUDIO: [Spectrogram], + ModalityType.TIMESERIES: [ResNet], + ModalityType.VIDEO: [ResNet], + ModalityType.EMBEDDING: [], + }, + ): + registry = Registry() + registry._fusion_operators = [Average, Concatenation] + unimodal_optimizer = UnimodalRepresentationOptimizer( + [text, audio, video], [task], max_chain_depth=2 + ) + unimodal_optimizer.optimize() + + multimodal_optimizer = FusionOptimizer( + [audio, text, video], + task, + unimodal_optimizer.optimization_results, + unimodal_optimizer.cache, + 2, + 2, + debug=False, + ) + multimodal_optimizer.optimize() + + +if __name__ == "__main__": + unittest.main() From 52de6e038d5cc4ccc530ce71167702770d80bd15 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Tue, 1 Jul 2025 16:47:53 +0200 Subject: [PATCH 17/23] testing --- src/main/python/tests/scuro/data_generator.py | 58 ++++++++++++++++++- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 29e7493205f..ec5bedb0051 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -98,15 +98,67 @@ def create_audio_data(self, num_instances, num_features): return data, metadata def create_text_data(self, num_instances): - nltk.download("webtext") - sentences = nltk.corpus.webtext.sents()[:num_instances] + subjects = [ + "The cat", + "A dog", + "The student", + "The teacher", + "The bird", + "The child", + "The programmer", + "The scientist", + "A researcher", + ] + verbs = [ + "reads", + "writes", + "studies", + "analyzes", + "creates", + "develops", + "designs", + "implements", + "examines", + ] + objects = [ + "the document", + "the code", + "the data", + "the problem", + "the solution", + "the project", + "the research", + "the paper", + ] + adverbs = [ + "carefully", + "quickly", + "efficiently", + "thoroughly", + "diligently", + "precisely", + "methodically", + ] + + sentences = [] + for _ in range(num_instances): + include_adverb = np.random.random() < 0.7 + + subject = np.random.choice(subjects) + verb = np.random.choice(verbs) + obj = np.random.choice(objects) + adverb = np.random.choice(adverbs) if include_adverb else "" + + sentence = f"{subject} {adverb} {verb} {obj}" + + sentences.append(sentence) metadata = { i: ModalityType.TEXT.create_text_metadata(len(sentences[i]), sentences[i]) for i in range(num_instances) } - return [" ".join(sentence) for sentence in sentences], metadata + return sentences, metadata def create_visual_modality(self, num_instances, num_frames=1, height=28, width=28): if num_frames == 1: From d606f653f9fa9c5ddb60f28b972efb83754b86c8 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Tue, 1 Jul 2025 17:37:42 +0200 Subject: [PATCH 18/23] testing --- src/main/python/tests/scuro/data_generator.py | 2 - .../tests/scuro/test_multimodal_join.py | 193 ++++++++-------- .../tests/scuro/test_operator_registry.py | 85 ++++--- .../tests/scuro/test_unimodal_optimizer.py | 216 +++++++++--------- 4 files changed, 244 insertions(+), 252 deletions(-) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index ec5bedb0051..209eb41f16e 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -26,8 +26,6 @@ import random import os -import nltk - from systemds.scuro.dataloader.base_loader import BaseLoader from systemds.scuro.dataloader.video_loader import VideoLoader from systemds.scuro.dataloader.audio_loader import AudioLoader diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index c2c83493a3d..a871e4297bc 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -37,103 +37,100 @@ class TestMultimodalJoin(unittest.TestCase): - pass + test_file_path = None + mods = None + text = None + audio = None + video = None + data_generator = None + num_instances = 0 + indizes = [] + @classmethod + def setUpClass(cls): + cls.test_file_path = "join_test_data" + cls.num_instances = 4 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] -# test_file_path = None -# mods = None -# text = None -# audio = None -# video = None -# data_generator = None -# num_instances = 0 -# indizes = [] -# -# @classmethod -# def setUpClass(cls): -# cls.test_file_path = "join_test_data" -# cls.num_instances = 4 -# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] -# -# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) -# -# @classmethod -# def tearDownClass(cls): -# print("Cleaning up test data") -# shutil.rmtree(cls.test_file_path) -# -# def test_video_audio_join(self): -# self._execute_va_join() -# -# def test_chunked_video_audio_join(self): -# self._execute_va_join(2) -# -# def test_video_chunked_audio_join(self): -# self._execute_va_join(None, 2) -# -# def test_chunked_video_chunked_audio_join(self): -# self._execute_va_join(2, 2) -# -# def test_audio_video_join(self): -# # Audio has a much higher frequency than video, hence we would need to -# # duplicate or interpolate frames to match them to the audio frequency -# self._execute_av_join() -# -# # TODO -# # def test_chunked_audio_video_join(self): -# # self._execute_av_join(2) -# -# # TODO -# # def test_chunked_audio_chunked_video_join(self): -# # self._execute_av_join(2, 2) -# -# def _execute_va_join(self, l_chunk_size=None, r_chunk_size=None): -# video, audio = self._prepare_data(l_chunk_size, r_chunk_size) -# self._join(video, audio, 2) -# -# def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): -# video, audio = self._prepare_data(l_chunk_size, r_chunk_size) -# self._join(audio, video, 2) -# -# def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): -# video_data_loader = VideoLoader( -# self.data_generator.get_modality_path(ModalityType.VIDEO), -# self.data_generator.indices, -# data_type=np.float16, -# chunk_size=l_chunk_size, -# ) -# video = UnimodalModality(video_data_loader) -# -# audio_data_loader = AudioLoader( -# self.data_generator.get_modality_path(ModalityType.AUDIO), -# self.data_generator.indices, -# data_type=np.float32, -# chunk_size=r_chunk_size, -# ) -# audio = UnimodalModality(audio_data_loader) -# -# mel_audio = audio.apply_representation(MelSpectrogram()) -# -# return video, mel_audio -# -# def _join(self, left_modality, right_modality, window_size): -# resnet_modality = ( -# left_modality.join( -# right_modality, JoinCondition("timestamp", "timestamp", "<") -# ) -# .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50")) -# .window_aggregation(window_size, "mean") -# .combine("concat") -# ) -# -# assert resnet_modality.left_modality is not None -# assert resnet_modality.right_modality is not None -# assert len(resnet_modality.left_modality.data) == self.num_instances -# assert len(resnet_modality.right_modality.data) == self.num_instances -# assert resnet_modality.data is not None -# -# return resnet_modality -# -# -# if __name__ == "__main__": -# unittest.main() + cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + + @classmethod + def tearDownClass(cls): + print("Cleaning up test data") + shutil.rmtree(cls.test_file_path) + + def test_video_audio_join(self): + self._execute_va_join() + + def test_chunked_video_audio_join(self): + self._execute_va_join(2) + + def test_video_chunked_audio_join(self): + self._execute_va_join(None, 2) + + def test_chunked_video_chunked_audio_join(self): + self._execute_va_join(2, 2) + + def test_audio_video_join(self): + # Audio has a much higher frequency than video, hence we would need to + # duplicate or interpolate frames to match them to the audio frequency + self._execute_av_join() + + # TODO + # def test_chunked_audio_video_join(self): + # self._execute_av_join(2) + + # TODO + # def test_chunked_audio_chunked_video_join(self): + # self._execute_av_join(2, 2) + + def _execute_va_join(self, l_chunk_size=None, r_chunk_size=None): + video, audio = self._prepare_data(l_chunk_size, r_chunk_size) + self._join(video, audio, 2) + + def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): + video, audio = self._prepare_data(l_chunk_size, r_chunk_size) + self._join(audio, video, 2) + + def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, + data_type=np.float16, + chunk_size=l_chunk_size, + ) + video = UnimodalModality(video_data_loader) + + audio_data_loader = AudioLoader( + self.data_generator.get_modality_path(ModalityType.AUDIO), + self.data_generator.indices, + data_type=np.float32, + chunk_size=r_chunk_size, + ) + audio = UnimodalModality(audio_data_loader) + + mel_audio = audio.apply_representation(MelSpectrogram()) + + return video, mel_audio + + def _join(self, left_modality, right_modality, window_size): + resnet_modality = ( + left_modality.join( + right_modality, JoinCondition("timestamp", "timestamp", "<") + ) + .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50")) + .window_aggregation(window_size, "mean") + .combine("concat") + ) + + assert resnet_modality.left_modality is not None + assert resnet_modality.right_modality is not None + assert len(resnet_modality.left_modality.data) == self.num_instances + assert len(resnet_modality.right_modality.data) == self.num_instances + assert resnet_modality.data is not None + + return resnet_modality + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index fa4e5cf0356..e35bf5b1d2c 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -42,49 +42,46 @@ class TestOperatorRegistry(unittest.TestCase): - pass + def test_audio_representations_in_registry(self): + registry = Registry() + for representation in [Spectrogram, MelSpectrogram, Wav2Vec, MFCC]: + assert representation in registry.get_representations( + ModalityType.AUDIO + ), f"{representation} not in registry" + def test_video_representations_in_registry(self): + registry = Registry() + assert registry.get_representations(ModalityType.VIDEO) == [ResNet] -# def test_audio_representations_in_registry(self): -# registry = Registry() -# for representation in [Spectrogram, MelSpectrogram, Wav2Vec, MFCC]: -# assert representation in registry.get_representations( -# ModalityType.AUDIO -# ), f"{representation} not in registry" -# -# def test_video_representations_in_registry(self): -# registry = Registry() -# assert registry.get_representations(ModalityType.VIDEO) == [ResNet] -# -# def test_timeseries_representations_in_registry(self): -# registry = Registry() -# assert registry.get_representations(ModalityType.TIMESERIES) == [ResNet] -# -# def test_text_representations_in_registry(self): -# registry = Registry() -# for representation in [BoW, TfIdf, W2V, Bert]: -# assert representation in registry.get_representations( -# ModalityType.TEXT -# ), f"{representation} not in registry" -# -# def test_context_operator_in_registry(self): -# registry = Registry() -# assert registry.get_context_operators() == [WindowAggregation] -# -# # def test_fusion_operator_in_registry(self): -# # registry = Registry() -# # for fusion_operator in [ -# # # RowMax, -# # Sum, -# # Average, -# # Concatenation, -# # LSTM, -# # Multiplication, -# # ]: -# # assert ( -# # fusion_operator in registry.get_fusion_operators() -# # ), f"{fusion_operator} not in registry" -# -# -# if __name__ == "__main__": -# unittest.main() + def test_timeseries_representations_in_registry(self): + registry = Registry() + assert registry.get_representations(ModalityType.TIMESERIES) == [ResNet] + + def test_text_representations_in_registry(self): + registry = Registry() + for representation in [BoW, TfIdf, W2V, Bert]: + assert representation in registry.get_representations( + ModalityType.TEXT + ), f"{representation} not in registry" + + def test_context_operator_in_registry(self): + registry = Registry() + assert registry.get_context_operators() == [WindowAggregation] + + # def test_fusion_operator_in_registry(self): + # registry = Registry() + # for fusion_operator in [ + # # RowMax, + # Sum, + # Average, + # Concatenation, + # LSTM, + # Multiplication, + # ]: + # assert ( + # fusion_operator in registry.get_fusion_operators() + # ), f"{fusion_operator} not in registry" + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index ef61ec18b53..3ffa87ee4d2 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -39,7 +39,7 @@ from systemds.scuro.representations.word2vec import W2V from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader @@ -101,110 +101,110 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestUnimodalRepresentationOptimizer(unittest.TestCase): - pass - - -# test_file_path = None -# data_generator = None -# num_instances = 0 -# -# @classmethod -# def setUpClass(cls): -# cls.test_file_path = "unimodal_optimizer_test_data" -# -# cls.num_instances = 10 -# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] -# -# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) -# split = train_test_split( -# cls.data_generator.indices, -# cls.data_generator.labels, -# test_size=0.2, -# random_state=42, -# ) -# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ -# int(i) for i in split[1] -# ] -# -# cls.tasks = [ -# Task( -# "UnimodalRepresentationTask1", -# TestSVM(), -# cls.data_generator.labels, -# cls.train_indizes, -# cls.val_indizes, -# ), -# Task( -# "UnimodalRepresentationTask2", -# TestCNN(), -# cls.data_generator.labels, -# cls.train_indizes, -# cls.val_indizes, -# ), -# ] -# -# @classmethod -# def tearDownClass(cls): -# shutil.rmtree(cls.test_file_path) -# -# def test_unimodal_optimizer_for_audio_modality(self): -# audio_data_loader = AudioLoader( -# self.data_generator.get_modality_path(ModalityType.AUDIO), -# self.data_generator.indices, -# ) -# audio = UnimodalModality(audio_data_loader) -# -# self.optimize_unimodal_representation_for_modality(audio) -# -# def test_unimodal_optimizer_for_text_modality(self): -# text_data_loader = TextLoader( -# self.data_generator.get_modality_path(ModalityType.TEXT), -# self.data_generator.indices, -# ) -# text = UnimodalModality(text_data_loader) -# self.optimize_unimodal_representation_for_modality(text) -# -# def test_unimodal_optimizer_for_video_modality(self): -# video_data_loader = VideoLoader( -# self.data_generator.get_modality_path(ModalityType.VIDEO), -# self.data_generator.indices, -# ) -# video = UnimodalModality(video_data_loader) -# self.optimize_unimodal_representation_for_modality(video) -# -# def optimize_unimodal_representation_for_modality(self, modality): -# with patch.object( -# Registry, -# "_representations", -# { -# ModalityType.TEXT: [W2V], -# ModalityType.AUDIO: [Spectrogram], -# ModalityType.TIMESERIES: [ResNet], -# ModalityType.VIDEO: [ResNet], -# ModalityType.EMBEDDING: [], -# }, -# ): -# registry = Registry() -# -# unimodal_optimizer = UnimodalRepresentationOptimizer( -# [modality], self.tasks, max_chain_depth=2 -# ) -# unimodal_optimizer.optimize() -# -# assert ( -# list(unimodal_optimizer.optimization_results.keys())[0] -# == modality.modality_id -# ) -# assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 -# assert ( -# len( -# unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ -# 0 -# ].operator_chain -# ) -# >= 1 -# ) -# -# -# if __name__ == "__main__": -# unittest.main() + data_generator = None + num_instances = 0 + + @classmethod + def setUpClass(cls): + cls.num_instances = 10 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + cls.labels = np.random.choice([0, 1], size=cls.num_instances) + cls.indices = np.array(range(cls.num_instances)) + + split = train_test_split( + cls.indices, + cls.labels, + test_size=0.2, + random_state=42, + ) + cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ + int(i) for i in split[1] + ] + + cls.tasks = [ + Task( + "UnimodalRepresentationTask1", + TestSVM(), + cls.labels, + cls.train_indizes, + cls.val_indizes, + ), + Task( + "UnimodalRepresentationTask2", + TestCNN(), + cls.labels, + cls.train_indizes, + cls.val_indizes, + ), + ] + + def test_unimodal_optimizer_for_audio_modality(self): + audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( + self.num_instances, 30000 + ) + audio = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md + ) + ) + + self.optimize_unimodal_representation_for_modality(audio) + + def test_unimodal_optimizer_for_text_modality(self): + text_data, text_md = ModalityRandomDataGenerator().create_text_data( + self.num_instances + ) + text = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.TEXT, text_data, str, text_md + ) + ) + self.optimize_unimodal_representation_for_modality(text) + + def test_unimodal_optimizer_for_video_modality(self): + video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 60 + ) + video = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md + ) + ) + self.optimize_unimodal_representation_for_modality(video) + + def optimize_unimodal_representation_for_modality(self, modality): + with patch.object( + Registry, + "_representations", + { + ModalityType.TEXT: [W2V], + ModalityType.AUDIO: [Spectrogram], + ModalityType.TIMESERIES: [ResNet], + ModalityType.VIDEO: [ResNet], + ModalityType.EMBEDDING: [], + }, + ): + registry = Registry() + + unimodal_optimizer = UnimodalRepresentationOptimizer( + [modality], self.tasks, max_chain_depth=2 + ) + unimodal_optimizer.optimize() + + assert ( + list(unimodal_optimizer.optimization_results.keys())[0] + == modality.modality_id + ) + assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 + assert ( + len( + unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ + 0 + ].operator_chain + ) + >= 1 + ) + + +if __name__ == "__main__": + unittest.main() From 16b825c7aae45edcce58d4d014c0985cf7c8e9af Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 2 Jul 2025 09:43:52 +0200 Subject: [PATCH 19/23] testing --- .../tests/scuro/test_unimodal_optimizer.py | 215 +++++++++--------- 1 file changed, 108 insertions(+), 107 deletions(-) diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index 3ffa87ee4d2..cb6c50ca591 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -101,110 +101,111 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestUnimodalRepresentationOptimizer(unittest.TestCase): - data_generator = None - num_instances = 0 - - @classmethod - def setUpClass(cls): - cls.num_instances = 10 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - cls.labels = np.random.choice([0, 1], size=cls.num_instances) - cls.indices = np.array(range(cls.num_instances)) - - split = train_test_split( - cls.indices, - cls.labels, - test_size=0.2, - random_state=42, - ) - cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ - int(i) for i in split[1] - ] - - cls.tasks = [ - Task( - "UnimodalRepresentationTask1", - TestSVM(), - cls.labels, - cls.train_indizes, - cls.val_indizes, - ), - Task( - "UnimodalRepresentationTask2", - TestCNN(), - cls.labels, - cls.train_indizes, - cls.val_indizes, - ), - ] - - def test_unimodal_optimizer_for_audio_modality(self): - audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( - self.num_instances, 30000 - ) - audio = UnimodalModality( - TestDataLoader( - self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md - ) - ) - - self.optimize_unimodal_representation_for_modality(audio) - - def test_unimodal_optimizer_for_text_modality(self): - text_data, text_md = ModalityRandomDataGenerator().create_text_data( - self.num_instances - ) - text = UnimodalModality( - TestDataLoader( - self.indices, None, ModalityType.TEXT, text_data, str, text_md - ) - ) - self.optimize_unimodal_representation_for_modality(text) - - def test_unimodal_optimizer_for_video_modality(self): - video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( - self.num_instances, 60 - ) - video = UnimodalModality( - TestDataLoader( - self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md - ) - ) - self.optimize_unimodal_representation_for_modality(video) - - def optimize_unimodal_representation_for_modality(self, modality): - with patch.object( - Registry, - "_representations", - { - ModalityType.TEXT: [W2V], - ModalityType.AUDIO: [Spectrogram], - ModalityType.TIMESERIES: [ResNet], - ModalityType.VIDEO: [ResNet], - ModalityType.EMBEDDING: [], - }, - ): - registry = Registry() - - unimodal_optimizer = UnimodalRepresentationOptimizer( - [modality], self.tasks, max_chain_depth=2 - ) - unimodal_optimizer.optimize() - - assert ( - list(unimodal_optimizer.optimization_results.keys())[0] - == modality.modality_id - ) - assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 - assert ( - len( - unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ - 0 - ].operator_chain - ) - >= 1 - ) - - -if __name__ == "__main__": - unittest.main() + pass +# data_generator = None +# num_instances = 0 +# +# @classmethod +# def setUpClass(cls): +# cls.num_instances = 10 +# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] +# cls.labels = np.random.choice([0, 1], size=cls.num_instances) +# cls.indices = np.array(range(cls.num_instances)) +# +# split = train_test_split( +# cls.indices, +# cls.labels, +# test_size=0.2, +# random_state=42, +# ) +# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ +# int(i) for i in split[1] +# ] +# +# cls.tasks = [ +# Task( +# "UnimodalRepresentationTask1", +# TestSVM(), +# cls.labels, +# cls.train_indizes, +# cls.val_indizes, +# ), +# Task( +# "UnimodalRepresentationTask2", +# TestCNN(), +# cls.labels, +# cls.train_indizes, +# cls.val_indizes, +# ), +# ] +# +# def test_unimodal_optimizer_for_audio_modality(self): +# audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( +# self.num_instances, 30000 +# ) +# audio = UnimodalModality( +# TestDataLoader( +# self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md +# ) +# ) +# +# self.optimize_unimodal_representation_for_modality(audio) +# +# def test_unimodal_optimizer_for_text_modality(self): +# text_data, text_md = ModalityRandomDataGenerator().create_text_data( +# self.num_instances +# ) +# text = UnimodalModality( +# TestDataLoader( +# self.indices, None, ModalityType.TEXT, text_data, str, text_md +# ) +# ) +# self.optimize_unimodal_representation_for_modality(text) +# +# def test_unimodal_optimizer_for_video_modality(self): +# video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( +# self.num_instances, 60 +# ) +# video = UnimodalModality( +# TestDataLoader( +# self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md +# ) +# ) +# self.optimize_unimodal_representation_for_modality(video) +# +# def optimize_unimodal_representation_for_modality(self, modality): +# with patch.object( +# Registry, +# "_representations", +# { +# ModalityType.TEXT: [W2V], +# ModalityType.AUDIO: [Spectrogram], +# ModalityType.TIMESERIES: [ResNet], +# ModalityType.VIDEO: [ResNet], +# ModalityType.EMBEDDING: [], +# }, +# ): +# registry = Registry() +# +# unimodal_optimizer = UnimodalRepresentationOptimizer( +# [modality], self.tasks, max_chain_depth=2 +# ) +# unimodal_optimizer.optimize() +# +# assert ( +# list(unimodal_optimizer.optimization_results.keys())[0] +# == modality.modality_id +# ) +# assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 +# assert ( +# len( +# unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ +# 0 +# ].operator_chain +# ) +# >= 1 +# ) +# +# +# if __name__ == "__main__": +# unittest.main() From ae2152f64ba71a14cc4c62a04ced5e8ab7afe7f4 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 28 Jul 2025 10:33:06 +0200 Subject: [PATCH 20/23] reduce test time --- src/main/python/systemds/scuro/__init__.py | 4 +- .../python/systemds/scuro/modality/type.py | 8 + .../systemds/scuro/representations/average.py | 2 + .../systemds/scuro/representations/bert.py | 27 ++- .../systemds/scuro/representations/fusion.py | 4 + .../{multiplication.py => hadamard.py} | 28 +-- .../scuro/representations/image_bind.py | 34 +-- .../systemds/scuro/representations/max.py | 51 +---- .../systemds/scuro/representations/x3d.py | 71 +++++- .../systemds/scuro/utils/torch_dataset.py | 12 +- src/main/python/systemds/utils/helpers.py | 3 +- src/main/python/tests/scuro/data_generator.py | 16 +- src/main/python/tests/scuro/test_dr_search.py | 6 +- .../python/tests/scuro/test_fusion_orders.py | 96 ++++++++ .../tests/scuro/test_multimodal_fusion.py | 2 +- .../tests/scuro/test_multimodal_join.py | 44 ++-- .../tests/scuro/test_operator_registry.py | 2 +- .../tests/scuro/test_unimodal_optimizer.py | 215 +++++++++--------- .../scuro/test_unimodal_representations.py | 157 +++++++------ .../tests/scuro/test_window_operations.py | 149 ++++++------ 20 files changed, 542 insertions(+), 389 deletions(-) rename src/main/python/systemds/scuro/representations/{multiplication.py => hadamard.py} (67%) create mode 100644 src/main/python/tests/scuro/test_fusion_orders.py diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index b878200b0b4..1c3cfe92231 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -39,7 +39,7 @@ from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.mfcc import MFCC -from systemds.scuro.representations.multiplication import Multiplication +from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.optical_flow import OpticalFlow from systemds.scuro.representations.representation import Representation from systemds.scuro.representations.representation_dataloader import NPY @@ -94,7 +94,7 @@ "RowMax", "MelSpectrogram", "MFCC", - "Multiplication", + "Hadamard", "OpticalFlow", "Representation", "NPY", diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index 4b59c263d6b..a2f3d993a9b 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -190,6 +190,14 @@ def get_schema(self): def update_metadata(self, md, data): return ModalitySchemas.update_metadata(self.name, md, data) + + def add_alignment(self, md, alignment_timestamps): + md["alignment_timestamps"] = alignment_timestamps + return md + + def add_field(self, md, field, data): + md[field] = data + return md def create_audio_metadata(self, sampling_rate, data): md = deepcopy(self.get_schema()) diff --git a/src/main/python/systemds/scuro/representations/average.py b/src/main/python/systemds/scuro/representations/average.py index 4c6b0e17879..8a7e6b9ec8e 100644 --- a/src/main/python/systemds/scuro/representations/average.py +++ b/src/main/python/systemds/scuro/representations/average.py @@ -37,6 +37,8 @@ def __init__(self): Combines modalities using averaging """ super().__init__("Average") + self.associative = True + self.commutative = True def transform(self, modalities: List[Modality]): for modality in modalities: diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 5aa073ed820..15b969aeb49 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -22,11 +22,14 @@ from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation import torch -from transformers import BertTokenizer, BertModel +from transformers import BertTokenizerFast, BertModel from systemds.scuro.representations.utils import save_embeddings from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.operator_registry import register_representation +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" + @register_representation(ModalityType.TEXT) class Bert(UnimodalRepresentation): @@ -40,29 +43,33 @@ def __init__(self, model_name="bert", output_file=None): def transform(self, modality): transformed_modality = TransformedModality(modality, self) model_name = "bert-base-uncased" - tokenizer = BertTokenizer.from_pretrained( + tokenizer = BertTokenizerFast.from_pretrained( model_name, clean_up_tokenization_spaces=True ) model = BertModel.from_pretrained(model_name) - embeddings = self.create_embeddings(modality.data, model, tokenizer) - + embeddings = self.create_embeddings(modality, model, tokenizer) + if self.output_file is not None: save_embeddings(embeddings, self.output_file) transformed_modality.data = embeddings return transformed_modality - def create_embeddings(self, data, model, tokenizer): + def create_embeddings(self, modality, model, tokenizer): embeddings = [] - for d in data: - inputs = tokenizer(d, return_tensors="pt", padding=True, truncation=True) - + for i, d in enumerate(modality.data): + inputs = tokenizer(d, return_offsets_mapping=True, return_tensors="pt", padding=True, truncation=True) + + ModalityType.TEXT.add_field(list(modality.metadata.values())[i], "token_to_character_mapping", inputs.data['offset_mapping'][0].tolist()) + + del inputs.data['offset_mapping'] + with torch.no_grad(): outputs = model(**inputs) - cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy() - embeddings.append(cls_embedding.reshape(1, -1)) + cls_embedding = outputs.last_hidden_state[0].numpy() + embeddings.append(cls_embedding) return embeddings diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py index 773452371be..339bb050a52 100644 --- a/src/main/python/systemds/scuro/representations/fusion.py +++ b/src/main/python/systemds/scuro/representations/fusion.py @@ -33,6 +33,10 @@ def __init__(self, name, parameters=None): :param name: Name of the fusion type """ super().__init__(name, parameters) + self.associative = False + self.commutative = False + self.needs_alignment = False + def transform(self, modalities: List[Modality]): """ diff --git a/src/main/python/systemds/scuro/representations/multiplication.py b/src/main/python/systemds/scuro/representations/hadamard.py similarity index 67% rename from src/main/python/systemds/scuro/representations/multiplication.py rename to src/main/python/systemds/scuro/representations/hadamard.py index 8d1e7f8c908..0576ef49563 100644 --- a/src/main/python/systemds/scuro/representations/multiplication.py +++ b/src/main/python/systemds/scuro/representations/hadamard.py @@ -24,31 +24,23 @@ import numpy as np from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.utils import pad_sequences from systemds.scuro.representations.fusion import Fusion from systemds.scuro.drsearch.operator_registry import register_fusion_operator - - @register_fusion_operator() -class Multiplication(Fusion): +class Hadamard(Fusion): def __init__(self): """ - Combines modalities using elementwise multiply + Combines modalities using elementwise multiply (Hadamard product) """ - super().__init__("Multiplication") + super().__init__("Hadamard") + self.needs_alignment = True # zero padding falsifies the result + self.commutative = True + self.associative = True def transform(self, modalities: List[Modality], train_indices=None): - max_emb_size = self.get_max_embedding_size(modalities) - - data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32") - - for m in range(1, len(modalities)): - # scaled = self.scale_data(modalities[m].data, train_indices) - data = np.multiply( - data, - pad_sequences(modalities[m].data, maxlen=max_emb_size, dtype="float32"), - ) - - return data + # TODO: check for alignment in the metadata + fused_data = np.prod([m.data for m in modalities], axis=0) + + return fused_data diff --git a/src/main/python/systemds/scuro/representations/image_bind.py b/src/main/python/systemds/scuro/representations/image_bind.py index b7b1e31ecec..608e68e61c0 100644 --- a/src/main/python/systemds/scuro/representations/image_bind.py +++ b/src/main/python/systemds/scuro/representations/image_bind.py @@ -39,40 +39,46 @@ DEVICE = torch.device("cpu") -@register_representation([ModalityType.TEXT, ModalityType.AUDIO, ModalityType.VIDEO]) +# @register_representation([ModalityType.TEXT, ModalityType.AUDIO, ModalityType.VIDEO]) class ImageBind(UnimodalRepresentation): def __init__(self): parameters = {} super().__init__("ImageBind", ModalityType.EMBEDDING, parameters) + self.model = imagebind_model.imagebind_huge(pretrained=True) + for param in self.model.parameters(): + param.requires_grad = False + self.model.eval() + self.model.to(DEVICE) def transform(self, modality): transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata + modality, self, ModalityType.EMBEDDING ) - model = imagebind_model.imagebind_huge(pretrained=True) - for param in model.parameters(): - param.requires_grad = False - model.eval() - model.to(DEVICE) - result = [] if modality.modality_type == ModalityType.TEXT: for i, instance in enumerate(modality.data): text_inputs = data.load_and_transform_text(instance, DEVICE) - text_embeddings = model({IBModalityType.TEXT: text_inputs})[ + text_embeddings =self.model({IBModalityType.TEXT: text_inputs})[ IBModalityType.TEXT ] result.append(text_embeddings.mean(axis=0).cpu().detach().numpy()) if modality.modality_type == ModalityType.AUDIO: audio_inputs = data.load_and_transform_audio_data( - list(modality.metadata), + list(modality.metadata)[ + (modality.data_loader.next_chunk - 1) + * (modality.data_loader.chunk_size): ( + modality.data_loader.next_chunk - 1 + ) + * (modality.data_loader.chunk_size) + + (modality.data_loader.chunk_size) + ], DEVICE, ) - audio_embeddings = model({IBModalityType.AUDIO: audio_inputs})[ + audio_embeddings = self.model({IBModalityType.AUDIO: audio_inputs})[ IBModalityType.AUDIO ] - result.append(audio_embeddings.mean(axis=0).cpu().detach().numpy()) + result.extend(audio_embeddings.cpu().detach().numpy()) if modality.modality_type == ModalityType.VIDEO: video_inputs = data.load_and_transform_video_data( list(modality.metadata)[ @@ -85,10 +91,10 @@ def transform(self, modality): ], DEVICE, ) - video_embeddings = model({IBModalityType.VISION: video_inputs})[ + video_embeddings = self.model({IBModalityType.VISION: video_inputs})[ IBModalityType.VISION ] - result.append(video_embeddings.mean(axis=0).cpu().detach().numpy()) + result.extend(video_embeddings.cpu().detach().numpy()) transformed_modality.data = result return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/max.py b/src/main/python/systemds/scuro/representations/max.py index 5a787dcf0c3..f7f7c298b8a 100644 --- a/src/main/python/systemds/scuro/representations/max.py +++ b/src/main/python/systemds/scuro/representations/max.py @@ -18,14 +18,11 @@ # under the License. # # ------------------------------------------------------------- -import itertools from typing import List import numpy as np from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.utils import pad_sequences - from systemds.scuro.representations.fusion import Fusion from systemds.scuro.drsearch.operator_registry import register_fusion_operator @@ -33,52 +30,22 @@ @register_fusion_operator() class RowMax(Fusion): - def __init__(self, split=4): + def __init__(self): """ Combines modalities by computing the outer product of a modality combination and taking the row max """ super().__init__("RowMax") - self.split = split + self.needs_alignment = True + self.associative = True + self.commutative = True def transform( self, modalities: List[Modality], ): - if len(modalities) < 2: - return np.array(modalities[0].data) - - max_emb_size = self.get_max_embedding_size(modalities) - - padded_modalities = [] - for modality in modalities: - d = pad_sequences(modality.data, maxlen=max_emb_size, dtype="float32") - padded_modalities.append(d) - - split_rows = int(len(modalities[0].data) / self.split) - - data = [] - - for combination in itertools.combinations(padded_modalities, 2): - combined = None - for i in range(0, self.split): - start = split_rows * i - end = ( - split_rows * (i + 1) - if i < (self.split - 1) - else len(modalities[0].data) - ) - m = np.einsum( - "bi,bo->bio", combination[0][start:end], combination[1][start:end] - ) - m = m.max(axis=2) - if combined is None: - combined = m - else: - combined = np.concatenate((combined, m), axis=0) - data.append(combined) - - data = np.stack(data) - data = data.max(axis=0) - - return np.array(data) + # TODO: need to check if data is aligned - same number of dimension + fused_data = np.maximum.reduce([m.data for m in modalities]) + + return fused_data + \ No newline at end of file diff --git a/src/main/python/systemds/scuro/representations/x3d.py b/src/main/python/systemds/scuro/representations/x3d.py index b518f68ef54..42f21ec7e89 100644 --- a/src/main/python/systemds/scuro/representations/x3d.py +++ b/src/main/python/systemds/scuro/representations/x3d.py @@ -30,13 +30,15 @@ import numpy as np from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.operator_registry import register_representation - +import math if torch.backends.mps.is_available(): DEVICE = torch.device("mps") # elif torch.cuda.is_available(): # DEVICE = torch.device("cuda") else: DEVICE = torch.device("cpu") + +DEVICE = torch.device("cpu") # @register_representation([ModalityType.VIDEO]) @@ -133,3 +135,70 @@ def hook( transformed_modality.data = list(embeddings.values()) return transformed_modality + + +class I3D(UnimodalRepresentation): + def __init__(self, layer="avgpool", model_name="i3d", output_file=None): + self.model_name = model_name + parameters = self._get_parameters() + self.model = torch.hub.load("facebookresearch/pytorchvideo", "i3d_r50", pretrained=True).to(DEVICE) + super().__init__("I3D", ModalityType.TIMESERIES, parameters) + + self.output_file = output_file + self.layer_name = layer + self.model.eval() + for param in self.model.parameters(): + param.requires_grad = False + + def _get_parameters(self, high_level=True): + parameters = {"model_name": [], "layer_name": []} + for m in ["r3d", "s3d"]: + parameters["model_name"].append(m) + + if high_level: + parameters["layer_name"] = [ + "conv1", + "layer1", + "layer2", + "layer3", + "layer4", + "avgpool", + ] + else: + for name, layer in self.model.named_modules(): + parameters["layer_name"].append(name) + return parameters + + def transform(self, modality): + dataset = CustomDataset(modality.data, torch.float32, DEVICE) + embeddings = {} + + features = None + + def hook(module, input, output): + pooled = torch.nn.functional.adaptive_avg_pool3d(output, 1).squeeze() + nonlocal features + features = pooled.detach().cpu().numpy() + + handle = self.model.blocks[6].dropout.register_forward_hook(hook) + + for instance in dataset: + video_id = instance["id"] + frames = instance["data"].to(DEVICE) + embeddings[video_id] = [] + + + batch = torch.transpose(frames, 1, 0) + batch = batch.unsqueeze(0) + _ = self.model(batch) + + embeddings[video_id] = features + + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) + + transformed_modality.data = list(embeddings.values()) + + return transformed_modality + diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py index 314dfcd5fc1..f93dcaca8a7 100644 --- a/src/main/python/systemds/scuro/utils/torch_dataset.py +++ b/src/main/python/systemds/scuro/utils/torch_dataset.py @@ -25,15 +25,19 @@ class CustomDataset(torch.utils.data.Dataset): - def __init__(self, data, data_type, device): + def __init__(self, data, data_type, device, size=None): self.data = data self.data_type = data_type self.device = device + self.size = size + if size is None: + self.size = (256, 224) + self.tf = transforms.Compose( [ transforms.ToPILImage(), - transforms.Resize(256), - transforms.CenterCrop(224), + transforms.Resize(self.size[0]), + transforms.CenterCrop(self.size[1]), transforms.ToTensor(), transforms.ConvertImageDtype(dtype=self.data_type), transforms.Normalize( @@ -45,7 +49,7 @@ def __init__(self, data, data_type, device): def __getitem__(self, index) -> Dict[str, object]: data = self.data[index] output = torch.empty( - (len(data), 3, 224, 224), dtype=self.data_type, device=self.device + (len(data), 3, self.size[1], self.size[1]), dtype=self.data_type, device=self.device ) for i, d in enumerate(data): diff --git a/src/main/python/systemds/utils/helpers.py b/src/main/python/systemds/utils/helpers.py index 05c9bf0647f..429f924feb7 100644 --- a/src/main/python/systemds/utils/helpers.py +++ b/src/main/python/systemds/utils/helpers.py @@ -23,9 +23,10 @@ from importlib.util import find_spec from itertools import chain from typing import Dict, Iterable - +import torch from systemds.utils.consts import MODULE_NAME +DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu' def create_params_string( unnamed_parameters: Iterable[str], named_parameters: Dict[str, str] diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 209eb41f16e..bc8854647df 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -41,9 +41,16 @@ def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata self.metadata = metadata self.test_data = data - + + def reset(self): + self._next_chunk = 0 + self.data = [] + def extract(self, file, indices): - self.data = self.test_data + if isinstance(self.test_data, list): + self.data = [self.test_data[i] for i in indices] + else: + self.data = self.test_data[indices] class ModalityRandomDataGenerator: @@ -89,7 +96,7 @@ def create1DModality( def create_audio_data(self, num_instances, num_features): data = np.random.rand(num_instances, num_features).astype(np.float32) metadata = { - i: ModalityType.AUDIO.create_audio_metadata(num_features / 10, data[i]) + i: ModalityType.AUDIO.create_audio_metadata(16000, data[i]) for i in range(num_instances) } @@ -164,7 +171,7 @@ def create_visual_modality(self, num_instances, num_frames=1, height=28, width=2 else: metadata = { i: ModalityType.VIDEO.create_video_metadata( - num_instances / 30, num_frames / 30, width, height, 1 + 30, num_frames, width, height, 1 ) for i in range(num_instances) } @@ -172,6 +179,7 @@ def create_visual_modality(self, num_instances, num_frames=1, height=28, width=2 return ( np.random.randint( 0, 256, (num_instances, num_frames, height, width) + # ).astype(np.float16).tolist(), ).astype(np.float16), metadata, ) diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index a01ac308201..50f57eebb20 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -38,7 +38,7 @@ from systemds.scuro.representations.lstm import LSTM from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.mel_spectrogram import MelSpectrogram -from systemds.scuro.representations.multiplication import Multiplication +from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.sum import Sum from tests.scuro.data_generator import ModalityRandomDataGenerator @@ -125,8 +125,8 @@ def setUpClass(cls): cls.representations = [ Concatenation(), Average(), - RowMax(100), - Multiplication(), + RowMax(), + Hadamard(), Sum(), LSTM(width=256, depth=3), ] diff --git a/src/main/python/tests/scuro/test_fusion_orders.py b/src/main/python/tests/scuro/test_fusion_orders.py new file mode 100644 index 00000000000..7bb086b9a37 --- /dev/null +++ b/src/main/python/tests/scuro/test_fusion_orders.py @@ -0,0 +1,96 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import os +import shutil +import unittest +import numpy as np + +from systemds.scuro import Concatenation, RowMax, Hadamard +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.representations.bert import Bert +from systemds.scuro.representations.mel_spectrogram import MelSpectrogram +from systemds.scuro.representations.average import Average +from tests.scuro.data_generator import ModalityRandomDataGenerator +from systemds.scuro.modality.type import ModalityType + + +class TestFusionOrders(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.num_instances = 40 + cls.data_generator = ModalityRandomDataGenerator() + cls.r_1 = cls.data_generator.create1DModality(40, 100, ModalityType.AUDIO) + cls.r_2 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT) + cls.r_3 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT) + + def test_fusion_order_avg(self): + r_1_r_2 = self.r_1.combine(self.r_2, Average()) + r_2_r_1 = self.r_2.combine(self.r_1, Average()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Average()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Average()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Average()) + + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) + + + def test_fusion_order_concat(self): + r_1_r_2 = self.r_1.combine(self.r_2, Concatenation()) + r_2_r_1 = self.r_2.combine(self.r_1, Concatenation()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Concatenation()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Concatenation()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Concatenation()) + + self.assertFalse(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) + + def test_fusion_order_max(self): + r_1_r_2 = self.r_1.combine(self.r_2, RowMax()) + r_2_r_1 = self.r_2.combine(self.r_1, RowMax()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, RowMax()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, RowMax()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], RowMax()) + + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) + + def test_fusion_order_hadamard(self): + r_1_r_2 = self.r_1.combine(self.r_2, Hadamard()) + r_2_r_1 = self.r_2.combine(self.r_1, Hadamard()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Hadamard()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Hadamard()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Hadamard()) + + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) \ No newline at end of file diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py index 56e53089253..77f03054eb5 100644 --- a/src/main/python/tests/scuro/test_multimodal_fusion.py +++ b/src/main/python/tests/scuro/test_multimodal_fusion.py @@ -155,7 +155,7 @@ def test_multimodal_fusion(self): ) audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( - self.num_instances, 30000 + self.num_instances, 100 ) text_data, text_md = ModalityRandomDataGenerator().create_text_data( self.num_instances diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index a871e4297bc..7f0b35c311f 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -24,12 +24,12 @@ import unittest import numpy as np - +import copy from systemds.scuro.modality.joined import JoinCondition from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import TestDataLoader, ModalityRandomDataGenerator from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader @@ -48,16 +48,15 @@ class TestMultimodalJoin(unittest.TestCase): @classmethod def setUpClass(cls): - cls.test_file_path = "join_test_data" cls.num_instances = 4 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] - - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) + cls.indices = np.array(range(cls.num_instances)) + cls.audio_data, cls.audio_md = ModalityRandomDataGenerator().create_audio_data( + cls.num_instances, 32000 + ) + + cls.video_data, cls.video_md = ModalityRandomDataGenerator().create_visual_modality( + cls.num_instances, 60 + ) def test_video_audio_join(self): self._execute_va_join() @@ -93,21 +92,16 @@ def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): self._join(audio, video, 2) def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - data_type=np.float16, - chunk_size=l_chunk_size, + audio = UnimodalModality( + TestDataLoader( + self.indices, r_chunk_size, ModalityType.AUDIO, copy.deepcopy(self.audio_data), np.float32, copy.deepcopy(self.audio_md) + ) ) - video = UnimodalModality(video_data_loader) - - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - data_type=np.float32, - chunk_size=r_chunk_size, + video = UnimodalModality( + TestDataLoader( + self.indices, l_chunk_size, ModalityType.VIDEO, copy.deepcopy(self.video_data), np.float32, copy.deepcopy(self.video_md) + ) ) - audio = UnimodalModality(audio_data_loader) mel_audio = audio.apply_representation(MelSpectrogram()) @@ -118,7 +112,7 @@ def _join(self, left_modality, right_modality, window_size): left_modality.join( right_modality, JoinCondition("timestamp", "timestamp", "<") ) - .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50")) + .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet18")) .window_aggregation(window_size, "mean") .combine("concat") ) diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index e35bf5b1d2c..7f2a752722a 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -36,7 +36,7 @@ from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.spectrogram import Spectrogram -from systemds.scuro.representations.multiplication import Multiplication +from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.sum import Sum diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index cb6c50ca591..9ed034e5fe8 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -101,111 +101,110 @@ def test(self, test_X: np.ndarray, test_y: np.ndarray): class TestUnimodalRepresentationOptimizer(unittest.TestCase): - pass -# data_generator = None -# num_instances = 0 -# -# @classmethod -# def setUpClass(cls): -# cls.num_instances = 10 -# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] -# cls.labels = np.random.choice([0, 1], size=cls.num_instances) -# cls.indices = np.array(range(cls.num_instances)) -# -# split = train_test_split( -# cls.indices, -# cls.labels, -# test_size=0.2, -# random_state=42, -# ) -# cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ -# int(i) for i in split[1] -# ] -# -# cls.tasks = [ -# Task( -# "UnimodalRepresentationTask1", -# TestSVM(), -# cls.labels, -# cls.train_indizes, -# cls.val_indizes, -# ), -# Task( -# "UnimodalRepresentationTask2", -# TestCNN(), -# cls.labels, -# cls.train_indizes, -# cls.val_indizes, -# ), -# ] -# -# def test_unimodal_optimizer_for_audio_modality(self): -# audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( -# self.num_instances, 30000 -# ) -# audio = UnimodalModality( -# TestDataLoader( -# self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md -# ) -# ) -# -# self.optimize_unimodal_representation_for_modality(audio) -# -# def test_unimodal_optimizer_for_text_modality(self): -# text_data, text_md = ModalityRandomDataGenerator().create_text_data( -# self.num_instances -# ) -# text = UnimodalModality( -# TestDataLoader( -# self.indices, None, ModalityType.TEXT, text_data, str, text_md -# ) -# ) -# self.optimize_unimodal_representation_for_modality(text) -# -# def test_unimodal_optimizer_for_video_modality(self): -# video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( -# self.num_instances, 60 -# ) -# video = UnimodalModality( -# TestDataLoader( -# self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md -# ) -# ) -# self.optimize_unimodal_representation_for_modality(video) -# -# def optimize_unimodal_representation_for_modality(self, modality): -# with patch.object( -# Registry, -# "_representations", -# { -# ModalityType.TEXT: [W2V], -# ModalityType.AUDIO: [Spectrogram], -# ModalityType.TIMESERIES: [ResNet], -# ModalityType.VIDEO: [ResNet], -# ModalityType.EMBEDDING: [], -# }, -# ): -# registry = Registry() -# -# unimodal_optimizer = UnimodalRepresentationOptimizer( -# [modality], self.tasks, max_chain_depth=2 -# ) -# unimodal_optimizer.optimize() -# -# assert ( -# list(unimodal_optimizer.optimization_results.keys())[0] -# == modality.modality_id -# ) -# assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 -# assert ( -# len( -# unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ -# 0 -# ].operator_chain -# ) -# >= 1 -# ) -# -# -# if __name__ == "__main__": -# unittest.main() + data_generator = None + num_instances = 0 + + @classmethod + def setUpClass(cls): + cls.num_instances = 10 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + cls.labels = np.random.choice([0, 1], size=cls.num_instances) + cls.indices = np.array(range(cls.num_instances)) + + split = train_test_split( + cls.indices, + cls.labels, + test_size=0.2, + random_state=42, + ) + cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [ + int(i) for i in split[1] + ] + + cls.tasks = [ + Task( + "UnimodalRepresentationTask1", + TestSVM(), + cls.labels, + cls.train_indizes, + cls.val_indizes, + ), + Task( + "UnimodalRepresentationTask2", + TestCNN(), + cls.labels, + cls.train_indizes, + cls.val_indizes, + ), + ] + + def test_unimodal_optimizer_for_audio_modality(self): + audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( + self.num_instances, 100 + ) + audio = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md + ) + ) + + self.optimize_unimodal_representation_for_modality(audio) + + def test_unimodal_optimizer_for_text_modality(self): + text_data, text_md = ModalityRandomDataGenerator().create_text_data( + self.num_instances + ) + text = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.TEXT, text_data, str, text_md + ) + ) + self.optimize_unimodal_representation_for_modality(text) + + def test_unimodal_optimizer_for_video_modality(self): + video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 60 + ) + video = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md + ) + ) + self.optimize_unimodal_representation_for_modality(video) + + def optimize_unimodal_representation_for_modality(self, modality): + with patch.object( + Registry, + "_representations", + { + ModalityType.TEXT: [W2V], + ModalityType.AUDIO: [Spectrogram], + ModalityType.TIMESERIES: [ResNet], + ModalityType.VIDEO: [ResNet], + ModalityType.EMBEDDING: [], + }, + ): + registry = Registry() + + unimodal_optimizer = UnimodalRepresentationOptimizer( + [modality], self.tasks, max_chain_depth=2 + ) + unimodal_optimizer.optimize() + + assert ( + list(unimodal_optimizer.optimization_results.keys())[0] + == modality.modality_id + ) + assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2 + assert ( + len( + unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[ + 0 + ].operator_chain + ) + >= 1 + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index f1dd530372e..f2a42076fc8 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -29,6 +29,8 @@ from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.bert import Bert from systemds.scuro.representations.mel_spectrogram import MelSpectrogram +from systemds.scuro.representations.mfcc import MFCC +from systemds.scuro.representations.x3d import I3D from systemds.scuro.representations.resnet import ResNet from tests.scuro.data_generator import setup_data @@ -39,84 +41,81 @@ class TestUnimodalRepresentations(unittest.TestCase): - pass + test_file_path = None + mods = None + text = None + audio = None + video = None + data_generator = None + num_instances = 0 + @classmethod + def setUpClass(cls): + cls.test_file_path = "unimodal_test_data" -# test_file_path = None -# mods = None -# text = None -# audio = None -# video = None -# data_generator = None -# num_instances = 0 -# -# @classmethod -# def setUpClass(cls): -# cls.test_file_path = "unimodal_test_data" -# -# cls.num_instances = 4 -# cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] -# -# cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) -# os.makedirs(f"{cls.test_file_path}/embeddings") -# -# @classmethod -# def tearDownClass(cls): -# print("Cleaning up test data") -# shutil.rmtree(cls.test_file_path) -# -# def test_audio_representations(self): -# audio_representations = [MelSpectrogram()] # TODO: add FFT, TFN, 1DCNN -# audio_data_loader = AudioLoader( -# self.data_generator.get_modality_path(ModalityType.AUDIO), -# self.data_generator.indices, -# ) -# audio = UnimodalModality(audio_data_loader) -# -# for representation in audio_representations: -# r = audio.apply_representation(representation) -# assert r.data is not None -# assert len(r.data) == self.num_instances -# -# def test_video_representations(self): -# video_representations = [ResNet()] # Todo: add other video representations -# video_data_loader = VideoLoader( -# self.data_generator.get_modality_path(ModalityType.VIDEO), -# self.data_generator.indices, -# ) -# video = UnimodalModality(video_data_loader) -# for representation in video_representations: -# r = video.apply_representation(representation) -# assert r.data is not None -# assert len(r.data) == self.num_instances -# -# def test_text_representations(self): -# test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()] -# text_data_loader = TextLoader( -# self.data_generator.get_modality_path(ModalityType.TEXT), -# self.data_generator.indices, -# ) -# text = UnimodalModality(text_data_loader) -# -# for representation in test_representations: -# r = text.apply_representation(representation) -# assert r.data is not None -# assert len(r.data) == self.num_instances -# -# def test_chunked_video_representations(self): -# video_representations = [ResNet()] -# video_data_loader = VideoLoader( -# self.data_generator.get_modality_path(ModalityType.VIDEO), -# self.data_generator.indices, -# chunk_size=2, -# ) -# video = UnimodalModality(video_data_loader) -# for representation in video_representations: -# r = video.apply_representation(representation) -# assert r.data is not None -# assert len(r.data) == self.num_instances -# assert len(r.metadata) == self.num_instances -# -# -# if __name__ == "__main__": -# unittest.main() + cls.num_instances = 4 + cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + + cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + os.makedirs(f"{cls.test_file_path}/embeddings") + + @classmethod + def tearDownClass(cls): + print("Cleaning up test data") + shutil.rmtree(cls.test_file_path) + + def test_audio_representations(self): + audio_representations = [MFCC()] # TODO: add FFT, TFN, 1DCNN + audio_data_loader = AudioLoader( + self.data_generator.get_modality_path(ModalityType.AUDIO), + self.data_generator.indices, + ) + audio = UnimodalModality(audio_data_loader) + + for representation in audio_representations: + r = audio.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + + def test_video_representations(self): + video_representations = [I3D()] # Todo: add other video representations + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, fps=5 + ) + video = UnimodalModality(video_data_loader) + for representation in video_representations: + r = video.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + + def test_text_representations(self): + test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()] + text_data_loader = TextLoader( + self.data_generator.get_modality_path(ModalityType.TEXT), + self.data_generator.indices, + ) + text = UnimodalModality(text_data_loader) + + for representation in test_representations: + r = text.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + + def test_chunked_video_representations(self): + video_representations = [ResNet()] + video_data_loader = VideoLoader( + self.data_generator.get_modality_path(ModalityType.VIDEO), + self.data_generator.indices, + chunk_size=2, + ) + video = UnimodalModality(video_data_loader) + for representation in video_representations: + r = video.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + assert len(r.metadata) == self.num_instances + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py index 9e9e82d9ed9..ea1b0f46f2e 100644 --- a/src/main/python/tests/scuro/test_window_operations.py +++ b/src/main/python/tests/scuro/test_window_operations.py @@ -29,81 +29,78 @@ class TestWindowOperations(unittest.TestCase): - pass + @classmethod + def setUpClass(cls): + cls.num_instances = 40 + cls.data_generator = ModalityRandomDataGenerator() + cls.aggregations = ["mean", "sum", "max", "min"] + def test_window_operations_on_audio_representations(self): + window_size = 10 + self.run_window_operations_for_modality(ModalityType.AUDIO, window_size) -# @classmethod -# def setUpClass(cls): -# cls.num_instances = 40 -# cls.data_generator = ModalityRandomDataGenerator() -# cls.aggregations = ["mean", "sum", "max", "min"] -# -# def test_window_operations_on_audio_representations(self): -# window_size = 10 -# self.run_window_operations_for_modality(ModalityType.AUDIO, window_size) -# -# def test_window_operations_on_video_representations(self): -# window_size = 10 -# self.run_window_operations_for_modality(ModalityType.VIDEO, window_size) -# -# def test_window_operations_on_text_representations(self): -# window_size = 10 -# -# self.run_window_operations_for_modality(ModalityType.TEXT, window_size) -# -# def run_window_operations_for_modality(self, modality_type, window_size): -# r = self.data_generator.create1DModality(40, 100, modality_type) -# for aggregation in self.aggregations: -# windowed_modality = r.window_aggregation(window_size, aggregation) -# -# self.verify_window_operation(aggregation, r, windowed_modality, window_size) -# -# def verify_window_operation( -# self, aggregation, modality, windowed_modality, window_size -# ): -# assert windowed_modality.data is not None -# assert len(windowed_modality.data) == self.num_instances -# -# for i, instance in enumerate(windowed_modality.data): -# # assert ( -# # list(windowed_modality.metadata.values())[i]["data_layout"]["shape"][0] -# # == list(modality.metadata.values())[i]["data_layout"]["shape"][0] -# # ) -# assert len(instance) == math.ceil(len(modality.data[i]) / window_size) -# for j in range(0, len(instance)): -# if aggregation == "mean": -# np.testing.assert_almost_equal( -# instance[j], -# np.mean( -# modality.data[i][j * window_size : (j + 1) * window_size], -# axis=0, -# ), -# ) -# elif aggregation == "sum": -# np.testing.assert_almost_equal( -# instance[j], -# np.sum( -# modality.data[i][j * window_size : (j + 1) * window_size], -# axis=0, -# ), -# ) -# elif aggregation == "max": -# np.testing.assert_almost_equal( -# instance[j], -# np.max( -# modality.data[i][j * window_size : (j + 1) * window_size], -# axis=0, -# ), -# ) -# elif aggregation == "min": -# np.testing.assert_almost_equal( -# instance[j], -# np.min( -# modality.data[i][j * window_size : (j + 1) * window_size], -# axis=0, -# ), -# ) -# -# -# if __name__ == "__main__": -# unittest.main() + def test_window_operations_on_video_representations(self): + window_size = 10 + self.run_window_operations_for_modality(ModalityType.VIDEO, window_size) + + def test_window_operations_on_text_representations(self): + window_size = 10 + + self.run_window_operations_for_modality(ModalityType.TEXT, window_size) + + def run_window_operations_for_modality(self, modality_type, window_size): + r = self.data_generator.create1DModality(40, 100, modality_type) + for aggregation in self.aggregations: + windowed_modality = r.window_aggregation(window_size, aggregation) + + self.verify_window_operation(aggregation, r, windowed_modality, window_size) + + def verify_window_operation( + self, aggregation, modality, windowed_modality, window_size + ): + assert windowed_modality.data is not None + assert len(windowed_modality.data) == self.num_instances + + for i, instance in enumerate(windowed_modality.data): + # assert ( + # list(windowed_modality.metadata.values())[i]["data_layout"]["shape"][0] + # == list(modality.metadata.values())[i]["data_layout"]["shape"][0] + # ) + assert len(instance) == math.ceil(len(modality.data[i]) / window_size) + for j in range(0, len(instance)): + if aggregation == "mean": + np.testing.assert_almost_equal( + instance[j], + np.mean( + modality.data[i][j * window_size : (j + 1) * window_size], + axis=0, + ), + ) + elif aggregation == "sum": + np.testing.assert_almost_equal( + instance[j], + np.sum( + modality.data[i][j * window_size : (j + 1) * window_size], + axis=0, + ), + ) + elif aggregation == "max": + np.testing.assert_almost_equal( + instance[j], + np.max( + modality.data[i][j * window_size : (j + 1) * window_size], + axis=0, + ), + ) + elif aggregation == "min": + np.testing.assert_almost_equal( + instance[j], + np.min( + modality.data[i][j * window_size : (j + 1) * window_size], + axis=0, + ), + ) + + +if __name__ == "__main__": + unittest.main() From f0018fe01f1bc11c702103e81bea7fc5e4e28ad2 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 28 Jul 2025 10:38:11 +0200 Subject: [PATCH 21/23] remove torch device --- src/main/python/systemds/utils/helpers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/python/systemds/utils/helpers.py b/src/main/python/systemds/utils/helpers.py index 429f924feb7..e6c2e199207 100644 --- a/src/main/python/systemds/utils/helpers.py +++ b/src/main/python/systemds/utils/helpers.py @@ -26,8 +26,6 @@ import torch from systemds.utils.consts import MODULE_NAME -DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu' - def create_params_string( unnamed_parameters: Iterable[str], named_parameters: Dict[str, str] ) -> str: From 8e2007daa74ab9e1db8d8f8ee2fd8a6ea66fd8e3 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 28 Jul 2025 11:02:33 +0200 Subject: [PATCH 22/23] reformat --- .../scuro/drsearch/operator_registry.py | 1 + .../python/systemds/scuro/modality/type.py | 4 +- .../systemds/scuro/representations/bert.py | 25 +++++++---- .../systemds/scuro/representations/fusion.py | 1 - .../scuro/representations/hadamard.py | 6 ++- .../scuro/representations/image_bind.py | 14 +++---- .../systemds/scuro/representations/max.py | 3 +- .../systemds/scuro/representations/x3d.py | 41 +++++++++---------- .../systemds/scuro/utils/torch_dataset.py | 6 ++- src/main/python/systemds/utils/helpers.py | 1 + src/main/python/tests/scuro/data_generator.py | 10 +++-- .../python/tests/scuro/test_fusion_orders.py | 27 ++++++------ .../tests/scuro/test_multimodal_join.py | 20 ++++++--- .../scuro/test_unimodal_representations.py | 6 +-- 14 files changed, 95 insertions(+), 70 deletions(-) diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py index 942e5bb80eb..cfd313eb563 100644 --- a/src/main/python/systemds/scuro/drsearch/operator_registry.py +++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py @@ -58,6 +58,7 @@ def get_representations(self, modality: ModalityType): return self._representations[modality] def get_context_operators(self): + # TODO: return modality specific context operations return self._context_operators def get_fusion_operators(self): diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index a2f3d993a9b..a479e07085d 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -190,11 +190,11 @@ def get_schema(self): def update_metadata(self, md, data): return ModalitySchemas.update_metadata(self.name, md, data) - + def add_alignment(self, md, alignment_timestamps): md["alignment_timestamps"] = alignment_timestamps return md - + def add_field(self, md, field, data): md[field] = data return md diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 15b969aeb49..8d8d40f4fd7 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -28,6 +28,7 @@ from systemds.scuro.drsearch.operator_registry import register_representation import os + os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -50,7 +51,7 @@ def transform(self, modality): model = BertModel.from_pretrained(model_name) embeddings = self.create_embeddings(modality, model, tokenizer) - + if self.output_file is not None: save_embeddings(embeddings, self.output_file) @@ -60,12 +61,22 @@ def transform(self, modality): def create_embeddings(self, modality, model, tokenizer): embeddings = [] for i, d in enumerate(modality.data): - inputs = tokenizer(d, return_offsets_mapping=True, return_tensors="pt", padding=True, truncation=True) - - ModalityType.TEXT.add_field(list(modality.metadata.values())[i], "token_to_character_mapping", inputs.data['offset_mapping'][0].tolist()) - - del inputs.data['offset_mapping'] - + inputs = tokenizer( + d, + return_offsets_mapping=True, + return_tensors="pt", + padding=True, + truncation=True, + ) + + ModalityType.TEXT.add_field( + list(modality.metadata.values())[i], + "token_to_character_mapping", + inputs.data["offset_mapping"][0].tolist(), + ) + + del inputs.data["offset_mapping"] + with torch.no_grad(): outputs = model(**inputs) diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py index 339bb050a52..cbbb5606e6d 100644 --- a/src/main/python/systemds/scuro/representations/fusion.py +++ b/src/main/python/systemds/scuro/representations/fusion.py @@ -36,7 +36,6 @@ def __init__(self, name, parameters=None): self.associative = False self.commutative = False self.needs_alignment = False - def transform(self, modalities: List[Modality]): """ diff --git a/src/main/python/systemds/scuro/representations/hadamard.py b/src/main/python/systemds/scuro/representations/hadamard.py index 0576ef49563..138003b8741 100644 --- a/src/main/python/systemds/scuro/representations/hadamard.py +++ b/src/main/python/systemds/scuro/representations/hadamard.py @@ -28,6 +28,8 @@ from systemds.scuro.representations.fusion import Fusion from systemds.scuro.drsearch.operator_registry import register_fusion_operator + + @register_fusion_operator() class Hadamard(Fusion): def __init__(self): @@ -35,12 +37,12 @@ def __init__(self): Combines modalities using elementwise multiply (Hadamard product) """ super().__init__("Hadamard") - self.needs_alignment = True # zero padding falsifies the result + self.needs_alignment = True # zero padding falsifies the result self.commutative = True self.associative = True def transform(self, modalities: List[Modality], train_indices=None): # TODO: check for alignment in the metadata fused_data = np.prod([m.data for m in modalities], axis=0) - + return fused_data diff --git a/src/main/python/systemds/scuro/representations/image_bind.py b/src/main/python/systemds/scuro/representations/image_bind.py index 608e68e61c0..e934d521af4 100644 --- a/src/main/python/systemds/scuro/representations/image_bind.py +++ b/src/main/python/systemds/scuro/representations/image_bind.py @@ -59,19 +59,19 @@ def transform(self, modality): if modality.modality_type == ModalityType.TEXT: for i, instance in enumerate(modality.data): text_inputs = data.load_and_transform_text(instance, DEVICE) - text_embeddings =self.model({IBModalityType.TEXT: text_inputs})[ + text_embeddings = self.model({IBModalityType.TEXT: text_inputs})[ IBModalityType.TEXT ] result.append(text_embeddings.mean(axis=0).cpu().detach().numpy()) if modality.modality_type == ModalityType.AUDIO: audio_inputs = data.load_and_transform_audio_data( list(modality.metadata)[ - (modality.data_loader.next_chunk - 1) - * (modality.data_loader.chunk_size): ( - modality.data_loader.next_chunk - 1 - ) - * (modality.data_loader.chunk_size) - + (modality.data_loader.chunk_size) + (modality.data_loader.next_chunk - 1) + * (modality.data_loader.chunk_size) : ( + modality.data_loader.next_chunk - 1 + ) + * (modality.data_loader.chunk_size) + + (modality.data_loader.chunk_size) ], DEVICE, ) diff --git a/src/main/python/systemds/scuro/representations/max.py b/src/main/python/systemds/scuro/representations/max.py index f7f7c298b8a..6ecf5fd52f3 100644 --- a/src/main/python/systemds/scuro/representations/max.py +++ b/src/main/python/systemds/scuro/representations/max.py @@ -46,6 +46,5 @@ def transform( ): # TODO: need to check if data is aligned - same number of dimension fused_data = np.maximum.reduce([m.data for m in modalities]) - + return fused_data - \ No newline at end of file diff --git a/src/main/python/systemds/scuro/representations/x3d.py b/src/main/python/systemds/scuro/representations/x3d.py index 42f21ec7e89..1629ac6f309 100644 --- a/src/main/python/systemds/scuro/representations/x3d.py +++ b/src/main/python/systemds/scuro/representations/x3d.py @@ -31,14 +31,13 @@ from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.operator_registry import register_representation import math + if torch.backends.mps.is_available(): DEVICE = torch.device("mps") -# elif torch.cuda.is_available(): -# DEVICE = torch.device("cuda") +elif torch.cuda.is_available(): + DEVICE = torch.device("cuda") else: DEVICE = torch.device("cpu") - -DEVICE = torch.device("cpu") # @register_representation([ModalityType.VIDEO]) @@ -135,26 +134,28 @@ def hook( transformed_modality.data = list(embeddings.values()) return transformed_modality - + class I3D(UnimodalRepresentation): def __init__(self, layer="avgpool", model_name="i3d", output_file=None): self.model_name = model_name parameters = self._get_parameters() - self.model = torch.hub.load("facebookresearch/pytorchvideo", "i3d_r50", pretrained=True).to(DEVICE) + self.model = torch.hub.load( + "facebookresearch/pytorchvideo", "i3d_r50", pretrained=True + ).to(DEVICE) super().__init__("I3D", ModalityType.TIMESERIES, parameters) - + self.output_file = output_file self.layer_name = layer self.model.eval() for param in self.model.parameters(): param.requires_grad = False - + def _get_parameters(self, high_level=True): parameters = {"model_name": [], "layer_name": []} for m in ["r3d", "s3d"]: parameters["model_name"].append(m) - + if high_level: parameters["layer_name"] = [ "conv1", @@ -168,37 +169,35 @@ def _get_parameters(self, high_level=True): for name, layer in self.model.named_modules(): parameters["layer_name"].append(name) return parameters - + def transform(self, modality): dataset = CustomDataset(modality.data, torch.float32, DEVICE) embeddings = {} - + features = None - + def hook(module, input, output): pooled = torch.nn.functional.adaptive_avg_pool3d(output, 1).squeeze() nonlocal features features = pooled.detach().cpu().numpy() - + handle = self.model.blocks[6].dropout.register_forward_hook(hook) - + for instance in dataset: video_id = instance["id"] frames = instance["data"].to(DEVICE) embeddings[video_id] = [] - - + batch = torch.transpose(frames, 1, 0) batch = batch.unsqueeze(0) _ = self.model(batch) - + embeddings[video_id] = features - + transformed_modality = TransformedModality( modality, self, self.output_modality_type ) - + transformed_modality.data = list(embeddings.values()) - - return transformed_modality + return transformed_modality diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py index f93dcaca8a7..c04be0ec7b6 100644 --- a/src/main/python/systemds/scuro/utils/torch_dataset.py +++ b/src/main/python/systemds/scuro/utils/torch_dataset.py @@ -32,7 +32,7 @@ def __init__(self, data, data_type, device, size=None): self.size = size if size is None: self.size = (256, 224) - + self.tf = transforms.Compose( [ transforms.ToPILImage(), @@ -49,7 +49,9 @@ def __init__(self, data, data_type, device, size=None): def __getitem__(self, index) -> Dict[str, object]: data = self.data[index] output = torch.empty( - (len(data), 3, self.size[1], self.size[1]), dtype=self.data_type, device=self.device + (len(data), 3, self.size[1], self.size[1]), + dtype=self.data_type, + device=self.device, ) for i, d in enumerate(data): diff --git a/src/main/python/systemds/utils/helpers.py b/src/main/python/systemds/utils/helpers.py index e6c2e199207..887b3140ebd 100644 --- a/src/main/python/systemds/utils/helpers.py +++ b/src/main/python/systemds/utils/helpers.py @@ -26,6 +26,7 @@ import torch from systemds.utils.consts import MODULE_NAME + def create_params_string( unnamed_parameters: Iterable[str], named_parameters: Dict[str, str] ) -> str: diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index bc8854647df..fbb50ac180e 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -41,11 +41,11 @@ def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata self.metadata = metadata self.test_data = data - + def reset(self): self._next_chunk = 0 self.data = [] - + def extract(self, file, indices): if isinstance(self.test_data, list): self.data = [self.test_data[i] for i in indices] @@ -178,8 +178,10 @@ def create_visual_modality(self, num_instances, num_frames=1, height=28, width=2 return ( np.random.randint( - 0, 256, (num_instances, num_frames, height, width) - # ).astype(np.float16).tolist(), + 0, + 256, + (num_instances, num_frames, height, width), + # ).astype(np.float16).tolist(), ).astype(np.float16), metadata, ) diff --git a/src/main/python/tests/scuro/test_fusion_orders.py b/src/main/python/tests/scuro/test_fusion_orders.py index 7bb086b9a37..eb01d18ffe4 100644 --- a/src/main/python/tests/scuro/test_fusion_orders.py +++ b/src/main/python/tests/scuro/test_fusion_orders.py @@ -41,56 +41,55 @@ def setUpClass(cls): cls.r_1 = cls.data_generator.create1DModality(40, 100, ModalityType.AUDIO) cls.r_2 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT) cls.r_3 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT) - + def test_fusion_order_avg(self): r_1_r_2 = self.r_1.combine(self.r_2, Average()) r_2_r_1 = self.r_2.combine(self.r_1, Average()) r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Average()) r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Average()) - + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Average()) - + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) - - + def test_fusion_order_concat(self): r_1_r_2 = self.r_1.combine(self.r_2, Concatenation()) r_2_r_1 = self.r_2.combine(self.r_1, Concatenation()) r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Concatenation()) r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Concatenation()) - + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Concatenation()) - + self.assertFalse(np.array_equal(r_1_r_2.data, r_2_r_1.data)) self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) - + def test_fusion_order_max(self): r_1_r_2 = self.r_1.combine(self.r_2, RowMax()) r_2_r_1 = self.r_2.combine(self.r_1, RowMax()) r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, RowMax()) r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, RowMax()) - + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], RowMax()) - + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) - + def test_fusion_order_hadamard(self): r_1_r_2 = self.r_1.combine(self.r_2, Hadamard()) r_2_r_1 = self.r_2.combine(self.r_1, Hadamard()) r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Hadamard()) r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Hadamard()) - + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Hadamard()) - + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) - self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) \ No newline at end of file + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index 7f0b35c311f..9e3a16ffcad 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -53,9 +53,9 @@ def setUpClass(cls): cls.audio_data, cls.audio_md = ModalityRandomDataGenerator().create_audio_data( cls.num_instances, 32000 ) - - cls.video_data, cls.video_md = ModalityRandomDataGenerator().create_visual_modality( - cls.num_instances, 60 + + cls.video_data, cls.video_md = ( + ModalityRandomDataGenerator().create_visual_modality(cls.num_instances, 60) ) def test_video_audio_join(self): @@ -94,12 +94,22 @@ def _execute_av_join(self, l_chunk_size=None, r_chunk_size=None): def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): audio = UnimodalModality( TestDataLoader( - self.indices, r_chunk_size, ModalityType.AUDIO, copy.deepcopy(self.audio_data), np.float32, copy.deepcopy(self.audio_md) + self.indices, + r_chunk_size, + ModalityType.AUDIO, + copy.deepcopy(self.audio_data), + np.float32, + copy.deepcopy(self.audio_md), ) ) video = UnimodalModality( TestDataLoader( - self.indices, l_chunk_size, ModalityType.VIDEO, copy.deepcopy(self.video_data), np.float32, copy.deepcopy(self.video_md) + self.indices, + l_chunk_size, + ModalityType.VIDEO, + copy.deepcopy(self.video_data), + np.float32, + copy.deepcopy(self.video_md), ) ) diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index f2a42076fc8..dca58113afb 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -30,7 +30,6 @@ from systemds.scuro.representations.bert import Bert from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.mfcc import MFCC -from systemds.scuro.representations.x3d import I3D from systemds.scuro.representations.resnet import ResNet from tests.scuro.data_generator import setup_data @@ -78,10 +77,11 @@ def test_audio_representations(self): assert len(r.data) == self.num_instances def test_video_representations(self): - video_representations = [I3D()] # Todo: add other video representations + video_representations = [ResNet()] # Todo: add other video representations video_data_loader = VideoLoader( self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, fps=5 + self.data_generator.indices, + fps=5, ) video = UnimodalModality(video_data_loader) for representation in video_representations: From 2dd9d2c5a12136b4d8ff13d53c079042369bd8c1 Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Mon, 28 Jul 2025 11:35:10 +0200 Subject: [PATCH 23/23] remove param --- src/main/python/tests/scuro/test_unimodal_representations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index dca58113afb..2f2e64efd7c 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -81,7 +81,6 @@ def test_video_representations(self): video_data_loader = VideoLoader( self.data_generator.get_modality_path(ModalityType.VIDEO), self.data_generator.indices, - fps=5, ) video = UnimodalModality(video_data_loader) for representation in video_representations: