-
Notifications
You must be signed in to change notification settings - Fork 9
105 har missing modules #106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
otavioon
merged 10 commits into
discovery-unicamp:main
from
gustavo-luz:105-har-missing-modules
May 19, 2026
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
43bac96
feat(data): add SeriesDataset base class for time series
gustavo-luz 0af6d91
feat(data): add HAR datasets for Rodrigues 2024 and Xu 2023
gustavo-luz c46ed1e
feat(data): add HAR DataModules (generic + Rodrigues 24, Xu 23)
gustavo-luz bbc624a
feat(ssl): add DIET and linear head
gustavo-luz 0fbf8c0
feat(nets): extend LFRHARArchitecture to be compatible with TNC(permute)
gustavo-luz 72cb741
test: cover SeriesDataset, HAR datasets/DataModules, DIET, DIETLinear
gustavo-luz 98148d8
style: apply black + isort to PR files
gustavo-luz 67c4f2b
Update documentation for use_val_with_train parameter
otavioon 5d8dd1a
Remove HarDataModule_Downstream class
otavioon 86654ff
Remove trailing blank line in har_xu_23.py
otavioon File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| from typing import List, Optional, Union | ||
|
|
||
| from lightning import LightningDataModule | ||
| from torch.utils.data import DataLoader | ||
|
|
||
| from minerva.data.datasets.har_rodrigues_24 import HARDatasetCPC | ||
| from minerva.utils.typing import PathLike | ||
|
|
||
|
|
||
| # Defining the data loader for the implementation | ||
| class HARDataModuleCPC(LightningDataModule): | ||
| def __init__( | ||
| self, | ||
| data_path: Union[PathLike, List[PathLike]], | ||
| input_size: int = 6, | ||
| window: int = 60, | ||
| overlap: int = 30, | ||
| batch_size: int = 64, | ||
| use_train_as_val: bool = False, | ||
| use_val_with_train: bool = True, | ||
| columns: Optional[List[str]] = None, | ||
| num_workers: int = 8, | ||
| drop_last: bool = True, | ||
| label: Optional[str] = "standard activity code", | ||
| transpose_data: bool = True, | ||
| ): | ||
| """Data module for Human Activity Recognition (HAR) using CPC. | ||
|
|
||
| This class handles the creation of training, validation, and test | ||
| dataloaders for the HAR dataset. It uses the HARDatasetCPC class to | ||
| load the data. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| data_path : Union[PathLike, List[PathLike]] | ||
| The root directory where the dataset is stored. If a list is | ||
| the datasets will be concatenated, in their respective order, to | ||
| each partition (train, val, test). | ||
| input_size : int, optional | ||
| The number of input features (default is 6). | ||
| window : int, optional | ||
| The size of the sliding window (default is 60). | ||
| overlap : int, optional | ||
| The overlap size for the sliding window (default is 30). | ||
| batch_size : int, optional | ||
| The batch size for the dataloaders (default is 64). | ||
| use_val_with_train : bool | ||
| Whether to use the training set with validation set togheter. | ||
| label : Optional[str] | ||
| The column to be used as the label. If None, no labels will be | ||
| used. If 'return_index_as_label', the index of the data will be | ||
| used as the label. | ||
| transpose_data : bool | ||
| If True, the data will be returned as a vector of shape (C, T), | ||
| else the data will be returned as a vector of shape (T, C). | ||
| """ | ||
| super().__init__() | ||
| self.data_path = data_path | ||
| self.batch_size = batch_size | ||
| self.num_workers = num_workers | ||
| self.drop_last = drop_last | ||
| self.label = label | ||
| self.transpose_data = transpose_data | ||
|
|
||
| self.train_dataset = HARDatasetCPC( | ||
| data_path, | ||
| input_size, | ||
| window, | ||
| overlap, | ||
| phase="train", | ||
| use_train_as_val=use_train_as_val, | ||
| use_val_with_train=use_val_with_train, | ||
| columns=columns, | ||
| label=label, | ||
| transpose_data=transpose_data, | ||
| ) | ||
| self.val_dataset = HARDatasetCPC( | ||
| data_path, | ||
| input_size, | ||
| window, | ||
| overlap, | ||
| phase="val", | ||
| use_train_as_val=use_train_as_val, | ||
| use_val_with_train=use_val_with_train, | ||
| columns=columns, | ||
| label=label, | ||
| transpose_data=transpose_data, | ||
| ) | ||
| self.test_dataset = HARDatasetCPC( | ||
| data_path, | ||
| input_size, | ||
| window, | ||
| overlap, | ||
| phase="test", | ||
| use_train_as_val=use_train_as_val, | ||
| use_val_with_train=use_val_with_train, | ||
| columns=columns, | ||
| label=label, | ||
| transpose_data=transpose_data, | ||
| ) | ||
|
otavioon marked this conversation as resolved.
|
||
|
|
||
| def train_dataloader(self): | ||
| return DataLoader( | ||
| self.train_dataset, | ||
| batch_size=self.batch_size, | ||
| shuffle=True, | ||
| drop_last=self.drop_last, | ||
| num_workers=self.num_workers, | ||
| ) | ||
|
|
||
| def val_dataloader(self): | ||
| return DataLoader( | ||
| self.val_dataset, | ||
| batch_size=self.batch_size, | ||
| shuffle=False, | ||
| drop_last=self.drop_last, | ||
| num_workers=self.num_workers, | ||
| ) | ||
|
|
||
| def test_dataloader(self): | ||
| return DataLoader( | ||
| self.test_dataset, | ||
| batch_size=self.batch_size, | ||
| shuffle=False, | ||
| drop_last=self.drop_last, | ||
| num_workers=self.num_workers, | ||
| ) | ||
|
|
||
| def __repr__(self): | ||
| return ( | ||
| f"HARDataModuleCPC(batch_size={self.batch_size}, datasets={self.data_path})" | ||
| ) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| import os | ||
| from pathlib import Path | ||
| from typing import List | ||
|
|
||
| import lightning as L | ||
| import numpy as np | ||
| from torch.utils.data import DataLoader | ||
|
|
||
| from minerva.data.datasets.har_xu_23 import HarDataset, TNCDataset | ||
| from minerva.utils.typing import PathLike | ||
|
|
||
|
|
||
| class HarDataModule(L.LightningDataModule): | ||
| def __init__( | ||
| self, | ||
| processed_data_dir: PathLike, | ||
| batch_size: int = 16, | ||
| mc_sample_size: int = 5, | ||
| epsilon: int = 3, | ||
| adf: bool = True, | ||
| window_size: int = 128, | ||
| use_train_as_val: bool = False, | ||
| num_workers: int = 8, | ||
| use_val_with_train: bool = False, | ||
| ): | ||
| """ | ||
| This DataModule handles the loading and preparation of data for | ||
| training, validation, and testing. The data is expected to be stored | ||
| in 3 numpy (.npy) files named `train_data.npy`, `val_data.npy`, and | ||
| `test_data.npy`. They are NumPy arrays storing the concatenated | ||
| accelerometer and gyroscope data. | ||
|
|
||
| This numpy arrays (files) must have the following shape (n_samples, | ||
| n_timesteps, n_channels) and are produced at specific window size by | ||
| another data processing script available in | ||
| https://github.com/maxxu05/rebar/blob/main/data/process/har_processdata.py | ||
|
|
||
| The original files have exact shape of: | ||
| - `train_data.npy`: `(41, 15038, 6)` | ||
| - `val_data.npy`: `(9, 15038, 6)` | ||
| - `test_data.npy`: `(9, 15038, 6)` | ||
|
|
||
| The Python script performs a series of tasks to facilitate the | ||
| preprocessing and organization of dataset, processing | ||
| The raw accelerometer and gyroscope data for each participant are, | ||
| filtering out sequences shorter than a set threshold. | ||
| The data is then split into training, validation, and test sets, which | ||
| are saved as NumPy arrays along with corresponding participant names. | ||
|
|
||
| For the dataloader, the .npy files are transposed into the shape | ||
| (n_samples, n_channels, n_timesteps) and passed to the TNCDataset | ||
|
|
||
| Parameters | ||
| ---------- | ||
| processed_data_dir: PathLike | ||
| Path to the directory where the processed .npy files are stored. | ||
| Inside this path must have 3 files, named train_data.npy, | ||
| val_data.npy, and test_data.npy. | ||
| batch_size : int, optional | ||
| The batch size to use for the DataLoader. Defaults to 16. | ||
| mc_sample_size : int, optional | ||
| This value determines how many neighboring and non-neighboring | ||
| windows are used per data sample. Defaults to 5. | ||
| epsilon : int, optional | ||
| This parameter controls the "spread" of neighboring windows. | ||
| adf : bool, optional | ||
| Flag indicating whether to use ADF (Augmented Dickey-Fuller) | ||
| testing for finding neighbors. Defaults to True. | ||
| window_size : int, optional | ||
| The size of the windows to be used for each sample in the TNC | ||
| dataset. Defaults to 128. | ||
| use_val_with_train : bool, optional | ||
| If True, the validation and train sets will be concatenated in | ||
| order to create a large train set. By default, this is True. | ||
| """ | ||
|
otavioon marked this conversation as resolved.
|
||
| super().__init__() | ||
| self.processed_data_dir = Path(processed_data_dir) | ||
| self.batch_size = batch_size | ||
| self.mc_sample_size = mc_sample_size | ||
| self.epsilon = epsilon | ||
| self.adf = adf | ||
| self.window_size = window_size | ||
| self.num_workers = num_workers | ||
| self.use_val_with_train = use_val_with_train | ||
|
|
||
| self.har_train = np.load(self.processed_data_dir / "train_data.npy") | ||
| self.har_val = np.load(self.processed_data_dir / "val_data.npy") | ||
| self.har_test = np.load(self.processed_data_dir / "test_data.npy") | ||
|
|
||
| # Handle use_val_with_train and use_train_as_val | ||
| if use_train_as_val: | ||
| self.har_val = self.har_train | ||
| elif use_val_with_train: | ||
| self.har_train = np.concatenate([self.har_train, self.har_val], axis=0) | ||
|
|
||
| # Print dataset sizes after concatenation | ||
| # print(f"\nFinal Training Data Size: {self.har_train.shape}") | ||
| # print(f"Final Validation Data Size: {self.har_val.shape}") | ||
| # print(f"Final Test Data Size: {self.har_test.shape}") | ||
|
|
||
| def train_dataloader(self): | ||
| """ | ||
| Returns the DataLoader for the training dataset. | ||
|
|
||
| Returns | ||
| ------- | ||
| DataLoader | ||
| DataLoader for the training dataset. | ||
| """ | ||
| return DataLoader( | ||
| TNCDataset( | ||
| np.transpose(self.har_train, (0, 2, 1)), | ||
| self.mc_sample_size, | ||
| self.window_size, | ||
| self.epsilon, | ||
| self.adf, | ||
| ), | ||
| batch_size=self.batch_size, | ||
| shuffle=True, | ||
| num_workers=self.num_workers, | ||
| ) | ||
|
|
||
| def val_dataloader(self): | ||
| """ | ||
| Returns the DataLoader for the validation dataset. | ||
|
|
||
| Returns | ||
| ------- | ||
| DataLoader | ||
| DataLoader for the validation dataset. | ||
| """ | ||
| return DataLoader( | ||
| TNCDataset( | ||
| np.transpose(self.har_val, (0, 2, 1)), | ||
| self.mc_sample_size, | ||
| self.window_size, | ||
| self.epsilon, | ||
| self.adf, | ||
| ), | ||
| batch_size=self.batch_size, | ||
| shuffle=False, | ||
| num_workers=self.num_workers, | ||
| ) | ||
|
|
||
| def test_dataloader(self): | ||
| """ | ||
| Returns the DataLoader for the test dataset. | ||
|
|
||
| Returns | ||
| ------- | ||
| DataLoader | ||
| DataLoader for the test dataset. | ||
| """ | ||
| return DataLoader( | ||
| TNCDataset( | ||
| np.transpose(self.har_test, (0, 2, 1)), | ||
| self.mc_sample_size, | ||
| self.window_size, | ||
| self.epsilon, | ||
| self.adf, | ||
| ), | ||
| batch_size=self.batch_size, | ||
| shuffle=False, | ||
| num_workers=self.num_workers, | ||
| ) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.