Skip to content
Merged
695 changes: 695 additions & 0 deletions minerva/data/data_modules/har.py

Large diffs are not rendered by default.

132 changes: 132 additions & 0 deletions minerva/data/data_modules/har_rodrigues_24.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from typing import List, Optional, Union

from lightning import LightningDataModule
from torch.utils.data import DataLoader

from minerva.data.datasets.har_rodrigues_24 import HARDatasetCPC
from minerva.utils.typing import PathLike


# Defining the data loader for the implementation
class HARDataModuleCPC(LightningDataModule):
def __init__(
self,
data_path: Union[PathLike, List[PathLike]],
input_size: int = 6,
window: int = 60,
overlap: int = 30,
batch_size: int = 64,
use_train_as_val: bool = False,
use_val_with_train: bool = True,
columns: Optional[List[str]] = None,
num_workers: int = 8,
drop_last: bool = True,
label: Optional[str] = "standard activity code",
transpose_data: bool = True,
):
"""Data module for Human Activity Recognition (HAR) using CPC.

This class handles the creation of training, validation, and test
dataloaders for the HAR dataset. It uses the HARDatasetCPC class to
load the data.

Parameters
----------
data_path : Union[PathLike, List[PathLike]]
The root directory where the dataset is stored. If a list is
the datasets will be concatenated, in their respective order, to
each partition (train, val, test).
input_size : int, optional
The number of input features (default is 6).
window : int, optional
The size of the sliding window (default is 60).
overlap : int, optional
The overlap size for the sliding window (default is 30).
batch_size : int, optional
The batch size for the dataloaders (default is 64).
use_val_with_train : bool
Whether to use the training set with validation set togheter.
label : Optional[str]
The column to be used as the label. If None, no labels will be
used. If 'return_index_as_label', the index of the data will be
used as the label.
transpose_data : bool
If True, the data will be returned as a vector of shape (C, T),
else the data will be returned as a vector of shape (T, C).
"""
Comment thread
otavioon marked this conversation as resolved.
super().__init__()
self.data_path = data_path
self.batch_size = batch_size
self.num_workers = num_workers
self.drop_last = drop_last
self.label = label
self.transpose_data = transpose_data

self.train_dataset = HARDatasetCPC(
data_path,
input_size,
window,
overlap,
phase="train",
use_train_as_val=use_train_as_val,
use_val_with_train=use_val_with_train,
columns=columns,
label=label,
transpose_data=transpose_data,
)
self.val_dataset = HARDatasetCPC(
data_path,
input_size,
window,
overlap,
phase="val",
use_train_as_val=use_train_as_val,
use_val_with_train=use_val_with_train,
columns=columns,
label=label,
transpose_data=transpose_data,
)
self.test_dataset = HARDatasetCPC(
data_path,
input_size,
window,
overlap,
phase="test",
use_train_as_val=use_train_as_val,
use_val_with_train=use_val_with_train,
columns=columns,
label=label,
transpose_data=transpose_data,
)
Comment thread
otavioon marked this conversation as resolved.

def train_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
shuffle=True,
drop_last=self.drop_last,
num_workers=self.num_workers,
)

def val_dataloader(self):
return DataLoader(
self.val_dataset,
batch_size=self.batch_size,
shuffle=False,
drop_last=self.drop_last,
num_workers=self.num_workers,
)

def test_dataloader(self):
return DataLoader(
self.test_dataset,
batch_size=self.batch_size,
shuffle=False,
drop_last=self.drop_last,
num_workers=self.num_workers,
)

def __repr__(self):
return (
f"HARDataModuleCPC(batch_size={self.batch_size}, datasets={self.data_path})"
)
165 changes: 165 additions & 0 deletions minerva/data/data_modules/har_xu_23.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import os
from pathlib import Path
from typing import List

import lightning as L
import numpy as np
from torch.utils.data import DataLoader

from minerva.data.datasets.har_xu_23 import HarDataset, TNCDataset
from minerva.utils.typing import PathLike


class HarDataModule(L.LightningDataModule):
def __init__(
self,
processed_data_dir: PathLike,
batch_size: int = 16,
mc_sample_size: int = 5,
epsilon: int = 3,
adf: bool = True,
window_size: int = 128,
use_train_as_val: bool = False,
num_workers: int = 8,
use_val_with_train: bool = False,
):
"""
This DataModule handles the loading and preparation of data for
training, validation, and testing. The data is expected to be stored
in 3 numpy (.npy) files named `train_data.npy`, `val_data.npy`, and
`test_data.npy`. They are NumPy arrays storing the concatenated
accelerometer and gyroscope data.

This numpy arrays (files) must have the following shape (n_samples,
n_timesteps, n_channels) and are produced at specific window size by
another data processing script available in
https://github.com/maxxu05/rebar/blob/main/data/process/har_processdata.py

The original files have exact shape of:
- `train_data.npy`: `(41, 15038, 6)`
- `val_data.npy`: `(9, 15038, 6)`
- `test_data.npy`: `(9, 15038, 6)`

The Python script performs a series of tasks to facilitate the
preprocessing and organization of dataset, processing
The raw accelerometer and gyroscope data for each participant are,
filtering out sequences shorter than a set threshold.
The data is then split into training, validation, and test sets, which
are saved as NumPy arrays along with corresponding participant names.

For the dataloader, the .npy files are transposed into the shape
(n_samples, n_channels, n_timesteps) and passed to the TNCDataset

Parameters
----------
processed_data_dir: PathLike
Path to the directory where the processed .npy files are stored.
Inside this path must have 3 files, named train_data.npy,
val_data.npy, and test_data.npy.
batch_size : int, optional
The batch size to use for the DataLoader. Defaults to 16.
mc_sample_size : int, optional
This value determines how many neighboring and non-neighboring
windows are used per data sample. Defaults to 5.
epsilon : int, optional
This parameter controls the "spread" of neighboring windows.
adf : bool, optional
Flag indicating whether to use ADF (Augmented Dickey-Fuller)
testing for finding neighbors. Defaults to True.
window_size : int, optional
The size of the windows to be used for each sample in the TNC
dataset. Defaults to 128.
use_val_with_train : bool, optional
If True, the validation and train sets will be concatenated in
order to create a large train set. By default, this is True.
"""
Comment thread
otavioon marked this conversation as resolved.
super().__init__()
self.processed_data_dir = Path(processed_data_dir)
self.batch_size = batch_size
self.mc_sample_size = mc_sample_size
self.epsilon = epsilon
self.adf = adf
self.window_size = window_size
self.num_workers = num_workers
self.use_val_with_train = use_val_with_train

self.har_train = np.load(self.processed_data_dir / "train_data.npy")
self.har_val = np.load(self.processed_data_dir / "val_data.npy")
self.har_test = np.load(self.processed_data_dir / "test_data.npy")

# Handle use_val_with_train and use_train_as_val
if use_train_as_val:
self.har_val = self.har_train
elif use_val_with_train:
self.har_train = np.concatenate([self.har_train, self.har_val], axis=0)

# Print dataset sizes after concatenation
# print(f"\nFinal Training Data Size: {self.har_train.shape}")
# print(f"Final Validation Data Size: {self.har_val.shape}")
# print(f"Final Test Data Size: {self.har_test.shape}")

def train_dataloader(self):
"""
Returns the DataLoader for the training dataset.

Returns
-------
DataLoader
DataLoader for the training dataset.
"""
return DataLoader(
TNCDataset(
np.transpose(self.har_train, (0, 2, 1)),
self.mc_sample_size,
self.window_size,
self.epsilon,
self.adf,
),
batch_size=self.batch_size,
shuffle=True,
num_workers=self.num_workers,
)

def val_dataloader(self):
"""
Returns the DataLoader for the validation dataset.

Returns
-------
DataLoader
DataLoader for the validation dataset.
"""
return DataLoader(
TNCDataset(
np.transpose(self.har_val, (0, 2, 1)),
self.mc_sample_size,
self.window_size,
self.epsilon,
self.adf,
),
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
)

def test_dataloader(self):
"""
Returns the DataLoader for the test dataset.

Returns
-------
DataLoader
DataLoader for the test dataset.
"""
return DataLoader(
TNCDataset(
np.transpose(self.har_test, (0, 2, 1)),
self.mc_sample_size,
self.window_size,
self.epsilon,
self.adf,
),
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
)
Loading
Loading