diff --git a/.github/unittest/linux/scripts/environment.yml b/.github/unittest/linux/scripts/environment.yml
deleted file mode 100644
index 3283867e9bc..00000000000
--- a/.github/unittest/linux/scripts/environment.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - pip
-  - protobuf
-  - pip:
-    - hypothesis
-    - future
-    - cloudpickle
-    - pygame
-    - moviepy<2.0.0
-    - tqdm
-    - pytest
-    - pytest-cov
-    - pytest-mock
-    - pytest-instafail
-    - pytest-rerunfailures
-    - pytest-timeout
-    - pytest-asyncio
-    - expecttest
-    - pybind11[global]
-    - pyyaml
-    - scipy
-    - hydra-core
-    - tensorboard
-    - imageio==2.26.0
-    - wandb
-    - dm_control
-    - mujoco<3.3.6
-    - mlflow
-    - av
-    - coverage
-    - ray
-    - transformers
-    - ninja
-    - timm
diff --git a/.github/unittest/linux/scripts/post_process.sh b/.github/unittest/linux/scripts/post_process.sh
index e97bf2a7b1b..df82332cd84 100755
--- a/.github/unittest/linux/scripts/post_process.sh
+++ b/.github/unittest/linux/scripts/post_process.sh
@@ -1,6 +1,3 @@
 #!/usr/bin/env bash
 
 set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh
index 94bb8a98e09..1c152fd4ed3 100755
--- a/.github/unittest/linux/scripts/run_all.sh
+++ b/.github/unittest/linux/scripts/run_all.sh
@@ -6,30 +6,33 @@ set -v
 # =============================================================================== #
 # ================================ Init ========================================= #
 
-
 if [[ $OSTYPE != 'darwin'* ]]; then
-  apt-get update && apt-get upgrade -y
-  apt-get install -y vim git wget cmake
+  # Prevent interactive prompts (notably tzdata) in CI.
+  export DEBIAN_FRONTEND=noninteractive
+  export TZ="${TZ:-Etc/UTC}"
+  ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime || true
+  echo "${TZ}" > /etc/timezone || true
+
+  apt-get update
+  apt-get install -y --no-install-recommends tzdata
+  dpkg-reconfigure -f noninteractive tzdata || true
 
-  # Enable universe repository
-  # apt-get install -y software-properties-common
-  # add-apt-repository universe
-  # apt-get update
+  apt-get upgrade -y
+  apt-get install -y vim git wget cmake curl python3-dev
 
-  # apt-get install -y libsdl2-dev libsdl2-2.0-0
+  # SDL2 and freetype needed for building pygame from source (Python 3.14+)
+  apt-get install -y libsdl2-dev libsdl2-2.0-0 libsdl2-mixer-dev libsdl2-image-dev libsdl2-ttf-dev
+  apt-get install -y libfreetype6-dev pkg-config
 
-  apt-get install -y libglfw3 libgl1-mesa-glx libosmesa6 libglew-dev
-  apt-get install -y libglvnd0 libgl1 libglx0 libegl1 libgles2 xvfb
+  apt-get install -y libglfw3 libosmesa6 libglew-dev
+  apt-get install -y libglvnd0 libgl1 libglx0 libglx-mesa0 libegl1 libgles2 xvfb
 
   if [ "${CU_VERSION:-}" == cpu ] ; then
-    # solves version `GLIBCXX_3.4.29' not found for tensorboard
-#    apt-get install -y gcc-4.9
     apt-get upgrade -y libstdc++6
     apt-get dist-upgrade -y
   else
     apt-get install -y g++ gcc
   fi
-
 fi
 
 this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
@@ -45,38 +48,26 @@ fi
 # Avoid error: "fatal: unsafe repository"
 git config --global --add safe.directory '*'
 root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-lib_dir="${env_dir}/lib"
+env_dir="${root_dir}/venv"
 
 cd "${root_dir}"
 
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+export PATH="$HOME/.local/bin:$PATH"
 
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh"
-    bash ./miniconda.sh -b -f -p "${conda_dir}"
-fi
-eval "$(${conda_dir}/bin/conda shell.bash hook)"
+# Create venv with uv
+printf "* Creating venv with Python ${PYTHON_VERSION}\n"
+uv venv --python "${PYTHON_VERSION}" "${env_dir}"
+source "${env_dir}/bin/activate"
+uv_pip_install() {
+  uv pip install --no-progress --python "${env_dir}/bin/python" "$@"
+}
 
-# 2. Create test environment at ./env
-printf "python: ${PYTHON_VERSION}\n"
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-echo "  - python=${PYTHON_VERSION}" >> "${this_dir}/environment.yml"
-cat "${this_dir}/environment.yml"
+# Verify CPython
+python -c "import sys; assert sys.implementation.name == 'cpython', f'Expected CPython, got {sys.implementation.name}'"
 
+# Set environment variables
 if [ "${CU_VERSION:-}" == cpu ] ; then
   export MUJOCO_GL=glfw
 else
@@ -84,37 +75,98 @@ else
 fi
 
 export SDL_VIDEODRIVER=dummy
+export PYOPENGL_PLATFORM=$MUJOCO_GL
+export DISPLAY=:99
+export LAZY_LEGACY_OP=False
+export RL_LOGGING_LEVEL=INFO
+export TOKENIZERS_PARALLELISM=true
+export MAX_IDLE_COUNT=1000
+export MKL_THREADING_LAYER=GNU
+export CKPT_BACKEND=torch
+export BATCHED_PIPE_TIMEOUT=60
 
-# legacy from bash scripts: remove?
-conda env config vars set \
-  MAX_IDLE_COUNT=1000 \
-  MUJOCO_GL=$MUJOCO_GL PYOPENGL_PLATFORM=$MUJOCO_GL DISPLAY=:99 SDL_VIDEODRIVER=dummy LAZY_LEGACY_OP=False RL_LOGGING_LEVEL=INFO TOKENIZERS_PARALLELISM=true
-
-pip3 install pip --upgrade
-pip install virtualenv
+# ==================================================================================== #
+# ================================ Install dependencies ============================== #
+
+printf "* Installing dependencies\n"
+
+# Install base dependencies
+uv_pip_install \
+  hypothesis \
+  future \
+  cloudpickle \
+  pygame \
+  "moviepy<2.0.0" \
+  tqdm \
+  pytest \
+  pytest-cov \
+  pytest-mock \
+  pytest-instafail \
+  pytest-rerunfailures \
+  pytest-timeout \
+  pytest-forked \
+  pytest-asyncio \
+  expecttest \
+  "pybind11[global]>=2.13" \
+  pyyaml \
+  scipy \
+  hydra-core \
+  tensorboard \
+  "imageio==2.26.0" \
+  "huggingface-hub>=0.34.0,<1.0" \
+  wandb \
+  mlflow \
+  av \
+  coverage \
+  transformers \
+  ninja \
+  timm
+
+# Install dm_control for Python < 3.13
+# labmaze (dm_control dependency) doesn't have Python 3.13+ wheels
+if [[ "$PYTHON_VERSION" != "3.13" && "$PYTHON_VERSION" != "3.14" ]]; then
+  echo "installing dm_control"
+  uv_pip_install dm_control
+fi
 
-conda env update --file "${this_dir}/environment.yml" --prune
+# Install ray for Python < 3.14 (ray doesn't support Python 3.14 yet)
+if [[ "$PYTHON_VERSION" != "3.14" ]]; then
+  echo "installing ray"
+  uv_pip_install ray
+fi
 
-# Reset conda env variables
-conda deactivate
-conda activate "${env_dir}"
+# Install mujoco for Python < 3.14 (mujoco doesn't have Python 3.14 wheels yet)
+if [[ "$PYTHON_VERSION" != "3.14" ]]; then
+  echo "installing mujoco"
+  uv_pip_install "mujoco>=3.3.7"
+fi
 
+# Install gymnasium
 echo "installing gymnasium"
-if [[ "$PYTHON_VERSION" == "3.12" ]]; then
-  pip3 install ale-py
-  pip3 install sympy
-  pip3 install "gymnasium[mujoco]>=1.1" mo-gymnasium[mujoco]
+if [[ "$PYTHON_VERSION" == "3.14" ]]; then
+  # Python 3.14: no mujoco wheels available, ale_py also failing
+  uv_pip_install "gymnasium>=1.1"
+elif [[ "$PYTHON_VERSION" == "3.12" ]]; then
+  uv_pip_install ale-py sympy
+  uv_pip_install "gymnasium[mujoco]>=1.1" "mo-gymnasium[mujoco]"
 else
-  pip3 install "gymnasium[atari,mujoco]>=1.1" mo-gymnasium[mujoco]
+  uv_pip_install "gymnasium[atari,mujoco]>=1.1" "mo-gymnasium[mujoco]"
 fi
 
-# sanity check: remove?
-python -c """
+# sanity check
+if [[ "$PYTHON_VERSION" != "3.13" && "$PYTHON_VERSION" != "3.14" ]]; then
+  python -c "
 import dm_control
 from dm_control import composer
 from tensorboard import *
 from google.protobuf import descriptor as _descriptor
-"""
+"
+else
+  python -c "
+from tensorboard import *
+from google.protobuf import descriptor as _descriptor
+"
+fi
 
 # ============================================================================================ #
 # ================================ PyTorch & TorchRL ========================================= #
@@ -122,7 +174,6 @@ from google.protobuf import descriptor as _descriptor
 unset PYTORCH_VERSION
 
 if [ "${CU_VERSION:-}" == cpu ] ; then
-    version="cpu"
     echo "Using cpu build"
 else
     if [[ ${#CU_VERSION} -eq 4 ]]; then
@@ -131,7 +182,6 @@ else
         CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
     fi
     echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION ($CU_VERSION)"
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
 fi
 
 # submodules
@@ -140,15 +190,15 @@ git submodule sync && git submodule update --init --recursive
 printf "Installing PyTorch with %s\n" "${CU_VERSION}"
 if [[ "$TORCH_VERSION" == "nightly" ]]; then
   if [ "${CU_VERSION:-}" == cpu ] ; then
-      pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu -U
+      uv_pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
   else
-      pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/$CU_VERSION -U
+      uv_pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/$CU_VERSION
   fi
 elif [[ "$TORCH_VERSION" == "stable" ]]; then
-    if [ "${CU_VERSION:-}" == cpu ] ; then
-      pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu -U
+  if [ "${CU_VERSION:-}" == cpu ] ; then
+      uv_pip_install torch torchvision --index-url https://download.pytorch.org/whl/cpu
   else
-      pip3 install torch torchvision --index-url https://download.pytorch.org/whl/$CU_VERSION -U
+      uv_pip_install torch torchvision --index-url https://download.pytorch.org/whl/$CU_VERSION
   fi
 else
   printf "Failed to install pytorch"
@@ -158,73 +208,83 @@ fi
 # smoke test
 python -c "import functorch"
 
-## install snapshot
-#if [[ "$TORCH_VERSION" == "nightly" ]]; then
-#  pip3 install git+https://github.com/pytorch/torchsnapshot
-#else
-#  pip3 install torchsnapshot
-#fi
+# Help CMake find pybind11 when building tensordict from source.
+# pybind11 ships a CMake package; its location can be obtained via `python -m pybind11 --cmakedir`.
+pybind11_DIR="$(python -m pybind11 --cmakedir)"
+export pybind11_DIR
 
 # install tensordict
 if [[ "$RELEASE" == 0 ]]; then
-  pip3 install git+https://github.com/pytorch/tensordict.git
+  uv_pip_install --no-build-isolation git+https://github.com/pytorch/tensordict.git
 else
-  pip3 install tensordict
+  uv_pip_install tensordict
 fi
 
 printf "* Installing torchrl\n"
-python -m pip install -e . --no-build-isolation
-
+uv_pip_install -e . --no-build-isolation
 
 if [ "${CU_VERSION:-}" != cpu ] ; then
   printf "* Installing VC1\n"
-  python -c """
-from torchrl.envs.transforms.vc1 import VC1Transform
-VC1Transform.install_vc_models(auto_exit=True)
-"""
+  # Install vc_models directly via uv.
+  # VC1Transform.install_vc_models() uses `setup.py develop` which expects `pip`
+  # to be present in the environment, but uv-created venvs do not necessarily
+  # ship with pip.
+  uv_pip_install "git+https://github.com/facebookresearch/eai-vc.git#subdirectory=vc_models"
 
   printf "* Upgrading timm\n"
-  pip3 install --upgrade "timm>=0.9.0"
+  uv_pip_install --upgrade "timm>=0.9.0"
 
-  python -c """
+  python -c "
 import vc_models
 from vc_models.models.vit import model_utils
 print(model_utils)
-"""
+"
 fi
 
 # ==================================================================================== #
 # ================================ Run tests ========================================= #
 
-
 export PYTORCH_TEST_WITH_SLOW='1'
 python -m torch.utils.collect_env
-## Avoid error: "fatal: unsafe repository"
-#git config --global --add safe.directory '*'
-#root_dir="$(git rev-parse --show-toplevel)"
-
-# solves ImportError: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found
-#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$lib_dir
-export MKL_THREADING_LAYER=GNU
-export CKPT_BACKEND=torch
-export MAX_IDLE_COUNT=100
-export BATCHED_PIPE_TIMEOUT=60
 
 Xvfb :99 -screen 0 1024x768x24 &
 
 pytest test/smoke_test.py -v --durations 200
 pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_control_pixels or test_dm_control or test_tb'
+
+# Track if any tests fail
+EXIT_STATUS=0
+
+python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_env.py::TestNonTensorEnv::test_parallel \
+  --instafail --durations 200 -vvv \
+  --capture no \
+  --timeout=120 --mp_fork_if_no_cuda
+
+# Run distributed tests first (GPU only) to surface errors early
+if [ "${CU_VERSION:-}" != cpu ] ; then
+  python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py \
+    --instafail --durations 200 -vv --capture no \
+    --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
+fi
+
+# Run remaining tests (always run even if distributed tests failed)
 if [ "${CU_VERSION:-}" != cpu ] ; then
   python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
     --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \
+    --ignore test/test_distributed.py \
     --ignore test/llm \
-    --timeout=120 --mp_fork_if_no_cuda
+    --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
 else
   python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
     --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \
     --ignore test/test_distributed.py \
     --ignore test/llm \
-    --timeout=120 --mp_fork_if_no_cuda
+    --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
+fi
+
+# Fail the workflow if any tests failed
+if [ $EXIT_STATUS -ne 0 ]; then
+  echo "Some tests failed with exit status $EXIT_STATUS"
 fi
 
 coverage combine
@@ -234,3 +294,6 @@ coverage xml -i
 # ================================ Post-proc ========================================= #
 
 bash ${this_dir}/post_process.sh
+
+# Exit with failure if any tests failed
+exit $EXIT_STATUS
diff --git a/.github/unittest/linux_optdeps/scripts/run_all.sh b/.github/unittest/linux_optdeps/scripts/run_all.sh
index c73efc50834..108f52a8527 100755
--- a/.github/unittest/linux_optdeps/scripts/run_all.sh
+++ b/.github/unittest/linux_optdeps/scripts/run_all.sh
@@ -9,11 +9,21 @@ set -e
 
 
 if [[ $OSTYPE != 'darwin'* ]]; then
-  apt-get update && apt-get upgrade -y
+  # Prevent interactive prompts (notably tzdata) in CI.
+  export DEBIAN_FRONTEND=noninteractive
+  export TZ="${TZ:-Etc/UTC}"
+  ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime || true
+  echo "${TZ}" > /etc/timezone || true
+
+  apt-get update
+  apt-get install -y --no-install-recommends tzdata
+  dpkg-reconfigure -f noninteractive tzdata || true
+
+  apt-get upgrade -y
   apt-get install -y vim git wget cmake
 
-  apt-get install -y libglfw3 libgl1-mesa-glx libosmesa6 libglew-dev
-  apt-get install -y libglvnd0 libgl1 libglx0 libegl1 libgles2
+  apt-get install -y libglfw3 libosmesa6 libglew-dev
+  apt-get install -y libglvnd0 libgl1 libglx0 libglx-mesa0 libegl1 libgles2
 
   if [ "${CU_VERSION:-}" == cpu ] ; then
     # solves version `GLIBCXX_3.4.29' not found for tensorboard
diff --git a/.github/unittest/linux_sota/scripts/run_all.sh b/.github/unittest/linux_sota/scripts/run_all.sh
index cf1a9be33a8..a81c562a446 100755
--- a/.github/unittest/linux_sota/scripts/run_all.sh
+++ b/.github/unittest/linux_sota/scripts/run_all.sh
@@ -7,7 +7,16 @@ set -v
 # ================================ Init ============================================== #
 
 
-apt-get update && apt-get upgrade -y
+export DEBIAN_FRONTEND=noninteractive
+export TZ="${TZ:-Etc/UTC}"
+ln -snf "/usr/share/zoneinfo/${TZ}" /etc/localtime || true
+echo "${TZ}" > /etc/timezone || true
+
+apt-get update
+apt-get install -y --no-install-recommends tzdata
+dpkg-reconfigure -f noninteractive tzdata || true
+
+apt-get upgrade -y
 apt-get install -y vim git wget cmake
 
 apt-get install -y libglfw3 libgl1-mesa-glx libosmesa6 libglew-dev libosmesa6-dev
diff --git a/.github/workflows/nightly_build.yml b/.github/workflows/nightly_build.yml
index 5e6c6a3bb91..8069b6a689d 100644
--- a/.github/workflows/nightly_build.yml
+++ b/.github/workflows/nightly_build.yml
@@ -42,11 +42,11 @@ jobs:
       matrix:
         os: [['linux', 'ubuntu-22.04'], ['macos', 'macos-latest']]
         python_version: [
-          ["3.9", "cp39-cp39"],
           ["3.10", "cp310-cp310"],
           ["3.11", "cp311-cp311"],
           ["3.12", "cp312-cp312"],
           ["3.13", "cp313-cp313"],
+          ["3.14", "cp314-cp314"],
         ]
         cuda_support: [["", "cpu", "cpu"]]
     steps:
@@ -88,11 +88,11 @@ jobs:
       matrix:
         os: [['linux', 'ubuntu-22.04'], ['macos', 'macos-latest']]
         python_version: [
-          ["3.9", "cp39-cp39"],
           ["3.10", "cp310-cp310"],
           ["3.11", "cp311-cp311"],
           ["3.12", "cp312-cp312"],
           ["3.13", "cp313-cp313"],
+          ["3.14", "cp314-cp314"],
         ]
         cuda_support: [["", "cpu", "cpu"]]
     steps:
@@ -162,11 +162,11 @@ jobs:
       matrix:
         os: [['linux', 'ubuntu-22.04'], ['macos', 'macos-latest']]
         python_version: [
-          ["3.9", "cp39-cp39"],
           ["3.10", "cp310-cp310"],
           ["3.11", "cp311-cp311"],
           ["3.12", "cp312-cp312"],
           ["3.13", "cp313-cp313"],
+          ["3.14", "cp314-cp314"],
         ]
         cuda_support: [["", "cpu", "cpu"]]
     steps:
@@ -204,11 +204,11 @@ jobs:
     strategy:
       matrix:
         python_version: [
-          ["3.9", "3.9"],
           ["3.10", "3.10.3"],
           ["3.11", "3.11"],
           ["3.12", "3.12"],
           ["3.13", "3.13"],
+          ["3.14", "3.14"],
         ]
     steps:
       - name: Setup Python
@@ -244,11 +244,11 @@ jobs:
     strategy:
       matrix:
         python_version: [
-          ["3.9", "3.9"],
           ["3.10", "3.10.3"],
           ["3.11", "3.11"],
           ["3.12", "3.12"],
           ["3.13", "3.13"],
+          ["3.14", "3.14"],
         ]
     steps:
       - name: Setup Python
@@ -314,11 +314,11 @@ jobs:
     strategy:
       matrix:
         python_version: [
-          ["3.9", "3.9"],
           ["3.10", "3.10.3"],
           ["3.11", "3.11"],
           ["3.12", "3.12"],
           ["3.13", "3.13"],
+          ["3.14", "3.14"],
         ]
     steps:
       - name: Checkout torchrl
diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml
index e660f29ba78..89fed0b2b18 100644
--- a/.github/workflows/test-linux.yml
+++ b/.github/workflows/test-linux.yml
@@ -26,13 +26,13 @@ jobs:
   tests-cpu:
     strategy:
       matrix:
-        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.12xlarge
       repository: pytorch/rl
-      docker-image: "nvidia/cuda:12.2.0-devel-ubuntu22.04"
+      docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
       timeout: 90
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -56,14 +56,14 @@ jobs:
   tests-gpu:
     strategy:
       matrix:
-        python_version: ["3.11"]
-        cuda_arch_version: ["12.8"]
+        python_version: ["3.12"]
+        cuda_arch_version: ["13.0"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       repository: pytorch/rl
-      docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04"
+      docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
       gpu-arch-type: cuda
       gpu-arch-version: ${{ matrix.cuda_arch_version }}
       timeout: 90
@@ -128,14 +128,14 @@ jobs:
   tests-optdeps:
     strategy:
       matrix:
-        python_version: ["3.11"]
-        cuda_arch_version: ["12.8"]
+        python_version: ["3.12"]
+        cuda_arch_version: ["13.0"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       repository: pytorch/rl
-      docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04"
+      docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
       gpu-arch-type: cuda
       gpu-arch-version: ${{ matrix.cuda_arch_version }}
       timeout: 90
@@ -163,14 +163,14 @@ jobs:
   tests-stable-gpu:
     strategy:
       matrix:
-        python_version: ["3.10"] # "3.9", "3.10", "3.11"
-        cuda_arch_version: ["11.8"] # "11.6", "11.7"
+        python_version: ["3.12"] # "3.9", "3.10", "3.11"
+        cuda_arch_version: ["13.0"] # "11.6", "11.7"
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       repository: pytorch/rl
-      docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04"
+      docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
       gpu-arch-type: cuda
       gpu-arch-version: ${{ matrix.cuda_arch_version }}
       timeout: 90
diff --git a/pyproject.toml b/pyproject.toml
index 0bc91eb32e5..2bcf033f1d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,11 +16,11 @@ maintainers = [
 ]
 keywords = ["reinforcement-learning", "pytorch", "rl", "machine-learning"]
 classifiers = [
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
     "Operating System :: OS Independent",
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
@@ -55,6 +55,9 @@ tests = [
     "pytest-rerunfailures",
     "pytest-error-for-skips",
     "pytest-timeout",
+    "pytest-forked",
+    "pytest-random-order",
+    "pytest-repeat",
 ]
 utils = [
     "tensorboard",
diff --git a/test/smoke_test_deps.py b/test/smoke_test_deps.py
index ade1c68f4af..941107199fe 100644
--- a/test/smoke_test_deps.py
+++ b/test/smoke_test_deps.py
@@ -6,11 +6,16 @@
 
 import argparse
 import os
+import sys
 import tempfile
 
 import pytest
 
 
+@pytest.mark.skipif(
+    sys.version_info >= (3, 13),
+    reason="dm_control not available on Python 3.13+ (labmaze lacks wheels)",
+)
 def test_dm_control():
     import dm_control  # noqa: F401
     import dm_env  # noqa: F401
@@ -23,21 +28,29 @@ def test_dm_control():
     env.reset()
 
 
+@pytest.mark.skipif(
+    sys.version_info >= (3, 13),
+    reason="dm_control not available on Python 3.13+ (labmaze lacks wheels)",
+)
 @pytest.mark.skip(reason="Not implemented yet")
 def test_dm_control_pixels():
-    from torchrl.envs.libs.dm_control import _has_dmc, DMControlEnv  # noqa
+    from torchrl.envs.libs.dm_control import DMControlEnv
 
     env = DMControlEnv("cheetah", "run", from_pixels=True)
     env.reset()
 
 
+@pytest.mark.skipif(
+    sys.version_info >= (3, 14),
+    reason="gymnasium[atari] / ALE not available on Python 3.14 in CI (ale-py install failing)",
+)
 def test_gym():
     try:
         import gymnasium as gym
     except ImportError as err:
         ERROR = err
         try:
-            import gym  # noqa: F401
+            import gym as gym  # noqa: F401
         except ImportError as err:
             raise ImportError(
                 f"gym and gymnasium load failed. Gym got error {err}."
@@ -46,12 +59,30 @@ def test_gym():
     from torchrl.envs.libs.gym import _has_gym, GymEnv  # noqa
 
     assert _has_gym
+    # If gymnasium is installed without the atari extra, ALE won't be registered.
+    # In that case we skip rather than hard-failing the dependency smoke test.
+    try:
+        import ale_py  # noqa: F401
+    except Exception:  # pragma: no cover
+        pytest.skip("ALE not available (missing ale_py); skipping Atari gym test.")
     if os.getenv("PYTORCH_TEST_FBCODE"):
         from pytorch.rl.test._utils_internal import PONG_VERSIONED
     else:
         from _utils_internal import PONG_VERSIONED
 
-    env = GymEnv(PONG_VERSIONED())
+    try:
+        env = GymEnv(PONG_VERSIONED())
+    except Exception as err:  # gymnasium.error.NamespaceNotFound and similar
+        namespace_not_found = err.__class__.__name__ == "NamespaceNotFound"
+        if hasattr(gym, "error") and hasattr(gym.error, "NamespaceNotFound"):
+            namespace_not_found = namespace_not_found or isinstance(
+                err, gym.error.NamespaceNotFound
+            )
+        if namespace_not_found:
+            pytest.skip(
+                "ALE namespace not registered (gymnasium installed without atari extra)."
+            )
+        raise
     env.reset()
 
 
diff --git a/test/test_distributed.py b/test/test_distributed.py
index b27459868fb..7879e6d068e 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -55,6 +55,8 @@
 if sys.platform.startswith("win"):
     pytest.skip("skipping windows tests in windows", allow_module_level=True)
 
+pytestmark = [pytest.mark.forked]
+
 
 class CountingPolicy(TensorDictModuleBase):
     """A policy for counting env.
diff --git a/test/test_env.py b/test/test_env.py
index 252d495fb98..baf270a6bbb 100644
--- a/test/test_env.py
+++ b/test/test_env.py
@@ -13,6 +13,7 @@
 import pickle
 import random
 import re
+import threading
 import time
 from collections import defaultdict
 from functools import partial
@@ -98,6 +99,59 @@
     pytest.mark.filterwarnings("ignore:unclosed file"),
 ]
 
+
+@pytest.fixture(autouse=False)  # Turn to True to enable
+def check_no_lingering_multiprocessing_resources(request):
+    """Fixture that checks for leftover multiprocessing resources after each test.
+
+    This helps detect test pollution where one test leaves behind resource_sharer
+    threads, zombie processes, or other multiprocessing state that can cause
+    deadlocks in subsequent tests (especially with fork start method on Linux).
+
+    See: https://bugs.python.org/issue30289
+    """
+    # Record state before test
+    threads_before = {t.name for t in threading.enumerate()}
+    # Count resource_sharer threads specifically
+    resource_sharer_before = sum(
+        1
+        for t in threading.enumerate()
+        if "_serve" in t.name or "resource_sharer" in t.name.lower()
+    )
+
+    yield
+
+    # Give a brief moment for cleanup
+    gc.collect()
+    time.sleep(0.05)
+
+    # Check for new resource_sharer threads
+    resource_sharer_after = sum(
+        1
+        for t in threading.enumerate()
+        if "_serve" in t.name or "resource_sharer" in t.name.lower()
+    )
+
+    # Only warn (not fail) for now - this is informational to help debug
+    if resource_sharer_after > resource_sharer_before:
+        new_threads = {t.name for t in threading.enumerate()} - threads_before
+        resource_sharer_threads = [
+            t.name
+            for t in threading.enumerate()
+            if "_serve" in t.name or "resource_sharer" in t.name.lower()
+        ]
+        import warnings
+
+        warnings.warn(
+            f"Test {request.node.name} left behind {resource_sharer_after - resource_sharer_before} "
+            f"resource_sharer thread(s): {resource_sharer_threads}. "
+            f"New threads: {new_threads}. "
+            "This can cause deadlocks in subsequent tests with fork start method.",
+            UserWarning,
+            stacklevel=1,
+        )
+
+
 gym_version = None
 if _has_gym:
     try:
@@ -3827,6 +3881,7 @@ def test_serial(self, bwad, use_buffers):
         r = env.rollout(N, break_when_any_done=bwad)
         assert r.get("non_tensor").tolist() == [list(range(N))] * 2
 
+    # @pytest.mark.forked  # Run in isolated subprocess to avoid resource_sharer pollution from other tests
     @pytest.mark.parametrize("bwad", [True, False])
     @pytest.mark.parametrize("use_buffers", [False, True])
     def test_parallel(self, bwad, use_buffers, maybe_fork_ParallelEnv):
diff --git a/torchrl/collectors/distributed/generic.py b/torchrl/collectors/distributed/generic.py
index 5f2211ea798..e9afa88b39c 100644
--- a/torchrl/collectors/distributed/generic.py
+++ b/torchrl/collectors/distributed/generic.py
@@ -300,6 +300,13 @@ def _run_collector(
         elif instruction == b"shutdown":
             if verbose:
                 torchrl_logger.debug(f"RANK {rank} -- shutting down")
+            # Shutdown weight sync schemes first (stops background threads)
+            if weight_sync_schemes is not None:
+                for scheme in weight_sync_schemes.values():
+                    try:
+                        scheme.shutdown()
+                    except Exception:
+                        pass
             try:
                 collector.shutdown()
             except Exception:
@@ -1117,6 +1124,11 @@ def load_state_dict(self, state_dict: OrderedDict) -> None:
         raise NotImplementedError
 
     def shutdown(self, timeout: float | None = None) -> None:
+        # Prevent double shutdown
+        if getattr(self, "_shutdown", False):
+            return
+        self._shutdown = True
+
         self._store.set("TRAINER_status", b"shutdown")
         for i in range(self.num_workers):
             rank = i + 1
@@ -1138,6 +1150,25 @@ def shutdown(self, timeout: float | None = None) -> None:
                 self.jobs[i].result()
             elif self.launcher == "submitit_delayed":
                 pass
+
+        # Clean up weight sync schemes AFTER workers have exited
+        # (workers have their own scheme instances that they clean up)
+        if self._weight_sync_schemes is not None:
+            torchrl_logger.debug("shutting down weight sync schemes")
+            for scheme in self._weight_sync_schemes.values():
+                try:
+                    scheme.shutdown()
+                except Exception as e:
+                    torchrl_logger.warning(
+                        f"Error shutting down weight sync scheme: {e}"
+                    )
+            self._weight_sync_schemes = None
+
+        # Destroy torch.distributed process group
+        if torch.distributed.is_initialized():
+            torchrl_logger.debug("destroying process group")
+            torch.distributed.destroy_process_group()
+
         torchrl_logger.debug("collector shut down")
 
 
diff --git a/torchrl/collectors/distributed/ray.py b/torchrl/collectors/distributed/ray.py
index 2df0bca48e1..2f8930207f6 100644
--- a/torchrl/collectors/distributed/ray.py
+++ b/torchrl/collectors/distributed/ray.py
@@ -1071,6 +1071,19 @@ def shutdown(
                 timeout=timeout if timeout is not None else 5.0
             )
         self.stop_remote_collectors()
+
+        # Clean up weight sync schemes AFTER workers have exited
+        if getattr(self, "_weight_sync_schemes", None) is not None:
+            torchrl_logger.debug("shutting down weight sync schemes")
+            for scheme in self._weight_sync_schemes.values():
+                try:
+                    scheme.shutdown()
+                except Exception as e:
+                    torchrl_logger.warning(
+                        f"Error shutting down weight sync scheme: {e}"
+                    )
+            self._weight_sync_schemes = None
+
         if shutdown_ray:
             ray.shutdown()
 
diff --git a/torchrl/collectors/distributed/rpc.py b/torchrl/collectors/distributed/rpc.py
index a5d0c9a7140..9cc5bbe8076 100644
--- a/torchrl/collectors/distributed/rpc.py
+++ b/torchrl/collectors/distributed/rpc.py
@@ -863,6 +863,7 @@ def shutdown(self, timeout: float | None = None) -> None:
             return
         if self._shutdown:
             return
+
         torchrl_logger.debug("shutting down")
         for future, i in self.futures:
             # clear the futures
@@ -876,10 +877,6 @@ def shutdown(self, timeout: float | None = None) -> None:
         torchrl_logger.debug("rpc shutdown")
         rpc.shutdown(timeout=int(IDLE_TIMEOUT))
 
-        # Destroy torch.distributed process group
-        if torch.distributed.is_initialized():
-            torch.distributed.destroy_process_group()
-
         if self.launcher == "mp":
             for job in self.jobs:
                 job.join(int(IDLE_TIMEOUT))
@@ -890,6 +887,23 @@ def shutdown(self, timeout: float | None = None) -> None:
             pass
         else:
             raise NotImplementedError(f"Unknown launcher {self.launcher}")
+
+        # Clean up weight sync schemes AFTER workers have exited
+        if getattr(self, "_weight_sync_schemes", None) is not None:
+            torchrl_logger.debug("shutting down weight sync schemes")
+            for scheme in self._weight_sync_schemes.values():
+                try:
+                    scheme.shutdown()
+                except Exception as e:
+                    torchrl_logger.warning(
+                        f"Error shutting down weight sync scheme: {e}"
+                    )
+            self._weight_sync_schemes = None
+
+        # Destroy torch.distributed process group
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+
         self._shutdown = True
 
 
diff --git a/torchrl/collectors/distributed/sync.py b/torchrl/collectors/distributed/sync.py
index a63b6d33c66..33a26220451 100644
--- a/torchrl/collectors/distributed/sync.py
+++ b/torchrl/collectors/distributed/sync.py
@@ -686,12 +686,42 @@ def load_state_dict(self, state_dict: OrderedDict) -> None:
         raise NotImplementedError
 
     def shutdown(self, timeout: float | None = None) -> None:
-        # Clean up weight sync schemes
+        # Prevent double shutdown
+        if getattr(self, "_shutdown", False):
+            return
+        self._shutdown = True
+
+        # Wait for workers to exit
+        if hasattr(self, "jobs"):
+            for job in self.jobs:
+                if self.launcher == "mp":
+                    if hasattr(job, "is_alive") and job.is_alive():
+                        job.join(timeout=timeout if timeout is not None else 10)
+                elif self.launcher == "submitit":
+                    try:
+                        job.result()
+                    except Exception:
+                        pass
+
+        # Clean up weight sync schemes AFTER workers have exited
         if self._weight_sync_schemes is not None:
+            torchrl_logger.debug("shutting down weight sync schemes")
             for scheme in self._weight_sync_schemes.values():
-                scheme.shutdown()
+                try:
+                    scheme.shutdown()
+                except Exception as e:
+                    torchrl_logger.warning(
+                        f"Error shutting down weight sync scheme: {e}"
+                    )
             self._weight_sync_schemes = None
 
+        # Destroy torch.distributed process group
+        if torch.distributed.is_initialized():
+            torchrl_logger.debug("destroying process group")
+            torch.distributed.destroy_process_group()
+
+        torchrl_logger.debug("collector shut down")
+
 
 class DistributedSyncDataCollector(
     DistributedSyncCollector, metaclass=_LegacyCollectorMeta
diff --git a/torchrl/envs/batched_envs.py b/torchrl/envs/batched_envs.py
index 0ba2c019303..320d6c9d1f0 100644
--- a/torchrl/envs/batched_envs.py
+++ b/torchrl/envs/batched_envs.py
@@ -36,6 +36,7 @@
     _make_ordinal_device,
     _ProcessNoWarn,
     logger as torchrl_logger,
+    timeit,
     VERBOSE,
 )
 from torchrl.data.tensor_specs import Composite, NonTensor
@@ -1523,6 +1524,10 @@ def look_for_cuda(tensor, has_cuda=has_cuda):
             channel.send(("init", None))
         self.is_closed = False
         self.set_spec_lock_()
+        
+        # Create thread pool for efficient parallel waiting on worker events
+        from concurrent.futures import ThreadPoolExecutor
+        self._wait_executor = ThreadPoolExecutor(max_workers=self.num_workers)
 
     @_check_start
     def state_dict(self) -> OrderedDict:
@@ -1566,7 +1571,9 @@ def _step_and_maybe_reset_no_buffers(
         if self.consolidate:
             try:
                 td = tensordict.consolidate(
-                    share_memory=True, inplace=True, num_threads=1
+                    # share_memory=False: avoid resource_sharer which causes
+                    # progressive slowdown with fork on Linux
+                    share_memory=False, inplace=True, num_threads=1
                 )
             except Exception as err:
                 raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
@@ -1801,114 +1808,143 @@ def select_and_transfer(x, y):
 
         return tensordict, tensordict_
 
+    @timeit("_wait_for_workers")
     def _wait_for_workers(self, workers_range):
-        workers_range_consume = set(workers_range)
+        """Wait for all workers to signal completion via their events.
+        
+        Uses multithreaded event.wait() for efficient parallel blocking.
+        """
+        from concurrent.futures import as_completed
+        
+        timeout = self.BATCHED_PIPE_TIMEOUT
         t0 = time.time()
-        while (
-            len(workers_range_consume)
-            and (time.time() - t0) < self.BATCHED_PIPE_TIMEOUT
-        ):
-            for i in workers_range:
-                if i not in workers_range_consume:
-                    continue
-                worker = self._workers[i]
-                if worker.is_alive():
-                    event: mp.Event = self._events[i]
-                    if event.is_set():
-                        workers_range_consume.discard(i)
-                        event.clear()
-                    else:
-                        continue
-                else:
+        
+        def wait_for_worker(i):
+            """Wait for a single worker's event."""
+            worker = self._workers[i]
+            event: mp.Event = self._events[i]
+            
+            remaining = timeout - (time.time() - t0)
+            if remaining <= 0:
+                return (i, False, "timeout")
+            
+            signaled = event.wait(timeout=remaining)
+            
+            if not signaled:
+                if not worker.is_alive():
+                    return (i, False, "dead")
+                return (i, False, "timeout")
+            
+            event.clear()
+            return (i, True, None)
+        
+        # Wait for all workers in parallel using persistent thread pool
+        futures = {self._wait_executor.submit(wait_for_worker, i): i for i in workers_range}
+        
+        for future in as_completed(futures):
+            i, success, error = future.result()
+            if not success:
+                if error == "dead":
                     try:
                         self._shutdown_workers()
                     finally:
                         raise RuntimeError(f"Cannot proceed, worker {i} dead.")
-                # event.wait(self.BATCHED_PIPE_TIMEOUT)
-        if len(workers_range_consume):
-            raise RuntimeError(
-                f"Failed to run all workers within the {self.BATCHED_PIPE_TIMEOUT} sec time limit. This "
-                f"threshold can be increased via the BATCHED_PIPE_TIMEOUT env variable."
-            )
+                else:
+                    raise RuntimeError(
+                        f"Failed to run all workers within the {timeout} sec time limit. This "
+                        f"threshold can be increased via the BATCHED_PIPE_TIMEOUT env variable."
+                    )
 
+    @timeit("_step_no_buffers")
     def _step_no_buffers(
         self, tensordict: TensorDictBase
     ) -> tuple[TensorDictBase, TensorDictBase]:
-        partial_steps = tensordict.get("_step")
-        tensordict_save = tensordict
-        if partial_steps is not None and partial_steps.all():
-            partial_steps = None
-        if partial_steps is not None:
-            partial_steps = partial_steps.view(tensordict.shape)
-            tensordict = tensordict[partial_steps]
-            workers_range = partial_steps.nonzero(as_tuple=True)[0].tolist()
-        else:
-            workers_range = range(self.num_workers)
+        torchrl_logger.debug(
+            f"Entering _step_no_buffers. Timeit state: {timeit.print()}"
+        )
+        with timeit("_step_no_buffers prep"):
+            partial_steps = tensordict.get("_step")
+            tensordict_save = tensordict
+            if partial_steps is not None and partial_steps.all():
+                partial_steps = None
+            if partial_steps is not None:
+                partial_steps = partial_steps.view(tensordict.shape)
+                tensordict = tensordict[partial_steps]
+                workers_range = partial_steps.nonzero(as_tuple=True)[0].tolist()
+            else:
+                workers_range = range(self.num_workers)
 
-        if self.consolidate:
-            try:
-                data = tensordict.consolidate(
-                    share_memory=True, inplace=False, num_threads=1
+        with timeit("_step_no_buffers prep"):
+            if self.consolidate:
+                try:
+                    data = tensordict.consolidate(
+                        # share_memory=False: avoid resource_sharer which causes
+                        # progressive slowdown with fork on Linux
+                        share_memory=False, inplace=False, num_threads=1
+                    )
+                except Exception as err:
+                    raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
+            else:
+                data = tensordict
+
+        with timeit("_step_no_buffers send"):
+            for i, local_data in zip(workers_range, data.unbind(0)):
+                env_device = (
+                    self.meta_data[i].device
+                    if isinstance(self.meta_data, list)
+                    else self.meta_data.device
                 )
-            except Exception as err:
-                raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
-        else:
-            data = tensordict
-
-        for i, local_data in zip(workers_range, data.unbind(0)):
-            env_device = (
-                self.meta_data[i].device
-                if isinstance(self.meta_data, list)
-                else self.meta_data.device
-            )
-            if data.device != env_device:
-                if env_device is None:
-                    local_data.clear_device_()
-                else:
-                    local_data = local_data.to(env_device)
-            self.parent_channels[i].send(("step", local_data))
+                if data.device != env_device:
+                    if env_device is None:
+                        local_data.clear_device_()
+                    else:
+                        local_data = local_data.to(env_device)
+                self.parent_channels[i].send(("step", local_data))
         # for i in range(data.shape[0]):
         #     self.parent_channels[i].send(("step", (data, i)))
 
-        self._wait_for_workers(workers_range)
-
-        out_tds = []
-        for i in workers_range:
-            channel = self.parent_channels[i]
-            td = channel.recv()
-            out_tds.append(td)
+        with timeit("_step_no_buffers wait for workers"):
+            self._wait_for_workers(workers_range)
+        with timeit("_step_no_buffers recv"):
+            out_tds = []
+            for i in workers_range:
+                channel = self.parent_channels[i]
+                td = channel.recv()
+                out_tds.append(td)
 
-        out = LazyStackedTensorDict.maybe_dense_stack(out_tds)
-        if self.device is not None and out.device != self.device:
-            out = out.to(self.device, non_blocking=self.non_blocking)
-        if partial_steps is not None:
-            result = out.new_zeros(tensordict_save.shape)
+        with timeit("_step_no_buffers post-process"):
+            out = LazyStackedTensorDict.maybe_dense_stack(out_tds)
+            if self.device is not None and out.device != self.device:
+                out = out.to(self.device, non_blocking=self.non_blocking)
+            if partial_steps is not None:
+                result = out.new_zeros(tensordict_save.shape)
 
-            def select_and_clone(x, y):
-                if y is not None:
-                    if x.device != y.device:
-                        x = x.to(y.device)
-                    else:
-                        x = x.clone()
-                    return x
+                def select_and_clone(x, y):
+                    if y is not None:
+                        if x.device != y.device:
+                            x = x.to(y.device)
+                        else:
+                            x = x.clone()
+                        return x
 
-            prev = tensordict_save._fast_apply(
-                select_and_clone,
-                result,
-                filter_empty=True,
-                device=result.device,
-                batch_size=result.batch_size,
-                is_leaf=_is_leaf_nontensor,
-                default=None,
-            )
+                prev = tensordict_save._fast_apply(
+                    select_and_clone,
+                    result,
+                    filter_empty=True,
+                    device=result.device,
+                    batch_size=result.batch_size,
+                    is_leaf=_is_leaf_nontensor,
+                    default=None,
+                )
 
-            result.update(prev)
+                result.update(prev)
 
-            if partial_steps.any():
-                result[partial_steps] = out
-            return result
-        return out
+                if partial_steps.any():
+                    result[partial_steps] = out
+                return result
+            return out
 
+    @timeit("_step")
     @torch.no_grad()
     @_check_start
     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
@@ -1922,41 +1958,43 @@ def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
         #   and this transform overrides an observation key (eg, CatFrames)
         #   the shape, dtype or device may not necessarily match and writing
         #   the value in-place will fail.
-        partial_steps = tensordict.get("_step")
-        tensordict_save = tensordict
-        if partial_steps is not None and partial_steps.all():
-            partial_steps = None
-        if partial_steps is not None:
-            partial_steps = partial_steps.view(tensordict.shape)
-            workers_range = partial_steps.nonzero(as_tuple=True)[0].tolist()
-            shared_tensordict_parent = TensorDict.lazy_stack(
-                [self.shared_tensordicts[i] for i in workers_range]
-            )
-            if self.shared_tensordict_parent.device is None:
-                tensordict = tensordict._fast_apply(
-                    lambda x, y: x[partial_steps].to(y.device)
-                    if y is not None
-                    else x[partial_steps],
-                    self.shared_tensordict_parent,
-                    default=None,
-                    device=None,
-                    batch_size=shared_tensordict_parent.shape,
+        with timeit("_step prep"):
+            partial_steps = tensordict.get("_step")
+            tensordict_save = tensordict
+            if partial_steps is not None and partial_steps.all():
+                partial_steps = None
+            if partial_steps is not None:
+                partial_steps = partial_steps.view(tensordict.shape)
+                workers_range = partial_steps.nonzero(as_tuple=True)[0].tolist()
+                shared_tensordict_parent = TensorDict.lazy_stack(
+                    [self.shared_tensordicts[i] for i in workers_range]
                 )
+                if self.shared_tensordict_parent.device is None:
+                    tensordict = tensordict._fast_apply(
+                        lambda x, y: x[partial_steps].to(y.device)
+                        if y is not None
+                        else x[partial_steps],
+                        self.shared_tensordict_parent,
+                        default=None,
+                        device=None,
+                        batch_size=shared_tensordict_parent.shape,
+                    )
+                else:
+                    tensordict = tensordict[partial_steps].to(
+                        self.shared_tensordict_parent.device
+                    )
             else:
-                tensordict = tensordict[partial_steps].to(
-                    self.shared_tensordict_parent.device
-                )
-        else:
-            workers_range = range(self.num_workers)
-            shared_tensordict_parent = self.shared_tensordict_parent
-
-        shared_tensordict_parent.update_(
-            tensordict,
-            # We also update the output keys because they can be implicitly used, eg
-            # during partial steps to fill in values
-            keys_to_update=list(self._env_input_keys),
-            non_blocking=self.non_blocking,
-        )
+                workers_range = range(self.num_workers)
+                shared_tensordict_parent = self.shared_tensordict_parent
+
+        with timeit("_step update"):
+            shared_tensordict_parent.update_(
+                tensordict,
+                # We also update the output keys because they can be implicitly used, eg
+                # during partial steps to fill in values
+                keys_to_update=list(self._env_input_keys),
+                non_blocking=self.non_blocking,
+            )
         next_td_passthrough = tensordict.get("next", None)
         if next_td_passthrough is not None:
             # if we have input "next" data (eg, RNNs which pass the next state)
@@ -2000,74 +2038,81 @@ def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
             data = [{} for _ in range(self.num_workers)]
 
         if self._non_tensor_keys:
-            for i, td in zip(
-                workers_range,
-                tensordict.select(*self._non_tensor_keys, strict=False).unbind(0),
-            ):
-                data[i]["non_tensor_data"] = td
+            with timeit("_step non-tensor keys"):
+                for i, td in zip(
+                    workers_range,
+                    tensordict.select(*self._non_tensor_keys, strict=False).unbind(0),
+                ):
+                    data[i]["non_tensor_data"] = td
 
-        self._sync_m2w()
+        with timeit("_step sync m2w"):
+            self._sync_m2w()
 
-        if self.event is not None:
-            self.event.record()
-            self.event.synchronize()
-        for i in workers_range:
-            self.parent_channels[i].send(("step", data[i]))
+        with timeit("_step event"):
+            if self.event is not None:
+                self.event.record()
+                self.event.synchronize()
+        with timeit("_step send"):
+            for i in workers_range:
+                self.parent_channels[i].send(("step", data[i]))
 
-        self._wait_for_workers(workers_range)
+        with timeit("_step wait for workers"):
+            self._wait_for_workers(workers_range)
 
-        if self._non_tensor_keys:
-            non_tensor_tds = []
-            for i in workers_range:
-                msg, non_tensor_td = self.parent_channels[i].recv()
-                non_tensor_tds.append(non_tensor_td)
+        with timeit("_step recv"):
+            if self._non_tensor_keys:
+                non_tensor_tds = []
+                for i in workers_range:
+                    msg, non_tensor_td = self.parent_channels[i].recv()
+                    non_tensor_tds.append(non_tensor_td)
 
-        # We must pass a clone of the tensordict, as the values of this tensordict
-        # will be modified in-place at further steps
-        next_td = shared_tensordict_parent.get("next")
-        device = self.device
+        with timeit("_step post-process"):
+            # We must pass a clone of the tensordict, as the values of this tensordict
+            # will be modified in-place at further steps
+            next_td = shared_tensordict_parent.get("next")
+            device = self.device
 
-        out = next_td.named_apply(
-            self.select_and_clone,
-            nested_keys=True,
-            filter_empty=True,
-            device=device,
-        )
-        if self._non_tensor_keys:
-            out.update(
-                LazyStackedTensorDict(*non_tensor_tds),
-                keys_to_update=self._non_tensor_keys,
+            out = next_td.named_apply(
+                self.select_and_clone,
+                nested_keys=True,
+                filter_empty=True,
+                device=device,
             )
-        if next_td_passthrough is not None:
-            out.update(next_td_passthrough)
+            if self._non_tensor_keys:
+                out.update(
+                    LazyStackedTensorDict(*non_tensor_tds),
+                    keys_to_update=self._non_tensor_keys,
+                )
+            if next_td_passthrough is not None:
+                out.update(next_td_passthrough)
 
-        self._sync_w2m()
-        if partial_steps is not None:
-            result = out.new_zeros(tensordict_save.shape)
+            self._sync_w2m()
+            if partial_steps is not None:
+                result = out.new_zeros(tensordict_save.shape)
 
-            def select_and_clone(x, y):
-                if y is not None:
-                    if x.device != y.device:
-                        x = x.to(y.device)
-                    else:
-                        x = x.clone()
-                    return x
+                def select_and_clone(x, y):
+                    if y is not None:
+                        if x.device != y.device:
+                            x = x.to(y.device)
+                        else:
+                            x = x.clone()
+                        return x
 
-            prev = tensordict_save._fast_apply(
-                select_and_clone,
-                result,
-                filter_empty=True,
-                device=result.device,
-                batch_size=result.batch_size,
-                is_leaf=_is_leaf_nontensor,
-                default=None,
-            )
+                prev = tensordict_save._fast_apply(
+                    select_and_clone,
+                    result,
+                    filter_empty=True,
+                    device=result.device,
+                    batch_size=result.batch_size,
+                    is_leaf=_is_leaf_nontensor,
+                    default=None,
+                )
 
-            result.update(prev)
-            if partial_steps.any():
-                result[partial_steps] = out
-            return result
-        return out
+                result.update(prev)
+                if partial_steps.any():
+                    result[partial_steps] = out
+                return result
+            return out
 
     def _reset_no_buffers(
         self,
@@ -2076,11 +2121,12 @@ def _reset_no_buffers(
         needs_resetting,
     ) -> tuple[TensorDictBase, TensorDictBase]:
         if is_tensor_collection(tensordict):
-            # tensordict = tensordict.consolidate(share_memory=True, num_threads=1)
             if self.consolidate:
                 try:
                     tensordict = tensordict.consolidate(
-                        share_memory=True, num_threads=1
+                        # share_memory=False: avoid resource_sharer which causes
+                        # progressive slowdown with fork on Linux
+                        share_memory=False, num_threads=1
                     )
                 except Exception as err:
                     raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
@@ -2230,6 +2276,11 @@ def tentative_update(val, other):
     @_check_start
     def _shutdown_workers(self) -> None:
         try:
+            # Shutdown the wait executor first
+            if hasattr(self, "_wait_executor") and self._wait_executor is not None:
+                self._wait_executor.shutdown(wait=False)
+                self._wait_executor = None
+            
             if self.is_closed:
                 raise RuntimeError(
                     "calling {self.__class__.__name__}._shutdown_workers only allowed when env.is_closed = False"
@@ -2652,6 +2703,7 @@ def _run_worker_pipe_direct(
 
     child_pipe.send("started")
     while True:
+        torchrl_logger.debug(f"[worker {pid}] poll, timing: {timeit.print()}")
         try:
             if child_pipe.poll(_timeout):
                 cmd, data = child_pipe.recv()
@@ -2686,35 +2738,44 @@ def _run_worker_pipe_direct(
                 raise RuntimeError("call 'init' before resetting")
             # we use 'data' to pass the keys that we need to pass to reset,
             # because passing the entire buffer may have unwanted consequences
-            # data, idx, reset_kwargs = data
-            # data = data[idx]
             data, reset_kwargs = data
-            if data is not None:
-                data.unlock_()
-                data._fast_apply(
-                    lambda x: x.clone() if x.device.type == "cuda" else x, out=data
+            
+            with timeit("worker reset data_prep"):
+                if data is not None:
+                    data.unlock_()
+                    data._fast_apply(
+                        lambda x: x.clone() if x.device.type == "cuda" else x, out=data
+                    )
+            
+            with timeit("worker reset env.reset"):
+                cur_td = env.reset(
+                    tensordict=data,
+                    **reset_kwargs,
                 )
-            cur_td = env.reset(
-                tensordict=data,
-                **reset_kwargs,
-            )
-            if event is not None:
-                event.record()
-                event.synchronize()
-            if consolidate:
-                try:
-                    child_pipe.send(
-                        cur_td.consolidate(
-                            share_memory=True, inplace=True, num_threads=1
+            
+            with timeit("worker reset cuda_sync"):
+                if event is not None:
+                    event.record()
+                    event.synchronize()
+            
+            with timeit("worker reset consolidate"):
+                if consolidate:
+                    try:
+                        cur_td = cur_td.consolidate(
+                            # share_memory=False: avoid resource_sharer which causes
+                            # progressive slowdown with fork on Linux
+                            share_memory=False, inplace=True, num_threads=1
                         )
-                    )
-                except Exception as err:
-                    raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
-            else:
+                    except Exception as err:
+                        raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
+            
+            with timeit("worker reset pipe.send"):
                 child_pipe.send(cur_td)
-            # Set event after successfully sending through pipe to avoid race condition
-            # where event is set but pipe send fails (BrokenPipeError)
-            mp_event.set()
+            
+            with timeit("worker reset event.set"):
+                # Set event after successfully sending through pipe to avoid race condition
+                # where event is set but pipe send fails (BrokenPipeError)
+                mp_event.set()
 
             del cur_td
 
@@ -2722,23 +2783,35 @@ def _run_worker_pipe_direct(
             if not initialized:
                 raise RuntimeError("called 'init' before step")
             i += 1
-            # data, idx = data
-            # data = data[idx]
-            next_td = env._step(data)
-            if event is not None:
-                event.record()
-                event.synchronize()
-            if consolidate:
-                try:
-                    next_td = next_td.consolidate(
-                        share_memory=True, inplace=True, num_threads=1
-                    )
-                except Exception as err:
-                    raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
-            child_pipe.send(next_td)
-            # Set event after successfully sending through pipe to avoid race condition
-            # where event is set but pipe send fails (BrokenPipeError)
-            mp_event.set()
+            
+            with timeit("worker step env._step"):
+                next_td = env._step(data)
+            
+            with timeit("worker step cuda_sync"):
+                if event is not None:
+                    event.record()
+                    event.synchronize()
+            
+            with timeit("worker step consolidate"):
+                if consolidate:
+                    try:
+                        next_td = next_td.consolidate(
+                            # TESTING: share_memory=True to observe slowdown
+                            share_memory=True, inplace=True, num_threads=1
+                        )
+                    except Exception as err:
+                        raise RuntimeError(_CONSOLIDATE_ERR_CAPTURE) from err
+            
+            with timeit("worker step pipe.send"):
+                child_pipe.send(next_td)
+            
+            with timeit("worker step event.set"):
+                # Set event after successfully sending through pipe to avoid race condition
+                # where event is set but pipe send fails (BrokenPipeError)
+                mp_event.set()
+
+            # Print worker timing after each step to observe progressive slowdown
+            torchrl_logger.info(f"[worker {pid}] step {i} timing: {timeit.print()}")
 
             del next_td
 
diff --git a/torchrl/weight_update/_distributed.py b/torchrl/weight_update/_distributed.py
index c9cad578c53..fd2625002c8 100644
--- a/torchrl/weight_update/_distributed.py
+++ b/torchrl/weight_update/_distributed.py
@@ -589,16 +589,13 @@ def _setup_connection_and_weights_on_receiver_impl(
 
     def shutdown(self) -> None:
         """Stop background receiver thread and clean up."""
-        if self._stop_event is not None:
-            self._stop_event.set()
-        if self._background_thread is not None:
-            self._background_thread.join(timeout=5.0)
-            if self._background_thread.is_alive():
-                torchrl_logger.warning(
-                    "DistributedWeightSyncScheme: Background thread did not stop gracefully"
-                )
-        self._background_thread = None
-        self._stop_event = None
+        # Check if already shutdown
+        if getattr(self, "_is_shutdown", False):
+            return
+        self._is_shutdown = True
+
+        # Let base class handle background thread cleanup
+        super().shutdown()
 
     @property
     def model(self) -> Any | None:
diff --git a/torchrl/weight_update/_shared.py b/torchrl/weight_update/_shared.py
index cba335a967b..7aa15471a07 100644
--- a/torchrl/weight_update/_shared.py
+++ b/torchrl/weight_update/_shared.py
@@ -886,22 +886,18 @@ def __getstate__(self):
 
     def shutdown(self) -> None:
         """Stop the background receiver thread and clean up."""
+        # Check if already shutdown
+        if getattr(self, "_is_shutdown", False):
+            return
+        self._is_shutdown = True
+
         # Signal all workers to stop
-        if self._instruction_queues:
+        if getattr(self, "_instruction_queues", None):
             for worker_idx in self._instruction_queues:
                 try:
                     self._instruction_queues[worker_idx].put("stop")
                 except Exception:
                     pass
 
-        # Stop local background thread if running
-        if self._stop_event is not None:
-            self._stop_event.set()
-        if self._background_thread is not None:
-            self._background_thread.join(timeout=5.0)
-            if self._background_thread.is_alive():
-                torchrl_logger.warning(
-                    "SharedMemWeightSyncScheme: Background thread did not stop gracefully"
-                )
-        self._background_thread = None
-        self._stop_event = None
+        # Let base class handle background thread cleanup
+        super().shutdown()
diff --git a/torchrl/weight_update/weight_sync_schemes.py b/torchrl/weight_update/weight_sync_schemes.py
index 75ab16563b4..b381a4db55b 100644
--- a/torchrl/weight_update/weight_sync_schemes.py
+++ b/torchrl/weight_update/weight_sync_schemes.py
@@ -1231,3 +1231,30 @@ def __getstate__(self):
     def __setstate__(self, state):
         """Restore the scheme from pickling."""
         self.__dict__.update(state)
+
+    def __del__(self):
+        """Clean up resources when the scheme is garbage collected."""
+        try:
+            self.shutdown()
+        except Exception:
+            # Silently ignore any errors during garbage collection cleanup
+            pass
+
+    def shutdown(self) -> None:
+        """Shutdown the scheme and release resources.
+
+        This method stops any background threads and cleans up connections.
+        It is safe to call multiple times. Subclasses should override this
+        method to add custom cleanup logic, but should call super().shutdown()
+        to ensure base cleanup is performed.
+        """
+        # Stop background receiver thread if running
+        if getattr(self, "_stop_event", None) is not None:
+            self._stop_event.set()
+        if getattr(self, "_background_thread", None) is not None:
+            try:
+                self._background_thread.join(timeout=5.0)
+            except Exception:
+                pass
+            self._background_thread = None
+        self._stop_event = None