Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion dandi/cli/cmd_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
instance_option,
map_to_click_exceptions,
)
from ..upload import UploadExisting, UploadValidation
from ..upload import GitAnnexMode, UploadExisting, UploadValidation


@click.command()
Expand Down Expand Up @@ -48,6 +48,18 @@
default="require",
show_default=True,
)
@click.option(
"--git-annex",
type=click.Choice(tuple(e.value for e in GitAnnexMode)),
default="auto",
show_default=True,
help=(
"Enable git-annex support to link local uploaded files with remote URLs. "
"'yes' - always use git-annex; "
"'no' - never use git-annex; "
"'auto' - auto-detect git-annex repository and use if found (default)."
),
)
@click.argument("paths", nargs=-1) # , type=click.Path(exists=True, dir_okay=False))
# &
# Development options: Set DANDI_DEVEL for them to become available
Expand Down Expand Up @@ -75,6 +87,7 @@ def upload(
dandi_instance: str,
existing: UploadExisting,
validation: UploadValidation,
git_annex: GitAnnexMode,
# Development options should come as kwargs
allow_any_path: bool = False,
upload_dandiset_metadata: bool = False,
Expand Down Expand Up @@ -115,4 +128,5 @@ def upload(
jobs=jobs,
jobs_per_file=jobs_per_file,
sync=sync,
git_annex=git_annex,
)
55 changes: 55 additions & 0 deletions dandi/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
from pathlib import Path
from shutil import copyfile, rmtree
import subprocess
from typing import Any
from unittest.mock import Mock
from urllib.parse import urlparse
Expand Down Expand Up @@ -707,3 +708,57 @@ def test_upload_rejects_dandidownload_nwb_file(new_dandiset: SampleDandiset) ->
match=f"contains {DOWNLOAD_SUFFIX} path which indicates incomplete download",
):
new_dandiset.upload(allow_any_path=True)


@pytest.mark.ai_generated
def test_git_annex_repo_ops(tmp_path: Path) -> None:
"""Test git-annex repository detection and backend operations."""
from ..upload import _get_file_annex_backend, _is_git_annex_repo, GitAnnexMode

# Check if git-annex is available
try:
subprocess.run(
["git", "annex", "version"],
capture_output=True,
check=True,
)
except (FileNotFoundError, subprocess.CalledProcessError):
pytest.skip("git-annex not available")

# Test enum values
assert GitAnnexMode.YES.value == "yes"
assert GitAnnexMode.NO.value == "no"
assert GitAnnexMode.AUTO.value == "auto"
assert str(GitAnnexMode.YES) == "yes"

# Test detection: Not a git repo
assert not _is_git_annex_repo(tmp_path)

# Create a git repo
subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True)
assert not _is_git_annex_repo(tmp_path)

# Initialize git-annex
subprocess.run(
["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True
)
assert _is_git_annex_repo(tmp_path)

# Test backend detection: Create a test file
test_file = tmp_path / "test.txt"
test_file.write_text("test content")

# Not annexed yet
assert _get_file_annex_backend(test_file) is None

# Add to annex
subprocess.run(
["git", "annex", "add", "test.txt"],
cwd=tmp_path,
check=True,
capture_output=True,
)

# Should detect backend (default is SHA256E on most systems)
backend = _get_file_annex_backend(test_file)
assert backend is not None
235 changes: 235 additions & 0 deletions dandi/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
from enum import Enum
from functools import reduce
import io
import os
import os.path
from pathlib import Path
import re
import subprocess
import time
from time import sleep
from typing import Any, TypedDict, cast
Expand Down Expand Up @@ -66,6 +68,145 @@ def _check_dandidownload_paths(dfile: DandiFile) -> None:
)


def _is_git_annex_repo(path: Path) -> bool:
"""
Check if the given path is within a git-annex repository.

Returns True if git-annex is properly initialized (has git-annex branch
and annex.uuid config).
"""
try:
# Find git repository root
result = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
cwd=path if path.is_dir() else path.parent,
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
return False

repo_root = Path(result.stdout.strip())

# Check for git-annex branch
branch_result = subprocess.run(
["git", "show-ref", "--verify", "refs/heads/git-annex"],
cwd=repo_root,
capture_output=True,
check=False,
)
if branch_result.returncode != 0:
return False

# Check for annex.uuid config
config_result = subprocess.run(
["git", "config", "annex.uuid"],
cwd=repo_root,
capture_output=True,
text=True,
check=False,
)
if config_result.returncode != 0 or not config_result.stdout.strip():
return False

return True
except Exception as e:
lgr.debug("Error checking for git-annex repository: %s", e)
return False


def _get_file_annex_backend(filepath: Path) -> str | None:
"""
Get the git-annex backend for a file if it's annexed.

Returns the backend name (e.g., 'SHA256E') or None if file is not annexed.
"""
if not filepath.exists():
return None

try:
# Check if file is a symlink (git-annex files are symlinks)
if not filepath.is_symlink():
return None

# Read the symlink target
target = os.readlink(filepath)

# Parse the backend from the annex key format
# Typical format: ../../.git/annex/objects/XX/YY/SHA256E-sNNNNN--HASH/HASH
# Verify this is actually in the annex objects directory
target_path = Path(target)
if "annex" not in target_path.parts or "objects" not in target_path.parts:
return None

# Find the part that looks like an annex key (contains backend and size)
for part in target_path.parts:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just take the filename

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Now using Path(target).name to extract just the filename.

if "-s" in part and "--" in part:
# This looks like an annex key: BACKEND-sSIZE--HASH
# Extract just the filename (last part of path)
backend = Path(target).name.split("-")[0]
return backend
except Exception as e:
lgr.debug("Error checking git-annex backend for %s: %s", filepath, e)

return None


def _register_url_with_annex(
filepath: Path, remote_url: str, expected_size: int | None = None
) -> bool:
"""
Register a remote URL with git-annex for a file.

Args:
filepath: Local file path
remote_url: Remote URL to register
expected_size: Expected file size (for validation)

Returns:
True if successful, False otherwise
"""
try:
# Use git-annex addurl with --relaxed to skip size/hash checks
# and --file to specify the target file
cmd = [
"git",
"annex",
"addurl",
"--relaxed",
"--file",
str(filepath),
remote_url,
]

lgr.debug("Registering URL with git-annex: %s -> %s", filepath, remote_url)

result = subprocess.run(
cmd,
cwd=filepath.parent,
capture_output=True,
text=True,
check=False,
)

if result.returncode == 0:
lgr.info("Successfully registered url %s to %s with git-annex", remote_url, filepath)
return True
else:
lgr.warning(
"Failed to register url %s for %s with git-annex: %s",
remote_url,
filepath,
result.stderr.strip(),
)
return False

except Exception as e:
lgr.warning("Error registering URL with git-annex for %s: %s", filepath, e)
return False


class Uploaded(TypedDict):
size: int
errors: list[str]
Expand All @@ -91,6 +232,17 @@ def __str__(self) -> str:
return self.value


class GitAnnexMode(str, Enum):
"""Mode for git-annex integration during upload"""

YES = "yes"
NO = "no"
AUTO = "auto"

def __str__(self) -> str:
return self.value


def upload(
paths: Sequence[str | Path] | None = None,
existing: UploadExisting = UploadExisting.REFRESH,
Expand All @@ -102,6 +254,7 @@ def upload(
jobs: int | None = None,
jobs_per_file: int | None = None,
sync: bool = False,
git_annex: GitAnnexMode = GitAnnexMode.AUTO,
) -> None:
if paths:
paths = [Path(p).absolute() for p in paths]
Expand All @@ -114,6 +267,16 @@ def upload(
" paths. Use 'dandi download' or 'organize' first."
)

# Determine if we should use git-annex based on mode
use_git_annex = False
if git_annex == GitAnnexMode.YES:
use_git_annex = True
elif git_annex == GitAnnexMode.AUTO:
# Auto-detect git-annex repository
use_git_annex = _is_git_annex_repo(dandiset.path_obj)
if use_git_annex:
lgr.info("Auto-detected git-annex repository; enabling git-annex support")

with ExitStack() as stack:
# We need to use the client as a context manager in order to ensure the
# session gets properly closed. Otherwise, pytest sometimes complains
Expand Down Expand Up @@ -384,9 +547,12 @@ def process_path(dfile: DandiFile) -> Iterator[dict]:
#
yield {"status": "uploading"}
validating = False
uploaded_asset: RemoteAsset | None = None
for r in dfile.iter_upload(
remote_dandiset, metadata, jobs=jobs_per_file, replacing=extant
):
if r["status"] == "done":
uploaded_asset = r.get("asset")
r.pop("asset", None) # to keep pyout from choking
if r["status"] == "uploading":
uploaded_paths[strpath]["size"] = r.pop("current")
Expand All @@ -398,6 +564,75 @@ def process_path(dfile: DandiFile) -> Iterator[dict]:
validating = True
else:
yield r

# Handle git-annex integration after successful upload
if use_git_annex and uploaded_asset is not None:
# Check backend and warn if not SHA256E
current_backend = _get_file_annex_backend(dfile.filepath)
if current_backend and current_backend != "SHA256E":
lgr.warning(
"%s: File backend is %s, but dandiset standard is SHA256E",
strpath,
current_backend,
)

# Poll for contentUrl with timeout (may take time to appear)
# The contentUrl field contains an array of URLs where the
# asset can be accessed. For public dandisets, we prefer
# S3 URLs. For embargoed dandisets, we may need API URLs
# with authentication.
content_urls = None
max_wait = 20 # seconds
poll_interval = 2 # seconds
for attempt in range(max_wait // poll_interval):
asset_metadata = uploaded_asset.get_raw_metadata()
content_urls = asset_metadata.get("contentUrl", [])
if content_urls:
break
if attempt < (max_wait // poll_interval) - 1:
lgr.debug(
"%s: Waiting for contentUrl to be populated (attempt %d/%d)",
strpath,
attempt + 1,
max_wait // poll_interval,
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might take time to get URL registered in backend. So we would need to loop for e.g. up to 20 seconds, reasking each 2 and then failing if none is provided after 20 seconds, and exiting the loop if we successfully registered the URL.

note that there could be multiple URLs, review code in https://github.com/dandi/backups2datalad on dealing with public vs private / embargoed dandisets. For embargoed we need to ensure enabling datalad special remote and associate api based URLs to facilitate authentication. If public -- then public S3 urls.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Added polling loop (20s timeout, 2s intervals) for contentUrl. Implemented S3 URL preference for public dandisets. Note: embargoed dandisets with datalad special remote authentication would need additional work - for now this handles the public case which is the primary use case.

sleep(poll_interval)

if not content_urls:
raise RuntimeError(
f"{strpath}: No contentUrl found in asset metadata after {max_wait}s. "
"Cannot register with git-annex. Use --git-annex no to disable."
)

# Select appropriate URL based on dandiset status
# For public dandisets: prefer S3 URLs (direct access)
# For embargoed: may need to use API URLs with auth
remote_url = None
for url in content_urls:
# Prefer S3 URLs for public access
if "s3.amazonaws.com" in url.lower() or "amazonaws.com" in url.lower():
remote_url = url
break

# Fall back to first URL if no S3 URL found
if not remote_url:
remote_url = content_urls[0]

file_size = asset_metadata.get("contentSize")

if not _register_url_with_annex(
dfile.filepath, remote_url, file_size
):
raise RuntimeError(
f"{strpath}: Failed to register URL with git-annex. "
"Use --git-annex no to disable."
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove swallow + warn -- let it raise. If doesn't work, user should disable use of git-annex. We might want to hint to use --git-annex no or whatever that option would be.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Removed try/except wrapper - errors now raise with helpful hint message to use --git-annex no.


lgr.debug(
"%s: Successfully registered remote URL with git-annex",
strpath,
)

yield {"status": "done"}

except Exception as exc:
Expand Down