diff --git a/README.md b/README.md index 6466b9e7..f4a6f835 100644 --- a/README.md +++ b/README.md @@ -146,30 +146,57 @@ DataJoint (). ### Prerequisites -- [Docker](https://docs.docker.com/get-docker/) for MySQL and MinIO services +- [Docker](https://docs.docker.com/get-docker/) (Docker daemon must be running) - Python 3.10+ -### Running Tests - -Tests are organized into `unit/` (no external services) and `integration/` (requires MySQL + MinIO): +### Quick Start ```bash -# Install dependencies +# Clone and install +git clone https://github.com/datajoint/datajoint-python.git +cd datajoint-python pip install -e ".[test]" -# Run unit tests only (fast, no Docker needed) -pytest tests/unit/ +# Run all tests (containers start automatically via testcontainers) +pytest tests/ -# Start MySQL and MinIO for integration tests -docker compose up -d db minio +# Install and run pre-commit hooks +pip install pre-commit +pre-commit install +pre-commit run --all-files +``` -# Run all tests +### Running Tests + +Tests use [testcontainers](https://testcontainers.com/) to automatically manage MySQL and MinIO containers. +**No manual `docker-compose up` required** - containers start when tests run and stop afterward. + +```bash +# Run all tests (recommended) pytest tests/ +# Run with coverage report +pytest --cov-report term-missing --cov=datajoint tests/ + # Run specific test file pytest tests/integration/test_blob.py -v -# Stop services when done +# Run only unit tests (no containers needed) +pytest tests/unit/ +``` + +### Alternative: External Containers + +For development/debugging, you may prefer persistent containers that survive test runs: + +```bash +# Start containers manually +docker compose up -d db minio + +# Run tests using external containers +DJ_USE_EXTERNAL_CONTAINERS=1 pytest tests/ + +# Stop containers when done docker compose down ``` @@ -183,24 +210,46 @@ docker compose --profile test up djtest --build ### Alternative: Using pixi -[pixi](https://pixi.sh) users can run tests with automatic service management: +[pixi](https://pixi.sh) users can run tests with: ```bash pixi install # First time setup -pixi run test # Starts services and runs tests -pixi run services-down # Stop services +pixi run test # Runs tests (testcontainers manages containers) ``` ### Pre-commit Hooks +Pre-commit hooks run automatically on `git commit` to check code quality. +**All hooks must pass before committing.** + ```bash -pre-commit install # Install hooks (first time) -pre-commit run --all-files # Run all checks +# Install hooks (first time only) +pip install pre-commit +pre-commit install + +# Run all checks manually +pre-commit run --all-files + +# Run specific hook +pre-commit run ruff --all-files +pre-commit run codespell --all-files ``` +Hooks include: +- **ruff**: Python linting and formatting +- **codespell**: Spell checking +- **YAML/JSON/TOML validation** +- **Large file detection** + +### Before Submitting a PR + +1. **Run all tests**: `pytest tests/` +2. **Run pre-commit**: `pre-commit run --all-files` +3. **Check coverage**: `pytest --cov-report term-missing --cov=datajoint tests/` + ### Environment Variables -Tests use these defaults (configured in `pyproject.toml`): +For external container mode (`DJ_USE_EXTERNAL_CONTAINERS=1`): | Variable | Default | Description | |----------|---------|-------------| diff --git a/docker-compose.yaml b/docker-compose.yaml index 98a16f16..2c48ffd1 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,10 +1,15 @@ # Development environment with MySQL and MinIO services # -# Quick start: -# docker compose up -d db minio # Start services -# pytest tests/ # Run tests (uses localhost defaults) +# NOTE: docker-compose is OPTIONAL for running tests. +# Tests use testcontainers to automatically manage containers. +# Just run: pytest tests/ # -# Full Docker testing: +# Use docker-compose for development/debugging when you want +# persistent containers that survive test runs: +# docker compose up -d db minio # Start services manually +# pytest tests/ # Tests will use these containers +# +# Full Docker testing (CI): # docker compose --profile test up djtest --build services: db: diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index 4de4f58e..03c10f69 100644 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -33,7 +33,7 @@ nav: - Blobs: design/tables/blobs.md - Attachments: design/tables/attach.md - Filepaths: design/tables/filepath.md - - Custom Datatypes: design/tables/customtype.md + - Custom Codecs: design/tables/codecs.md - Dependencies: design/tables/dependencies.md - Indexes: design/tables/indexes.md - Master-Part Relationships: design/tables/master-part.md diff --git a/docs/src/design/tables/attributes.md b/docs/src/design/tables/attributes.md index 2e8105e7..39a80ff6 100644 --- a/docs/src/design/tables/attributes.md +++ b/docs/src/design/tables/attributes.md @@ -5,109 +5,150 @@ To conserve database resources, use the smallest and most restrictive datatype sufficient for your data. This also ensures that only valid data are entered into the pipeline. -## Most common datatypes - -- `tinyint`: an 8-bit integer number, ranging from -128 to 127. -- `tinyint unsigned`: an 8-bit positive integer number, ranging from 0 to 255. -- `smallint`: a 16-bit integer number, ranging from -32,768 to 32,767. -- `smallint unsigned`: a 16-bit positive integer, ranging from 0 to 65,535. -- `int`: a 32-bit integer number, ranging from -2,147,483,648 to 2,147,483,647. -- `int unsigned`: a 32-bit positive integer, ranging from 0 to 4,294,967,295. -- `enum`: one of several explicitly enumerated values specified as strings. - Use this datatype instead of text strings to avoid spelling variations and to save - storage space. - For example, the datatype for an anesthesia attribute could be - `enum("urethane", "isoflurane", "fentanyl")`. - Do not use enums in primary keys due to the difficulty of changing their definitions - consistently in multiple tables. +## Core datatypes (recommended) + +Use these portable, scientist-friendly types for cross-database compatibility. + +### Integers + +- `int8`: 8-bit signed integer (-128 to 127) +- `uint8`: 8-bit unsigned integer (0 to 255) +- `int16`: 16-bit signed integer (-32,768 to 32,767) +- `uint16`: 16-bit unsigned integer (0 to 65,535) +- `int32`: 32-bit signed integer +- `uint32`: 32-bit unsigned integer +- `int64`: 64-bit signed integer +- `uint64`: 64-bit unsigned integer +- `bool`: boolean value (True/False, stored as 0/1) + +### Floating-point + +- `float32`: 32-bit single-precision floating-point. Sufficient for many measurements. +- `float64`: 64-bit double-precision floating-point. + Avoid using floating-point types in primary keys due to equality comparison issues. +- `decimal(n,f)`: fixed-point number with *n* total digits and *f* fractional digits. + Use for exact decimal representation (e.g., currency, coordinates). + Safe for primary keys due to well-defined precision. + +### Strings + +- `char(n)`: fixed-length string of exactly *n* characters. +- `varchar(n)`: variable-length string up to *n* characters. +- `text`: unlimited-length text for long-form content (notes, descriptions, abstracts). +- `enum(...)`: one of several enumerated values, e.g., `enum("low", "medium", "high")`. + Do not use enums in primary keys due to difficulty changing definitions. + +**Encoding policy:** All strings use UTF-8 encoding (`utf8mb4` in MySQL, `UTF8` in PostgreSQL). +Character encoding and collation are database-level configuration, not part of type definitions. +Comparisons are case-sensitive by default. + +### Date/Time - `date`: date as `'YYYY-MM-DD'`. -- `time`: time as `'HH:MM:SS'`. -- `datetime`: Date and time to the second as `'YYYY-MM-DD HH:MM:SS'` -- `timestamp`: Date and time to the second as `'YYYY-MM-DD HH:MM:SS'`. - The default value may be set to `CURRENT_TIMESTAMP`. - Unlike `datetime`, a `timestamp` value will be adjusted to the local time zone. - -- `char(N)`: a character string up to *N* characters (but always takes the entire *N* -bytes to store). -- `varchar(N)`: a text string of arbitrary length up to *N* characters that takes -*M+1* or *M+2* bytes of storage, where *M* is the actual length of each stored string. -- `float`: a single-precision floating-point number. - Takes 4 bytes. - Single precision is sufficient for many measurements. - -- `double`: a double-precision floating-point number. - Takes 8 bytes. - Because equality comparisons are error-prone, neither `float` nor `double` should be - used in primary keys. -- `decimal(N,F)`: a fixed-point number with *N* total decimal digits and *F* -fractional digits. - This datatype is well suited to represent numbers whose magnitude is well defined - and does not warrant the use of floating-point representation or requires precise - decimal representations (e.g. dollars and cents). - Because of its well-defined precision, `decimal` values can be used in equality - comparison and be included in primary keys. - -- `longblob`: raw binary data, up to 4 -[GiB](http://en.wikipedia.org/wiki/Gibibyte) in size. - Stores and returns raw bytes without serialization. - For serialized Python objects (arrays, dicts, etc.), use `` instead. - The `longblob` and other `blob` datatypes can be configured to store data - [externally](../../sysadmin/external-store.md) by using the `blob@store` syntax. - -## Less common (but supported) datatypes - -- `decimal(N,F) unsigned`: same as `decimal`, but limited to nonnegative values. -- `mediumint` a 24-bit integer number, ranging from -8,388,608 to 8,388,607. -- `mediumint unsigned`: a 24-bit positive integer, ranging from 0 to 16,777,216. -- `mediumblob`: arbitrary numeric array, up to 16 -[MiB](http://en.wikipedia.org/wiki/Mibibyte) -- `blob`: arbitrary numeric array, up to 64 -[KiB](http://en.wikipedia.org/wiki/Kibibyte) -- `tinyblob`: arbitrary numeric array, up to 256 bytes (actually smaller due to header -info). - -## Special DataJoint-only datatypes - -These types abstract certain kinds of non-database data to facilitate use -together with DataJoint. - -- ``: DataJoint's native serialization format for Python objects. Supports -NumPy arrays, dicts, lists, datetime objects, and nested structures. Compatible with -MATLAB. See [custom types](customtype.md) for details. - -- `object`: managed [file and folder storage](object.md) with support for direct writes -(Zarr, HDF5) and fsspec integration. Recommended for new pipelines. - -- `attach`: a [file attachment](attach.md) similar to email attachments facillitating -sending/receiving an opaque data file to/from a DataJoint pipeline. - -- `filepath@store`: a [filepath](filepath.md) used to link non-DataJoint managed files -into a DataJoint pipeline. - -- ``: a [custom attribute type](customtype.md) that defines bidirectional -conversion between Python objects and database storage formats. Use this to store -complex data types like graphs, domain-specific objects, or custom data structures. - -## Numeric type aliases - -DataJoint provides convenient type aliases that map to standard MySQL numeric types. +- `datetime`: date and time as `'YYYY-MM-DD HH:MM:SS'`. + Use `CURRENT_TIMESTAMP` as default for auto-populated timestamps. + +**Timezone policy:** All `datetime` values should be stored as **UTC**. Timezone +conversion is a presentation concern handled by the application layer. This ensures +reproducible computations regardless of server location or timezone settings. + +### Binary + +- `bytes`: raw binary data (up to 4 GiB). Stores and returns raw bytes without + serialization. For serialized Python objects (arrays, dicts, etc.), use ``. + +### Other + +- `uuid`: 128-bit universally unique identifier. +- `json`: JSON document for structured data. + +## Native datatypes (advanced) + +Native database types are available for advanced use cases but are **not recommended** +for portable pipelines. Using native types will generate a warning. + +- `tinyint`, `smallint`, `int`, `bigint` (with optional `unsigned`) +- `float`, `double`, `real` +- `tinyblob`, `blob`, `mediumblob`, `longblob` +- `tinytext`, `mediumtext`, `longtext` (size variants) +- `time`, `timestamp`, `year` +- `mediumint`, `serial`, `int auto_increment` + +See the [storage types spec](storage-types-spec.md) for complete mappings. + +## Codec types (special datatypes) + +Codecs provide `encode()`/`decode()` semantics for complex data that doesn't +fit native database types. They are denoted with angle brackets: ``. + +### Storage mode: `@` convention + +The `@` character indicates **external storage** (object store vs database): + +- **No `@`**: Internal storage (database) - e.g., ``, `` +- **`@` present**: External storage (object store) - e.g., ``, `` +- **`@` alone**: Use default store - e.g., `` +- **`@name`**: Use named store - e.g., `` + +### Built-in codecs + +**Serialization types** - for Python objects: + +- ``: DataJoint's native serialization format for Python objects. Supports + NumPy arrays, dicts, lists, datetime objects, and nested structures. Stores in + database. Compatible with MATLAB. See [custom codecs](codecs.md) for details. + +- `` / ``: Like `` but stores externally with hash- + addressed deduplication. Use for large arrays that may be duplicated across rows. + +**File storage types** - for managed files: + +- `` / ``: Managed file and folder storage with path derived + from primary key. Supports Zarr, HDF5, and direct writes via fsspec. Returns + `ObjectRef` for lazy access. External only. See [object storage](object.md). + +- `` / ``: Hash-addressed storage for raw bytes with + MD5 deduplication. External only. Use via `` or `` rather than directly. + +**File attachment types** - for file transfer: + +- ``: File attachment stored in database with filename preserved. Similar + to email attachments. Good for small files (<16MB). See [attachments](attach.md). + +- `` / ``: Like `` but stores externally with + deduplication. Use for large files. + +**File reference types** - for external files: + +- ``: Reference to existing file in a configured store. No file + copying occurs. Returns `ObjectRef` for lazy access. External only. See [filepath](filepath.md). + +### User-defined codecs + +- ``: Define your own [custom codec](codecs.md) with + bidirectional conversion between Python objects and database storage. Use for + graphs, domain-specific objects, or custom data structures. + +## Core type aliases + +DataJoint provides convenient type aliases that map to standard database types. These aliases use familiar naming conventions from NumPy and other numerical computing -libraries, making table definitions more readable and explicit about data precision. - -| Alias | MySQL Type | Description | -|-------|------------|-------------| -| `bool` | `tinyint` | Boolean value (0 or 1) | -| `int8` | `tinyint` | 8-bit signed integer (-128 to 127) | -| `uint8` | `tinyint unsigned` | 8-bit unsigned integer (0 to 255) | -| `int16` | `smallint` | 16-bit signed integer (-32,768 to 32,767) | -| `uint16` | `smallint unsigned` | 16-bit unsigned integer (0 to 65,535) | -| `int32` | `int` | 32-bit signed integer | -| `uint32` | `int unsigned` | 32-bit unsigned integer | -| `int64` | `bigint` | 64-bit signed integer | -| `uint64` | `bigint unsigned` | 64-bit unsigned integer | -| `float32` | `float` | 32-bit single-precision floating point | -| `float64` | `double` | 64-bit double-precision floating point | +libraries, making table definitions more readable and portable across database backends. + +| Alias | MySQL | PostgreSQL | Description | +|-------|-------|------------|-------------| +| `bool` | `TINYINT` | `BOOLEAN` | Boolean value (0 or 1) | +| `int8` | `TINYINT` | `SMALLINT` | 8-bit signed integer (-128 to 127) | +| `uint8` | `TINYINT UNSIGNED` | `SMALLINT` | 8-bit unsigned integer (0 to 255) | +| `int16` | `SMALLINT` | `SMALLINT` | 16-bit signed integer | +| `uint16` | `SMALLINT UNSIGNED` | `INTEGER` | 16-bit unsigned integer | +| `int32` | `INT` | `INTEGER` | 32-bit signed integer | +| `uint32` | `INT UNSIGNED` | `BIGINT` | 32-bit unsigned integer | +| `int64` | `BIGINT` | `BIGINT` | 64-bit signed integer | +| `uint64` | `BIGINT UNSIGNED` | `NUMERIC(20)` | 64-bit unsigned integer | +| `float32` | `FLOAT` | `REAL` | 32-bit single-precision float | +| `float64` | `DOUBLE` | `DOUBLE PRECISION` | 64-bit double-precision float | +| `bytes` | `LONGBLOB` | `BYTEA` | Raw binary data | Example usage: @@ -115,22 +156,24 @@ Example usage: @schema class Measurement(dj.Manual): definition = """ - measurement_id : int + measurement_id : int32 --- temperature : float32 # single-precision temperature reading precise_value : float64 # double-precision measurement sample_count : uint32 # unsigned 32-bit counter sensor_flags : uint8 # 8-bit status flags is_valid : bool # boolean flag + raw_data : bytes # raw binary data + processed : # serialized Python object + large_array : # external storage with deduplication """ ``` ## Datatypes not (yet) supported -- `binary` -- `text` -- `longtext` -- `bit` +- `binary(n)` / `varbinary(n)` - use `bytes` instead +- `bit(n)` - use `int` types with bitwise operations +- `set(...)` - use `json` for multiple selections For additional information about these datatypes, see http://dev.mysql.com/doc/refman/5.6/en/data-types.html diff --git a/docs/src/design/tables/codec-spec.md b/docs/src/design/tables/codec-spec.md new file mode 100644 index 00000000..a3eefa57 --- /dev/null +++ b/docs/src/design/tables/codec-spec.md @@ -0,0 +1,766 @@ +# Codec Specification + +This document specifies the DataJoint Codec API for creating custom attribute types +that extend DataJoint's native type system. + +## Overview + +Codecs define bidirectional conversion between Python objects and database storage. +They enable storing complex data types (graphs, models, custom formats) while +maintaining DataJoint's query capabilities. + +``` +┌─────────────────┐ ┌─────────────────┐ +│ Python Object │ ──── encode ────► │ Storage Type │ +│ (e.g. Graph) │ │ (e.g. bytes) │ +│ │ ◄─── decode ──── │ │ +└─────────────────┘ └─────────────────┘ +``` + +## Quick Start + +```python +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + """Store NetworkX graphs.""" + + name = "graph" # Use as in definitions + + def get_dtype(self, is_external: bool) -> str: + return "" # Delegate to blob for serialization + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + +# Use in table definition +@schema +class Connectivity(dj.Manual): + definition = ''' + conn_id : int + --- + network : + ''' +``` + +## The Codec Base Class + +All custom codecs inherit from `dj.Codec`: + +```python +class Codec(ABC): + """Base class for codec types.""" + + name: str | None = None # Required: unique identifier + + def get_dtype(self, is_external: bool) -> str: + """Return the storage dtype.""" + raise NotImplementedError + + @abstractmethod + def encode(self, value, *, key=None, store_name=None) -> Any: + """Encode Python value for storage.""" + ... + + @abstractmethod + def decode(self, stored, *, key=None) -> Any: + """Decode stored value back to Python.""" + ... + + def validate(self, value) -> None: + """Optional: validate value before encoding.""" + pass +``` + +## Required Components + +### 1. The `name` Attribute + +The `name` class attribute is a unique identifier used in table definitions with +`` syntax: + +```python +class MyCodec(dj.Codec): + name = "mycodec" # Use as in definitions +``` + +Naming conventions: +- Use lowercase with underscores: `spike_train`, `graph_embedding` +- Avoid generic names that might conflict: prefer `lab_model` over `model` +- Names must be unique across all registered codecs + +### 2. The `get_dtype()` Method + +Returns the underlying storage type. The `is_external` parameter indicates whether +the `@` modifier is present in the table definition: + +```python +def get_dtype(self, is_external: bool) -> str: + """ + Args: + is_external: True if @ modifier present (e.g., ) + + Returns: + - A core type: "bytes", "json", "varchar(N)", "int32", etc. + - Another codec: "", "", etc. + + Raises: + DataJointError: If external storage not supported but @ is present + """ +``` + +Examples: + +```python +# Simple: always store as bytes +def get_dtype(self, is_external: bool) -> str: + return "bytes" + +# Different behavior for internal/external +def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" + +# External-only codec +def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" +``` + +### 3. The `encode()` Method + +Converts Python objects to the format expected by `get_dtype()`: + +```python +def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: + """ + Args: + value: The Python object to store + key: Primary key values (for context-dependent encoding) + store_name: Target store name (for external storage) + + Returns: + Value in the format expected by get_dtype() + """ +``` + +### 4. The `decode()` Method + +Converts stored values back to Python objects: + +```python +def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """ + Args: + stored: Data retrieved from storage + key: Primary key values (for context-dependent decoding) + + Returns: + The reconstructed Python object + """ +``` + +### 5. The `validate()` Method (Optional) + +Called automatically before `encode()` during INSERT operations: + +```python +def validate(self, value: Any) -> None: + """ + Args: + value: The value to validate + + Raises: + TypeError: If the value has an incompatible type + ValueError: If the value fails domain validation + """ + if not isinstance(value, ExpectedType): + raise TypeError(f"Expected ExpectedType, got {type(value).__name__}") +``` + +## Auto-Registration + +Codecs automatically register when their class is defined. No decorator needed: + +```python +# This codec is registered automatically when the class is defined +class MyCodec(dj.Codec): + name = "mycodec" + # ... +``` + +### Skipping Registration + +For abstract base classes that shouldn't be registered: + +```python +class BaseCodec(dj.Codec, register=False): + """Abstract base - not registered.""" + name = None # Or omit entirely + +class ConcreteCodec(BaseCodec): + name = "concrete" # This one IS registered + # ... +``` + +### Registration Timing + +Codecs are registered at class definition time. Ensure your codec classes are +imported before any table definitions that use them: + +```python +# myproject/codecs.py +class GraphCodec(dj.Codec): + name = "graph" + ... + +# myproject/tables.py +import myproject.codecs # Ensure codecs are registered + +@schema +class Networks(dj.Manual): + definition = ''' + id : int + --- + network : + ''' +``` + +## Codec Composition (Chaining) + +Codecs can delegate to other codecs by returning `` from `get_dtype()`. +This enables layered functionality: + +```python +class CompressedJsonCodec(dj.Codec): + """Compress JSON data with zlib.""" + + name = "zjson" + + def get_dtype(self, is_external: bool) -> str: + return "" # Delegate serialization to blob codec + + def encode(self, value, *, key=None, store_name=None): + import json, zlib + json_bytes = json.dumps(value).encode('utf-8') + return zlib.compress(json_bytes) + + def decode(self, stored, *, key=None): + import json, zlib + json_bytes = zlib.decompress(stored) + return json.loads(json_bytes.decode('utf-8')) +``` + +### How Chaining Works + +When DataJoint encounters ``: + +1. Calls `ZjsonCodec.get_dtype(is_external=False)` → returns `""` +2. Calls `BlobCodec.get_dtype(is_external=False)` → returns `"bytes"` +3. Final storage type is `bytes` (LONGBLOB in MySQL) + +During INSERT: +1. `ZjsonCodec.encode()` converts Python dict → compressed bytes +2. `BlobCodec.encode()` packs bytes → DJ blob format +3. Stored in database + +During FETCH: +1. Read from database +2. `BlobCodec.decode()` unpacks DJ blob → compressed bytes +3. `ZjsonCodec.decode()` decompresses → Python dict + +### Built-in Codec Chains + +DataJoint's built-in codecs form these chains: + +``` + → bytes (internal) + → json (external) + + → bytes (internal) + → json (external) + + → json (external only) + → json (external only) + → json (external only) +``` + +### Store Name Propagation + +When using external storage (`@`), the store name propagates through the chain: + +```python +# Table definition +data : + +# Resolution: +# 1. MyCodec.get_dtype(is_external=True) → "" +# 2. BlobCodec.get_dtype(is_external=True) → "" +# 3. HashCodec.get_dtype(is_external=True) → "json" +# 4. store_name="coldstore" passed to HashCodec.encode() +``` + +## Plugin System (Entry Points) + +Codecs can be distributed as installable packages using Python entry points. + +### Package Structure + +``` +dj-graph-codecs/ +├── pyproject.toml +└── src/ + └── dj_graph_codecs/ + ├── __init__.py + └── codecs.py +``` + +### pyproject.toml + +```toml +[project] +name = "dj-graph-codecs" +version = "1.0.0" +dependencies = ["datajoint>=2.0", "networkx"] + +[project.entry-points."datajoint.codecs"] +graph = "dj_graph_codecs.codecs:GraphCodec" +weighted_graph = "dj_graph_codecs.codecs:WeightedGraphCodec" +``` + +### Codec Implementation + +```python +# src/dj_graph_codecs/codecs.py +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + +class WeightedGraphCodec(dj.Codec): + name = "weighted_graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': [(u, v, d) for u, v, d in graph.edges(data=True)], + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + for u, v, d in stored['edges']: + G.add_edge(u, v, **d) + return G +``` + +### Usage After Installation + +```bash +pip install dj-graph-codecs +``` + +```python +# Codecs are automatically discovered and available +@schema +class Networks(dj.Manual): + definition = ''' + network_id : int + --- + topology : + weights : + ''' +``` + +### Entry Point Discovery + +DataJoint loads entry points lazily when a codec is first requested: + +1. Check explicit registry (codecs defined in current process) +2. Load entry points from `datajoint.codecs` group +3. Also checks legacy `datajoint.types` group for compatibility + +## API Reference + +### Module Functions + +```python +import datajoint as dj + +# List all registered codec names +dj.list_codecs() # Returns: ['blob', 'hash', 'object', 'attach', 'filepath', ...] + +# Get a codec instance by name +codec = dj.get_codec("blob") +codec = dj.get_codec("") # Angle brackets are optional +codec = dj.get_codec("") # Store parameter is stripped +``` + +### Internal Functions (for advanced use) + +```python +from datajoint.codecs import ( + is_codec_registered, # Check if codec exists + unregister_codec, # Remove codec (testing only) + resolve_dtype, # Resolve codec chain + parse_type_spec, # Parse "" syntax +) +``` + +## Built-in Codecs + +DataJoint provides these built-in codecs: + +| Codec | Internal | External | Description | +|-------|----------|----------|-------------| +| `` | `bytes` | `` | DataJoint serialization for Python objects | +| `` | N/A | `json` | Content-addressed storage with MD5 deduplication | +| `` | N/A | `json` | Path-addressed storage for files/folders | +| `` | `bytes` | `` | File attachments with filename preserved | +| `` | N/A | `json` | Reference to existing files in store | + +## Complete Examples + +### Example 1: Simple Serialization + +```python +import datajoint as dj +import numpy as np + +class SpikeTrainCodec(dj.Codec): + """Efficient storage for sparse spike timing data.""" + + name = "spike_train" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def validate(self, value): + if not isinstance(value, np.ndarray): + raise TypeError("Expected numpy array of spike times") + if value.ndim != 1: + raise ValueError("Spike train must be 1-dimensional") + if len(value) > 1 and not np.all(np.diff(value) >= 0): + raise ValueError("Spike times must be sorted") + + def encode(self, spike_times, *, key=None, store_name=None): + # Store as differences (smaller values, better compression) + return np.diff(spike_times, prepend=0).astype(np.float32) + + def decode(self, stored, *, key=None): + # Reconstruct original spike times + return np.cumsum(stored).astype(np.float64) +``` + +### Example 2: External Storage + +```python +import datajoint as dj +import pickle + +class ModelCodec(dj.Codec): + """Store ML models with optional external storage.""" + + name = "model" + + def get_dtype(self, is_external: bool) -> str: + # Use hash-addressed storage for large models + return "" if is_external else "" + + def encode(self, model, *, key=None, store_name=None): + return pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL) + + def decode(self, stored, *, key=None): + return pickle.loads(stored) + + def validate(self, value): + # Check that model has required interface + if not hasattr(value, 'predict'): + raise TypeError("Model must have a predict() method") +``` + +Usage: +```python +@schema +class Models(dj.Manual): + definition = ''' + model_id : int + --- + small_model : # Internal storage + large_model : # External (default store) + archive_model : # External (specific store) + ''' +``` + +### Example 3: JSON with Schema Validation + +```python +import datajoint as dj +import jsonschema + +class ConfigCodec(dj.Codec): + """Store validated JSON configuration.""" + + name = "config" + + SCHEMA = { + "type": "object", + "properties": { + "version": {"type": "integer", "minimum": 1}, + "settings": {"type": "object"}, + }, + "required": ["version", "settings"], + } + + def get_dtype(self, is_external: bool) -> str: + return "json" + + def validate(self, value): + jsonschema.validate(value, self.SCHEMA) + + def encode(self, config, *, key=None, store_name=None): + return config # JSON type handles serialization + + def decode(self, stored, *, key=None): + return stored +``` + +### Example 4: Context-Dependent Encoding + +```python +import datajoint as dj + +class VersionedDataCodec(dj.Codec): + """Handle different encoding versions based on primary key.""" + + name = "versioned" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + version = key.get("schema_version", 1) if key else 1 + if version >= 2: + return {"v": 2, "data": self._encode_v2(value)} + return {"v": 1, "data": self._encode_v1(value)} + + def decode(self, stored, *, key=None): + version = stored.get("v", 1) + if version >= 2: + return self._decode_v2(stored["data"]) + return self._decode_v1(stored["data"]) + + def _encode_v1(self, value): + return value + + def _decode_v1(self, data): + return data + + def _encode_v2(self, value): + # New encoding format + return {"optimized": True, "payload": value} + + def _decode_v2(self, data): + return data["payload"] +``` + +### Example 5: External-Only Codec + +```python +import datajoint as dj +from pathlib import Path + +class ZarrCodec(dj.Codec): + """Store Zarr arrays in object storage.""" + + name = "zarr" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise dj.DataJointError(" requires @ (external storage only)") + return "" # Delegate to object storage + + def encode(self, value, *, key=None, store_name=None): + import zarr + import tempfile + + # If already a path, pass through + if isinstance(value, (str, Path)): + return str(value) + + # If zarr array, save to temp and return path + if isinstance(value, zarr.Array): + tmpdir = tempfile.mkdtemp() + path = Path(tmpdir) / "data.zarr" + zarr.save(path, value) + return str(path) + + raise TypeError(f"Expected zarr.Array or path, got {type(value)}") + + def decode(self, stored, *, key=None): + # ObjectCodec returns ObjectRef, use its fsmap for zarr + import zarr + return zarr.open(stored.fsmap, mode='r') +``` + +## Best Practices + +### 1. Choose Appropriate Storage Types + +| Data Type | Recommended `get_dtype()` | +|-----------|---------------------------| +| Python objects (dicts, arrays) | `""` | +| Large binary data | `""` (external) | +| Files/folders (Zarr, HDF5) | `""` (external) | +| Simple JSON-serializable | `"json"` | +| Short strings | `"varchar(N)"` | +| Numeric identifiers | `"int32"`, `"int64"` | + +### 2. Handle None Values + +Nullable columns may pass `None` to your codec: + +```python +def encode(self, value, *, key=None, store_name=None): + if value is None: + return None # Pass through for nullable columns + return self._actual_encode(value) + +def decode(self, stored, *, key=None): + if stored is None: + return None + return self._actual_decode(stored) +``` + +### 3. Test Round-Trips + +Always verify that `decode(encode(x)) == x`: + +```python +def test_codec_roundtrip(): + codec = MyCodec() + + test_values = [ + {"key": "value"}, + [1, 2, 3], + np.array([1.0, 2.0]), + ] + + for original in test_values: + encoded = codec.encode(original) + decoded = codec.decode(encoded) + assert decoded == original or np.array_equal(decoded, original) +``` + +### 4. Include Validation + +Catch errors early with `validate()`: + +```python +def validate(self, value): + if not isinstance(value, ExpectedType): + raise TypeError(f"Expected ExpectedType, got {type(value).__name__}") + + if not self._is_valid(value): + raise ValueError("Value fails validation constraints") +``` + +### 5. Document Expected Formats + +Include docstrings explaining input/output formats: + +```python +class MyCodec(dj.Codec): + """ + Store MyType objects. + + Input format (encode): + MyType instance with attributes: x, y, z + + Storage format: + Dict with keys: 'x', 'y', 'z' + + Output format (decode): + MyType instance reconstructed from storage + """ +``` + +### 6. Consider Versioning + +If your encoding format might change: + +```python +def encode(self, value, *, key=None, store_name=None): + return { + "_version": 2, + "_data": self._encode_v2(value), + } + +def decode(self, stored, *, key=None): + version = stored.get("_version", 1) + data = stored.get("_data", stored) + + if version == 1: + return self._decode_v1(data) + return self._decode_v2(data) +``` + +## Error Handling + +### Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| `Unknown codec: ` | Codec not registered | Import module defining codec before table definition | +| `Codec already registered` | Duplicate name | Use unique names; check for conflicts | +| ` requires @` | External-only codec used without @ | Add `@` or `@store` to attribute type | +| `Circular codec reference` | Codec chain forms a loop | Check `get_dtype()` return values | + +### Debugging + +```python +# Check what codecs are registered +print(dj.list_codecs()) + +# Inspect a codec +codec = dj.get_codec("mycodec") +print(f"Name: {codec.name}") +print(f"Internal dtype: {codec.get_dtype(is_external=False)}") +print(f"External dtype: {codec.get_dtype(is_external=True)}") + +# Resolve full chain +from datajoint.codecs import resolve_dtype +final_type, chain, store = resolve_dtype("") +print(f"Final storage type: {final_type}") +print(f"Codec chain: {[c.name for c in chain]}") +print(f"Store: {store}") +``` diff --git a/docs/src/design/tables/codecs.md b/docs/src/design/tables/codecs.md new file mode 100644 index 00000000..ccc9db1f --- /dev/null +++ b/docs/src/design/tables/codecs.md @@ -0,0 +1,553 @@ +# Custom Codecs + +In modern scientific research, data pipelines often involve complex workflows that +generate diverse data types. From high-dimensional imaging data to machine learning +models, these data types frequently exceed the basic representations supported by +traditional relational databases. For example: + ++ A lab working on neural connectivity might use graph objects to represent brain + networks. ++ Researchers processing raw imaging data might store custom objects for pre-processing + configurations. ++ Computational biologists might store fitted machine learning models or parameter + objects for downstream predictions. + +To handle these diverse needs, DataJoint provides the **Codec** system. It +enables researchers to store and retrieve complex, non-standard data types—like Python +objects or data structures—in a relational database while maintaining the +reproducibility, modularity, and query capabilities required for scientific workflows. + +## Overview + +Custom codecs define bidirectional conversion between: + +- **Python objects** (what your code works with) +- **Storage format** (what gets stored in the database) + +``` +┌─────────────────┐ encode() ┌─────────────────┐ +│ Python Object │ ───────────────► │ Storage Type │ +│ (e.g. Graph) │ │ (e.g. bytes) │ +└─────────────────┘ decode() └─────────────────┘ + ◄─────────────── +``` + +## Defining Custom Codecs + +Create a custom codec by subclassing `dj.Codec` and implementing the required +methods. Codecs auto-register when their class is defined: + +```python +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + """Custom codec for storing networkx graphs.""" + + # Required: unique identifier used in table definitions + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + """Return the underlying storage type.""" + return "" # Delegate to blob for serialization + + def encode(self, graph, *, key=None, store_name=None): + """Convert graph to storable format (called on INSERT).""" + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + """Convert stored data back to graph (called on FETCH).""" + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G +``` + +### Required Components + +| Component | Description | +|-----------|-------------| +| `name` | Unique identifier used in table definitions with `` syntax | +| `get_dtype(is_external)` | Returns underlying storage type (e.g., `""`, `"bytes"`, `"json"`) | +| `encode(value, *, key=None, store_name=None)` | Converts Python object to storable format | +| `decode(stored, *, key=None)` | Converts stored data back to Python object | + +### Using Custom Codecs in Tables + +Once defined, use the codec in table definitions with angle brackets: + +```python +@schema +class Connectivity(dj.Manual): + definition = """ + conn_id : int + --- + conn_graph = null : # Uses the GraphCodec we defined + """ +``` + +Insert and fetch work seamlessly: + +```python +import networkx as nx + +# Insert - encode() is called automatically +g = nx.lollipop_graph(4, 2) +Connectivity.insert1({"conn_id": 1, "conn_graph": g}) + +# Fetch - decode() is called automatically +result = (Connectivity & "conn_id = 1").fetch1("conn_graph") +assert isinstance(result, nx.Graph) +``` + +## Auto-Registration + +Codecs automatically register when their class is defined. No decorator needed: + +```python +# This codec is registered automatically when the class is defined +class MyCodec(dj.Codec): + name = "mycodec" + ... +``` + +### Skipping Registration + +For abstract base classes that shouldn't be registered: + +```python +class BaseCodec(dj.Codec, register=False): + """Abstract base - not registered.""" + name = None + +class ConcreteCodec(BaseCodec): + name = "concrete" # This one IS registered + ... +``` + +### Listing Registered Codecs + +```python +# List all registered codec names +print(dj.list_codecs()) +``` + +## Validation + +Add data validation by overriding the `validate()` method. It's called automatically +before `encode()` during INSERT operations: + +```python +class PositiveArrayCodec(dj.Codec): + name = "positive_array" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def validate(self, value): + """Ensure all values are positive.""" + import numpy as np + if not isinstance(value, np.ndarray): + raise TypeError(f"Expected numpy array, got {type(value).__name__}") + if np.any(value < 0): + raise ValueError("Array must contain only positive values") + + def encode(self, array, *, key=None, store_name=None): + return array + + def decode(self, stored, *, key=None): + return stored +``` + +## The `get_dtype()` Method + +The `get_dtype()` method specifies how data is stored. The `is_external` parameter +indicates whether the `@` modifier is present: + +```python +def get_dtype(self, is_external: bool) -> str: + """ + Args: + is_external: True if @ modifier present (e.g., ) + + Returns: + - A core type: "bytes", "json", "varchar(N)", etc. + - Another codec: "", "", etc. + """ +``` + +### Storage Type Options + +| Return Value | Use Case | Database Type | +|--------------|----------|---------------| +| `"bytes"` | Raw binary data | LONGBLOB | +| `"json"` | JSON-serializable data | JSON | +| `"varchar(N)"` | String representations | VARCHAR(N) | +| `"int32"` | Integer identifiers | INT | +| `""` | Serialized Python objects | Depends on internal/external | +| `""` | Large objects with deduplication | JSON (external only) | +| `""` | Chain to another codec | Varies | + +### External Storage + +For large data, use external storage with the `@` modifier: + +```python +class LargeArrayCodec(dj.Codec): + name = "large_array" + + def get_dtype(self, is_external: bool) -> str: + # Use hash-addressed external storage for large data + return "" if is_external else "" + + def encode(self, array, *, key=None, store_name=None): + import pickle + return pickle.dumps(array) + + def decode(self, stored, *, key=None): + import pickle + return pickle.loads(stored) +``` + +Usage: +```python +@schema +class Data(dj.Manual): + definition = ''' + id : int + --- + small_array : # Internal (in database) + big_array : # External (default store) + archive : # External (specific store) + ''' +``` + +## Codec Chaining + +Custom codecs can build on other codecs by returning `` from `get_dtype()`: + +```python +class CompressedGraphCodec(dj.Codec): + name = "compressed_graph" + + def get_dtype(self, is_external: bool) -> str: + return "" # Chain to the GraphCodec + + def encode(self, graph, *, key=None, store_name=None): + # Compress before passing to GraphCodec + return self._compress(graph) + + def decode(self, stored, *, key=None): + # GraphCodec's decode already ran, decompress result + return self._decompress(stored) +``` + +DataJoint automatically resolves the chain to find the final storage type. + +### How Chaining Works + +When DataJoint encounters ``: + +1. `CompressedGraphCodec.get_dtype()` returns `""` +2. `GraphCodec.get_dtype()` returns `""` +3. `BlobCodec.get_dtype()` returns `"bytes"` +4. Final storage type is `bytes` (LONGBLOB in MySQL) + +During INSERT, encoders run outer → inner: +1. `CompressedGraphCodec.encode()` → compressed graph +2. `GraphCodec.encode()` → edge list dict +3. `BlobCodec.encode()` → serialized bytes + +During FETCH, decoders run inner → outer (reverse order). + +## The Key Parameter + +The `key` parameter provides access to primary key values during encode/decode +operations. This is useful when the conversion depends on record context: + +```python +class ContextAwareCodec(dj.Codec): + name = "context_aware" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + if key and key.get("version") == 2: + return self._encode_v2(value) + return self._encode_v1(value) + + def decode(self, stored, *, key=None): + if key and key.get("version") == 2: + return self._decode_v2(stored) + return self._decode_v1(stored) +``` + +## Publishing Codecs as Packages + +Custom codecs can be distributed as installable packages using Python entry points. +This allows codecs to be automatically discovered when the package is installed. + +### Package Structure + +``` +dj-graph-codecs/ +├── pyproject.toml +└── src/ + └── dj_graph_codecs/ + ├── __init__.py + └── codecs.py +``` + +### pyproject.toml + +```toml +[project] +name = "dj-graph-codecs" +version = "1.0.0" +dependencies = ["datajoint>=2.0", "networkx"] + +[project.entry-points."datajoint.codecs"] +graph = "dj_graph_codecs.codecs:GraphCodec" +weighted_graph = "dj_graph_codecs.codecs:WeightedGraphCodec" +``` + +### Codec Implementation + +```python +# src/dj_graph_codecs/codecs.py +import datajoint as dj +import networkx as nx + +class GraphCodec(dj.Codec): + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return { + 'nodes': list(graph.nodes(data=True)), + 'edges': list(graph.edges(data=True)), + } + + def decode(self, stored, *, key=None): + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + +class WeightedGraphCodec(dj.Codec): + name = "weighted_graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return [(u, v, d) for u, v, d in graph.edges(data=True)] + + def decode(self, edges, *, key=None): + g = nx.Graph() + for u, v, d in edges: + g.add_edge(u, v, **d) + return g +``` + +### Usage After Installation + +```bash +pip install dj-graph-codecs +``` + +```python +# Codecs are automatically available after package installation +@schema +class MyTable(dj.Manual): + definition = """ + id : int + --- + network : + weighted_network : + """ +``` + +## Complete Example + +Here's a complete example demonstrating custom codecs for a neuroscience workflow: + +```python +import datajoint as dj +import numpy as np + +# Define custom codecs +class SpikeTrainCodec(dj.Codec): + """Efficient storage for sparse spike timing data.""" + name = "spike_train" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def validate(self, value): + if not isinstance(value, np.ndarray): + raise TypeError("Expected numpy array of spike times") + if value.ndim != 1: + raise ValueError("Spike train must be 1-dimensional") + if len(value) > 1 and not np.all(np.diff(value) >= 0): + raise ValueError("Spike times must be sorted") + + def encode(self, spike_times, *, key=None, store_name=None): + # Store as differences (smaller values, better compression) + return np.diff(spike_times, prepend=0).astype(np.float32) + + def decode(self, stored, *, key=None): + # Reconstruct original spike times + return np.cumsum(stored).astype(np.float64) + + +class WaveformCodec(dj.Codec): + """Storage for spike waveform templates with metadata.""" + name = "waveform" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, waveform_dict, *, key=None, store_name=None): + return { + "data": waveform_dict["data"].astype(np.float32), + "sampling_rate": waveform_dict["sampling_rate"], + "channel_ids": list(waveform_dict["channel_ids"]), + } + + def decode(self, stored, *, key=None): + return { + "data": stored["data"].astype(np.float64), + "sampling_rate": stored["sampling_rate"], + "channel_ids": np.array(stored["channel_ids"]), + } + + +# Create schema and tables +schema = dj.schema("ephys_analysis") + +@schema +class Unit(dj.Manual): + definition = """ + unit_id : int + --- + spike_times : + waveform : + quality : enum('good', 'mua', 'noise') + """ + + +# Usage +spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89]) +waveform = { + "data": np.random.randn(82, 4), + "sampling_rate": 30000, + "channel_ids": [10, 11, 12, 13], +} + +Unit.insert1({ + "unit_id": 1, + "spike_times": spike_times, + "waveform": waveform, + "quality": "good", +}) + +# Fetch - automatically decoded +result = (Unit & "unit_id = 1").fetch1() +print(f"Spike times: {result['spike_times']}") +print(f"Waveform shape: {result['waveform']['data'].shape}") +``` + +## Built-in Codecs + +DataJoint includes several built-in codecs: + +### `` - DataJoint Blob Serialization + +The `` codec provides DataJoint's native binary serialization. It supports: + +- NumPy arrays (compatible with MATLAB) +- Python dicts, lists, tuples, sets +- datetime objects, Decimals, UUIDs +- Nested data structures +- Optional compression + +```python +@schema +class ProcessedData(dj.Manual): + definition = """ + data_id : int + --- + results : # Internal (serialized in database) + large_results : # External (hash-addressed storage) + """ +``` + +### `` - Content-Addressed Storage + +Stores raw bytes using MD5 content hashing with automatic deduplication. +External storage only. + +### `` - Path-Addressed Storage + +Stores files and folders at paths derived from primary keys. Ideal for +Zarr arrays, HDF5 files, and multi-file outputs. External storage only. + +### `` - File Attachments + +Stores files with filename preserved. Supports internal and external storage. + +### `` - File References + +References existing files in configured stores without copying. +External storage only. + +## Best Practices + +1. **Choose descriptive codec names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) + +2. **Select appropriate storage types**: Use `` for complex objects, `json` for simple structures, `` or `` for large data + +3. **Add validation**: Use `validate()` to catch data errors early + +4. **Document your codecs**: Include docstrings explaining the expected input/output formats + +5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes + +6. **Consider versioning**: If your encoding format might change, include version information + +7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs + +```python +def test_graph_codec_roundtrip(): + import networkx as nx + g = nx.lollipop_graph(4, 2) + codec = GraphCodec() + + encoded = codec.encode(g) + decoded = codec.decode(encoded) + + assert set(g.edges) == set(decoded.edges) +``` + +## API Reference + +```python +import datajoint as dj + +# List all registered codecs +dj.list_codecs() + +# Get a codec instance +codec = dj.get_codec("blob") +codec = dj.get_codec("") # Angle brackets optional +codec = dj.get_codec("") # Store parameter stripped +``` + +For the complete Codec API specification, see [Codec Specification](codec-spec.md). diff --git a/docs/src/design/tables/customtype.md b/docs/src/design/tables/customtype.md deleted file mode 100644 index 267e0420..00000000 --- a/docs/src/design/tables/customtype.md +++ /dev/null @@ -1,614 +0,0 @@ -# Custom Attribute Types - -In modern scientific research, data pipelines often involve complex workflows that -generate diverse data types. From high-dimensional imaging data to machine learning -models, these data types frequently exceed the basic representations supported by -traditional relational databases. For example: - -+ A lab working on neural connectivity might use graph objects to represent brain - networks. -+ Researchers processing raw imaging data might store custom objects for pre-processing - configurations. -+ Computational biologists might store fitted machine learning models or parameter - objects for downstream predictions. - -To handle these diverse needs, DataJoint provides the **AttributeType** system. It -enables researchers to store and retrieve complex, non-standard data types—like Python -objects or data structures—in a relational database while maintaining the -reproducibility, modularity, and query capabilities required for scientific workflows. - -## Overview - -Custom attribute types define bidirectional conversion between: - -- **Python objects** (what your code works with) -- **Storage format** (what gets stored in the database) - -``` -┌─────────────────┐ encode() ┌─────────────────┐ -│ Python Object │ ───────────────► │ Storage Type │ -│ (e.g. Graph) │ │ (e.g. blob) │ -└─────────────────┘ decode() └─────────────────┘ - ◄─────────────── -``` - -## Defining Custom Types - -Create a custom type by subclassing `dj.AttributeType` and implementing the required -methods: - -```python -import datajoint as dj -import networkx as nx - -@dj.register_type -class GraphType(dj.AttributeType): - """Custom type for storing networkx graphs.""" - - # Required: unique identifier used in table definitions - type_name = "graph" - - # Required: underlying DataJoint storage type - dtype = "longblob" - - def encode(self, graph, *, key=None): - """Convert graph to storable format (called on INSERT).""" - return list(graph.edges) - - def decode(self, edges, *, key=None): - """Convert stored data back to graph (called on FETCH).""" - return nx.Graph(edges) -``` - -### Required Components - -| Component | Description | -|-----------|-------------| -| `type_name` | Unique identifier used in table definitions with `` syntax | -| `dtype` | Underlying DataJoint type for storage (e.g., `"longblob"`, `"varchar(255)"`, `"json"`) | -| `encode(value, *, key=None)` | Converts Python object to storable format | -| `decode(stored, *, key=None)` | Converts stored data back to Python object | - -### Using Custom Types in Tables - -Once registered, use the type in table definitions with angle brackets: - -```python -@schema -class Connectivity(dj.Manual): - definition = """ - conn_id : int - --- - conn_graph = null : # Uses the GraphType we defined - """ -``` - -Insert and fetch work seamlessly: - -```python -import networkx as nx - -# Insert - encode() is called automatically -g = nx.lollipop_graph(4, 2) -Connectivity.insert1({"conn_id": 1, "conn_graph": g}) - -# Fetch - decode() is called automatically -result = (Connectivity & "conn_id = 1").fetch1("conn_graph") -assert isinstance(result, nx.Graph) -``` - -## Type Registration - -### Decorator Registration - -The simplest way to register a type is with the `@dj.register_type` decorator: - -```python -@dj.register_type -class MyType(dj.AttributeType): - type_name = "my_type" - ... -``` - -### Direct Registration - -You can also register types explicitly: - -```python -class MyType(dj.AttributeType): - type_name = "my_type" - ... - -dj.register_type(MyType) -``` - -### Listing Registered Types - -```python -# List all registered type names -print(dj.list_types()) -``` - -## Validation - -Add data validation by overriding the `validate()` method. It's called automatically -before `encode()` during INSERT operations: - -```python -@dj.register_type -class PositiveArrayType(dj.AttributeType): - type_name = "positive_array" - dtype = "longblob" - - def validate(self, value): - """Ensure all values are positive.""" - import numpy as np - if not isinstance(value, np.ndarray): - raise TypeError(f"Expected numpy array, got {type(value).__name__}") - if np.any(value < 0): - raise ValueError("Array must contain only positive values") - - def encode(self, array, *, key=None): - return array - - def decode(self, stored, *, key=None): - return stored -``` - -## Storage Types (dtype) - -The `dtype` property specifies how data is stored in the database: - -| dtype | Use Case | Stored Format | -|-------|----------|---------------| -| `"longblob"` | Complex Python objects, arrays | Serialized binary | -| `"blob"` | Smaller objects | Serialized binary | -| `"json"` | JSON-serializable data | JSON string | -| `"varchar(N)"` | String representations | Text | -| `"int"` | Integer identifiers | Integer | -| `"blob@store"` | Large objects in external storage | UUID reference | -| `"object"` | Files/folders in object storage | JSON metadata | -| `""` | Chain to another custom type | Varies | - -### External Storage - -For large data, use external blob storage: - -```python -@dj.register_type -class LargeArrayType(dj.AttributeType): - type_name = "large_array" - dtype = "blob@mystore" # Uses external store named "mystore" - - def encode(self, array, *, key=None): - return array - - def decode(self, stored, *, key=None): - return stored -``` - -## Type Chaining - -Custom types can build on other custom types by referencing them in `dtype`: - -```python -@dj.register_type -class CompressedGraphType(dj.AttributeType): - type_name = "compressed_graph" - dtype = "" # Chain to the GraphType - - def encode(self, graph, *, key=None): - # Compress before passing to GraphType - return self._compress(graph) - - def decode(self, stored, *, key=None): - # GraphType's decode already ran - return self._decompress(stored) -``` - -DataJoint automatically resolves the chain to find the final storage type. - -## The Key Parameter - -The `key` parameter provides access to primary key values during encode/decode -operations. This is useful when the conversion depends on record context: - -```python -@dj.register_type -class ContextAwareType(dj.AttributeType): - type_name = "context_aware" - dtype = "longblob" - - def encode(self, value, *, key=None): - if key and key.get("version") == 2: - return self._encode_v2(value) - return self._encode_v1(value) - - def decode(self, stored, *, key=None): - if key and key.get("version") == 2: - return self._decode_v2(stored) - return self._decode_v1(stored) -``` - -## Publishing Custom Types as Packages - -Custom types can be distributed as installable packages using Python entry points. -This allows types to be automatically discovered when the package is installed. - -### Package Structure - -``` -dj-graph-types/ -├── pyproject.toml -└── src/ - └── dj_graph_types/ - ├── __init__.py - └── types.py -``` - -### pyproject.toml - -```toml -[project] -name = "dj-graph-types" -version = "1.0.0" - -[project.entry-points."datajoint.types"] -graph = "dj_graph_types.types:GraphType" -weighted_graph = "dj_graph_types.types:WeightedGraphType" -``` - -### Type Implementation - -```python -# src/dj_graph_types/types.py -import datajoint as dj -import networkx as nx - -class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" - - def encode(self, graph, *, key=None): - return list(graph.edges) - - def decode(self, edges, *, key=None): - return nx.Graph(edges) - -class WeightedGraphType(dj.AttributeType): - type_name = "weighted_graph" - dtype = "longblob" - - def encode(self, graph, *, key=None): - return [(u, v, d) for u, v, d in graph.edges(data=True)] - - def decode(self, edges, *, key=None): - g = nx.Graph() - g.add_weighted_edges_from(edges) - return g -``` - -### Usage After Installation - -```bash -pip install dj-graph-types -``` - -```python -# Types are automatically available after package installation -@schema -class MyTable(dj.Manual): - definition = """ - id : int - --- - network : - weighted_network : - """ -``` - -## Complete Example - -Here's a complete example demonstrating custom types for a neuroscience workflow: - -```python -import datajoint as dj -import numpy as np - -# Configure DataJoint -dj.config["database.host"] = "localhost" -dj.config["database.user"] = "root" -dj.config["database.password"] = "password" - -# Define custom types -@dj.register_type -class SpikeTrainType(dj.AttributeType): - """Efficient storage for sparse spike timing data.""" - type_name = "spike_train" - dtype = "longblob" - - def validate(self, value): - if not isinstance(value, np.ndarray): - raise TypeError("Expected numpy array of spike times") - if value.ndim != 1: - raise ValueError("Spike train must be 1-dimensional") - if not np.all(np.diff(value) >= 0): - raise ValueError("Spike times must be sorted") - - def encode(self, spike_times, *, key=None): - # Store as differences (smaller values, better compression) - return np.diff(spike_times, prepend=0).astype(np.float32) - - def decode(self, stored, *, key=None): - # Reconstruct original spike times - return np.cumsum(stored).astype(np.float64) - - -@dj.register_type -class WaveformType(dj.AttributeType): - """Storage for spike waveform templates with metadata.""" - type_name = "waveform" - dtype = "longblob" - - def encode(self, waveform_dict, *, key=None): - return { - "data": waveform_dict["data"].astype(np.float32), - "sampling_rate": waveform_dict["sampling_rate"], - "channel_ids": list(waveform_dict["channel_ids"]), - } - - def decode(self, stored, *, key=None): - return { - "data": stored["data"].astype(np.float64), - "sampling_rate": stored["sampling_rate"], - "channel_ids": np.array(stored["channel_ids"]), - } - - -# Create schema and tables -schema = dj.schema("ephys_analysis") - -@schema -class Unit(dj.Manual): - definition = """ - unit_id : int - --- - spike_times : - waveform : - quality : enum('good', 'mua', 'noise') - """ - - -# Usage -spike_times = np.array([0.1, 0.15, 0.23, 0.45, 0.67, 0.89]) -waveform = { - "data": np.random.randn(82, 4), - "sampling_rate": 30000, - "channel_ids": [10, 11, 12, 13], -} - -Unit.insert1({ - "unit_id": 1, - "spike_times": spike_times, - "waveform": waveform, - "quality": "good", -}) - -# Fetch - automatically decoded -result = (Unit & "unit_id = 1").fetch1() -print(f"Spike times: {result['spike_times']}") -print(f"Waveform shape: {result['waveform']['data'].shape}") -``` - -## Migration from AttributeAdapter - -The `AttributeAdapter` class is deprecated. Migrate to `AttributeType`: - -### Before (deprecated) - -```python -class GraphAdapter(dj.AttributeAdapter): - attribute_type = "longblob" - - def put(self, obj): - return list(obj.edges) - - def get(self, value): - return nx.Graph(value) - -# Required context-based registration -graph = GraphAdapter() -schema = dj.schema("mydb", context={"graph": graph}) -``` - -### After (recommended) - -```python -@dj.register_type -class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" - - def encode(self, obj, *, key=None): - return list(obj.edges) - - def decode(self, value, *, key=None): - return nx.Graph(value) - -# Global registration - no context needed -schema = dj.schema("mydb") -``` - -### Key Differences - -| Aspect | AttributeAdapter (deprecated) | AttributeType (recommended) | -|--------|-------------------------------|----------------------------| -| Methods | `put()` / `get()` | `encode()` / `decode()` | -| Storage type | `attribute_type` | `dtype` | -| Type name | Variable name in context | `type_name` property | -| Registration | Context dict per schema | Global `@register_type` decorator | -| Validation | Manual | Built-in `validate()` method | -| Distribution | Copy adapter code | Entry point packages | -| Key access | Not available | Optional `key` parameter | - -## Best Practices - -1. **Choose descriptive type names**: Use lowercase with underscores (e.g., `spike_train`, `graph_embedding`) - -2. **Select appropriate storage types**: Use `` for complex objects, `json` for simple structures, external storage for large data - -3. **Add validation**: Use `validate()` to catch data errors early - -4. **Document your types**: Include docstrings explaining the expected input/output formats - -5. **Handle None values**: Your encode/decode methods may receive `None` for nullable attributes - -6. **Consider versioning**: If your encoding format might change, include version information - -7. **Test round-trips**: Ensure `decode(encode(x)) == x` for all valid inputs - -```python -def test_graph_type_roundtrip(): - g = nx.lollipop_graph(4, 2) - t = GraphType() - - encoded = t.encode(g) - decoded = t.decode(encoded) - - assert set(g.edges) == set(decoded.edges) -``` - -## Built-in Types - -DataJoint includes a built-in type for explicit blob serialization: - -### `` - DataJoint Blob Serialization - -The `` type provides explicit control over DataJoint's native binary -serialization. It supports: - -- NumPy arrays (compatible with MATLAB) -- Python dicts, lists, tuples, sets -- datetime objects, Decimals, UUIDs -- Nested data structures -- Optional compression - -```python -@schema -class ProcessedData(dj.Manual): - definition = """ - data_id : int - --- - results : # Serialized Python objects - raw_bytes : longblob # Raw bytes (no serialization) - """ -``` - -#### When to Use `` - -- **Serialized data**: When storing Python objects (dicts, arrays, etc.) -- **New tables**: Prefer `` for automatic serialization -- **Migration**: Existing schemas with implicit serialization must migrate - -#### Raw Blob Behavior - -Plain `longblob` (and other blob variants) columns now store and return -**raw bytes** without automatic serialization: - -```python -@schema -class RawData(dj.Manual): - definition = """ - id : int - --- - raw_bytes : longblob # Stores/returns raw bytes - serialized : # Stores Python objects with serialization - """ - -# Raw bytes - no serialization -RawData.insert1({"id": 1, "raw_bytes": b"raw binary data", "serialized": {"key": "value"}}) - -row = (RawData & "id=1").fetch1() -row["raw_bytes"] # Returns: b"raw binary data" -row["serialized"] # Returns: {"key": "value"} -``` - -**Important**: Existing schemas that relied on implicit blob serialization -must be migrated to `` to preserve their behavior. - -## Schema Migration - -When upgrading existing schemas to use explicit type declarations, DataJoint -provides migration utilities. - -### Analyzing Blob Columns - -```python -import datajoint as dj - -schema = dj.schema("my_database") - -# Check migration status -status = dj.migrate.check_migration_status(schema) -print(f"Blob columns: {status['total_blob_columns']}") -print(f"Already migrated: {status['migrated']}") -print(f"Pending migration: {status['pending']}") -``` - -### Generating Migration SQL - -```python -# Preview migration (dry run) -result = dj.migrate.migrate_blob_columns(schema, dry_run=True) -for sql in result['sql_statements']: - print(sql) -``` - -### Applying Migration - -```python -# Apply migration -result = dj.migrate.migrate_blob_columns(schema, dry_run=False) -print(f"Migrated {result['migrated']} columns") -``` - -### Migration Details - -The migration updates MySQL column comments to include the type declaration. -This is a **metadata-only** change - the actual blob data format is unchanged. - -All blob type variants are handled: `tinyblob`, `blob`, `mediumblob`, `longblob`. - -Before migration: -- Column: `longblob` (or `blob`, `mediumblob`, etc.) -- Comment: `user comment` -- Behavior: Auto-serialization (implicit) - -After migration: -- Column: `longblob` (unchanged) -- Comment: `::user comment` -- Behavior: Explicit serialization via `` - -### Updating Table Definitions - -After database migration, update your Python table definitions for consistency: - -```python -# Before -class MyTable(dj.Manual): - definition = """ - id : int - --- - data : longblob # stored data - """ - -# After -class MyTable(dj.Manual): - definition = """ - id : int - --- - data : # stored data - """ -``` - -Both definitions work identically after migration, but using `` makes -the serialization explicit and documents the intended behavior. diff --git a/docs/src/design/tables/object-type-spec.md b/docs/src/design/tables/object-type-spec.md deleted file mode 100644 index 24fb2b4a..00000000 --- a/docs/src/design/tables/object-type-spec.md +++ /dev/null @@ -1,1473 +0,0 @@ -# Object Column Type Specification - -## Overview - -The `object` type introduces a new paradigm for managed file storage in DataJoint. Unlike existing `attach@store` and `filepath@store` types that reference named stores, the `object` type uses a **unified storage backend** that is tightly coupled with the schema and configured at the pipeline level. - -The `object` type supports both **files and folders**. Content is copied to storage at insert time, referenced via handle on fetch, and deleted when the record is deleted. - -### Immutability Contract - -Objects stored via the `object` type are **immutable after finalization**. Users agree to: -- **Insert (copy)**: Copy existing content to storage -- **Insert (staged)**: Reserve path, write directly, then finalize -- **Fetch**: Read content via handle (no modification) -- **Delete**: Remove content when record is deleted (only way to remove) - -Once an object is **finalized** (either via copy-insert or staged-insert completion), users must not directly modify it in the object store. - -#### Two Insert Modes - -| Mode | Use Case | Workflow | -|------|----------|----------| -| **Copy** | Small files, existing data | Local file → copy to storage → insert record | -| **Staged** | Large objects, Zarr, TileDB | Reserve path → write directly to storage → finalize record | - -### Augmented Schema vs External References - -The `object` type implements **Augmented Schema (AUS)** — a paradigm where the object store becomes a true extension of the relational database: - -- **DataJoint fully controls** the object store lifecycle -- **Only DataJoint writes** to the object store (users may have direct read access) -- **Tight coupling** between database and object store -- **Joint transaction management** on objects and database records -- **Single backend per pipeline** — all managed objects live together - -This is fundamentally different from **external references**, where DataJoint merely points to user-managed data: - -| Aspect | `object` (Augmented Schema) | `filepath@store` (External Reference) | -|--------|----------------------------|--------------------------------------| -| **Ownership** | DataJoint owns the data | User owns the data | -| **Writes** | Only via DataJoint | User writes directly | -| **Deletion** | DataJoint deletes on record delete | User manages lifecycle | -| **Multi-backend** | Single backend per pipeline | Multiple named stores | -| **Use case** | Pipeline-generated data | Collaborator data, legacy assets | - -**When to use each:** - -- Use `object` for data that DataJoint should own and manage as part of the schema (e.g., processed results, derived datasets) -- Use `filepath@store` for referencing externally-managed data across multiple backends (e.g., collaborator data on different cloud providers, legacy data that shouldn't be moved) - -## Storage Architecture - -### Default and Named Stores - -Each DataJoint pipeline has a **default storage backend** plus optional **named stores**, all configured in `datajoint.json`. DataJoint fully controls the path structure within each store. - -```python -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_id : int - --- - raw_data : object # uses default store - published : object@public # uses 'public' named store - """ -``` - -**All stores follow OAS principles:** -- DataJoint owns the lifecycle (insert/delete/fetch as a unit) -- Same deterministic path structure (`project/schema/Table/objects/...`) -- Same access control alignment with database -- Each store has its own `datajoint_store.json` metadata file - -**Why support multiple stores?** -- Different access policies (private vs public buckets) -- Different storage tiers (hot vs cold storage) -- Organizational requirements (data sovereignty, compliance) - -**Why require explicit store configuration?** -- All stores must be registered for OAS semantics -- Credential management aligns with database access control (platform-managed) -- Orphan cleanup operates per-store with full knowledge of configured stores - -### Access Control Patterns - -The deterministic path structure (`project/schema/Table/objects/pk=val/...`) enables **prefix-based access control policies** on each storage backend. - -**Supported access control levels:** - -| Level | Implementation | Example Policy Prefix | -|-------|---------------|----------------------| -| Project-level | IAM/bucket policy | `my-bucket/my_project/*` | -| Schema-level | IAM/bucket policy | `my-bucket/my_project/lab_internal/*` | -| Table-level | IAM/bucket policy | `my-bucket/my_project/schema/SensitiveTable/*` | -| Row-level | Per-object ACL or signed URLs | Future enhancement | - -**Example: Private and public data in separate stores** - -``` -# Default store (private) -s3://internal-bucket/my_project/ -└── lab_schema/ - └── ProcessingResults/ - └── objects/... - -# Named 'public' store -s3://public-bucket/my_project/ -└── lab_schema/ - └── PublishedDatasets/ - └── objects/... -``` - -Alternatively, use prefix-based policies within a single bucket if preferred. - -**Row-level access control** (access to objects for specific primary key values) is not directly supported by object store policies. Future versions may address this via DataJoint-generated signed URLs that project database permissions onto object access. - -### Supported Backends - -DataJoint uses **[`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/)** to ensure compatibility across multiple storage backends: - -- **Local storage** – POSIX-compliant file systems (e.g., NFS, SMB) -- **Cloud-based object storage** – Amazon S3, Google Cloud Storage, Azure Blob, MinIO - -## Project Structure - -A DataJoint project creates a structured hierarchical storage pattern: - -``` -📁 project_name/ -├── datajoint_store.json # store metadata (not client config) -├── 📁 schema_name/ -│ ├── 📁 Table1/ -│ │ ├── data.parquet # tabular data export (future) -│ │ └── 📁 objects/ # object storage for this table -│ │ ├── pk1=val1/pk2=val2/field1_token.dat -│ │ └── pk1=val1/pk2=val2/field2_token.zarr -│ ├── 📁 Table2/ -│ │ ├── data.parquet -│ │ └── 📁 objects/ -│ │ └── ... -``` - -### Object Storage Keys - -When using cloud object storage: - -``` -s3://bucket/project_name/schema_name/Table1/objects/pk1=val1/field_token.dat -s3://bucket/project_name/schema_name/Table1/objects/pk1=val1/field_token.zarr -``` - -## Configuration - -### Settings Structure - -Object storage is configured in `datajoint.json` using the existing settings system: - -```json -{ - "database.host": "localhost", - "database.user": "datajoint", - - "object_storage.project_name": "my_project", - "object_storage.protocol": "s3", - "object_storage.endpoint": "s3.amazonaws.com", - "object_storage.bucket": "my-bucket", - "object_storage.location": "my_project", - "object_storage.partition_pattern": "{subject_id}/{session_id}" -} -``` - -For local filesystem storage: - -```json -{ - "object_storage.project_name": "my_project", - "object_storage.protocol": "file", - "object_storage.location": "/data/my_project", - "object_storage.partition_pattern": "{subject_id}/{session_id}" -} -``` - -### Named Stores - -Additional stores can be defined using the `object_storage.stores.` prefix: - -```json -{ - "object_storage.project_name": "my_project", - "object_storage.protocol": "s3", - "object_storage.bucket": "internal-bucket", - "object_storage.location": "my_project", - - "object_storage.stores.public.protocol": "s3", - "object_storage.stores.public.bucket": "public-bucket", - "object_storage.stores.public.location": "my_project" -} -``` - -Named stores inherit `project_name` from the default configuration but can override all other settings. Use named stores with the `object@store_name` syntax: - -```python -@schema -class Dataset(dj.Manual): - definition = """ - dataset_id : int - --- - internal_data : object # default store (internal-bucket) - published_data : object@public # public store (public-bucket) - """ -``` - -Each named store: -- Must be explicitly configured (no ad-hoc URLs) -- Has its own `datajoint_store.json` metadata file -- Follows the same OAS lifecycle semantics as the default store -- Credentials are managed at the platform level, aligned with database access control - -### Settings Schema - -| Setting | Type | Required | Description | -|---------|------|----------|-------------| -| `object_storage.project_name` | string | Yes | Unique project identifier (must match store metadata) | -| `object_storage.protocol` | string | Yes | Storage backend: `file`, `s3`, `gcs`, `azure` | -| `object_storage.location` | string | Yes | Base path or bucket prefix | -| `object_storage.bucket` | string | For cloud | Bucket name (S3, GCS, Azure) | -| `object_storage.endpoint` | string | For S3 | S3 endpoint URL | -| `object_storage.partition_pattern` | string | No | Path pattern with `{attribute}` placeholders | -| `object_storage.token_length` | int | No | Random suffix length for filenames (default: 8, range: 4-16) | -| `object_storage.access_key` | string | For cloud | Access key (can use secrets file) | -| `object_storage.secret_key` | string | For cloud | Secret key (can use secrets file) | - -### Configuration Immutability - -**CRITICAL**: Once a project has been instantiated (i.e., `datajoint_store.json` has been created and the first object stored), the following settings MUST NOT be changed: - -- `object_storage.project_name` -- `object_storage.protocol` -- `object_storage.bucket` -- `object_storage.location` -- `object_storage.partition_pattern` - -Changing these settings after objects have been stored will result in **broken references**—existing paths stored in the database will no longer resolve to valid storage locations. - -DataJoint validates `project_name` against `datajoint_store.json` on connect, but administrators must ensure other settings remain consistent across all clients for the lifetime of the project. - -### Environment Variables - -Settings can be overridden via environment variables: - -```bash -DJ_OBJECT_STORAGE_PROTOCOL=s3 -DJ_OBJECT_STORAGE_BUCKET=my-bucket -DJ_OBJECT_STORAGE_LOCATION=my_project -DJ_OBJECT_STORAGE_PARTITION_PATTERN="subject{subject_id}/session{session_id}" -``` - -### Secrets - -Credentials can be stored in the `.secrets/` directory: - -``` -.secrets/ -├── object_storage.access_key -└── object_storage.secret_key -``` - -### Partition Pattern - -The partition pattern is configured **per pipeline** (one per settings file). Placeholders use `{attribute_name}` syntax and are replaced with primary key values. - -```json -{ - "object_storage.partition_pattern": "subject{subject_id}/session{session_id}" -} -``` - -**Example with partitioning:** - -``` -s3://my-bucket/my_project/subject_id=123/session_id=45/schema_name/Recording/objects/raw_data_Ax7bQ2kM.dat -``` - -If no partition pattern is specified, files are organized directly under `{location}/{schema}/{Table}/objects/`. - -## Store Metadata (`datajoint_store.json`) - -Each object store contains a metadata file at its root that identifies the store and enables verification by DataJoint clients. This file is named `datajoint_store.json` to distinguish it from client configuration files (`datajoint.json`). - -### Location - -``` -{location}/datajoint_store.json -``` - -For cloud storage: -``` -s3://bucket/my_project/datajoint_store.json -``` - -### Content - -```json -{ - "project_name": "my_project", - "created": "2025-01-15T10:30:00Z", - "format_version": "1.0", - "datajoint_version": "0.15.0", - "database_host": "db.example.com", - "database_name": "my_project_db" -} -``` - -### Schema - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `project_name` | string | Yes | Unique project identifier | -| `created` | string | Yes | ISO 8601 timestamp of store creation | -| `format_version` | string | Yes | Store format version for compatibility | -| `datajoint_version` | string | Yes | DataJoint version that created the store | -| `database_host` | string | No | Database server hostname (for bidirectional mapping) | -| `database_name` | string | No | Database name on the server (for bidirectional mapping) | - -The `database_name` field exists for DBMS platforms that support multiple databases on a single server (e.g., PostgreSQL, MySQL). The object storage configuration is **shared across all schemas comprising the pipeline**—it's a pipeline-level setting, not a per-schema setting. - -The optional `database_host` and `database_name` fields enable bidirectional mapping between object stores and databases: - -- **Forward**: Client settings → object store location -- **Reverse**: Object store metadata → originating database - -This is informational only—not enforced at runtime. Administrators can alternatively ensure unique `project_name` values across their namespace, and managed platforms may handle this mapping externally. - -### Store Initialization - -The store metadata file is created when the first `object` attribute is used: - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Client attempts first file operation │ -├─────────────────────────────────────────────────────────┤ -│ 2. Check if datajoint_store.json exists │ -│ ├─ If exists: verify project_name matches │ -│ └─ If not: create with current project_name │ -├─────────────────────────────────────────────────────────┤ -│ 3. On mismatch: raise DataJointError │ -└─────────────────────────────────────────────────────────┘ -``` - -### Client Verification - -DataJoint performs a basic verification on connect to ensure store-database cohesion: - -1. **On connect**: Client reads `datajoint_store.json` from store -2. **Verify**: `project_name` in client settings matches store metadata -3. **On mismatch**: Raise `DataJointError` with descriptive message - -```python -# Example error -DataJointError: Object store project name mismatch. - Client configured: "project_a" - Store metadata: "project_b" - Ensure all clients use the same object_storage.project_name setting. -``` - -### Administrative Responsibility - -A 1:1 correspondence is assumed between: -- Database location + `project_name` in client settings -- Object store + `project_name` in store metadata - -DataJoint performs basic verification but does **not** enforce this mapping. Administrators are responsible for ensuring correct configuration across all clients. - -## Syntax - -```python -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_id : int - --- - raw_data : object # uses default store - processed : object # another object attribute (default store) - published : object@public # uses named 'public' store - """ -``` - -- `object` — uses the default storage backend -- `object@store_name` — uses a named store (must be configured in settings) - -## Database Storage - -The `object` type is stored as a `JSON` column in MySQL containing: - -**File in default store:** -```json -{ - "store": null, - "url": "s3://my-bucket/my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat", - "path": "my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat", - "size": 12345, - "hash": null, - "ext": ".dat", - "is_dir": false, - "timestamp": "2025-01-15T10:30:00Z", - "mime_type": "application/octet-stream" -} -``` - -**File in named store:** -```json -{ - "store": "public", - "url": "s3://public-bucket/my_project/my_schema/Dataset/objects/dataset_id=1/published_data_Bx8cD3kM.dat", - "path": "my_schema/Dataset/objects/dataset_id=1/published_data_Bx8cD3kM.dat", - "size": 12345, - "hash": "sha256:abcdef1234...", - "ext": ".dat", - "is_dir": false, - "timestamp": "2025-01-15T10:30:00Z", - "mime_type": "application/octet-stream" -} -``` - -**Folder example:** -```json -{ - "store": null, - "url": "s3://my-bucket/my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_pL9nR4wE", - "path": "my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_pL9nR4wE", - "size": 567890, - "hash": null, - "ext": null, - "is_dir": true, - "timestamp": "2025-01-15T10:30:00Z", - "item_count": 42 -} -``` - -**Zarr example (large dataset, metadata fields omitted for performance):** -```json -{ - "store": null, - "url": "s3://my-bucket/my_project/my_schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", - "path": "my_schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", - "size": null, - "hash": null, - "ext": ".zarr", - "is_dir": true, - "timestamp": "2025-01-15T10:30:00Z" -} -``` - -### JSON Schema - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `store` | string/null | Yes | Store name (e.g., `"public"`), or `null` for default store | -| `url` | string | Yes | Full URL including protocol and bucket (e.g., `s3://bucket/path`) | -| `path` | string | Yes | Relative path within store (excludes protocol/bucket, includes token) | -| `size` | integer/null | No | Total size in bytes (sum for folders), or null if not computed. See [Performance Considerations](#performance-considerations). | -| `hash` | string/null | Yes | Content hash with algorithm prefix, or null (default) | -| `ext` | string/null | Yes | File extension as tooling hint (e.g., `.dat`, `.zarr`) or null. See [Extension Field](#extension-field). | -| `is_dir` | boolean | Yes | True if stored content is a directory/key-prefix (e.g., Zarr store) | -| `timestamp` | string | Yes | ISO 8601 upload timestamp | -| `mime_type` | string | No | MIME type (files only, auto-detected from extension) | -| `item_count` | integer | No | Number of files (folders only), or null if not computed. See [Performance Considerations](#performance-considerations). | - -**Why both `url` and `path`?** -- `url`: Self-describing, enables cross-validation, robust to config changes -- `path`: Enables store name re-derivation at migration time, consistent structure across stores -- At migration, the store name can be derived by matching `url` against configured stores - -### Extension Field - -The `ext` field is a **tooling hint** that preserves the original file extension or provides a conventional suffix for directory-based formats. It is: - -- **Not a content-type declaration**: Unlike `mime_type`, it does not attempt to describe the internal content format -- **Useful for tooling**: Enables file browsers, IDEs, and other tools to display appropriate icons or suggest applications -- **Conventional for formats like Zarr**: The `.zarr` extension is recognized by the ecosystem even though a Zarr store contains mixed content (JSON metadata + binary chunks) - -For single files, `ext` is extracted from the source filename. For staged inserts (like Zarr), it can be explicitly provided. - -### Performance Considerations - -For large hierarchical data like Zarr stores, computing certain metadata can be expensive: - -- **`size`**: Requires listing all objects and summing their sizes. For stores with millions of chunks, this can take minutes or hours. -- **`item_count`**: Requires listing all objects. Same performance concern as `size`. -- **`hash`**: Requires reading all content. Explicitly not supported for staged inserts. - -**These fields are optional** and default to `null` for staged inserts. Users can explicitly request computation when needed, understanding the performance implications. - -### Content Hashing - -By default, **no content hash is computed** to avoid performance overhead for large objects. Storage backend integrity is trusted. - -**Explicit hash control** via insert kwarg: - -```python -# Default - no hash (fast) -Recording.insert1({..., "raw_data": "/path/to/large.dat"}) - -# Explicit hash request - user specifies algorithm -Recording.insert1({..., "raw_data": "/path/to/important.dat"}, hash="sha256") - -# Other supported algorithms -Recording.insert1({..., "raw_data": "/path/to/data.bin"}, hash="md5") -Recording.insert1({..., "raw_data": "/path/to/large.bin"}, hash="xxhash") # xxh3, faster for large files -``` - -**Design principles:** - -- **Explicit over implicit**: No automatic hashing based on file size or other heuristics -- **User controls the tradeoff**: User decides when integrity verification is worth the performance cost -- **Files only**: Hash applies to files, not folders (folders use manifests for integrity) -- **Staged inserts**: Hash is always `null` regardless of kwarg—data flows directly to storage without a local copy to hash - -Supported hash algorithms: `sha256`, `md5`, `xxhash` (xxh3, faster for large files) - -### Folder Manifests - -For folders (directories), a **manifest file** is created alongside the folder in the object store to enable integrity verification without computing content hashes: - -``` -raw_data_pL9nR4wE/ -raw_data_pL9nR4wE.manifest.json -``` - -**Manifest content:** -```json -{ - "files": [ - {"path": "file1.dat", "size": 1234}, - {"path": "subdir/file2.dat", "size": 5678}, - {"path": "subdir/file3.dat", "size": 91011} - ], - "total_size": 567890, - "item_count": 42, - "created": "2025-01-15T10:30:00Z" -} -``` - -**Design rationale:** -- Stored in object store (not database) to avoid bloating the JSON for folders with many files -- Placed alongside folder (not inside) to avoid polluting folder contents and interfering with tools like Zarr -- Enables self-contained verification without database access - -The manifest enables: -- Quick verification that all expected files exist -- Size validation without reading file contents -- Detection of missing or extra files - -### Filename Convention - -The stored filename is **always derived from the field name**: -- **Base name**: The attribute/field name (e.g., `raw_data`) -- **Extension**: Adopted from source file (copy insert) or optionally provided (staged insert) -- **Token**: Random suffix for collision avoidance - -``` -Stored filename = {field}_{token}{ext} - -Examples: - raw_data_Ax7bQ2kM.dat # file with .dat extension - raw_data_pL9nR4wE.zarr # Zarr directory with .zarr extension - raw_data_kM3nP2qR # directory without extension -``` - -This convention ensures: -- Consistent, predictable naming across all objects -- Field name visible in storage for easier debugging -- Extension preserved for MIME type detection and tooling compatibility - -## Path Generation - -Storage paths are **deterministically constructed** from record metadata, enabling bidirectional lookup between database records and stored files. - -### Path Components - -1. **Location** - from configuration (`object_storage.location`) -2. **Partition attributes** - promoted PK attributes (if `partition_pattern` configured) -3. **Schema name** - from the table's schema -4. **Table name** - the table class name -5. **Object directory** - `objects/` -6. **Primary key encoding** - remaining PK attributes and values -7. **Suffixed filename** - `{field}_{token}{ext}` - -### Path Template - -**Without partitioning:** -``` -{location}/{schema}/{Table}/objects/{pk_attr1}={pk_val1}/{pk_attr2}={pk_val2}/.../{field}_{token}{ext} -``` - -**With partitioning:** -``` -{location}/{partition_attr}={val}/.../schema/{Table}/objects/{remaining_pk_attrs}/.../{field}_{token}{ext} -``` - -Note: The `objects/` directory follows the table name, allowing each table folder to also contain tabular data exports (e.g., `data.parquet`) alongside the objects. - -### Partitioning - -The **partition pattern** allows promoting certain primary key attributes to the beginning of the path (after `location`). This organizes storage by high-level attributes like subject or experiment, enabling: -- Efficient data locality for related records -- Easier manual browsing of storage -- Potential for storage tiering by partition - -**Configuration:** -```json -{ - "object_storage.partition_pattern": "{subject_id}/{experiment_id}" -} -``` - -Partition attributes are extracted from the primary key and placed at the path root. Remaining PK attributes appear in their normal position. - -### Example Without Partitioning - -For a table: -```python -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_id : int - --- - raw_data : object - """ -``` - -Inserting `{"subject_id": 123, "session_id": 45, "raw_data": "/path/to/recording.dat"}` produces: - -``` -my_project/my_schema/Recording/objects/subject_id=123/session_id=45/raw_data_Ax7bQ2kM.dat -``` - -Note: The filename is `raw_data` (field name) with `.dat` extension (from source file). - -### Example With Partitioning - -With `partition_pattern = "{subject_id}"`: - -``` -my_project/subject_id=123/my_schema/Recording/objects/session_id=45/raw_data_Ax7bQ2kM.dat -``` - -The `subject_id` is promoted to the path root, grouping all files for subject 123 together regardless of schema or table. - -### Deterministic Bidirectional Mapping - -The path structure (excluding the random token) is fully deterministic: -- **Record → File**: Given a record's primary key, construct the path prefix to locate its file -- **File → Record**: Parse the path to extract schema, table, field, and primary key values - -This enables: -- Finding all files for a specific record -- Identifying which record a file belongs to -- Auditing storage against database contents - -The **random token** is stored in the JSON metadata to complete the full path. - -### Primary Key Value Encoding - -Primary key values are encoded directly in paths when they are simple, path-safe types: -- **Integers**: Used directly (`subject_id=123`) -- **Dates**: ISO format (`session_date=2025-01-15`) -- **Timestamps**: ISO format with safe separators (`created=2025-01-15T10-30-00`) -- **Simple strings**: Used directly if path-safe (`experiment=baseline`) - -**Conversion to path-safe strings** is applied only when necessary: -- Strings containing `/`, `\`, or other path-unsafe characters -- Very long strings (truncated with hash suffix) -- Binary or complex types (hashed) - -```python -# Direct encoding (no conversion needed) -subject_id=123 -session_date=2025-01-15 -trial_type=control - -# Converted encoding (path-unsafe characters) -filename=my%2Ffile.dat # "/" encoded -description=a1b2c3d4_abc123 # long string truncated + hash -``` - -### Filename Collision Avoidance - -To prevent filename collisions, each stored object receives a **random token suffix** appended to the field name: - -``` -field: raw_data, source: recording.dat -stored: raw_data_Ax7bQ2kM.dat - -field: image, source: scan.tiff -stored: image_pL9nR4wE.tiff - -field: neural_data (staged with .zarr) -stored: neural_data_kM3nP2qR.zarr -``` - -#### Token Suffix Specification - -- **Alphabet**: URL-safe and filename-safe Base64 characters: `A-Z`, `a-z`, `0-9`, `-`, `_` -- **Length**: Configurable via `object_storage.token_length` (default: 8, range: 4-16) -- **Generation**: Cryptographically random using `secrets.token_urlsafe()` - -At 8 characters with 64 possible values per character: 64^8 = 281 trillion combinations. - -#### Rationale - -- Avoids collisions without requiring existence checks -- Field name visible in storage for easier debugging/auditing -- URL-safe for web-based access to cloud storage -- Filesystem-safe across all supported platforms - -### No Deduplication - -Each insert stores a separate copy of the file, even if identical content was previously stored. This ensures: -- Clear 1:1 relationship between records and files -- Simplified delete behavior -- No reference counting complexity - -## Insert Behavior - -At insert time, the `object` attribute accepts: - -1. **Local file path** (string or `Path`): Path to an existing local file (extension extracted) -2. **Local folder path** (string or `Path`): Path to an existing local directory -3. **Remote URL** (string): URL to remote file or folder (`s3://`, `gs://`, `az://`, `http://`, `https://`) -4. **Tuple of (ext, stream)**: File-like object with explicit extension - -```python -# From local file path - extension (.dat) extracted from source -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": "/local/path/to/recording.dat" -}) -# Stored as: raw_data_Ax7bQ2kM.dat - -# From local folder path - no extension -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": "/local/path/to/data_folder/" -}) -# Stored as: raw_data_pL9nR4wE/ - -# From remote URL - copies from source to managed storage -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": "s3://source-bucket/path/to/data.dat" -}) -# Stored as: raw_data_kM3nP2qR.dat - -# From remote Zarr store (e.g., collaborator data on GCS) -Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "neural_data": "gs://collaborator-bucket/shared/experiment.zarr" -}) -# Copied to managed storage as: neural_data_pL9nR4wE.zarr - -# From stream with explicit extension -with open("/local/path/data.bin", "rb") as f: - Recording.insert1({ - "subject_id": 123, - "session_id": 45, - "raw_data": (".bin", f) - }) -# Stored as: raw_data_xY8zW3vN.bin -``` - -### Remote URL Support - -Remote URLs are detected by protocol prefix and handled via fsspec: - -| Protocol | Example | Notes | -|----------|---------|-------| -| `s3://` | `s3://bucket/path/file.dat` | AWS S3, MinIO | -| `gs://` | `gs://bucket/path/file.dat` | Google Cloud Storage | -| `az://` | `az://container/path/file.dat` | Azure Blob Storage | -| `http://` | `http://server/path/file.dat` | HTTP (read-only source) | -| `https://` | `https://server/path/file.dat` | HTTPS (read-only source) | - -**Authentication**: Remote sources may require credentials. fsspec uses standard credential discovery (environment variables, config files, IAM roles). For cross-cloud copies, ensure credentials are configured for both source and destination. - -**Performance note**: For large remote-to-remote copies, data flows through the client. This is acceptable for most use cases but may be slow for very large datasets. Future optimizations could include server-side copy for same-provider transfers. - -### Insert Processing Steps - -1. Validate input (file/folder exists, stream is readable) -2. Generate deterministic storage path with random token -3. **Copy content to storage backend** via `fsspec` -4. **If copy fails: abort insert** (no database operation attempted) -5. Compute content hash if requested (optional, default: no hash) -6. Build JSON metadata structure -7. Execute database INSERT - -### Copy-First Semantics - -The file/folder is copied to storage **before** the database insert is attempted: -- If the copy fails, the insert does not proceed -- If the copy succeeds but the database insert fails, an orphaned file may remain -- Orphaned files are acceptable due to the random token (no collision with future inserts) - -### Staged Insert (Direct Write Mode) - -For large objects like Zarr arrays, copying from local storage is inefficient. **Staged insert** allows writing directly to the destination. - -#### Why a Separate Method? - -Staged insert uses a dedicated `staged_insert1` method rather than co-opting `insert1` because: - -1. **Explicit over implicit** - Staged inserts have fundamentally different semantics (file creation happens during context, commit on exit). A separate method makes this explicit. -2. **Backward compatibility** - `insert1` returns `None` and doesn't support context manager protocol. Changing this could break existing code. -3. **Clear error handling** - The context manager semantics (success = commit, exception = rollback) are obvious with `staged_insert1`. -4. **Type safety** - The staged context exposes `.store()` for object fields. A dedicated method can return a properly-typed `StagedInsert` object. - -**Staged inserts are limited to `insert1`** (one row at a time). Multi-row inserts are not supported for staged operations. - -#### Basic Usage - -```python -# Stage an insert with direct object storage writes -with Recording.staged_insert1 as staged: - # Set primary key values - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Create object storage directly using store() - # Extension is optional - .zarr is conventional for Zarr arrays - z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(10000, 10000), dtype='f4') - z[:] = compute_large_array() - - # Assign the created object to the record - staged.rec['raw_data'] = z - -# On successful exit: metadata computed, record inserted -# On exception: storage cleaned up, no record inserted -# Stored as: raw_data_Ax7bQ2kM.zarr -``` - -#### StagedInsert Interface - -```python -class StagedInsert: - """Context manager for staged insert operations.""" - - rec: dict[str, Any] # Record dict for setting attribute values - - def store(self, field: str, ext: str = "") -> fsspec.FSMap: - """ - Get an FSMap store for direct writes to an object field. - - Args: - field: Name of the object attribute - ext: Optional extension (e.g., ".zarr", ".hdf5") - - Returns: - fsspec.FSMap suitable for Zarr/xarray - """ - ... - - def open(self, field: str, ext: str = "", mode: str = "wb") -> IO: - """ - Open a file for direct writes to an object field. - - Args: - field: Name of the object attribute - ext: Optional extension (e.g., ".bin", ".dat") - mode: File mode (default: "wb") - - Returns: - File-like object for writing - """ - ... - - @property - def fs(self) -> fsspec.AbstractFileSystem: - """Return fsspec filesystem for advanced operations.""" - ... -``` - -#### Staged Insert Flow - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Enter context: create StagedInsert with empty rec │ -├─────────────────────────────────────────────────────────┤ -│ 2. User sets primary key values in staged.rec │ -├─────────────────────────────────────────────────────────┤ -│ 3. User calls store()/open() to get storage handles │ -│ - Path reserved with random token on first call │ -│ - User writes data directly via fsspec │ -├─────────────────────────────────────────────────────────┤ -│ 4. User assigns object references to staged.rec │ -├─────────────────────────────────────────────────────────┤ -│ 5. On context exit (success): │ -│ - Build metadata (size/item_count optional, no hash) │ -│ - Execute database INSERT │ -├─────────────────────────────────────────────────────────┤ -│ 6. On context exit (exception): │ -│ - Delete any written data │ -│ - Re-raise exception │ -└─────────────────────────────────────────────────────────┘ -``` - -#### Zarr Example - -```python -import zarr -import numpy as np - -# Create a large Zarr array directly in object storage -with Recording.staged_insert1 as staged: - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Create Zarr hierarchy directly in object storage - # .zarr extension is optional but conventional - root = zarr.open(staged.store('neural_data', '.zarr'), mode='w') - root.create_dataset('timestamps', data=np.arange(1000000)) - root.create_dataset('waveforms', shape=(1000000, 82), chunks=(10000, 82)) - - # Write in chunks (streaming from acquisition) - for i, chunk in enumerate(data_stream): - root['waveforms'][i*10000:(i+1)*10000] = chunk - - # Assign to record - staged.rec['neural_data'] = root - -# Record automatically inserted with computed metadata -# Stored as: neural_data_kM3nP2qR.zarr -``` - -#### Multiple Object Fields - -```python -with Recording.staged_insert1 as staged: - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Write multiple object fields - extension optional - raw = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(1000, 1000)) - raw[:] = raw_array - - processed = zarr.open(staged.store('processed', '.zarr'), mode='w', shape=(100, 100)) - processed[:] = processed_array - - staged.rec['raw_data'] = raw - staged.rec['processed'] = processed - -# Stored as: raw_data_Ax7bQ2kM.zarr, processed_pL9nR4wE.zarr -``` - -#### Comparison: Copy vs Staged Insert - -| Aspect | Copy Insert | Staged Insert | -|--------|-------------|---------------| -| Data location | Must exist locally first | Written directly to storage | -| Efficiency | Copy overhead | No copy needed | -| Use case | Small files, existing data | Large arrays, streaming data | -| Cleanup on failure | Orphan possible | Cleaned up | -| API | `insert1({..., "field": path})` | `staged_insert1` context manager | -| Multi-row | Supported | Not supported (insert1 only) | - -## Transaction Handling - -Since storage backends don't support distributed transactions with MySQL, DataJoint uses a **copy-first** strategy. - -### Insert Transaction Flow - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Validate input and generate storage path with token │ -├─────────────────────────────────────────────────────────┤ -│ 2. Copy file/folder to storage backend │ -│ └─ On failure: raise error, INSERT not attempted │ -├─────────────────────────────────────────────────────────┤ -│ 3. Compute hash (if requested) and build JSON metadata │ -├─────────────────────────────────────────────────────────┤ -│ 4. Execute database INSERT │ -│ └─ On failure: orphaned file remains (acceptable) │ -├─────────────────────────────────────────────────────────┤ -│ 5. Commit database transaction │ -│ └─ On failure: orphaned file remains (acceptable) │ -└─────────────────────────────────────────────────────────┘ -``` - -### Failure Scenarios - -| Scenario | Result | Orphaned File? | -|----------|--------|----------------| -| Copy fails | Clean failure, no INSERT | No | -| DB insert fails | Error raised | Yes (acceptable) | -| DB commit fails | Error raised | Yes (acceptable) | - -### Orphaned Files - -Orphaned files (files in storage without corresponding database records) may accumulate due to: -- Failed database inserts after successful copy -- Process crashes -- Network failures - -**This is acceptable** because: -- Random tokens prevent collisions with future inserts -- Orphaned files can be identified by comparing storage contents with database records -- A separate cleanup procedure removes orphaned files during maintenance - -### Orphan Cleanup Procedure - -Orphan cleanup is a **separate maintenance operation** provided via the `schema.object_storage` utility object. Cleanup operates **per-store**, iterating through all configured stores. - -```python -# Maintenance utility methods (not a hidden table) -schema.object_storage.find_orphaned(grace_period_minutes=30) # List orphaned files (all stores) -schema.object_storage.find_orphaned(store="public") # List orphaned files (specific store) -schema.object_storage.cleanup_orphaned(dry_run=True) # Delete orphaned files -schema.object_storage.verify_integrity() # Check all objects exist -schema.object_storage.stats() # Storage usage statistics -``` - -**Note**: `schema.object_storage` is a utility object, not a hidden table. Unlike `attach@store` which uses `~external_*` tables, the `object` type stores all metadata inline in JSON columns and has no hidden tables. - -**Efficient listing for Zarr and large stores:** - -For stores with Zarr arrays (potentially millions of chunk objects), cleanup uses **delimiter-based listing** to enumerate only root object names, not individual chunks: - -```python -# S3 API with delimiter - lists "directories" only -response = s3.list_objects_v2( - Bucket=bucket, - Prefix='project/schema/Table/objects/', - Delimiter='/' -) -# Returns: ['neural_data_kM3nP2qR.zarr/', 'raw_data_Ax7bQ2kM.dat'] -# NOT millions of individual chunk keys -``` - -Orphan deletion uses recursive delete to remove entire Zarr stores efficiently. - -**Grace period for in-flight inserts:** - -While random tokens prevent filename collisions, there's a race condition with in-flight inserts: - -1. Insert starts: file copied to storage with token `Ax7bQ2kM` -2. Orphan cleanup runs: lists storage, queries DB for references -3. File `Ax7bQ2kM` not yet in DB (INSERT not committed) -4. Cleanup identifies it as orphan and deletes it -5. Insert commits: DB now references deleted file! - -**Solution**: The `grace_period_minutes` parameter (default: 30) excludes files created within that window, assuming they are in-flight inserts. - -**Important considerations:** -- Cleanup enumerates all configured stores (default + named) -- Uses delimiter-based listing for efficiency with Zarr stores -- Grace period handles race conditions—cleanup is safe to run anytime -- `dry_run=True` previews deletions before execution -- Compares storage contents against JSON metadata in table columns - -## Fetch Behavior - -On fetch, the `object` type returns a **handle** (`ObjectRef` object) to the stored content. **The file is not copied** - all operations access the storage backend directly. - -```python -record = Recording.fetch1() -file_ref = record["raw_data"] - -# Access metadata (no I/O) -print(file_ref.path) # Full storage path -print(file_ref.size) # File size in bytes -print(file_ref.hash) # Content hash (if computed) or None -print(file_ref.ext) # File extension (e.g., ".dat") or None -print(file_ref.is_dir) # True if stored content is a folder - -# Read content directly from storage backend -content = file_ref.read() # Returns bytes (files only) - -# Open as fsspec file object (files only) -with file_ref.open() as f: - data = f.read() - -# List contents (folders only) -contents = file_ref.listdir() # Returns list of relative paths - -# Access specific file within folder -with file_ref.open("subdir/file.dat") as f: - data = f.read() -``` - -### No Automatic Download - -Unlike `attach@store`, the `object` type does **not** automatically download content to a local path. Users access content directly through the `ObjectRef` handle, which streams from the storage backend. - -For local copies, users explicitly download: - -```python -# Download file to local destination -local_path = file_ref.download("/local/destination/") - -# Download specific file from folder -local_path = file_ref.download("/local/destination/", "subdir/file.dat") -``` - -## Implementation Components - -### 1. Settings Extension (`settings.py`) - -New `ObjectStorageSettings` class: - -```python -class ObjectStorageSettings(BaseSettings): - """Object storage configuration for object columns.""" - - model_config = SettingsConfigDict( - env_prefix="DJ_OBJECT_STORAGE_", - extra="forbid", - validate_assignment=True, - ) - - project_name: str | None = None # Must match store metadata - protocol: Literal["object", "s3", "gcs", "azure"] | None = None - location: str | None = None - bucket: str | None = None - endpoint: str | None = None - partition_pattern: str | None = None - token_length: int = Field(default=8, ge=4, le=16) - access_key: str | None = None - secret_key: SecretStr | None = None -``` - -Add to main `Config` class: - -```python -object_storage: ObjectStorageSettings = Field(default_factory=ObjectStorageSettings) -``` - -### 2. Storage Backend (`storage.py` - new module) - -- `StorageBackend` class wrapping `fsspec` -- Methods: `upload()`, `download()`, `open()`, `exists()`, `delete()` -- Path generation with partition support - -### 3. Type Declaration (`declare.py`) - -- Add `OBJECT` pattern: `object$` -- Add to `SPECIAL_TYPES` -- Substitute to `JSON` type in database - -### 4. Schema Integration (`schemas.py`) - -- Associate storage backend with schema -- Validate storage configuration on schema creation - -### 5. Insert Processing (`table.py`) - -- New `__process_file_attribute()` method -- Path generation using primary key and partition pattern -- Upload via storage backend - -### 6. Fetch Processing (`fetch.py`) - -- New `ObjectRef` class -- Lazy loading from storage backend -- Metadata access interface - -### 7. ObjectRef Class (`objectref.py` - new module) - -```python -@dataclass -class ObjectRef: - """Handle to a file or folder stored in the pipeline's storage backend.""" - - path: str - size: int - hash: str | None # content hash (if computed) or None - ext: str | None # file extension (e.g., ".dat") or None - is_dir: bool - timestamp: datetime - mime_type: str | None # files only, derived from ext - item_count: int | None # folders only - _backend: StorageBackend # internal reference - - # fsspec access (for Zarr, xarray, etc.) - @property - def fs(self) -> fsspec.AbstractFileSystem: - """Return fsspec filesystem for direct access.""" - ... - - @property - def store(self) -> fsspec.FSMap: - """Return FSMap suitable for Zarr/xarray.""" - ... - - @property - def full_path(self) -> str: - """Return full URI (e.g., 's3://bucket/path').""" - ... - - # File operations - def read(self) -> bytes: ... - def open(self, subpath: str | None = None, mode: str = "rb") -> IO: ... - - # Folder operations - def listdir(self, subpath: str = "") -> list[str]: ... - def walk(self) -> Iterator[tuple[str, list[str], list[str]]]: ... - - # Common operations - def download(self, destination: Path | str, subpath: str | None = None) -> Path: ... - def exists(self, subpath: str | None = None) -> bool: ... - - # Integrity verification - def verify(self) -> bool: - """ - Verify object integrity. - - For files: checks size matches, and hash if available. - For folders: validates manifest (all files exist with correct sizes). - - Returns True if valid, raises IntegrityError with details if not. - """ - ... -``` - -#### fsspec Integration - -The `ObjectRef` provides direct fsspec access for integration with array libraries: - -```python -import zarr -import xarray as xr - -record = Recording.fetch1() -obj_ref = record["raw_data"] - -# Direct Zarr access -z = zarr.open(obj_ref.store, mode='r') -print(z.shape) - -# Direct xarray access -ds = xr.open_zarr(obj_ref.store) - -# Use fsspec filesystem directly -fs = obj_ref.fs -files = fs.ls(obj_ref.full_path) -``` - -## Dependencies - -New dependency: `fsspec` with optional backend-specific packages: - -```toml -[project.dependencies] -fsspec = ">=2023.1.0" - -[project.optional-dependencies] -s3 = ["s3fs"] -gcs = ["gcsfs"] -azure = ["adlfs"] -``` - -### Storage Access Architecture - -The `object` type separates **data declaration** (the JSON metadata stored in the database) from **storage access** (the library used to read/write objects): - -- **Data declaration**: The JSON schema (path, size, hash, etc.) is a pure data structure with no library dependencies -- **Storage access**: Currently uses `fsspec` as the default accessor, but the architecture supports alternative backends - -**Why this matters**: While `fsspec` is a mature and widely-used library, alternatives like [`obstore`](https://github.com/developmentseed/obstore) offer performance advantages for certain workloads. By keeping the data model independent of the access library, future versions can support pluggable storage accessors without schema changes. - -**Current implementation**: The `ObjectRef` class provides fsspec-based accessors (`fs`, `store` properties). Future versions may add: -- Pluggable accessor interface -- Alternative backends (obstore, custom implementations) -- Backend selection per-operation or per-configuration - -## Comparison with Existing Types - -| Feature | `attach@store` | `filepath@store` | `object` | -|---------|----------------|------------------|--------| -| Store config | Per-attribute | Per-attribute | Per-pipeline | -| Path control | DataJoint | User-managed | DataJoint | -| DB column | binary(16) UUID | binary(16) UUID | JSON | -| Hidden tables | Yes (external) | Yes (external) | **No** | -| Backend | File/S3 only | File/S3 only | fsspec (any) | -| Partitioning | Hash-based | User path | Configurable | -| Metadata storage | External table | External table | Inline JSON | -| Deduplication | By content | By path | None | - -### No Hidden Tables - -A key architectural difference: the `object` type does **not** use hidden external tables. - -The legacy `attach@store` and `filepath@store` types store a UUID in the table column and maintain a separate hidden `~external_*` table containing: -- File paths/keys -- Checksums -- Size information -- Reference counts - -The `object` type eliminates this complexity by storing all metadata **inline** in the JSON column. This provides: -- **Simpler schema** - no hidden tables to manage or migrate -- **Self-contained records** - all information in one place -- **Easier debugging** - metadata visible directly in queries -- **No reference counting** - each record owns its object exclusively - -### Legacy Type Deprecation - -The existing `attach@store` and `filepath@store` types will be: -- **Maintained** for backward compatibility with existing pipelines -- **Deprecated** in future releases with migration warnings -- **Eventually removed** after a transition period - -New pipelines should use the `object` type exclusively. - -## Delete Behavior - -When a record with a `object` attribute is deleted: - -1. **Database delete executes first** (within transaction) -2. **File delete is attempted** after successful DB commit -3. **File delete is best-effort** - the delete transaction succeeds even if file deletion fails - -### Delete Transaction Flow - -``` -┌─────────────────────────────────────────────────────────┐ -│ 1. Execute database DELETE │ -├─────────────────────────────────────────────────────────┤ -│ 2. Commit database transaction │ -│ └─ On failure: rollback, files unchanged │ -├─────────────────────────────────────────────────────────┤ -│ 3. Issue delete command to storage backend │ -│ └─ On failure: log warning, transaction still OK │ -└─────────────────────────────────────────────────────────┘ -``` - -### Stale Files - -If file deletion fails (network error, permissions, etc.), **stale files** may remain in storage. This is acceptable because: -- The database record is already deleted (authoritative source) -- Random tokens prevent any collision with future inserts -- Stale files can be identified and cleaned via orphan detection utilities - -### No Reference Counting - -Each record owns its file exclusively. There is no deduplication or reference counting, simplifying delete logic. - -## Migration Path - -- Existing `attach@store` and `filepath@store` remain unchanged -- `object` type is additive - new tables only -- Future: Migration utilities to convert existing external storage - -## Zarr, TileDB, and Large Hierarchical Data - -The `object` type is designed with **chunk-based formats** like Zarr and TileDB in mind. These formats store each chunk as a separate object, which maps naturally to object storage. - -### Staged Insert Compatibility - -**Staged inserts work with formats that support chunk-based writes:** - -| Format | Staged Insert | Why | -|--------|---------------|-----| -| **Zarr** | ✅ Yes | Each chunk is a separate object | -| **TileDB** | ✅ Yes | Fragment-based storage maps to objects | -| **HDF5** | ❌ No | Single monolithic file requires random-access seek/write | - -**HDF5 limitation**: HDF5 files have internal B-tree structures that require random-access modifications. Object storage only supports full object PUT/GET operations, not partial updates. For HDF5, use **copy insert**: - -```python -# HDF5: Write locally, then copy to object storage -import h5py -import tempfile - -with tempfile.NamedTemporaryFile(suffix='.h5', delete=False) as f: - with h5py.File(f.name, 'w') as h5: - h5.create_dataset('data', data=large_array) - Recording.insert1({..., 'data_file': f.name}) -``` - -For cloud-native workflows with large arrays, **Zarr is recommended** over HDF5. - -### Recommended Workflow (Zarr) - -For large Zarr stores, use **staged insert** to write directly to object storage: - -```python -import zarr -import numpy as np - -with Recording.staged_insert1 as staged: - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Write Zarr directly to object storage - store = staged.store('neural_data', '.zarr') - root = zarr.open(store, mode='w') - root.create_dataset('spikes', shape=(1000000, 384), chunks=(10000, 384), dtype='f4') - - # Stream data without local intermediate copy - for i, chunk in enumerate(acquisition_stream): - root['spikes'][i*10000:(i+1)*10000] = chunk - - staged.rec['neural_data'] = root - -# Metadata recorded, no expensive size/hash computation -``` - -### JSON Metadata for Zarr - -For Zarr stores, the recommended JSON metadata omits expensive-to-compute fields: - -```json -{ - "path": "schema/Recording/objects/subject_id=123/session_id=45/neural_data_kM3nP2qR.zarr", - "size": null, - "hash": null, - "ext": ".zarr", - "is_dir": true, - "timestamp": "2025-01-15T10:30:00Z" -} -``` - -**Field notes for Zarr:** -- **`size`**: Set to `null` - computing total size requires listing all chunks -- **`hash`**: Always `null` for staged inserts - no merkle tree support currently -- **`ext`**: Set to `.zarr` as a conventional tooling hint -- **`is_dir`**: Set to `true` - Zarr stores are key prefixes (logical directories) -- **`item_count`**: Omitted - counting chunks is expensive and rarely useful -- **`mime_type`**: Omitted - Zarr contains mixed content types - -### Reading Zarr Data - -The `ObjectRef` provides direct access compatible with Zarr and xarray: - -```python -record = Recording.fetch1() -obj_ref = record['neural_data'] - -# Direct Zarr access -z = zarr.open(obj_ref.store, mode='r') -print(z['spikes'].shape) - -# xarray integration -ds = xr.open_zarr(obj_ref.store) - -# Dask integration (lazy loading) -import dask.array as da -arr = da.from_zarr(obj_ref.store, component='spikes') -``` - -### Performance Tips - -1. **Use chunked writes**: Write data in chunks that match your Zarr chunk size -2. **Avoid metadata computation**: Let `size` and `item_count` default to `null` -3. **Use appropriate chunk sizes**: Balance between too many small files (overhead) and too few large files (memory) -4. **Consider compression**: Configure Zarr compression (blosc, zstd) to reduce storage costs - -## Future Extensions - -- [ ] Compression options (gzip, lz4, zstd) -- [ ] Encryption at rest -- [ ] Versioning support -- [ ] Streaming upload for large files -- [ ] Checksum verification on fetch -- [ ] Cache layer for frequently accessed files -- [ ] Parallel upload/download for large folders -- [ ] Row-level object access control via signed URLs (project DB permissions onto object access) diff --git a/docs/src/design/tables/storage-types-implementation-plan.md b/docs/src/design/tables/storage-types-implementation-plan.md deleted file mode 100644 index c15a2292..00000000 --- a/docs/src/design/tables/storage-types-implementation-plan.md +++ /dev/null @@ -1,464 +0,0 @@ -# DataJoint Storage Types Redesign - Implementation Plan - -## Executive Summary - -This plan describes the implementation of a three-layer type architecture for DataJoint, building on the existing `AttributeType` infrastructure. The key goals are: - -1. Establish a clean three-layer type hierarchy (native DB types, core DataJoint types, AttributeTypes) -2. Implement content-addressed storage with deduplication -3. Provide composable, user-friendly types (``, ``, ``) -4. Enable project-wide garbage collection -5. Maintain backward compatibility with existing schemas - ---- - -## Implementation Status - -| Phase | Status | Notes | -|-------|--------|-------| -| Phase 1: Core Type System | ✅ Complete | CORE_TYPES dict, type chain resolution | -| Phase 2: Content-Addressed Storage | ✅ Complete | Function-based, no registry table | -| Phase 2b: Path-Addressed Storage | ✅ Complete | ObjectType for files/folders | -| Phase 3: User-Defined AttributeTypes | ✅ Complete | AttachType, XAttachType, FilepathType | -| Phase 4: Insert and Fetch Integration | ✅ Complete | Type chain encoding/decoding | -| Phase 5: Garbage Collection | ✅ Complete | gc.py with scan/collect functions | -| Phase 6: Documentation and Testing | ✅ Complete | Test files for all new types | - ---- - -## Phase 1: Core Type System Foundation ✅ - -**Status**: Complete - -### Implemented in `src/datajoint/declare.py`: - -```python -CORE_TYPES = { - # Numeric types (aliased to native SQL) - "float32": (r"float32$", "float"), - "float64": (r"float64$", "double"), - "int64": (r"int64$", "bigint"), - "uint64": (r"uint64$", "bigint unsigned"), - "int32": (r"int32$", "int"), - "uint32": (r"uint32$", "int unsigned"), - "int16": (r"int16$", "smallint"), - "uint16": (r"uint16$", "smallint unsigned"), - "int8": (r"int8$", "tinyint"), - "uint8": (r"uint8$", "tinyint unsigned"), - "bool": (r"bool$", "tinyint"), - # UUID (stored as binary) - "uuid": (r"uuid$", "binary(16)"), - # JSON - "json": (r"json$", None), - # Binary (blob maps to longblob) - "blob": (r"blob$", "longblob"), - # Temporal - "date": (r"date$", None), - "datetime": (r"datetime$", None), - # String types (with parameters) - "char": (r"char\s*\(\d+\)$", None), - "varchar": (r"varchar\s*\(\d+\)$", None), - # Enumeration - "enum": (r"enum\s*\(.+\)$", None), -} -``` - -### Key changes: -- Removed `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` -- Core types are recorded in field comments with `:type:` syntax -- Non-standard native types pass through with warning -- `parse_type_spec()` handles `` syntax -- `resolve_dtype()` returns `(final_dtype, type_chain, store_name)` tuple - ---- - -## Phase 2: Content-Addressed Storage ✅ - -**Status**: Complete (simplified design) - -### Design Decision: Functions vs Class - -The original plan proposed a `ContentRegistry` class with a database table. We implemented a simpler, stateless approach using functions in `content_registry.py`: - -**Why functions instead of a registry table:** -1. **Simpler** - No additional database table to manage -2. **Decoupled** - Content storage is independent of any schema -3. **GC by scanning** - Garbage collection scans tables for references rather than maintaining reference counts -4. **Less state** - No synchronization issues between registry and actual storage - -### Implemented in `src/datajoint/content_registry.py`: - -```python -def compute_content_hash(data: bytes) -> str: - """Compute SHA256 hash of content.""" - return hashlib.sha256(data).hexdigest() - -def build_content_path(content_hash: str) -> str: - """Build path: _content/{hash[:2]}/{hash[2:4]}/{hash}""" - return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - -def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: - """Store content with deduplication. Returns {hash, store, size}.""" - ... - -def get_content(content_hash: str, store_name: str | None = None) -> bytes: - """Retrieve content by hash with verification.""" - ... - -def content_exists(content_hash: str, store_name: str | None = None) -> bool: - """Check if content exists.""" - ... - -def delete_content(content_hash: str, store_name: str | None = None) -> bool: - """Delete content (use with caution - verify no references first).""" - ... -``` - -### Implemented AttributeTypes in `src/datajoint/attribute_type.py`: - -```python -class ContentType(AttributeType): - """Content-addressed storage. Stores bytes, returns JSON metadata.""" - type_name = "content" - dtype = "json" - - def encode(self, value: bytes, *, key=None, store_name=None) -> dict: - return put_content(value, store_name=store_name) - - def decode(self, stored: dict, *, key=None) -> bytes: - return get_content(stored["hash"], store_name=stored.get("store")) - - -class XBlobType(AttributeType): - """External serialized blob using content-addressed storage.""" - type_name = "xblob" - dtype = "" # Composition - - def encode(self, value, *, key=None, store_name=None) -> bytes: - return blob.pack(value, compress=True) - - def decode(self, stored: bytes, *, key=None) -> Any: - return blob.unpack(stored, squeeze=False) -``` - ---- - -## Phase 2b: Path-Addressed Storage (ObjectType) ✅ - -**Status**: Complete - -### Design: Path vs Content Addressing - -| Aspect | `` | `` | -|--------|-------------|------------| -| Addressing | Content-hash (SHA256) | Path (from primary key) | -| Path Format | `_content/{hash[:2]}/{hash[2:4]}/{hash}` | `{schema}/{table}/objects/{pk}/{field}_{token}.ext` | -| Deduplication | Yes (same content = same hash) | No (each row has unique path) | -| Deletion | GC when unreferenced | Deleted with row | -| Use case | Serialized blobs, attachments | Zarr, HDF5, folders | - -### Implemented in `src/datajoint/builtin_types.py`: - -```python -@register_type -class ObjectType(AttributeType): - """Path-addressed storage for files and folders.""" - type_name = "object" - dtype = "json" - - def encode(self, value, *, key=None, store_name=None) -> dict: - # value can be bytes, str path, or Path - # key contains _schema, _table, _field for path construction - path, token = build_object_path(schema, table, field, primary_key, ext) - backend.put_buffer(content, path) # or put_folder for directories - return { - "path": path, - "store": store_name, - "size": size, - "ext": ext, - "is_dir": is_dir, - "timestamp": timestamp.isoformat(), - } - - def decode(self, stored: dict, *, key=None) -> ObjectRef: - # Returns lazy handle for fsspec-based access - return ObjectRef.from_json(stored, backend=backend) -``` - -### ObjectRef Features: -- `ref.path` - Storage path -- `ref.read()` - Read file content -- `ref.open()` - Open as file handle -- `ref.fsmap` - For `zarr.open(ref.fsmap)` -- `ref.download(dest)` - Download to local path -- `ref.listdir()` / `ref.walk()` - For directories - -### Staged Insert for Object Types - -For large objects like Zarr arrays, `staged_insert.py` provides direct writes to storage: - -```python -with table.staged_insert1 as staged: - # 1. Set primary key first (required for path construction) - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # 2. Get storage handle and write directly - z = zarr.open(staged.store('raw_data', '.zarr'), mode='w') - z[:] = large_array - - # 3. On exit: metadata computed, record inserted -``` - -**Flow comparison:** - -| Normal Insert | Staged Insert | -|--------------|---------------| -| `ObjectType.encode()` uploads content | Direct writes via `staged.store()` | -| Single operation | Two-phase: write then finalize | -| Good for files/folders | Ideal for Zarr, HDF5, streaming | - -Both produce the same JSON metadata format compatible with `ObjectRef.from_json()`. - -**Key methods:** -- `staged.store(field, ext)` - Returns `FSMap` for Zarr/xarray -- `staged.open(field, ext)` - Returns file handle for binary writes -- `staged.fs` - Raw fsspec filesystem access - ---- - -## Phase 3: User-Defined AttributeTypes ✅ - -**Status**: Complete - -All built-in AttributeTypes are implemented in `src/datajoint/builtin_types.py`. - -### 3.1 XBlobType ✅ -External serialized blobs using content-addressed storage. Composes with ``. - -### 3.2 AttachType ✅ - -```python -@register_type -class AttachType(AttributeType): - """Internal file attachment stored in database.""" - type_name = "attach" - dtype = "longblob" - - def encode(self, filepath, *, key=None, store_name=None) -> bytes: - # Returns: filename (UTF-8) + null byte + contents - return path.name.encode("utf-8") + b"\x00" + path.read_bytes() - - def decode(self, stored, *, key=None) -> str: - # Extracts to download_path, returns local path - ... -``` - -### 3.3 XAttachType ✅ - -```python -@register_type -class XAttachType(AttributeType): - """External file attachment using content-addressed storage.""" - type_name = "xattach" - dtype = "" # Composes with ContentType - # Same encode/decode as AttachType, but stored externally with dedup -``` - -### 3.4 FilepathType ✅ - -```python -@register_type -class FilepathType(AttributeType): - """Reference to existing file in configured store.""" - type_name = "filepath" - dtype = "json" - - def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: - # Verifies file exists, returns metadata - return {'path': path, 'store': store_name, 'size': size, ...} - - def decode(self, stored: dict, *, key=None) -> ObjectRef: - # Returns ObjectRef for lazy access - return ObjectRef.from_json(stored, backend=backend) -``` - -### Type Comparison - -| Type | Storage | Copies File | Dedup | Returns | -|------|---------|-------------|-------|---------| -| `` | Database | Yes | No | Local path | -| `` | External | Yes | Yes | Local path | -| `` | Reference | No | N/A | ObjectRef | -| `` | External | Yes | No | ObjectRef | - ---- - -## Phase 4: Insert and Fetch Integration ✅ - -**Status**: Complete - -### Updated in `src/datajoint/table.py`: - -```python -def __make_placeholder(self, name, value, ...): - if attr.adapter: - from .attribute_type import resolve_dtype - attr.adapter.validate(value) - _, type_chain, resolved_store = resolve_dtype( - f"<{attr.adapter.type_name}>", store_name=attr.store - ) - # Apply type chain: outermost → innermost - for attr_type in type_chain: - try: - value = attr_type.encode(value, key=None, store_name=resolved_store) - except TypeError: - value = attr_type.encode(value, key=None) -``` - -### Updated in `src/datajoint/fetch.py`: - -```python -def _get(connection, attr, data, squeeze, download_path): - if attr.adapter: - from .attribute_type import resolve_dtype - final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") - - # Parse JSON if final storage is JSON - if final_dtype.lower() == "json": - data = json.loads(data) - - # Apply type chain in reverse: innermost → outermost - for attr_type in reversed(type_chain): - data = attr_type.decode(data, key=None) - - return data -``` - ---- - -## Phase 5: Garbage Collection ✅ - -**Status**: Complete - -### Implemented in `src/datajoint/gc.py`: - -```python -import datajoint as dj - -# Scan schemas and find orphaned content/objects -stats = dj.gc.scan(schema1, schema2, store_name='mystore') - -# Remove orphaned content/objects (dry_run=False to actually delete) -stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) - -# Format statistics for display -print(dj.gc.format_stats(stats)) -``` - -**Supported storage patterns:** - -1. **Content-Addressed Storage** (``, ``, ``): - - Stored at: `_content/{hash[:2]}/{hash[2:4]}/{hash}` - - Referenced by SHA256 hash in JSON metadata - -2. **Path-Addressed Storage** (``): - - Stored at: `{schema}/{table}/objects/{pk}/{field}_{token}/` - - Referenced by path in JSON metadata - -**Key functions:** -- `scan_references(*schemas, store_name=None)` - Scan tables for content hashes -- `scan_object_references(*schemas, store_name=None)` - Scan tables for object paths -- `list_stored_content(store_name=None)` - List all content in `_content/` directory -- `list_stored_objects(store_name=None)` - List all objects in `*/objects/` directories -- `scan(*schemas, store_name=None)` - Find orphaned content/objects without deleting -- `collect(*schemas, store_name=None, dry_run=True)` - Remove orphaned content/objects -- `delete_object(path, store_name=None)` - Delete an object directory -- `format_stats(stats)` - Human-readable statistics output - -**GC Process:** -1. Scan all tables in provided schemas for content-type and object-type attributes -2. Extract content hashes and object paths from JSON metadata columns -3. Scan storage for all stored content (`_content/`) and objects (`*/objects/`) -4. Compute orphaned = stored - referenced (for both types) -5. Optionally delete orphaned items (when `dry_run=False`) - ---- - -## Phase 6: Documentation and Testing ✅ - -**Status**: Complete - -### Test files created: -- `tests/test_content_storage.py` - Content-addressed storage functions -- `tests/test_type_composition.py` - Type chain encoding/decoding -- `tests/test_gc.py` - Garbage collection -- `tests/test_attribute_type.py` - AttributeType registry and DJBlobType (existing) - ---- - -## Critical Files Summary - -| File | Status | Changes | -|------|--------|---------| -| `src/datajoint/declare.py` | ✅ | CORE_TYPES, type parsing, SQL generation | -| `src/datajoint/heading.py` | ✅ | Simplified attribute properties | -| `src/datajoint/attribute_type.py` | ✅ | Base class, registry, type chain resolution | -| `src/datajoint/builtin_types.py` | ✅ | DJBlobType, ContentType, XBlobType, ObjectType | -| `src/datajoint/content_registry.py` | ✅ | Content storage functions (put, get, delete) | -| `src/datajoint/objectref.py` | ✅ | ObjectRef handle for lazy access | -| `src/datajoint/storage.py` | ✅ | StorageBackend, build_object_path | -| `src/datajoint/staged_insert.py` | ✅ | Staged insert for direct object storage writes | -| `src/datajoint/table.py` | ✅ | Type chain encoding on insert | -| `src/datajoint/fetch.py` | ✅ | Type chain decoding on fetch | -| `src/datajoint/blob.py` | ✅ | Removed bypass_serialization | -| `src/datajoint/gc.py` | ✅ | Garbage collection for content storage | -| `tests/test_content_storage.py` | ✅ | Tests for content_registry.py | -| `tests/test_type_composition.py` | ✅ | Tests for type chain encoding/decoding | -| `tests/test_gc.py` | ✅ | Tests for garbage collection | - ---- - -## Removed/Deprecated - -- `src/datajoint/attribute_adapter.py` - Deleted (hard deprecated) -- `bypass_serialization` flag in `blob.py` - Removed -- `database` field in Attribute - Removed (unused) -- `SERIALIZED_TYPES`, `BINARY_TYPES`, `EXTERNAL_TYPES` - Removed -- `is_attachment`, `is_filepath`, `is_object`, `is_external` flags - Removed - ---- - -## Architecture Summary - -``` -Layer 3: AttributeTypes (user-facing) - , , , , , , - ↓ encode() / ↑ decode() - -Layer 2: Core DataJoint Types - float32, int64, uuid, json, blob, varchar(n), etc. - ↓ SQL mapping - -Layer 1: Native Database Types - FLOAT, BIGINT, BINARY(16), JSON, LONGBLOB, VARCHAR(n), etc. -``` - -**Built-in AttributeTypes:** -``` - → longblob (internal serialized storage) - → longblob (internal file attachment) - → json (path-addressed, for Zarr/HDF5/folders) - → json (reference to existing file in store) - → json (content-addressed with deduplication) - → json (external serialized with dedup) - → json (external file attachment with dedup) -``` - -**Type Composition Example:** -``` - → json (in DB) - -Insert: Python object → blob.pack() → put_content() → JSON metadata -Fetch: JSON metadata → get_content() → blob.unpack() → Python object -``` diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/design/tables/storage-types-spec.md index 668fdfdf..f7aead7d 100644 --- a/docs/src/design/tables/storage-types-spec.md +++ b/docs/src/design/tables/storage-types-spec.md @@ -6,20 +6,20 @@ This document defines a three-layer type architecture: 1. **Native database types** - Backend-specific (`FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB`). Discouraged for direct use. 2. **Core DataJoint types** - Standardized across backends, scientist-friendly (`float32`, `uint8`, `bool`, `json`). -3. **AttributeTypes** - Programmatic types with `encode()`/`decode()` semantics. Composable. +3. **Codec Types** - Programmatic types with `encode()`/`decode()` semantics. Composable. ``` ┌───────────────────────────────────────────────────────────────────┐ -│ AttributeTypes (Layer 3) │ +│ Codec Types (Layer 3) │ │ │ -│ Built-in: │ +│ Built-in: │ │ User: ... │ ├───────────────────────────────────────────────────────────────────┤ │ Core DataJoint Types (Layer 2) │ │ │ │ float32 float64 int64 uint64 int32 uint32 int16 uint16 │ -│ int8 uint8 bool uuid json blob date datetime │ -│ char(n) varchar(n) enum(...) │ +│ int8 uint8 bool uuid json bytes date datetime text │ +│ char(n) varchar(n) enum(...) decimal(n,f) │ ├───────────────────────────────────────────────────────────────────┤ │ Native Database Types (Layer 1) │ │ │ @@ -31,14 +31,15 @@ This document defines a three-layer type architecture: **Syntax distinction:** - Core types: `int32`, `float64`, `varchar(255)` - no brackets -- AttributeTypes: ``, ``, `` - angle brackets +- Codec types: ``, ``, `` - angle brackets +- The `@` character indicates external storage (object store vs database) ### OAS Storage Regions | Region | Path Pattern | Addressing | Use Case | |--------|--------------|------------|----------| | Object | `{schema}/{table}/{pk}/` | Primary key | Large objects, Zarr, HDF5 | -| Content | `_content/{hash}` | Content hash | Deduplicated blobs/files | +| Hash | `_hash/{hash}` | MD5 hash | Deduplicated blobs/files | ### External References @@ -54,70 +55,300 @@ MySQL and PostgreSQL backends. Users should prefer these over native database ty ### Numeric Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `int8` | 8-bit signed | `TINYINT` | -| `int16` | 16-bit signed | `SMALLINT` | -| `int32` | 32-bit signed | `INT` | -| `int64` | 64-bit signed | `BIGINT` | -| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | -| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | -| `uint32` | 32-bit unsigned | `INT UNSIGNED` | -| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | -| `float32` | 32-bit float | `FLOAT` | -| `float64` | 64-bit float | `DOUBLE` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `int8` | 8-bit signed | `TINYINT` | `SMALLINT` | +| `int16` | 16-bit signed | `SMALLINT` | `SMALLINT` | +| `int32` | 32-bit signed | `INT` | `INTEGER` | +| `int64` | 64-bit signed | `BIGINT` | `BIGINT` | +| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | `SMALLINT` | +| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | `INTEGER` | +| `uint32` | 32-bit unsigned | `INT UNSIGNED` | `BIGINT` | +| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | `NUMERIC(20)` | +| `float32` | 32-bit float | `FLOAT` | `REAL` | +| `float64` | 64-bit float | `DOUBLE` | `DOUBLE PRECISION` | +| `decimal(n,f)` | Fixed-point | `DECIMAL(n,f)` | `NUMERIC(n,f)` | ### String Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `char(n)` | Fixed-length | `CHAR(n)` | -| `varchar(n)` | Variable-length | `VARCHAR(n)` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` | +| `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` | +| `text` | Unlimited text | `TEXT` | `TEXT` | + +**Encoding:** All strings use UTF-8 (`utf8mb4` in MySQL, `UTF8` in PostgreSQL). +See [Encoding and Collation Policy](#encoding-and-collation-policy) for details. ### Boolean -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `bool` | True/False | `TINYINT` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `bool` | True/False | `TINYINT` | `BOOLEAN` | ### Date/Time Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `date` | Date only | `DATE` | -| `datetime` | Date and time | `DATETIME` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `date` | Date only | `DATE` | `DATE` | +| `datetime` | Date and time | `DATETIME` | `TIMESTAMP` | + +**Timezone policy:** All `datetime` values should be stored as **UTC**. Timezone conversion is a +presentation concern handled by the application layer, not the database. This ensures: +- Reproducible computations regardless of server or client timezone settings +- Simple arithmetic on temporal values (no DST ambiguity) +- Portable data across systems and regions + +Use `CURRENT_TIMESTAMP` for auto-populated creation times: +``` +created_at : datetime = CURRENT_TIMESTAMP +``` ### Binary Types -The core `blob` type stores raw bytes without any serialization. Use `` AttributeType +The core `bytes` type stores raw bytes without any serialization. Use the `` codec for serialized Python objects. -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `blob` | Raw bytes | `LONGBLOB` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `bytes` | Raw bytes | `LONGBLOB` | `BYTEA` | ### Other Types -| Core Type | Description | MySQL | -|-----------|-------------|-------| -| `json` | JSON document | `JSON` | -| `uuid` | UUID | `BINARY(16)` | -| `enum(...)` | Enumeration | `ENUM(...)` | +| Core Type | Description | MySQL | PostgreSQL | +|-----------|-------------|-------|------------| +| `json` | JSON document | `JSON` | `JSONB` | +| `uuid` | UUID | `BINARY(16)` | `UUID` | +| `enum(...)` | Enumeration | `ENUM(...)` | `CREATE TYPE ... AS ENUM` | ### Native Passthrough Types -Users may use native database types directly (e.g., `text`, `mediumint auto_increment`), +Users may use native database types directly (e.g., `mediumint`, `tinyblob`), but these will generate a warning about non-standard usage. Native types are not recorded in field comments and may have portability issues across database backends. -## AttributeTypes (Layer 3) +### Type Modifiers Policy + +DataJoint table definitions have their own syntax for constraints and metadata. SQL type +modifiers are **not allowed** in type specifications because they conflict with DataJoint's +declarative syntax: + +| Modifier | Status | DataJoint Alternative | +|----------|--------|----------------------| +| `NOT NULL` / `NULL` | ❌ Not allowed | Use `= NULL` for nullable; omit default for required | +| `DEFAULT value` | ❌ Not allowed | Use `= value` syntax before the type | +| `PRIMARY KEY` | ❌ Not allowed | Position above `---` line | +| `UNIQUE` | ❌ Not allowed | Use DataJoint index syntax | +| `COMMENT 'text'` | ❌ Not allowed | Use `# comment` syntax | +| `CHARACTER SET` | ❌ Not allowed | Database-level configuration | +| `COLLATE` | ❌ Not allowed | Database-level configuration | +| `AUTO_INCREMENT` | ⚠️ Discouraged | Allowed with native types only, generates warning | +| `UNSIGNED` | ✅ Allowed | Part of type semantics (use `uint*` core types) | + +**Nullability and defaults:** DataJoint handles nullability through the default value syntax. +An attribute is nullable if and only if its default is `NULL`: + +``` +# Required (NOT NULL, no default) +name : varchar(100) + +# Nullable (default is NULL) +nickname = NULL : varchar(100) + +# Required with default value +status = "active" : varchar(20) +``` + +**Auto-increment policy:** DataJoint discourages `AUTO_INCREMENT` / `SERIAL` because: +- Breaks reproducibility (IDs depend on insertion order) +- Makes pipelines non-deterministic +- Complicates data migration and replication +- Primary keys should be meaningful, not arbitrary + +If required, use native types: `int auto_increment` or `serial` (with warning). + +### Encoding and Collation Policy + +Character encoding and collation are **database-level configuration**, not part of type +definitions. This ensures consistent behavior across all tables and simplifies portability. -AttributeTypes provide `encode()`/`decode()` semantics on top of core types. They are +**Configuration** (in `dj.config` or `datajoint.json`): +```json +{ + "database.charset": "utf8mb4", + "database.collation": "utf8mb4_bin" +} +``` + +**Defaults:** + +| Setting | MySQL | PostgreSQL | +|---------|-------|------------| +| Charset | `utf8mb4` | `UTF8` | +| Collation | `utf8mb4_bin` | `C` | + +**Policy:** +- **UTF-8 required**: DataJoint validates charset is UTF-8 compatible at connection time +- **Case-sensitive by default**: Binary collation (`utf8mb4_bin` / `C`) ensures predictable comparisons +- **No per-column overrides**: `CHARACTER SET` and `COLLATE` are rejected in type definitions +- **Like timezone**: Encoding is infrastructure configuration, not part of the data model + +## Codec Types (Layer 3) + +Codec types provide `encode()`/`decode()` semantics on top of core types. They are composable and can be built-in or user-defined. -### `` / `` - Path-Addressed Storage +### Storage Mode: `@` Convention + +The `@` character in codec syntax indicates **external storage** (object store): + +- **No `@`**: Internal storage (database) - e.g., ``, `` +- **`@` present**: External storage (object store) - e.g., ``, `` +- **`@` alone**: Use default store - e.g., `` +- **`@name`**: Use named store - e.g., `` + +Some codecs support both modes (``, ``), others are external-only (``, ``, ``). + +### Codec Base Class + +Codecs auto-register when subclassed using Python's `__init_subclass__` mechanism. +No decorator is needed. + +```python +from abc import ABC, abstractmethod +from typing import Any + +# Global codec registry +_codec_registry: dict[str, "Codec"] = {} + + +class Codec(ABC): + """ + Base class for codec types. Subclasses auto-register by name. + + Requires Python 3.10+. + """ + name: str | None = None # Must be set by concrete subclasses + + def __init_subclass__(cls, *, register: bool = True, **kwargs): + """Auto-register concrete codecs when subclassed.""" + super().__init_subclass__(**kwargs) + + if not register: + return # Skip registration for abstract bases + + if cls.name is None: + return # Skip registration if no name (abstract) + + if cls.name in _codec_registry: + existing = _codec_registry[cls.name] + if type(existing) is not cls: + raise DataJointError( + f"Codec <{cls.name}> already registered by {type(existing).__name__}" + ) + return # Same class, idempotent + + _codec_registry[cls.name] = cls() + + def get_dtype(self, is_external: bool) -> str: + """ + Return the storage dtype for this codec. + + Args: + is_external: True if @ modifier present (external storage) + + Returns: + A core type (e.g., "bytes", "json") or another codec (e.g., "") + """ + raise NotImplementedError + + @abstractmethod + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: + """Encode Python value for storage.""" + ... + + @abstractmethod + def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """Decode stored value back to Python.""" + ... + + def validate(self, value: Any) -> None: + """Optional validation before encoding. Override to add constraints.""" + pass + + +def list_codecs() -> list[str]: + """Return list of registered codec names.""" + return sorted(_codec_registry.keys()) + + +def get_codec(name: str) -> Codec: + """Get codec by name. Raises DataJointError if not found.""" + if name not in _codec_registry: + raise DataJointError(f"Unknown codec: <{name}>") + return _codec_registry[name] +``` + +**Usage - no decorator needed:** + +```python +class GraphCodec(dj.Codec): + """Auto-registered as .""" + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): + import networkx as nx + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G +``` + +**Skip registration for abstract bases:** + +```python +class ExternalOnlyCodec(dj.Codec, register=False): + """Abstract base for external-only codecs. Not registered.""" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(f"<{self.name}> requires @ (external only)") + return "json" +``` + +### Codec Resolution and Chaining + +Codecs resolve to core types through chaining. The `get_dtype(is_external)` method +returns the appropriate dtype based on storage mode: + +``` +Resolution at declaration time: -**Built-in AttributeType.** OAS (Object-Augmented Schema) storage: + → get_dtype(False) → "bytes" → LONGBLOB/BYTEA + → get_dtype(True) → "" → json → JSON/JSONB + → get_dtype(True) → "" → json (store=cold) + + → get_dtype(False) → "bytes" → LONGBLOB/BYTEA + → get_dtype(True) → "" → json → JSON/JSONB + + → get_dtype(True) → "json" → JSON/JSONB + → get_dtype(False) → ERROR (external only) + + → get_dtype(True) → "json" → JSON/JSONB + → get_dtype(True) → "json" → JSON/JSONB +``` + +### `` / `` - Path-Addressed Storage + +**Built-in codec. External only.** + +OAS (Object-Augmented Schema) storage for files and folders: - Path derived from primary key: `{schema}/{table}/{pk}/{attribute}/` - One-to-one relationship with table row @@ -131,7 +362,7 @@ class Analysis(dj.Computed): definition = """ -> Recording --- - results : # default store + results : # default store archive : # specific store """ ``` @@ -139,35 +370,34 @@ class Analysis(dj.Computed): #### Implementation ```python -class ObjectType(AttributeType): - """Built-in AttributeType for path-addressed OAS storage.""" - type_name = "object" - dtype = "json" +class ObjectCodec(dj.Codec): + """Path-addressed OAS storage. External only.""" + name = "object" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode(self, value, *, key=None, store_name=None) -> dict: store = get_store(store_name or dj.config['stores']['default']) path = self._compute_path(key) # {schema}/{table}/{pk}/{attr}/ store.put(path, value) - return { - "path": path, - "store": store_name, - # Additional metadata (size, timestamps, etc.) - } + return {"path": path, "store": store_name, ...} def decode(self, stored: dict, *, key=None) -> ObjectRef: - return ObjectRef( - store=get_store(stored["store"]), - path=stored["path"] - ) + return ObjectRef(store=get_store(stored["store"]), path=stored["path"]) ``` -### `` / `` - Content-Addressed Storage +### `` / `` - Hash-Addressed Storage -**Built-in AttributeType.** Content-addressed storage with deduplication: +**Built-in codec. External only.** + +Hash-addressed storage with deduplication: - **Single blob only**: stores a single file or serialized object (not folders) - **Per-project scope**: content is shared across all schemas in a project (not per-schema) -- Path derived from content hash: `_content/{hash[:2]}/{hash[2:4]}/{hash}` +- Path derived from content hash: `_hash/{hash[:2]}/{hash[2:4]}/{hash}` - Many-to-one: multiple rows (even across schemas) can reference same content - Reference counted for garbage collection - Deduplication: identical content stored once across the entire project @@ -179,48 +409,44 @@ store_root/ ├── {schema}/{table}/{pk}/ # object storage (path-addressed by PK) │ └── {attribute}/ │ -└── _content/ # content storage (content-addressed) +└── _hash/ # content storage (hash-addressed) └── {hash[:2]}/{hash[2:4]}/{hash} ``` #### Implementation ```python -class ContentType(AttributeType): - """Built-in AttributeType for content-addressed storage.""" - type_name = "content" - dtype = "json" +class HashCodec(dj.Codec): + """Hash-addressed storage. External only.""" + name = "hash" + + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode(self, data: bytes, *, key=None, store_name=None) -> dict: """Store content, return metadata as JSON.""" - content_hash = hashlib.sha256(data).hexdigest() + hash_id = hashlib.md5(data).hexdigest() # 32-char hex store = get_store(store_name or dj.config['stores']['default']) - path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" if not store.exists(path): store.put(path, data) - ContentRegistry().insert1({ - 'content_hash': content_hash, - 'store': store_name, - 'size': len(data) - }, skip_duplicates=True) - - return { - "hash": content_hash, - "store": store_name, - "size": len(data) - } + + # Metadata stored in JSON column (no separate registry) + return {"hash": hash_id, "store": store_name, "size": len(data)} def decode(self, stored: dict, *, key=None) -> bytes: """Retrieve content by hash.""" store = get_store(stored["store"]) - path = f"_content/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}" + path = f"_hash/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}" return store.get(path) ``` #### Database Column -The `` type stores JSON metadata: +The `` type stores JSON metadata: ```sql -- content column (MySQL) @@ -233,7 +459,9 @@ features JSONB NOT NULL ### `` - Portable External Reference -**Built-in AttributeType.** Relative path references within configured stores: +**Built-in codec. External only (store required).** + +Relative path references within configured stores: - **Relative paths**: paths within a configured store (portable across environments) - **Store-aware**: resolves paths against configured store backend @@ -282,32 +510,23 @@ just use `varchar`. A string is simpler and more transparent. #### Implementation ```python -class FilepathType(AttributeType): - """Built-in AttributeType for store-relative file references.""" - type_name = "filepath" - dtype = "json" +class FilepathCodec(dj.Codec): + """Store-relative file references. External only.""" + name = "filepath" - def encode(self, relative_path: str, *, key=None, store_name=None, - compute_checksum: bool = False) -> dict: + def get_dtype(self, is_external: bool) -> str: + if not is_external: + raise DataJointError(" requires @store") + return "json" + + def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: """Register reference to file in store.""" store = get_store(store_name) # store_name required for filepath - metadata = {'path': relative_path, 'store': store_name} - - if compute_checksum: - full_path = store.resolve(relative_path) - if store.exists(full_path): - metadata['checksum'] = compute_file_checksum(store, full_path) - metadata['size'] = store.size(full_path) - - return metadata + return {'path': relative_path, 'store': store_name} def decode(self, stored: dict, *, key=None) -> ObjectRef: """Return ObjectRef for lazy access.""" - return ObjectRef( - store=get_store(stored['store']), - path=stored['path'], - checksum=stored.get('checksum') # optional verification - ) + return ObjectRef(store=get_store(stored['store']), path=stored['path']) ``` #### Database Column @@ -346,69 +565,33 @@ column_name JSONB NOT NULL ``` The `json` database type: -- Used as dtype by built-in AttributeTypes (``, ``, ``) +- Used as dtype by built-in codecs (``, ``, ``) - Stores arbitrary JSON-serializable data - Automatically uses appropriate type for database backend - Supports JSON path queries where available -## Parameterized AttributeTypes - -AttributeTypes can be parameterized with `` syntax. The parameter specifies -which store to use: +## Built-in Codecs -```python -class AttributeType: - type_name: str # Name used in or as bare type - dtype: str # Database type or built-in AttributeType - - # When user writes type_name@param, resolved store becomes param -``` - -**Resolution examples:** -``` - → uses type → default store - → uses type → cold store - → dtype = "longblob" → database (no store) - → uses type → cold store -``` +### `` / `` - Serialized Python Objects -AttributeTypes can use other AttributeTypes as their dtype (composition): -- `` uses `` - adds djblob serialization on top of content-addressed storage -- `` uses `` - adds filename preservation on top of content-addressed storage +**Supports both internal and external storage.** -## User-Defined AttributeTypes +Serializes Python objects (NumPy arrays, dicts, lists, etc.) using DataJoint's +blob format. Compatible with MATLAB. -### `` - Internal Serialized Blob - -Serialized Python object stored in database. +- **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) +- **``**: Stored externally via `` with deduplication +- **``**: Stored in specific named store ```python -@dj.register_type -class DJBlobType(AttributeType): - type_name = "djblob" - dtype = "longblob" # MySQL type - - def encode(self, value, *, key=None) -> bytes: - from . import blob - return blob.pack(value, compress=True) - - def decode(self, stored, *, key=None) -> Any: - from . import blob - return blob.unpack(stored) -``` - -### `` / `` - External Serialized Blob +class BlobCodec(dj.Codec): + """Serialized Python objects. Supports internal and external.""" + name = "blob" -Serialized Python object stored in content-addressed storage. + def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" -```python -@dj.register_type -class XBlobType(AttributeType): - type_name = "xblob" - dtype = "content" # Core type - uses default store - # dtype = "content@store" for specific store - - def encode(self, value, *, key=None) -> bytes: + def encode(self, value, *, key=None, store_name=None) -> bytes: from . import blob return blob.pack(value, compress=True) @@ -423,55 +606,38 @@ class ProcessedData(dj.Computed): definition = """ -> RawData --- - small_result : # internal (in database) - large_result : # external (default store) - archive_result : # external (specific store) + small_result : # internal (in database) + large_result : # external (default store) + archive_result : # external (specific store) """ ``` -### `` - Internal File Attachment - -File stored in database with filename preserved. - -```python -@dj.register_type -class AttachType(AttributeType): - type_name = "attach" - dtype = "longblob" - - def encode(self, filepath, *, key=None) -> bytes: - path = Path(filepath) - return path.name.encode() + b"\0" + path.read_bytes() +### `` / `` - File Attachments - def decode(self, stored, *, key=None) -> str: - filename, contents = stored.split(b"\0", 1) - filename = filename.decode() - download_path = Path(dj.config['download_path']) / filename - download_path.parent.mkdir(parents=True, exist_ok=True) - download_path.write_bytes(contents) - return str(download_path) -``` +**Supports both internal and external storage.** -### `` / `` - External File Attachment +Stores files with filename preserved. On fetch, extracts to configured download path. -File stored in content-addressed storage with filename preserved. +- **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) +- **``**: Stored externally via `` with deduplication +- **``**: Stored in specific named store ```python -@dj.register_type -class XAttachType(AttributeType): - type_name = "xattach" - dtype = "content" # Core type +class AttachCodec(dj.Codec): + """File attachment with filename. Supports internal and external.""" + name = "attach" + + def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" - def encode(self, filepath, *, key=None) -> bytes: + def encode(self, filepath, *, key=None, store_name=None) -> bytes: path = Path(filepath) - # Include filename in stored data return path.name.encode() + b"\0" + path.read_bytes() def decode(self, stored, *, key=None) -> str: filename, contents = stored.split(b"\0", 1) filename = filename.decode() download_path = Path(dj.config['download_path']) / filename - download_path.parent.mkdir(parents=True, exist_ok=True) download_path.write_bytes(contents) return str(download_path) ``` @@ -480,93 +646,121 @@ Usage: ```python class Attachments(dj.Manual): definition = """ - attachment_id : int + attachment_id : int32 --- config : # internal (small file in DB) - data_file : # external (default store) - archive : # external (specific store) + data_file : # external (default store) + archive : # external (specific store) """ ``` -## Storage Comparison +## User-Defined Codecs -| Type | dtype | Storage Location | Dedup | Returns | -|------|-------|------------------|-------|---------| -| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `` | `json` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `` | `json` | `_content/{hash}` | Yes | bytes | -| `` | `json` | `_content/{hash}` | Yes | bytes | -| `` | `json` | Configured store (relative path) | No | ObjectRef | -| `` | `longblob` | Database | No | Python object | -| `` | `` | `_content/{hash}` | Yes | Python object | -| `` | `` | `_content/{hash}` | Yes | Python object | -| `` | `longblob` | Database | No | Local file path | -| `` | `` | `_content/{hash}` | Yes | Local file path | -| `` | `` | `_content/{hash}` | Yes | Local file path | - -## Reference Counting for Content Type - -The `ContentRegistry` is a **project-level** table that tracks content-addressed objects -across all schemas. This differs from the legacy `~external_*` tables which were per-schema. +Users can define custom codecs for domain-specific data: ```python -class ContentRegistry: - """ - Project-level content registry. - Stored in a designated database (e.g., `{project}_content`). - """ - definition = """ - # Content-addressed object registry (project-wide) - content_hash : char(64) # SHA256 hex - --- - store : varchar(64) # Store name - size : bigint unsigned # Size in bytes - created : timestamp DEFAULT CURRENT_TIMESTAMP - """ +class GraphCodec(dj.Codec): + """Store NetworkX graphs. Internal only (no external support).""" + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + if is_external: + raise DataJointError(" does not support external storage") + return "" # Chain to blob for serialization + + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): + import networkx as nx + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G +``` + +Custom codecs can support both modes by returning different dtypes: + +```python +class ImageCodec(dj.Codec): + """Store images. Supports both internal and external.""" + name = "image" + + def get_dtype(self, is_external: bool) -> str: + return "" if is_external else "bytes" + + def encode(self, image, *, key=None, store_name=None) -> bytes: + # Convert PIL Image to PNG bytes + buffer = io.BytesIO() + image.save(buffer, format='PNG') + return buffer.getvalue() + + def decode(self, stored: bytes, *, key=None): + return PIL.Image.open(io.BytesIO(stored)) ``` -Garbage collection scans **all schemas** in the project: +## Storage Comparison + +| Type | get_dtype | Resolves To | Storage Location | Dedup | Returns | +|------|-----------|-------------|------------------|-------|---------| +| `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Python object | +| `` | `` | `json` | `_hash/{hash}` | Yes | Python object | +| `` | `` | `json` | `_hash/{hash}` | Yes | Python object | +| `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Local file path | +| `` | `` | `json` | `_hash/{hash}` | Yes | Local file path | +| `` | `` | `json` | `_hash/{hash}` | Yes | Local file path | +| `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | +| `` | `json` | `JSON`/`JSONB` | `_hash/{hash}` | Yes | bytes | +| `` | `json` | `JSON`/`JSONB` | `_hash/{hash}` | Yes | bytes | +| `` | `json` | `JSON`/`JSONB` | Configured store | No | ObjectRef | + +## Garbage Collection for Hash Storage + +Hash metadata (hash, store, size) is stored directly in each table's JSON column - no separate +registry table is needed. Garbage collection scans all tables to find referenced hashes: ```python -def garbage_collect(project): - """Remove content not referenced by any table in any schema.""" - # Get all registered hashes - registered = set(ContentRegistry().fetch('content_hash', 'store')) +def garbage_collect(store_name): + """Remove hash-addressed data not referenced by any table.""" + # Scan store for all hash files + store = get_store(store_name) + all_hashes = set(store.list_hashes()) # from _hash/ directory - # Get all referenced hashes from ALL schemas in the project + # Scan all tables for referenced hashes referenced = set() for schema in project.schemas: for table in schema.tables: for attr in table.heading.attributes: - if attr.type in ('content', 'content@...'): - hashes = table.fetch(attr.name) - referenced.update((h, attr.store) for h in hashes) - - # Delete orphaned content - for content_hash, store in (registered - referenced): - store_backend = get_store(store) - store_backend.delete(content_path(content_hash)) - (ContentRegistry() & {'content_hash': content_hash}).delete() + if uses_hash_storage(attr): # , , + for row in table.fetch(attr.name): + if row and row.get('store') == store_name: + referenced.add(row['hash']) + + # Delete orphaned files + for hash_id in (all_hashes - referenced): + store.delete(hash_path(hash_id)) ``` -## Built-in AttributeType Comparison +## Built-in Codec Comparison -| Feature | `` | `` | `` | -|---------|------------|-------------|---------------------| -| dtype | `json` | `json` | `json` | -| Location | OAS store | OAS store | Configured store | -| Addressing | Primary key | Content hash | Relative path | -| Path control | DataJoint | DataJoint | User | -| Deduplication | No | Yes | No | -| Structure | Files, folders, Zarr | Single blob only | Any (via fsspec) | -| Access | ObjectRef (lazy) | Transparent (bytes) | ObjectRef (lazy) | -| GC | Deleted with row | Reference counted | N/A (user managed) | -| Integrity | DataJoint managed | DataJoint managed | User managed | +| Feature | `` | `` | `` | `` | `` | +|---------|----------|------------|-------------|--------------|---------------| +| Storage modes | Both | Both | External only | External only | External only | +| Internal dtype | `bytes` | `bytes` | N/A | N/A | N/A | +| External dtype | `` | `` | `json` | `json` | `json` | +| Addressing | Hash | Hash | Primary key | Hash | Relative path | +| Deduplication | Yes (external) | Yes (external) | No | Yes | No | +| Structure | Single blob | Single file | Files, folders | Single blob | Any | +| Returns | Python object | Local path | ObjectRef | bytes | ObjectRef | +| GC | Ref counted | Ref counted | With row | Ref counted | User managed | **When to use each:** -- **``**: Large/complex objects where DataJoint controls organization (Zarr, HDF5) -- **``**: Deduplicated serialized data or file attachments via ``, `` -- **``**: Portable references to files in configured stores +- **``**: Serialized Python objects (NumPy arrays, dicts). Use `` for large/duplicated data +- **``**: File attachments with filename preserved. Use `` for large files +- **``**: Large/complex file structures (Zarr, HDF5) where DataJoint controls organization +- **``**: Raw bytes with deduplication (typically used via `` or ``) +- **``**: Portable references to externally-managed files - **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed ## Key Design Decisions @@ -574,51 +768,88 @@ def garbage_collect(project): 1. **Three-layer architecture**: - Layer 1: Native database types (backend-specific, discouraged) - Layer 2: Core DataJoint types (standardized, scientist-friendly) - - Layer 3: AttributeTypes (encode/decode, composable) -2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool` instead of `FLOAT`, `TINYINT UNSIGNED`, `TINYINT(1)` -3. **AttributeTypes use angle brackets**: ``, ``, `` - distinguishes from core types -4. **AttributeTypes are composable**: `` uses ``, which uses `json` -5. **Built-in AttributeTypes use JSON dtype**: Stores metadata (path, hash, store name, etc.) -6. **Two OAS regions**: object (PK-addressed) and content (hash-addressed) within managed stores -7. **Filepath for portability**: `` uses relative paths within stores for environment portability -8. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent -9. **Content type**: Single-blob, content-addressed, deduplicated storage -10. **Parameterized types**: `` passes store parameter -11. **Naming convention**: - - `` = internal serialized (database) - - `` = external serialized (content-addressed) - - `` = internal file (single file) - - `` = external file (single file) -12. **Transparent access**: AttributeTypes return Python objects or file paths -13. **Lazy access**: ``, ``, and `` return ObjectRef + - Layer 3: Codec types (encode/decode, composable) +2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool`, `bytes` instead of `FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB` +3. **Codecs use angle brackets**: ``, ``, `` - distinguishes from core types +4. **`@` indicates external storage**: No `@` = database, `@` present = object store +5. **`get_dtype(is_external)` method**: Codecs resolve dtype at declaration time based on storage mode +6. **Codecs are composable**: `` uses ``, which uses `json` +7. **Built-in external codecs use JSON dtype**: Stores metadata (path, hash, store name, etc.) +8. **Two OAS regions**: object (PK-addressed) and hash (hash-addressed) within managed stores +9. **Filepath for portability**: `` uses relative paths within stores for environment portability +10. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent +11. **Naming conventions**: + - `@` = external storage (object store) + - No `@` = internal storage (database) + - `@` alone = default store + - `@name` = named store +12. **Dual-mode codecs**: `` and `` support both internal and external storage +13. **External-only codecs**: ``, ``, `` require `@` +14. **Transparent access**: Codecs return Python objects or file paths +15. **Lazy access**: `` and `` return ObjectRef +16. **MD5 for content hashing**: See [Hash Algorithm Choice](#hash-algorithm-choice) below +17. **No separate registry**: Hash metadata stored in JSON columns, not a separate table +18. **Auto-registration via `__init_subclass__`**: Codecs register automatically when subclassed—no decorator needed. Use `register=False` for abstract bases. Requires Python 3.10+. + +### Hash Algorithm Choice + +Content-addressed storage uses **MD5** (128-bit, 32-char hex) rather than SHA256 (256-bit, 64-char hex). + +**Rationale:** + +1. **Practical collision resistance is sufficient**: The birthday bound for MD5 is ~2^64 operations + before 50% collision probability. No scientific project will store anywhere near 10^19 files. + For content deduplication (not cryptographic verification), MD5 provides adequate uniqueness. + +2. **Storage efficiency**: 32-char hashes vs 64-char hashes in every JSON metadata field. + With millions of records, this halves the storage overhead for hash identifiers. + +3. **Performance**: MD5 is ~2-3x faster than SHA256 for large files. While both are fast, + the difference is measurable when hashing large scientific datasets. + +4. **Legacy compatibility**: DataJoint's existing `uuid_from_buffer()` function uses MD5. + The new system changes only the storage format (hex string in JSON vs binary UUID), + not the underlying hash algorithm. This simplifies migration. + +5. **Consistency with existing codebase**: The `dj.hash` module already uses MD5 for + `key_hash()` (job reservation) and `uuid_from_buffer()` (query caching). + +**Why not SHA256?** + +SHA256 is the modern standard for content-addressable storage (Git, Docker, IPFS). However: +- These systems prioritize cryptographic security against adversarial collision attacks +- Scientific data pipelines face no adversarial threat model +- The practical benefits (storage, speed, compatibility) outweigh theoretical security gains + +**Note**: If cryptographic verification is ever needed (e.g., for compliance or reproducibility +audits), SHA256 checksums can be computed on-demand without changing the storage addressing scheme. ## Migration from Legacy Types | Legacy | New Equivalent | |--------|----------------| -| `longblob` (auto-serialized) | `` | -| `blob@store` | `` | +| `longblob` (auto-serialized) | `` | +| `blob@store` | `` | | `attach` | `` | -| `attach@store` | `` | -| `filepath@store` (copy-based) | `filepath@store` (ObjectRef-based, upgraded) | +| `attach@store` | `` | +| `filepath@store` (copy-based) | `` (ObjectRef-based) | ### Migration from Legacy `~external_*` Stores -Legacy external storage used per-schema `~external_{store}` tables. Migration to the new -per-project `ContentRegistry` requires: +Legacy external storage used per-schema `~external_{store}` tables with UUID references. +Migration to the new JSON-based hash storage requires: ```python def migrate_external_store(schema, store_name): """ - Migrate legacy ~external_{store} to new ContentRegistry. + Migrate legacy ~external_{store} to new HashRegistry. 1. Read all entries from ~external_{store} 2. For each entry: - Fetch content from legacy location - - Compute SHA256 hash - - Copy to _content/{hash}/ if not exists - - Update table column from UUID to hash - - Register in ContentRegistry + - Compute MD5 hash + - Copy to _hash/{hash}/ if not exists + - Update table column to new hash format 3. After all schemas migrated, drop ~external_{store} tables """ external_table = schema.external[store_name] @@ -630,22 +861,16 @@ def migrate_external_store(schema, store_name): content = external_table.get(legacy_uuid) # Compute new content hash - content_hash = hashlib.sha256(content).hexdigest() + hash_id = hashlib.md5(content).hexdigest() # Store in new location if not exists - new_path = f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" + new_path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" store = get_store(store_name) if not store.exists(new_path): store.put(new_path, content) - # Register in project-wide ContentRegistry - ContentRegistry().insert1({ - 'content_hash': content_hash, - 'store': store_name, - 'size': len(content) - }, skip_duplicates=True) - - # Update referencing tables (UUID -> hash) + # Update referencing tables: convert UUID column to JSON with hash metadata + # The JSON column stores {"hash": hash_id, "store": store_name, "size": len(content)} # ... update all tables that reference this UUID ... # After migration complete for all schemas: @@ -653,13 +878,13 @@ def migrate_external_store(schema, store_name): ``` **Migration considerations:** -- Legacy UUIDs were based on content hash but stored as `binary(16)` -- New system uses `char(64)` SHA256 hex strings +- Legacy UUIDs were based on MD5 content hash stored as `binary(16)` (UUID format) +- New system uses `char(32)` MD5 hex strings stored in JSON +- The hash algorithm is unchanged (MD5), only the storage format differs - Migration can be done incrementally per schema - Backward compatibility layer can read both formats during transition ## Open Questions -1. Should `content` without `@store` use a default store, or require explicit store? -2. Should we support `` without `@store` syntax (implying default store)? -3. How long should the backward compatibility layer support legacy `~external_*` format? +1. How long should the backward compatibility layer support legacy `~external_*` format? +2. Should `` (without store name) use a default store or require explicit store name? diff --git a/pyproject.toml b/pyproject.toml index 82cad39e..154a4039 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,9 +84,9 @@ datajoint = "datajoint.cli:cli" test = [ "pytest", "pytest-cov", - "pytest-env", "requests", "graphviz", + "testcontainers[mysql,minio]>=4.0", ] [project.optional-dependencies] @@ -96,9 +96,9 @@ azure = ["adlfs>=2023.1.0"] test = [ "pytest", "pytest-cov", - "pytest-env", "requests", "s3fs>=2023.1.0", + "testcontainers[mysql,minio]>=4.0", ] dev = [ "pre-commit", @@ -158,20 +158,11 @@ skip = ".git,*.pdf,*.svg,*.csv,*.ipynb,*.drawio" # astroid -- Python library name (not "asteroid") ignore-words-list = "rever,numer,astroid" -[tool.pytest_env] -# Default environment variables for tests (D: prefix = only set if not defined) -# These defaults work for local development with `docker compose up -d db minio` -# For devcontainer/docker: override DJ_HOST=db and S3_ENDPOINT=minio:9000 -"D:DJ_HOST" = "localhost" -"D:DJ_PORT" = "3306" -"D:DJ_USER" = "root" -"D:DJ_PASS" = "password" -"D:DJ_TEST_USER" = "datajoint" -"D:DJ_TEST_PASSWORD" = "datajoint" -"D:S3_ENDPOINT" = "localhost:9000" -"D:S3_ACCESS_KEY" = "datajoint" -"D:S3_SECRET_KEY" = "datajoint" -"D:S3_BUCKET" = "datajoint.test" +[tool.pytest.ini_options] +markers = [ + "requires_mysql: marks tests as requiring MySQL database (deselect with '-m \"not requires_mysql\"')", + "requires_minio: marks tests as requiring MinIO object storage (deselect with '-m \"not requires_minio\"')", +] [tool.pixi.workspace] @@ -187,12 +178,13 @@ dev = { features = ["dev"], solve-group = "default" } test = { features = ["test"], solve-group = "default" } [tool.pixi.tasks] -# Start required services (MySQL and MinIO) +# Tests use testcontainers - no manual setup required +test = "pytest tests/" +test-cov = "pytest --cov-report term-missing --cov=datajoint tests/" +# Optional: use external containers (docker-compose) instead of testcontainers services-up = "docker compose up -d db minio" services-down = "docker compose down" -# Run tests (requires services to be running, uses localhost defaults from pytest_env) -test = { cmd = "pytest tests/", depends-on = ["services-up"] } -test-cov = { cmd = "pytest --cov-report term-missing --cov=datajoint tests/", depends-on = ["services-up"] } +test-external = { cmd = "DJ_USE_EXTERNAL_CONTAINERS=1 pytest tests/", depends-on = ["services-up"] } [tool.pixi.dependencies] python = ">=3.10,<3.14" diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index a19aae6d..684ffd08 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -45,9 +45,10 @@ "kill", "MatCell", "MatStruct", - "AttributeType", - "register_type", - "list_types", + # Codec API + "Codec", + "list_codecs", + "get_codec", "errors", "migrate", "DataJointError", @@ -61,7 +62,11 @@ from . import errors from . import migrate from .admin import kill -from .attribute_type import AttributeType, list_types, register_type +from .codecs import ( + Codec, + get_codec, + list_codecs, +) from .blob import MatCell, MatStruct from .cli import cli from .connection import Connection, conn diff --git a/src/datajoint/attribute_type.py b/src/datajoint/attribute_type.py deleted file mode 100644 index 37fae88c..00000000 --- a/src/datajoint/attribute_type.py +++ /dev/null @@ -1,497 +0,0 @@ -""" -Custom attribute type system for DataJoint. - -This module provides the AttributeType base class and registration mechanism -for creating custom data types that extend DataJoint's native type system. - -Custom types enable seamless integration of complex Python objects (like NumPy arrays, -graphs, or domain-specific structures) with DataJoint's relational storage. - -Example: - @dj.register_type - class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" - - def encode(self, graph: nx.Graph) -> list: - return list(graph.edges) - - def decode(self, edges: list) -> nx.Graph: - return nx.Graph(edges) - - # Then use in table definitions: - class MyTable(dj.Manual): - definition = ''' - id : int - --- - data : - ''' -""" - -from __future__ import annotations - -import logging -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any - -from .errors import DataJointError - -if TYPE_CHECKING: - pass - -logger = logging.getLogger(__name__.split(".")[0]) - -# Global type registry - maps type_name to AttributeType instance -_type_registry: dict[str, AttributeType] = {} -_entry_points_loaded: bool = False - - -class AttributeType(ABC): - """ - Base class for custom DataJoint attribute types. - - Subclass this to create custom types that can be used in table definitions - with the ```` syntax. Custom types define bidirectional conversion - between Python objects and DataJoint's storage format. - - Attributes: - type_name: Unique identifier used in ```` syntax - dtype: Underlying DataJoint storage type - - Example: - @dj.register_type - class GraphType(dj.AttributeType): - type_name = "graph" - dtype = "longblob" - - def encode(self, graph): - return list(graph.edges) - - def decode(self, edges): - import networkx as nx - return nx.Graph(edges) - - The type can then be used in table definitions:: - - class Connectivity(dj.Manual): - definition = ''' - id : int - --- - graph_data : - ''' - """ - - @property - @abstractmethod - def type_name(self) -> str: - """ - Unique identifier for this type, used in table definitions as ````. - - This name must be unique across all registered types. It should be lowercase - with underscores (e.g., "graph", "zarr_array", "compressed_image"). - - Returns: - The type name string without angle brackets. - """ - ... - - @property - @abstractmethod - def dtype(self) -> str: - """ - The underlying DataJoint type used for storage. - - Can be: - - A native type: ``"longblob"``, ``"blob"``, ``"varchar(255)"``, ``"int"``, ``"json"`` - - An external type: ``"blob@store"``, ``"attach@store"`` - - The object type: ``"object"`` - - Another custom type: ``""`` (enables type chaining) - - Returns: - The storage type specification string. - """ - ... - - @abstractmethod - def encode(self, value: Any, *, key: dict | None = None) -> Any: - """ - Convert a Python object to the storable format. - - Called during INSERT operations to transform user-provided objects - into a format suitable for storage in the underlying ``dtype``. - - Args: - value: The Python object to store. - key: Primary key values as a dict. Available when the dtype uses - object storage and may be needed for path construction. - - Returns: - Value in the format expected by ``dtype``. For example: - - For ``dtype="longblob"``: any picklable Python object - - For ``dtype="object"``: path string or file-like object - - For ``dtype="varchar(N)"``: string - """ - ... - - @abstractmethod - def decode(self, stored: Any, *, key: dict | None = None) -> Any: - """ - Convert stored data back to a Python object. - - Called during FETCH operations to reconstruct the original Python - object from the stored format. - - Args: - stored: Data retrieved from storage. Type depends on ``dtype``: - - For ``"object"``: an ``ObjectRef`` handle - - For blob types: the unpacked Python object - - For native types: the native Python value (str, int, etc.) - key: Primary key values as a dict. - - Returns: - The reconstructed Python object. - """ - ... - - def validate(self, value: Any) -> None: - """ - Validate a value before encoding. - - Override this method to add type checking or domain constraints. - Called automatically before ``encode()`` during INSERT operations. - The default implementation accepts any value. - - Args: - value: The value to validate. - - Raises: - TypeError: If the value has an incompatible type. - ValueError: If the value fails domain validation. - """ - pass - - def default(self) -> Any: - """ - Return a default value for this type. - - Override if the type has a sensible default value. The default - implementation raises NotImplementedError, indicating no default exists. - - Returns: - The default value for this type. - - Raises: - NotImplementedError: If no default exists (the default behavior). - """ - raise NotImplementedError(f"No default value for type <{self.type_name}>") - - def __repr__(self) -> str: - return f"<{self.__class__.__name__}(type_name={self.type_name!r}, dtype={self.dtype!r})>" - - -def register_type(cls: type[AttributeType]) -> type[AttributeType]: - """ - Register a custom attribute type with DataJoint. - - Can be used as a decorator or called directly. The type becomes available - for use in table definitions with the ```` syntax. - - Args: - cls: An AttributeType subclass to register. - - Returns: - The same class, unmodified (allows use as decorator). - - Raises: - DataJointError: If a type with the same name is already registered - by a different class. - TypeError: If cls is not an AttributeType subclass. - - Example: - As a decorator:: - - @dj.register_type - class GraphType(dj.AttributeType): - type_name = "graph" - ... - - Or called directly:: - - dj.register_type(GraphType) - """ - if not isinstance(cls, type) or not issubclass(cls, AttributeType): - raise TypeError(f"register_type requires an AttributeType subclass, got {cls!r}") - - instance = cls() - name = instance.type_name - - if not isinstance(name, str) or not name: - raise DataJointError(f"type_name must be a non-empty string, got {name!r}") - - if name in _type_registry: - existing = _type_registry[name] - if type(existing) is not cls: - raise DataJointError( - f"Type <{name}> is already registered by " f"{type(existing).__module__}.{type(existing).__name__}" - ) - # Same class registered twice - idempotent, no error - return cls - - _type_registry[name] = instance - logger.debug(f"Registered attribute type <{name}> from {cls.__module__}.{cls.__name__}") - return cls - - -def parse_type_spec(spec: str) -> tuple[str, str | None]: - """ - Parse a type specification into type name and optional store parameter. - - Handles formats like: - - "" -> ("xblob", None) - - "" -> ("xblob", "cold") - - "xblob@cold" -> ("xblob", "cold") - - "xblob" -> ("xblob", None) - - Args: - spec: Type specification string, with or without angle brackets. - - Returns: - Tuple of (type_name, store_name). store_name is None if not specified. - """ - # Strip angle brackets - spec = spec.strip("<>").strip() - - if "@" in spec: - type_name, store_name = spec.split("@", 1) - return type_name.strip(), store_name.strip() - - return spec, None - - -def unregister_type(name: str) -> None: - """ - Remove a type from the registry. - - Primarily useful for testing. Use with caution in production code. - - Args: - name: The type_name to unregister. - - Raises: - DataJointError: If the type is not registered. - """ - name = name.strip("<>") - if name not in _type_registry: - raise DataJointError(f"Type <{name}> is not registered") - del _type_registry[name] - - -def get_type(name: str) -> AttributeType: - """ - Retrieve a registered attribute type by name. - - Looks up the type in the explicit registry first, then attempts - to load from installed packages via entry points. - - Args: - name: The type name, with or without angle brackets. - Store parameters (e.g., "") are stripped. - - Returns: - The registered AttributeType instance. - - Raises: - DataJointError: If the type is not found. - """ - # Strip angle brackets and store parameter - type_name, _ = parse_type_spec(name) - - # Check explicit registry first - if type_name in _type_registry: - return _type_registry[type_name] - - # Lazy-load entry points - _load_entry_points() - - if type_name in _type_registry: - return _type_registry[type_name] - - raise DataJointError( - f"Unknown attribute type: <{type_name}>. " - f"Ensure the type is registered via @dj.register_type or installed as a package." - ) - - -def list_types() -> list[str]: - """ - List all registered type names. - - Returns: - Sorted list of registered type names. - """ - _load_entry_points() - return sorted(_type_registry.keys()) - - -def is_type_registered(name: str) -> bool: - """ - Check if a type name is registered. - - Args: - name: The type name to check (store parameters are ignored). - - Returns: - True if the type is registered. - """ - type_name, _ = parse_type_spec(name) - if type_name in _type_registry: - return True - _load_entry_points() - return type_name in _type_registry - - -def _load_entry_points() -> None: - """ - Load attribute types from installed packages via entry points. - - Types are discovered from the ``datajoint.types`` entry point group. - Packages declare types in pyproject.toml:: - - [project.entry-points."datajoint.types"] - zarr_array = "dj_zarr:ZarrArrayType" - - This function is idempotent - entry points are only loaded once. - """ - global _entry_points_loaded - if _entry_points_loaded: - return - - _entry_points_loaded = True - - try: - from importlib.metadata import entry_points - except ImportError: - # Python < 3.10 fallback - try: - from importlib_metadata import entry_points - except ImportError: - logger.debug("importlib.metadata not available, skipping entry point discovery") - return - - try: - # Python 3.10+ / importlib_metadata 3.6+ - eps = entry_points(group="datajoint.types") - except TypeError: - # Older API - eps = entry_points().get("datajoint.types", []) - - for ep in eps: - if ep.name in _type_registry: - # Already registered explicitly, skip entry point - continue - try: - type_class = ep.load() - register_type(type_class) - logger.debug(f"Loaded attribute type <{ep.name}> from entry point {ep.value}") - except Exception as e: - logger.warning(f"Failed to load attribute type '{ep.name}' from {ep.value}: {e}") - - -def resolve_dtype( - dtype: str, seen: set[str] | None = None, store_name: str | None = None -) -> tuple[str, list[AttributeType], str | None]: - """ - Resolve a dtype string, following type chains. - - If dtype references another custom type (e.g., ""), recursively - resolves to find the ultimate storage type. Store parameters are propagated - through the chain. - - Args: - dtype: The dtype string to resolve (e.g., "", "", "longblob"). - seen: Set of already-seen type names (for cycle detection). - store_name: Store name from outer type specification (propagated inward). - - Returns: - Tuple of (final_storage_type, list_of_types_in_chain, resolved_store_name). - The chain is ordered from outermost to innermost type. - - Raises: - DataJointError: If a circular type reference is detected. - - Examples: - >>> resolve_dtype("") - ("json", [XBlobType, ContentType], None) - - >>> resolve_dtype("") - ("json", [XBlobType, ContentType], "cold") - - >>> resolve_dtype("longblob") - ("longblob", [], None) - """ - if seen is None: - seen = set() - - chain: list[AttributeType] = [] - - # Check if dtype is a custom type reference - if dtype.startswith("<") and dtype.endswith(">"): - type_name, dtype_store = parse_type_spec(dtype) - - # Store from this level overrides inherited store - effective_store = dtype_store if dtype_store is not None else store_name - - if type_name in seen: - raise DataJointError(f"Circular type reference detected: <{type_name}>") - - seen.add(type_name) - attr_type = get_type(type_name) - chain.append(attr_type) - - # Recursively resolve the inner dtype, propagating store - inner_dtype, inner_chain, resolved_store = resolve_dtype(attr_type.dtype, seen, effective_store) - chain.extend(inner_chain) - return inner_dtype, chain, resolved_store - - # Not a custom type - check if it has a store suffix (e.g., "blob@store") - if "@" in dtype: - base_type, dtype_store = dtype.split("@", 1) - effective_store = dtype_store if dtype_store else store_name - return base_type, chain, effective_store - - # Plain type - return as-is with propagated store - return dtype, chain, store_name - - -def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]: - """ - Get an attribute type by name. - - This is a compatibility function used by heading and declare modules. - - Args: - context: Ignored (legacy parameter, kept for API compatibility). - adapter_name: The type name, with or without angle brackets. - May include store parameter (e.g., ""). - - Returns: - Tuple of (AttributeType instance, store_name or None). - - Raises: - DataJointError: If the type is not found. - """ - type_name, store_name = parse_type_spec(adapter_name) - - if is_type_registered(type_name): - return get_type(type_name), store_name - - raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.") - - -# ============================================================================= -# Auto-register built-in types -# ============================================================================= - -# Import builtin_types module to register built-in types (DJBlobType, ContentType, etc.) -# This import has a side effect: it registers the types via @register_type decorators -from . import builtin_types as _builtin_types # noqa: F401, E402 diff --git a/src/datajoint/builtin_types.py b/src/datajoint/builtin_codecs.py similarity index 64% rename from src/datajoint/builtin_types.py rename to src/datajoint/builtin_codecs.py index 3c1654a6..a55494e8 100644 --- a/src/datajoint/builtin_types.py +++ b/src/datajoint/builtin_codecs.py @@ -1,31 +1,30 @@ """ -Built-in DataJoint attribute types. +Built-in DataJoint codecs. -This module defines the standard AttributeTypes that ship with DataJoint. -These serve as both useful built-in types and as examples for users who -want to create their own custom types. +This module defines the standard codecs that ship with DataJoint. +These serve as both useful built-in codecs and as examples for users who +want to create their own custom codecs. -Built-in Types: - - ````: Serialize Python objects to DataJoint's blob format (internal storage) - - ````: Content-addressed storage with SHA256 deduplication - - ````: External serialized blobs using content-addressed storage +Built-in Codecs: + - ````: Serialize Python objects (internal) or external with dedup + - ````: Hash-addressed storage with MD5 deduplication - ````: Path-addressed storage for files/folders (Zarr, HDF5) - - ````: Internal file attachment stored in database - - ````: External file attachment with deduplication + - ````: File attachment (internal) or external with dedup - ````: Reference to existing file in store -Example - Creating a Custom Type: - Here's how to define your own AttributeType, modeled after the built-in types:: +Example - Creating a Custom Codec: + Here's how to define your own codec, modeled after the built-in codecs:: import datajoint as dj import networkx as nx - @dj.register_type - class GraphType(dj.AttributeType): + class GraphCodec(dj.Codec): '''Store NetworkX graphs as edge lists.''' - type_name = "graph" # Use as in definitions - dtype = "" # Compose with djblob for serialization + name = "graph" # Use as in definitions + + def get_dtype(self, is_external: bool) -> str: + return "" # Compose with blob for serialization def encode(self, graph, *, key=None, store_name=None): # Convert graph to a serializable format @@ -59,22 +58,26 @@ class Networks(dj.Manual): from typing import Any -from .attribute_type import AttributeType, register_type +from .codecs import Codec +from .errors import DataJointError # ============================================================================= -# DJBlob Types - DataJoint's native serialization +# Blob Codec - DataJoint's native serialization # ============================================================================= -@register_type -class DJBlobType(AttributeType): +class BlobCodec(Codec): """ Serialize Python objects using DataJoint's blob format. - The ```` type handles serialization of arbitrary Python objects + The ```` codec handles serialization of arbitrary Python objects including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs. - Data is stored in a MySQL ``LONGBLOB`` column. + + Supports both internal and external storage: + - ````: Stored in database (bytes → LONGBLOB) + - ````: Stored externally via ```` with deduplication + - ````: Stored in specific named store Format Features: - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native) @@ -88,19 +91,20 @@ class ProcessedData(dj.Manual): definition = ''' data_id : int --- - results : # Serialized Python objects + small_result : # internal (in database) + large_result : # external (default store) + archive : # external (specific store) ''' # Insert any serializable object - table.insert1({'data_id': 1, 'results': {'scores': [0.9, 0.8], 'labels': ['a', 'b']}}) - - Note: - Plain ``longblob`` columns store raw bytes without serialization. - Use ```` when you need automatic serialization. + table.insert1({'data_id': 1, 'small_result': {'scores': [0.9, 0.8]}}) """ - type_name = "djblob" - dtype = "longblob" + name = "blob" + + def get_dtype(self, is_external: bool) -> str: + """Return bytes for internal, for external storage.""" + return "" if is_external else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: """Serialize a Python object to DataJoint's blob format.""" @@ -116,22 +120,23 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any: # ============================================================================= -# Content-Addressed Storage Types +# Hash-Addressed Storage Codec # ============================================================================= -@register_type -class ContentType(AttributeType): +class HashCodec(Codec): """ - Content-addressed storage with SHA256 deduplication. + Hash-addressed storage with MD5 deduplication. - The ```` type stores raw bytes using content-addressed storage. - Data is identified by its SHA256 hash and stored in a hierarchical directory: - ``_content/{hash[:2]}/{hash[2:4]}/{hash}`` + The ```` codec stores raw bytes using content-addressed storage. + Data is identified by its MD5 hash and stored in a hierarchical directory: + ``_hash/{hash[:2]}/{hash[2:4]}/{hash}`` The database column stores JSON metadata: ``{hash, store, size}``. Duplicate content is automatically deduplicated. + External only - requires @ modifier. + Example:: @schema @@ -139,20 +144,24 @@ class RawContent(dj.Manual): definition = ''' content_id : int --- - data : + data : ''' # Insert raw bytes table.insert1({'content_id': 1, 'data': b'raw binary content'}) Note: - This type accepts only ``bytes``. For Python objects, use ````. - A store must be specified (e.g., ````) unless a default - store is configured. + This codec accepts only ``bytes``. For Python objects, use ````. + Typically used indirectly via ```` or ```` rather than directly. """ - type_name = "content" - dtype = "json" + name = "hash" + + def get_dtype(self, is_external: bool) -> str: + """Hash storage is external only.""" + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict: """ @@ -188,75 +197,22 @@ def decode(self, stored: dict, *, key: dict | None = None) -> bytes: def validate(self, value: Any) -> None: """Validate that value is bytes.""" if not isinstance(value, bytes): - raise TypeError(f" expects bytes, got {type(value).__name__}") - - -@register_type -class XBlobType(AttributeType): - """ - External serialized blobs with content-addressed storage. - - The ```` type combines DataJoint's blob serialization with - content-addressed storage. Objects are serialized, then stored externally - with automatic deduplication. - - This is ideal for large objects (NumPy arrays, DataFrames) that may be - duplicated across rows. - - Example:: - - @schema - class LargeArrays(dj.Manual): - definition = ''' - array_id : int - --- - data : - ''' - - import numpy as np - table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)}) - - Type Composition: - ```` composes with ````:: - - Insert: object → blob.pack() → put_content() → JSON metadata - Fetch: JSON → get_content() → blob.unpack() → object - - Note: - - For internal storage, use ```` - - For raw bytes without serialization, use ```` - """ - - type_name = "xblob" - dtype = "" # Composition: uses ContentType - - def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: - """Serialize object to bytes (passed to ContentType).""" - from . import blob - - return blob.pack(value, compress=True) - - def decode(self, stored: bytes, *, key: dict | None = None) -> Any: - """Deserialize bytes back to Python object.""" - from . import blob - - return blob.unpack(stored, squeeze=False) + raise TypeError(f" expects bytes, got {type(value).__name__}") # ============================================================================= -# Path-Addressed Storage Types (OAS - Object-Augmented Schema) +# Path-Addressed Storage Codec (OAS - Object-Augmented Schema) # ============================================================================= -@register_type -class ObjectType(AttributeType): +class ObjectCodec(Codec): """ Path-addressed storage for files and folders. - The ```` type provides managed file/folder storage where the path - is derived from the primary key: ``{schema}/{table}/objects/{pk}/{field}_{token}.{ext}`` + The ```` codec provides managed file/folder storage where the path + is derived from the primary key: ``{schema}/{table}/{pk}/{field}/`` - Unlike ```` (content-addressed), each row has its own storage path, + Unlike ```` (hash-addressed), each row has its own storage path, and content is deleted when the row is deleted. This is ideal for: - Zarr arrays (hierarchical chunked data) @@ -264,6 +220,8 @@ class ObjectType(AttributeType): - Complex multi-file outputs - Any content that shouldn't be deduplicated + External only - requires @ modifier. + Example:: @schema @@ -287,26 +245,25 @@ def make(self, key): Storage Structure: Objects are stored at:: - {store_root}/{schema}/{table}/objects/{pk}/{field}_{token}.ext - - The token ensures uniqueness even if content is replaced. + {store_root}/{schema}/{table}/{pk}/{field}/ - Comparison with ````:: + Comparison with ````:: - | Aspect | | | + | Aspect | | | |----------------|-------------------|---------------------| | Addressing | Path (by PK) | Hash (by content) | | Deduplication | No | Yes | | Deletion | With row | GC when unreferenced| | Use case | Zarr, HDF5 | Blobs, attachments | - - Note: - A store must be specified (````) unless a default store - is configured. Returns ``ObjectRef`` on fetch for lazy access. """ - type_name = "object" - dtype = "json" + name = "object" + + def get_dtype(self, is_external: bool) -> str: + """Object storage is external only.""" + if not is_external: + raise DataJointError(" requires @ (external storage only)") + return "json" def encode( self, @@ -355,7 +312,6 @@ def encode( ext = None size = None item_count = None - source_path = None if isinstance(value, bytes): content = value @@ -371,8 +327,6 @@ def encode( elif isinstance(value, (str, Path)): source_path = Path(value) if not source_path.exists(): - from .errors import DataJointError - raise DataJointError(f"Source path not found: {source_path}") is_dir = source_path.is_dir() ext = source_path.suffix if not is_dir else None @@ -434,8 +388,8 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: Returns: ObjectRef for accessing the stored content. """ - from .content_registry import get_store_backend from .objectref import ObjectRef + from .content_registry import get_store_backend store_name = stored.get("store") backend = get_store_backend(store_name) @@ -460,16 +414,19 @@ def validate(self, value: Any) -> None: # ============================================================================= -# File Attachment Types +# File Attachment Codecs # ============================================================================= -@register_type -class AttachType(AttributeType): +class AttachCodec(Codec): """ - Internal file attachment stored in database. + File attachment with filename preserved. + + Supports both internal and external storage: + - ````: Stored in database (bytes → LONGBLOB) + - ````: Stored externally via ```` with deduplication + - ````: Stored in specific named store - The ```` type stores a file directly in the database as a ``LONGBLOB``. The filename is preserved and the file is extracted to the configured download path on fetch. @@ -480,26 +437,27 @@ class Documents(dj.Manual): definition = ''' doc_id : int --- - report : + config : # internal (small file in DB) + dataset : # external (default store) + archive : # external (specific store) ''' # Insert a file - table.insert1({'doc_id': 1, 'report': '/path/to/report.pdf'}) + table.insert1({'doc_id': 1, 'config': '/path/to/config.json'}) # Fetch extracts to download_path and returns local path - local_path = (table & 'doc_id=1').fetch1('report') + local_path = (table & 'doc_id=1').fetch1('config') - Storage Format: + Storage Format (internal): The blob contains: ``filename\\0contents`` - Filename (UTF-8 encoded) + null byte + raw file contents - - Note: - - For large files, use ```` (external storage with deduplication) - - For files that shouldn't be copied, use ```` """ - type_name = "attach" - dtype = "longblob" + name = "attach" + + def get_dtype(self, is_external: bool) -> str: + """Return bytes for internal, for external storage.""" + return "" if is_external else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: """ @@ -576,138 +534,21 @@ def validate(self, value: Any) -> None: raise TypeError(f" expects a file path, got {type(value).__name__}") -@register_type -class XAttachType(AttributeType): - """ - External file attachment with content-addressed storage. - - The ```` type stores files externally using content-addressed - storage. Like ````, the filename is preserved and the file is - extracted on fetch. Unlike ````, files are stored externally - with automatic deduplication. - - Example:: - - @schema - class LargeDocuments(dj.Manual): - definition = ''' - doc_id : int - --- - dataset : - ''' - - # Insert a large file - table.insert1({'doc_id': 1, 'dataset': '/path/to/large_file.h5'}) - - # Fetch downloads and returns local path - local_path = (table & 'doc_id=1').fetch1('dataset') - - Type Composition: - ```` composes with ````:: - - Insert: file → read + encode filename → put_content() → JSON - Fetch: JSON → get_content() → extract → local path - - Comparison:: - - | Type | Storage | Deduplication | Best for | - |------------|----------|---------------|---------------------| - | | Database | No | Small files (<16MB) | - | | External | Yes | Large files | - """ - - type_name = "xattach" - dtype = "" # Composition: uses ContentType - - def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: - """ - Read file and encode as filename + contents. - - Args: - value: Path to file (str or Path). - key: Primary key values (unused). - store_name: Passed to ContentType for storage. - - Returns: - Bytes: filename (UTF-8) + null byte + file contents - """ - from pathlib import Path - - path = Path(value) - if not path.exists(): - raise FileNotFoundError(f"Attachment file not found: {path}") - if path.is_dir(): - raise IsADirectoryError(f" does not support directories: {path}") - - filename = path.name - contents = path.read_bytes() - return filename.encode("utf-8") + b"\x00" + contents - - def decode(self, stored: bytes, *, key: dict | None = None) -> str: - """ - Extract file to download path and return local path. - - Args: - stored: Bytes containing filename + null + contents. - key: Primary key values (unused). - - Returns: - Path to extracted file as string. - """ - from pathlib import Path - - from .settings import config - - # Split on first null byte - null_pos = stored.index(b"\x00") - filename = stored[:null_pos].decode("utf-8") - contents = stored[null_pos + 1 :] - - # Write to download path - download_path = Path(config.get("download_path", ".")) - download_path.mkdir(parents=True, exist_ok=True) - local_path = download_path / filename - - # Handle filename collision - if file exists with different content, add suffix - if local_path.exists(): - existing_contents = local_path.read_bytes() - if existing_contents != contents: - # Find unique filename - stem = local_path.stem - suffix = local_path.suffix - counter = 1 - while local_path.exists() and local_path.read_bytes() != contents: - local_path = download_path / f"{stem}_{counter}{suffix}" - counter += 1 - - # Only write if file doesn't exist or has different content - if not local_path.exists(): - local_path.write_bytes(contents) - - return str(local_path) - - def validate(self, value: Any) -> None: - """Validate that value is a valid file path.""" - from pathlib import Path - - if not isinstance(value, (str, Path)): - raise TypeError(f" expects a file path, got {type(value).__name__}") - - # ============================================================================= -# Filepath Reference Type +# Filepath Reference Codec # ============================================================================= -@register_type -class FilepathType(AttributeType): +class FilepathCodec(Codec): """ Reference to existing file in configured store. - The ```` type stores a reference to a file that already - exists in the storage backend. Unlike ```` or ````, no + The ```` codec stores a reference to a file that already + exists in the storage backend. Unlike ```` or ````, no file copying occurs - only the path is recorded. + External only - requires @store. + This is useful when: - Files are managed externally (e.g., by acquisition software) - Files are too large to copy @@ -739,8 +580,13 @@ class Recordings(dj.Manual): DataJoint does not manage the lifecycle of referenced files. """ - type_name = "filepath" - dtype = "json" + name = "filepath" + + def get_dtype(self, is_external: bool) -> str: + """Filepath is external only.""" + if not is_external: + raise DataJointError(" requires @store") + return "json" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> dict: """ @@ -790,8 +636,8 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: Returns: ObjectRef for accessing the file. """ - from .content_registry import get_store_backend from .objectref import ObjectRef + from .content_registry import get_store_backend store_name = stored.get("store") backend = get_store_backend(store_name) diff --git a/src/datajoint/codecs.py b/src/datajoint/codecs.py new file mode 100644 index 00000000..cc592bad --- /dev/null +++ b/src/datajoint/codecs.py @@ -0,0 +1,450 @@ +""" +Codec type system for DataJoint. + +This module provides the Codec base class for creating custom data types +that extend DataJoint's native type system. Codecs provide encode/decode +semantics for complex Python objects. + +Codecs auto-register when subclassed - no decorator needed (Python 3.10+). + +Example: + class GraphCodec(dj.Codec): + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): + import networkx as nx + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + + # Then use in table definitions: + class MyTable(dj.Manual): + definition = ''' + id : int + --- + data : + ''' +""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import Any + +from .errors import DataJointError + +logger = logging.getLogger(__name__.split(".")[0]) + +# Global codec registry - maps name to Codec instance +_codec_registry: dict[str, Codec] = {} +_entry_points_loaded: bool = False + + +class Codec(ABC): + """ + Base class for codec types. Subclasses auto-register by name. + + Requires Python 3.10+. + + Attributes: + name: Unique identifier used in ```` syntax. Must be set by subclasses. + + Example: + class GraphCodec(dj.Codec): + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, graph, *, key=None, store_name=None): + return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + + def decode(self, stored, *, key=None): + import networkx as nx + G = nx.Graph() + G.add_nodes_from(stored['nodes']) + G.add_edges_from(stored['edges']) + return G + + The codec can then be used in table definitions:: + + class Connectivity(dj.Manual): + definition = ''' + id : int + --- + graph_data : + ''' + + To skip auto-registration (for abstract base classes):: + + class ExternalOnlyCodec(dj.Codec, register=False): + '''Abstract base - not registered.''' + ... + """ + + name: str | None = None # Must be set by concrete subclasses + + def __init_subclass__(cls, *, register: bool = True, **kwargs): + """Auto-register concrete codecs when subclassed.""" + super().__init_subclass__(**kwargs) + + if not register: + return # Skip registration for abstract bases + + if cls.name is None: + return # Skip registration if no name (abstract) + + if not isinstance(cls.name, str) or not cls.name: + raise DataJointError(f"Codec name must be a non-empty string, got {cls.name!r}") + + if cls.name in _codec_registry: + existing = _codec_registry[cls.name] + if type(existing) is not cls: + raise DataJointError( + f"Codec <{cls.name}> already registered by " f"{type(existing).__module__}.{type(existing).__name__}" + ) + return # Same class, idempotent + + _codec_registry[cls.name] = cls() + logger.debug(f"Registered codec <{cls.name}> from {cls.__module__}.{cls.__name__}") + + def get_dtype(self, is_external: bool) -> str: + """ + Return the storage dtype for this codec. + + Args: + is_external: True if @ modifier present (external storage) + + Returns: + A core type (e.g., "bytes", "json") or another codec (e.g., "") + + Raises: + NotImplementedError: If not overridden by subclass. + DataJointError: If external storage not supported but requested. + """ + raise NotImplementedError(f"Codec <{self.name}> must implement get_dtype()") + + @abstractmethod + def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: + """ + Encode Python value for storage. + + Args: + value: The Python object to store. + key: Primary key values as a dict. May be needed for path construction. + store_name: Target store name for external storage. + + Returns: + Value in the format expected by the dtype. + """ + ... + + @abstractmethod + def decode(self, stored: Any, *, key: dict | None = None) -> Any: + """ + Decode stored value back to Python. + + Args: + stored: Data retrieved from storage. + key: Primary key values as a dict. + + Returns: + The reconstructed Python object. + """ + ... + + def validate(self, value: Any) -> None: + """ + Validate a value before encoding. + + Override this method to add type checking or domain constraints. + Called automatically before ``encode()`` during INSERT operations. + The default implementation accepts any value. + + Args: + value: The value to validate. + + Raises: + TypeError: If the value has an incompatible type. + ValueError: If the value fails domain validation. + """ + pass + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}(name={self.name!r})>" + + +def parse_type_spec(spec: str) -> tuple[str, str | None]: + """ + Parse a type specification into type name and optional store parameter. + + Handles formats like: + - "" -> ("blob", None) + - "" -> ("blob", "cold") + - "" -> ("blob", "") # default store + - "blob@cold" -> ("blob", "cold") + - "blob" -> ("blob", None) + + Args: + spec: Type specification string, with or without angle brackets. + + Returns: + Tuple of (type_name, store_name). store_name is None if not specified, + empty string if @ present without name (default store). + """ + # Strip angle brackets + spec = spec.strip("<>").strip() + + if "@" in spec: + type_name, store_name = spec.split("@", 1) + return type_name.strip(), store_name.strip() + + return spec, None + + +def unregister_codec(name: str) -> None: + """ + Remove a codec from the registry. + + Primarily useful for testing. Use with caution in production code. + + Args: + name: The codec name to unregister. + + Raises: + DataJointError: If the codec is not registered. + """ + name = name.strip("<>") + if name not in _codec_registry: + raise DataJointError(f"Codec <{name}> is not registered") + del _codec_registry[name] + + +def get_codec(name: str) -> Codec: + """ + Retrieve a registered codec by name. + + Looks up the codec in the explicit registry first, then attempts + to load from installed packages via entry points. + + Args: + name: The codec name, with or without angle brackets. + Store parameters (e.g., "") are stripped. + + Returns: + The registered Codec instance. + + Raises: + DataJointError: If the codec is not found. + """ + # Strip angle brackets and store parameter + type_name, _ = parse_type_spec(name) + + # Check explicit registry first + if type_name in _codec_registry: + return _codec_registry[type_name] + + # Lazy-load entry points + _load_entry_points() + + if type_name in _codec_registry: + return _codec_registry[type_name] + + raise DataJointError( + f"Unknown codec: <{type_name}>. " f"Ensure the codec is defined (inherit from dj.Codec with name='{type_name}')." + ) + + +def list_codecs() -> list[str]: + """ + List all registered codec names. + + Returns: + Sorted list of registered codec names. + """ + _load_entry_points() + return sorted(_codec_registry.keys()) + + +def is_codec_registered(name: str) -> bool: + """ + Check if a codec name is registered. + + Args: + name: The codec name to check (store parameters are ignored). + + Returns: + True if the codec is registered. + """ + type_name, _ = parse_type_spec(name) + if type_name in _codec_registry: + return True + _load_entry_points() + return type_name in _codec_registry + + +def _load_entry_points() -> None: + """ + Load codecs from installed packages via entry points. + + Codecs are discovered from the ``datajoint.codecs`` entry point group + (also checks legacy ``datajoint.types`` for backward compatibility). + + Packages declare codecs in pyproject.toml:: + + [project.entry-points."datajoint.codecs"] + zarr_array = "dj_zarr:ZarrArrayCodec" + + This function is idempotent - entry points are only loaded once. + """ + global _entry_points_loaded + if _entry_points_loaded: + return + + _entry_points_loaded = True + + try: + from importlib.metadata import entry_points + except ImportError: + logger.debug("importlib.metadata not available, skipping entry point discovery") + return + + # Load from both new and legacy entry point groups + for group in ("datajoint.codecs", "datajoint.types"): + try: + eps = entry_points(group=group) + except TypeError: + # Older API fallback + eps = entry_points().get(group, []) + + for ep in eps: + if ep.name in _codec_registry: + # Already registered explicitly, skip entry point + continue + try: + codec_class = ep.load() + # The class should auto-register via __init_subclass__ + # But if it's an old-style class, manually register + if ep.name not in _codec_registry and hasattr(codec_class, "name"): + _codec_registry[ep.name] = codec_class() + logger.debug(f"Loaded codec <{ep.name}> from entry point {ep.value}") + except Exception as e: + logger.warning(f"Failed to load codec '{ep.name}' from {ep.value}: {e}") + + +def resolve_dtype( + dtype: str, seen: set[str] | None = None, store_name: str | None = None +) -> tuple[str, list[Codec], str | None]: + """ + Resolve a dtype string, following codec chains. + + If dtype references another codec (e.g., ""), recursively + resolves to find the ultimate storage type. Store parameters are propagated + through the chain. + + Args: + dtype: The dtype string to resolve (e.g., "", "", "bytes"). + seen: Set of already-seen codec names (for cycle detection). + store_name: Store name from outer type specification (propagated inward). + + Returns: + Tuple of (final_storage_type, list_of_codecs_in_chain, resolved_store_name). + The chain is ordered from outermost to innermost codec. + + Raises: + DataJointError: If a circular type reference is detected. + + Examples: + >>> resolve_dtype("") + ("bytes", [BlobCodec], None) + + >>> resolve_dtype("") + ("", [BlobCodec], "cold") # BlobCodec.get_dtype(True) returns "" + + >>> resolve_dtype("bytes") + ("bytes", [], None) + """ + if seen is None: + seen = set() + + chain: list[Codec] = [] + + # Check if dtype is a codec reference + if dtype.startswith("<") and dtype.endswith(">"): + type_name, dtype_store = parse_type_spec(dtype) + + # Store from this level overrides inherited store + # Empty string means default store (@), None means no store specified + if dtype_store is not None: + effective_store = dtype_store + else: + effective_store = store_name + + if type_name in seen: + raise DataJointError(f"Circular codec reference detected: <{type_name}>") + + seen.add(type_name) + codec = get_codec(type_name) + chain.append(codec) + + # Determine if external based on whether @ is present + is_external = effective_store is not None + + # Get the inner dtype from the codec + inner_dtype = codec.get_dtype(is_external) + + # Recursively resolve the inner dtype, propagating store + final_dtype, inner_chain, resolved_store = resolve_dtype(inner_dtype, seen, effective_store) + chain.extend(inner_chain) + return final_dtype, chain, resolved_store + + # Not a codec - check if it has a store suffix (e.g., "blob@store") + if "@" in dtype: + base_type, dtype_store = dtype.split("@", 1) + effective_store = dtype_store if dtype_store else store_name + return base_type, chain, effective_store + + # Plain type - return as-is with propagated store + return dtype, chain, store_name + + +def lookup_codec(codec_spec: str) -> tuple[Codec, str | None]: + """ + Look up a codec from a type specification string. + + Parses a codec specification (e.g., "") and returns + the codec instance along with any store name. + + Args: + codec_spec: The codec specification, with or without angle brackets. + May include store parameter (e.g., ""). + + Returns: + Tuple of (Codec instance, store_name or None). + + Raises: + DataJointError: If the codec is not found. + """ + type_name, store_name = parse_type_spec(codec_spec) + + if is_codec_registered(type_name): + return get_codec(type_name), store_name + + raise DataJointError(f"Codec <{type_name}> is not registered. " "Define a Codec subclass with name='{type_name}'.") + + +# ============================================================================= +# Auto-register built-in codecs +# ============================================================================= + +# Import builtin_codecs module to register built-in codecs +# This import has a side effect: it registers the codecs via __init_subclass__ +from . import builtin_codecs as _builtin_codecs # noqa: F401, E402 diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py index 652f35de..abed955a 100644 --- a/src/datajoint/content_registry.py +++ b/src/datajoint/content_registry.py @@ -1,9 +1,9 @@ """ Content-addressed storage registry for DataJoint. -This module provides content-addressed storage with deduplication for the -AttributeType. Content is identified by its SHA256 hash and stored in a hierarchical -directory structure: _content/{hash[:2]}/{hash[2:4]}/{hash} +This module provides content-addressed storage with deduplication for the +Codec. Content is identified by its MD5 hash and stored in a hierarchical +directory structure: _hash/{hash[:2]}/{hash[2:4]}/{hash} The ContentRegistry tracks stored content for garbage collection purposes. """ diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 758c709e..8b6bfda8 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -9,7 +9,7 @@ import pyparsing as pp -from .attribute_type import get_adapter +from .codecs import lookup_codec from .condition import translate_attribute from .errors import DataJointError from .settings import config @@ -34,16 +34,20 @@ "uuid": (r"uuid$", "binary(16)"), # JSON "json": (r"json$", None), # json passes through as-is - # Binary (blob maps to longblob) - "blob": (r"blob$", "longblob"), + # Binary (bytes maps to longblob in MySQL, bytea in PostgreSQL) + "bytes": (r"bytes$", "longblob"), # Temporal "date": (r"date$", None), "datetime": (r"datetime$", None), # String types (with parameters) "char": (r"char\s*\(\d+\)$", None), "varchar": (r"varchar\s*\(\d+\)$", None), + # Unlimited text + "text": (r"text$", None), # Enumeration "enum": (r"enum\s*\(.+\)$", None), + # Fixed-point decimal + "decimal": (r"decimal\s*\(\d+\s*,\s*\d+\)$", None), } # Compile core type patterns @@ -66,14 +70,14 @@ **{name.upper(): pattern for name, (pattern, _) in CORE_TYPES.items()}, # Native SQL types (passthrough with warning for non-standard use) INTEGER=r"((tiny|small|medium|big|)int|integer)(\s*\(.+\))?(\s+unsigned)?(\s+auto_increment)?|serial$", - DECIMAL=r"(decimal|numeric)(\s*\(.+\))?(\s+unsigned)?$", + NUMERIC=r"numeric(\s*\(.+\))?(\s+unsigned)?$", # numeric is SQL alias, use decimal instead FLOAT=r"(double|float|real)(\s*\(.+\))?(\s+unsigned)?$", STRING=r"(var)?char\s*\(.+\)$", # Catches char/varchar not matched by core types TEMPORAL=r"(time|timestamp|year)(\s*\(.+\))?$", # time, timestamp, year (not date/datetime) NATIVE_BLOB=r"(tiny|small|medium|long)blob$", # Specific blob variants - TEXT=r"(tiny|small|medium|long)?text$", # Text types - # AttributeTypes use angle brackets - ADAPTED=r"<.+>$", + NATIVE_TEXT=r"(tiny|small|medium|long)text$", # Text variants (use plain 'text' instead) + # Codecs use angle brackets + CODEC=r"<.+>$", ).items() } @@ -81,7 +85,7 @@ CORE_TYPE_NAMES = {name.upper() for name in CORE_TYPES} # Special types that need comment storage (core types + adapted) -SPECIAL_TYPES = CORE_TYPE_NAMES | {"ADAPTED"} +SPECIAL_TYPES = CORE_TYPE_NAMES | {"CODEC"} # Native SQL types that pass through (with optional warning) NATIVE_TYPES = set(TYPE_PATTERN) - SPECIAL_TYPES @@ -100,23 +104,6 @@ def match_type(attribute_type): logger = logging.getLogger(__name__.split(".")[0]) -def build_foreign_key_parser_old(): - # old-style foreign key parser. Superseded by expression-based syntax. See issue #436 - # This will be deprecated in a future release. - left = pp.Literal("(").suppress() - right = pp.Literal(")").suppress() - attribute_name = pp.Word(pp.srange("[a-z]"), pp.srange("[a-z0-9_]")) - new_attrs = pp.Optional(left + pp.DelimitedList(attribute_name) + right).set_results_name("new_attrs") - arrow = pp.Literal("->").suppress() - lbracket = pp.Literal("[").suppress() - rbracket = pp.Literal("]").suppress() - option = pp.Word(pp.srange("[a-zA-Z]")) - options = pp.Optional(lbracket + pp.DelimitedList(option) + rbracket).set_results_name("options") - ref_table = pp.Word(pp.alphas, pp.alphanums + "._").set_results_name("ref_table") - ref_attrs = pp.Optional(left + pp.DelimitedList(attribute_name) + right).set_results_name("ref_attrs") - return new_attrs + arrow + options + ref_table + ref_attrs - - def build_foreign_key_parser(): arrow = pp.Literal("->").suppress() lbracket = pp.Literal("[").suppress() @@ -140,7 +127,6 @@ def build_attribute_parser(): return attribute_name + pp.Optional(default) + colon + data_type + comment -foreign_key_parser_old = build_foreign_key_parser_old() foreign_key_parser = build_foreign_key_parser() attribute_parser = build_attribute_parser() @@ -454,20 +440,30 @@ def substitute_special_type(match, category, foreign_key_sql, context): Substitute special types with their native SQL equivalents. Special types are: - - Core DataJoint types (float32 → float, uuid → binary(16), blob → longblob, etc.) - - ADAPTED types (AttributeTypes in angle brackets) + - Core DataJoint types (float32 → float, uuid → binary(16), bytes → longblob, etc.) + - CODEC types (Codecs in angle brackets) :param match: dict containing with keys "type" and "comment" -- will be modified in place :param category: attribute type category from TYPE_PATTERN :param foreign_key_sql: list of foreign key declarations to add to - :param context: context for looking up user-defined attribute_type adapters + :param context: context for looking up user-defined codecs (unused, kept for compatibility) """ - if category == "ADAPTED": - # AttributeType - resolve to underlying dtype - attr_type, store_name = get_adapter(context, match["type"]) + if category == "CODEC": + # Codec - resolve to underlying dtype + codec, store_name = lookup_codec(match["type"]) if store_name is not None: match["store"] = store_name - match["type"] = attr_type.dtype + # Determine if external storage is used (store_name is present, even if empty string for default) + is_external = store_name is not None + inner_dtype = codec.get_dtype(is_external=is_external) + + # If inner dtype is a codec without store, propagate the store from outer type + # e.g., returns , we need to resolve as + if inner_dtype.startswith("<") and "@" not in inner_dtype and match.get("store") is not None: + # Append store to the inner dtype + inner_dtype = inner_dtype[:-1] + "@" + match["store"] + ">" + + match["type"] = inner_dtype # Recursively resolve if dtype is also a special type category = match_type(match["type"]) if category in SPECIAL_TYPES: @@ -526,7 +522,7 @@ def compile_attribute(line, in_key, foreign_key_sql, context): category = match_type(match["type"]) if category in SPECIAL_TYPES: - # Core types and AttributeTypes are recorded in comment for reconstruction + # Core types and Codecs are recorded in comment for reconstruction match["comment"] = ":{type}:{comment}".format(**match) substitute_special_type(match, category, foreign_key_sql, context) elif category in NATIVE_TYPES: diff --git a/src/datajoint/errors.py b/src/datajoint/errors.py index 03555bf1..aadc74ca 100644 --- a/src/datajoint/errors.py +++ b/src/datajoint/errors.py @@ -2,8 +2,6 @@ Exception classes for the DataJoint library """ -import os - # --- Top Level --- class DataJointError(Exception): @@ -87,43 +85,3 @@ class BucketInaccessible(DataJointError): """ Error raised when a S3 bucket is inaccessible """ - - -# environment variables to control availability of experimental features - -ADAPTED_TYPE_SWITCH = "DJ_SUPPORT_ADAPTED_TYPES" -FILEPATH_FEATURE_SWITCH = "DJ_SUPPORT_FILEPATH_MANAGEMENT" - - -def _switch_adapted_types(on): - """ - Enable (on=True) or disable (on=False) support for AttributeAdapter - """ - if on: - os.environ[ADAPTED_TYPE_SWITCH] = "TRUE" - else: - del os.environ[ADAPTED_TYPE_SWITCH] - - -def _support_adapted_types(): - """ - check if support for AttributeAdapter is enabled - """ - return os.getenv(ADAPTED_TYPE_SWITCH, "FALSE").upper() == "TRUE" - - -def _switch_filepath_types(on): - """ - Enable (on=True) or disable (on=False) support for AttributeAdapter - """ - if on: - os.environ[FILEPATH_FEATURE_SWITCH] = "TRUE" - else: - del os.environ[FILEPATH_FEATURE_SWITCH] - - -def _support_filepath_types(): - """ - check if support for AttributeAdapter is enabled - """ - return os.getenv(FILEPATH_FEATURE_SWITCH, "FALSE").upper() == "TRUE" diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index bd97dfd1..575f3cbf 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -39,10 +39,10 @@ def _get(connection, attr, data, squeeze, download_path): - Native types pass through unchanged - JSON types are parsed - UUID types are converted from bytes - - Blob types return raw bytes (unless an adapter handles them) - - Adapters (AttributeTypes) handle all custom encoding/decoding via type chains + - Blob types return raw bytes (unless a codec handles them) + - Codecs handle all custom encoding/decoding via type chains - For composed types (e.g., using ), decoders are applied + For composed types (e.g., using ), decoders are applied in reverse order: innermost first, then outermost. :param connection: a dj.Connection object @@ -57,11 +57,17 @@ def _get(connection, attr, data, squeeze, download_path): if data is None: return None - # Get the final storage type and type chain if adapter present - if attr.adapter: - from .attribute_type import resolve_dtype + # Get the final storage type and type chain if codec present + if attr.codec: + from .codecs import resolve_dtype - final_dtype, type_chain, _ = resolve_dtype(f"<{attr.adapter.type_name}>") + # Include store if present to get correct chain for external storage + store = getattr(attr, "store", None) + if store is not None: + dtype_spec = f"<{attr.codec.name}@{store}>" + else: + dtype_spec = f"<{attr.codec.name}>" + final_dtype, type_chain, _ = resolve_dtype(dtype_spec) # First, process the final dtype (what's stored in the database) if final_dtype.lower() == "json": @@ -87,7 +93,7 @@ def _get(connection, attr, data, squeeze, download_path): return data - # No adapter - handle native types + # No codec - handle native types if attr.json: return json.loads(data) @@ -95,7 +101,7 @@ def _get(connection, attr, data, squeeze, download_path): return uuid_module.UUID(bytes=data) if attr.is_blob: - return data # raw bytes (use for automatic deserialization) + return data # raw bytes (use for automatic deserialization) # Native types - pass through unchanged return data diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index e0b7aaaf..db327f37 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -6,10 +6,10 @@ referencing it are deleted. Supports two storage patterns: -- Content-addressed storage: , , +- Content-addressed storage: , , Stored at: _content/{hash[:2]}/{hash[2:4]}/{hash} -- Path-addressed storage: +- Path-addressed storage: Stored at: {schema}/{table}/objects/{pk}/{field}_{token}/ Usage: @@ -41,10 +41,10 @@ def _uses_content_storage(attr) -> bool: """ Check if an attribute uses content-addressed storage. - This includes types that compose with : - - directly - - (composes with ) - - (composes with ) + This includes types that chain to for external storage: + - directly + - (chains to ) + - (chains to ) Args: attr: Attribute from table heading @@ -52,12 +52,22 @@ def _uses_content_storage(attr) -> bool: Returns: True if the attribute stores content hashes """ - if not attr.adapter: + if not attr.codec: return False - # Check if this type or its composition chain uses content storage - type_name = getattr(attr.adapter, "type_name", "") - return type_name in ("content", "xblob", "xattach") + # Check if this type uses content storage + codec_name = getattr(attr.codec, "name", "") + store = getattr(attr, "store", None) + + # always uses content storage (external only) + if codec_name == "hash": + return True + + # and use content storage when external (has store) + if codec_name in ("blob", "attach") and store is not None: + return True + + return False def _uses_object_storage(attr) -> bool: @@ -70,11 +80,11 @@ def _uses_object_storage(attr) -> bool: Returns: True if the attribute stores object paths """ - if not attr.adapter: + if not attr.codec: return False - type_name = getattr(attr.adapter, "type_name", "") - return type_name == "object" + codec_name = getattr(attr.codec, "name", "") + return codec_name == "object" def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: @@ -144,7 +154,7 @@ def scan_references( Scan schemas for content references. Examines all tables in the given schemas and extracts content hashes - from columns that use content-addressed storage (, , ). + from columns that use content-addressed storage (, , ). Args: *schemas: Schema instances to scan @@ -384,7 +394,7 @@ def scan( """ Scan for orphaned content and objects without deleting. - Scans both content-addressed storage (for , , ) + Scans both content-addressed storage (for , , ) and path-addressed storage (for ). Args: @@ -542,7 +552,7 @@ def format_stats(stats: dict[str, Any]) -> str: # Show content-addressed storage stats if present if "content_referenced" in stats: lines.append("") - lines.append("Content-Addressed Storage (, , ):") + lines.append("Content-Addressed Storage (, , ):") lines.append(f" Referenced: {stats['content_referenced']}") lines.append(f" Stored: {stats['content_stored']}") lines.append(f" Orphaned: {stats['content_orphaned']}") diff --git a/src/datajoint/hash.py b/src/datajoint/hash.py index f58c6573..88a737fb 100644 --- a/src/datajoint/hash.py +++ b/src/datajoint/hash.py @@ -1,7 +1,5 @@ import hashlib -import io import uuid -from pathlib import Path def key_hash(mapping): @@ -16,24 +14,14 @@ def key_hash(mapping): return hashed.hexdigest() -def uuid_from_stream(stream, *, init_string=""): +def uuid_from_buffer(buffer=b"", *, init_string=""): """ - :return: 16-byte digest of stream data - :stream: stream object or open file handle - :init_string: string to initialize the checksum + Compute MD5 hash of buffer data, returned as UUID. + + :param buffer: bytes to hash + :param init_string: string to initialize the checksum (for namespacing) + :return: UUID based on MD5 digest """ hashed = hashlib.md5(init_string.encode()) - chunk = True - chunk_size = 1 << 14 - while chunk: - chunk = stream.read(chunk_size) - hashed.update(chunk) + hashed.update(buffer) return uuid.UUID(bytes=hashed.digest()) - - -def uuid_from_buffer(buffer=b"", *, init_string=""): - return uuid_from_stream(io.BytesIO(buffer), init_string=init_string) - - -def uuid_from_file(filepath, *, init_string=""): - return uuid_from_stream(Path(filepath).open("rb"), init_string=init_string) diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 78b6af77..bc555224 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -5,8 +5,8 @@ import numpy as np -from .attribute_type import get_adapter -from .attribute_type import AttributeType +from .codecs import lookup_codec +from .codecs import Codec from .declare import ( CORE_TYPE_NAMES, SPECIAL_TYPES, @@ -15,33 +15,29 @@ from .errors import DataJointError -class _MissingType(AttributeType): - """Placeholder for missing/unregistered attribute types. Raises error on use.""" +class _MissingType(Codec, register=False): + """Placeholder for missing/unregistered codecs. Raises error on use.""" - def __init__(self, name: str): - self._name = name + def __init__(self, codec_name: str): + self._codec_name = codec_name @property - def type_name(self) -> str: - return self._name + def name(self) -> str: + return self._codec_name - @property - def dtype(self) -> str: + def get_dtype(self, is_external: bool) -> str: raise DataJointError( - f"Attribute type <{self._name}> is not registered. " - "Register it with @dj.register_type or include it in the schema context." + f"Codec <{self._codec_name}> is not registered. " f"Define a Codec subclass with name='{self._codec_name}'." ) - def encode(self, value, *, key=None): + def encode(self, value, *, key=None, store_name=None): raise DataJointError( - f"Attribute type <{self._name}> is not registered. " - "Register it with @dj.register_type or include it in the schema context." + f"Codec <{self._codec_name}> is not registered. " f"Define a Codec subclass with name='{self._codec_name}'." ) def decode(self, stored, *, key=None): raise DataJointError( - f"Attribute type <{self._name}> is not registered. " - "Register it with @dj.register_type or include it in the schema context." + f"Codec <{self._codec_name}> is not registered. " f"Define a Codec subclass with name='{self._codec_name}'." ) @@ -62,7 +58,7 @@ def decode(self, stored, *, key=None): json=None, is_blob=False, is_hidden=False, - adapter=None, + codec=None, store=None, unsupported=False, attribute_expression=None, @@ -283,10 +279,10 @@ def _init_from_database(self): autoincrement=bool(re.search(r"auto_increment", attr["Extra"], flags=re.I)), numeric=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("DECIMAL", "INTEGER", "FLOAT")), string=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("ENUM", "TEMPORAL", "STRING")), - is_blob=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BLOB", "NATIVE_BLOB")), + is_blob=any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BYTES", "NATIVE_BLOB")), uuid=False, json=bool(TYPE_PATTERN["JSON"].match(attr["type"])), - adapter=None, + codec=None, store=None, attribute_expression=None, is_hidden=attr["name"].startswith("_"), @@ -311,26 +307,26 @@ def _init_from_database(self): # Store the original type name for display but keep db_type for SQL attr["original_type"] = special["type"] - # process AttributeTypes (adapted types in angle brackets) - if special and TYPE_PATTERN["ADAPTED"].match(attr["type"]): + # process Codecs (types in angle brackets) + if special and TYPE_PATTERN["CODEC"].match(attr["type"]): # Context can be None for built-in types that are globally registered - adapter_name = special["type"] + codec_spec = special["type"] try: - adapter_result = get_adapter(context, adapter_name) - # get_adapter returns (adapter, store_name) tuple - if isinstance(adapter_result, tuple): - attr["adapter"], attr["store"] = adapter_result - else: - attr["adapter"] = adapter_result + codec_instance, codec_store = lookup_codec(codec_spec) + attr["codec"] = codec_instance + if codec_store is not None: + attr["store"] = codec_store except DataJointError: - # if no adapter, then delay the error until the first invocation - attr["adapter"] = _MissingType(adapter_name) + # if no codec, then delay the error until the first invocation + attr["codec"] = _MissingType(codec_spec) else: - attr["type"] = attr["adapter"].dtype + # Determine if external storage based on store presence + is_external = attr.get("store") is not None + attr["type"] = attr["codec"].get_dtype(is_external=is_external) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): - raise DataJointError(f"Invalid dtype '{attr['type']}' in attribute type <{adapter_name}>.") - # Update is_blob based on resolved dtype (check both BLOB and NATIVE_BLOB patterns) - attr["is_blob"] = any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BLOB", "NATIVE_BLOB")) + raise DataJointError(f"Invalid dtype '{attr['type']}' in codec <{codec_spec}>.") + # Update is_blob based on resolved dtype (check both BYTES and NATIVE_BLOB patterns) + attr["is_blob"] = any(TYPE_PATTERN[t].match(attr["type"]) for t in ("BYTES", "NATIVE_BLOB")) # Handle core type aliases (uuid, float32, etc.) if special: @@ -365,7 +361,7 @@ def _init_from_database(self): # fill out dtype. All floats and non-nullable integers are turned into specific dtypes attr["dtype"] = object - if attr["numeric"] and not attr["adapter"]: + if attr["numeric"] and not attr["codec"]: is_integer = TYPE_PATTERN["INTEGER"].match(attr["type"]) is_float = TYPE_PATTERN["FLOAT"].match(attr["type"]) if is_integer and not attr["nullable"] or is_float: @@ -375,9 +371,9 @@ def _init_from_database(self): assert (t, is_unsigned) in numeric_types, "dtype not found for type %s" % t attr["dtype"] = numeric_types[(t, is_unsigned)] - if attr["adapter"]: - # restore adapted type name for display - attr["type"] = adapter_name + if attr["codec"]: + # restore codec type name for display + attr["type"] = codec_spec self._attributes = dict(((q["name"], Attribute(**q)) for q in attributes)) diff --git a/src/datajoint/jobs.py b/src/datajoint/jobs.py index 18bf5730..b542f936 100644 --- a/src/datajoint/jobs.py +++ b/src/datajoint/jobs.py @@ -26,9 +26,9 @@ def __init__(self, conn, database): key_hash :char(32) # key hash --- status :enum('reserved','error','ignore') # if tuple is missing, the job is available - key=null : # structure containing the key + key=null : # structure containing the key error_message="" :varchar({error_message_length}) # error message returned if failed - error_stack=null : # error stack if failed + error_stack=null : # error stack if failed user="" :varchar(255) # database user host="" :varchar(255) # system hostname pid=0 :int unsigned # system process id diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index 696ca380..1948cbe0 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -2,8 +2,8 @@ Migration utilities for DataJoint schema updates. This module provides tools for migrating existing schemas to use the new -AttributeType system, particularly for upgrading blob columns to use -explicit `` type declarations. +Codec system, particularly for upgrading blob columns to use +explicit `` type declarations. """ from __future__ import annotations @@ -25,11 +25,11 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: """ - Analyze a schema to find blob columns that could be migrated to . + Analyze a schema to find blob columns that could be migrated to . This function identifies blob columns that: 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) - 2. Do NOT already have an adapter/type specified in their comment + 2. Do NOT already have a codec/type specified in their comment All blob size variants are included in the analysis. @@ -80,8 +80,8 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall() for column_name, column_type, comment in columns: - # Check if comment already has an adapter type (starts with :type:) - has_adapter = comment and comment.startswith(":") + # Check if comment already has a codec type (starts with :type:) + has_codec = comment and comment.startswith(":") results.append( { @@ -89,7 +89,7 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: "column_name": column_name, "column_type": column_type, "current_comment": comment or "", - "needs_migration": not has_adapter, + "needs_migration": not has_codec, } ) @@ -98,19 +98,19 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: def generate_migration_sql( schema: Schema, - target_type: str = "djblob", + target_type: str = "blob", dry_run: bool = True, ) -> list[str]: """ - Generate SQL statements to migrate blob columns to use . + Generate SQL statements to migrate blob columns to use . This generates ALTER TABLE statements that update column comments to - include the `::` prefix, marking them as using explicit + include the `::` prefix, marking them as using explicit DataJoint blob serialization. Args: schema: The DataJoint schema to migrate. - target_type: The type name to migrate to (default: "djblob"). + target_type: The type name to migrate to (default: "blob"). dry_run: If True, only return SQL without executing. Returns: @@ -156,18 +156,18 @@ def generate_migration_sql( def migrate_blob_columns( schema: Schema, - target_type: str = "djblob", + target_type: str = "blob", dry_run: bool = True, ) -> dict: """ - Migrate blob columns in a schema to use explicit type. + Migrate blob columns in a schema to use explicit type. This updates column comments in the database to include the type declaration. The data format remains unchanged. Args: schema: The DataJoint schema to migrate. - target_type: The type name to migrate to (default: "djblob"). + target_type: The type name to migrate to (default: "blob"). dry_run: If True, only preview changes without applying. Returns: @@ -188,7 +188,7 @@ def migrate_blob_columns( Warning: After migration, table definitions should be updated to use - `` instead of `longblob` for consistency. The migration + `` instead of `longblob` for consistency. The migration only updates database metadata; source code changes are manual. """ columns = analyze_blob_columns(schema) diff --git a/src/datajoint/preview.py b/src/datajoint/preview.py index 7572125e..0ef096d2 100644 --- a/src/datajoint/preview.py +++ b/src/datajoint/preview.py @@ -27,7 +27,7 @@ def _format_object_display(json_data): def preview(query_expression, limit, width): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - # Object fields are AttributeTypes with adapters - not specially handled in simplified model + # Object fields use codecs - not specially handled in simplified model object_fields = [] if limit is None: limit = config["display.limit"] @@ -88,7 +88,7 @@ def get_display_value(tup, f, idx): def repr_html(query_expression): heading = query_expression.heading rel = query_expression.proj(*heading.non_blobs) - # Object fields are AttributeTypes with adapters - not specially handled in simplified model + # Object fields use codecs - not specially handled in simplified model object_fields = [] info = heading.table_status tuples = rel.fetch(limit=config["display.limit"] + 1, format="array") diff --git a/src/datajoint/staged_insert.py b/src/datajoint/staged_insert.py index dbf51c6b..8f9c94d2 100644 --- a/src/datajoint/staged_insert.py +++ b/src/datajoint/staged_insert.py @@ -98,8 +98,8 @@ def _get_storage_path(self, field: str, ext: str = "") -> str: raise DataJointError(f"Attribute '{field}' not found in table heading") attr = self._table.heading[field] - # Check if this is an object AttributeType (has adapter with "object" in type_name) - if not (attr.adapter and hasattr(attr.adapter, "type_name") and "object" in attr.adapter.type_name): + # Check if this is an object Codec (has codec with "object" as name) + if not (attr.codec and attr.codec.name == "object"): raise DataJointError(f"Attribute '{field}' is not an type") # Extract primary key from rec diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 1ce7e816..23648e1d 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -732,7 +732,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): processing by mysql API. In the simplified type system: - - Adapters (AttributeTypes) handle all custom encoding via type chains + - Codecs handle all custom encoding via type chains - UUID values are converted to bytes - JSON values are serialized - Blob values pass through as bytes @@ -748,17 +748,17 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): attr = self.heading[name] # Apply adapter encoding with type chain support - if attr.adapter: - from .attribute_type import resolve_dtype + if attr.codec: + from .codecs import resolve_dtype # Skip validation and encoding for None values (nullable columns) if value is None: return name, "DEFAULT", None - attr.adapter.validate(value) + attr.codec.validate(value) # Resolve full type chain - _, type_chain, resolved_store = resolve_dtype(f"<{attr.adapter.type_name}>", store_name=attr.store) + _, type_chain, resolved_store = resolve_dtype(f"<{attr.codec.name}>", store_name=attr.store) # Apply encoders from outermost to innermost for attr_type in type_chain: @@ -790,7 +790,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): # Numeric - convert to string elif attr.numeric: value = str(int(value) if isinstance(value, bool) else value) - # Blob - pass through as bytes (use for automatic serialization) + # Blob - pass through as bytes (use for automatic serialization) return name, placeholder, value diff --git a/src/datajoint/version.py b/src/datajoint/version.py index 200fd9ba..4684015a 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a5" +__version__ = "2.0.0a9" diff --git a/tests/conftest.py b/tests/conftest.py index d6440423..14b848d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,19 @@ """ Pytest configuration for DataJoint tests. -Expects MySQL and MinIO services to be running via docker-compose: - docker-compose up -d db minio - -Environment variables (with defaults from docker-compose.yaml): - DJ_HOST=db MySQL host - DJ_USER=root MySQL root user - DJ_PASS=password MySQL root password - S3_ENDPOINT=minio:9000 MinIO endpoint - S3_ACCESS_KEY=datajoint MinIO access key - S3_SECRET_KEY=datajoint MinIO secret key +Tests are organized by their dependencies: +- Unit tests: No external dependencies, run with `pytest -m "not requires_mysql"` +- Integration tests: Require MySQL/MinIO, marked with @pytest.mark.requires_mysql + +Containers are automatically started via testcontainers when needed. +Just run: pytest tests/ + +To use external containers instead (e.g., docker-compose), set: + DJ_USE_EXTERNAL_CONTAINERS=1 + DJ_HOST=localhost DJ_PORT=3306 S3_ENDPOINT=localhost:9000 pytest + +To run only unit tests (no Docker required): + pytest -m "not requires_mysql" """ import logging @@ -21,13 +24,9 @@ import certifi import pytest import urllib3 -from packaging import version import datajoint as dj -from datajoint.errors import ( - FILEPATH_FEATURE_SWITCH, - DataJointError, -) +from datajoint.errors import DataJointError from . import schema, schema_advanced, schema_external, schema_object, schema_simple from . import schema_uuid as schema_uuid_module @@ -36,7 +35,120 @@ logger = logging.getLogger(__name__) -# --- Database connection fixtures --- +# ============================================================================= +# Pytest Hooks +# ============================================================================= + + +def pytest_collection_modifyitems(config, items): + """Auto-mark integration tests based on their fixtures.""" + # Tests that use these fixtures require MySQL + mysql_fixtures = { + "connection_root", + "connection_root_bare", + "connection_test", + "schema_any", + "schema_any_fresh", + "schema_simp", + "schema_adv", + "schema_ext", + "schema_uuid", + "schema_type_aliases", + "schema_obj", + "db_creds_root", + "db_creds_test", + } + # Tests that use these fixtures require MinIO + minio_fixtures = { + "minio_client", + "s3fs_client", + "s3_creds", + "stores_config", + "mock_stores", + } + + for item in items: + # Get all fixtures this test uses (directly or indirectly) + try: + fixturenames = set(item.fixturenames) + except AttributeError: + continue + + # Auto-add marks based on fixture usage + if fixturenames & mysql_fixtures: + item.add_marker(pytest.mark.requires_mysql) + if fixturenames & minio_fixtures: + item.add_marker(pytest.mark.requires_minio) + + +# ============================================================================= +# Container Fixtures - Auto-start MySQL and MinIO via testcontainers +# ============================================================================= + +# Check if we should use external containers (for CI or manual docker-compose) +USE_EXTERNAL_CONTAINERS = os.environ.get("DJ_USE_EXTERNAL_CONTAINERS", "").lower() in ("1", "true", "yes") + + +@pytest.fixture(scope="session") +def mysql_container(): + """Start MySQL container for the test session (or use external).""" + if USE_EXTERNAL_CONTAINERS: + # Use external container - return None, credentials come from env + logger.info("Using external MySQL container") + yield None + return + + from testcontainers.mysql import MySqlContainer + + container = MySqlContainer( + image="mysql:8.0", + username="root", + password="password", + dbname="test", + ) + container.start() + + host = container.get_container_host_ip() + port = container.get_exposed_port(3306) + logger.info(f"MySQL container started at {host}:{port}") + + yield container + + container.stop() + logger.info("MySQL container stopped") + + +@pytest.fixture(scope="session") +def minio_container(): + """Start MinIO container for the test session (or use external).""" + if USE_EXTERNAL_CONTAINERS: + # Use external container - return None, credentials come from env + logger.info("Using external MinIO container") + yield None + return + + from testcontainers.minio import MinioContainer + + container = MinioContainer( + image="minio/minio:latest", + access_key="datajoint", + secret_key="datajoint", + ) + container.start() + + host = container.get_container_host_ip() + port = container.get_exposed_port(9000) + logger.info(f"MinIO container started at {host}:{port}") + + yield container + + container.stop() + logger.info("MinIO container stopped") + + +# ============================================================================= +# Credential Fixtures - Derived from containers or environment +# ============================================================================= @pytest.fixture(scope="session") @@ -45,45 +157,92 @@ def prefix(): @pytest.fixture(scope="session") -def db_creds_root() -> Dict: - """Root database credentials from environment.""" - host = os.environ.get("DJ_HOST", "db") - port = os.environ.get("DJ_PORT", "3306") - return dict( - host=f"{host}:{port}" if port else host, - user=os.environ.get("DJ_USER", "root"), - password=os.environ.get("DJ_PASS", "password"), - ) +def db_creds_root(mysql_container) -> Dict: + """Root database credentials from container or environment.""" + if mysql_container is not None: + # From testcontainer + host = mysql_container.get_container_host_ip() + port = mysql_container.get_exposed_port(3306) + return dict( + host=f"{host}:{port}", + user="root", + password="password", + ) + else: + # From environment (external container) + host = os.environ.get("DJ_HOST", "localhost") + port = os.environ.get("DJ_PORT", "3306") + return dict( + host=f"{host}:{port}" if port else host, + user=os.environ.get("DJ_USER", "root"), + password=os.environ.get("DJ_PASS", "password"), + ) @pytest.fixture(scope="session") -def db_creds_test() -> Dict: - """Test user database credentials from environment.""" - host = os.environ.get("DJ_HOST", "db") - port = os.environ.get("DJ_PORT", "3306") - return dict( - host=f"{host}:{port}" if port else host, - user=os.environ.get("DJ_TEST_USER", "datajoint"), - password=os.environ.get("DJ_TEST_PASSWORD", "datajoint"), - ) +def db_creds_test(mysql_container) -> Dict: + """Test user database credentials from container or environment.""" + if mysql_container is not None: + # From testcontainer + host = mysql_container.get_container_host_ip() + port = mysql_container.get_exposed_port(3306) + return dict( + host=f"{host}:{port}", + user="datajoint", + password="datajoint", + ) + else: + # From environment (external container) + host = os.environ.get("DJ_HOST", "localhost") + port = os.environ.get("DJ_PORT", "3306") + return dict( + host=f"{host}:{port}" if port else host, + user=os.environ.get("DJ_TEST_USER", "datajoint"), + password=os.environ.get("DJ_TEST_PASSWORD", "datajoint"), + ) @pytest.fixture(scope="session") -def s3_creds() -> Dict: - """S3/MinIO credentials from environment.""" - return dict( - endpoint=os.environ.get("S3_ENDPOINT", "minio:9000"), - access_key=os.environ.get("S3_ACCESS_KEY", "datajoint"), - secret_key=os.environ.get("S3_SECRET_KEY", "datajoint"), - bucket=os.environ.get("S3_BUCKET", "datajoint.test"), - ) +def s3_creds(minio_container) -> Dict: + """S3/MinIO credentials from container or environment.""" + if minio_container is not None: + # From testcontainer + host = minio_container.get_container_host_ip() + port = minio_container.get_exposed_port(9000) + return dict( + endpoint=f"{host}:{port}", + access_key="datajoint", + secret_key="datajoint", + bucket="datajoint.test", + ) + else: + # From environment (external container) + return dict( + endpoint=os.environ.get("S3_ENDPOINT", "localhost:9000"), + access_key=os.environ.get("S3_ACCESS_KEY", "datajoint"), + secret_key=os.environ.get("S3_SECRET_KEY", "datajoint"), + bucket=os.environ.get("S3_BUCKET", "datajoint.test"), + ) + + +# ============================================================================= +# DataJoint Configuration +# ============================================================================= -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def configure_datajoint(db_creds_root): - """Configure DataJoint to use docker-compose services.""" - host = os.environ.get("DJ_HOST", "db") - port = os.environ.get("DJ_PORT", "3306") + """Configure DataJoint to use test database. + + This fixture is NOT autouse - it only runs when a test requests + a fixture that depends on it (e.g., connection_root_bare). + """ + # Parse host:port from credentials + host_port = db_creds_root["host"] + if ":" in host_port: + host, port = host_port.rsplit(":", 1) + else: + host, port = host_port, "3306" dj.config["database.host"] = host dj.config["database.port"] = int(port) @@ -92,8 +251,13 @@ def configure_datajoint(db_creds_root): logger.info(f"Configured DataJoint to use MySQL at {host}:{port}") +# ============================================================================= +# Connection Fixtures +# ============================================================================= + + @pytest.fixture(scope="session") -def connection_root_bare(db_creds_root): +def connection_root_bare(db_creds_root, configure_datajoint): """Bare root connection without user setup.""" connection = dj.Connection(**db_creds_root) yield connection @@ -104,45 +268,29 @@ def connection_root(connection_root_bare, prefix): """Root database connection with test users created.""" conn_root = connection_root_bare - # Create MySQL users - if version.parse(conn_root.query("select @@version;").fetchone()[0]) >= version.parse("8.0.0"): - conn_root.query( - """ - CREATE USER IF NOT EXISTS 'datajoint'@'%%' - IDENTIFIED BY 'datajoint'; - """ - ) - conn_root.query( - """ - CREATE USER IF NOT EXISTS 'djview'@'%%' - IDENTIFIED BY 'djview'; - """ - ) - conn_root.query( - """ - CREATE USER IF NOT EXISTS 'djssl'@'%%' - IDENTIFIED BY 'djssl' - REQUIRE SSL; - """ - ) - conn_root.query("GRANT ALL PRIVILEGES ON `djtest%%`.* TO 'datajoint'@'%%';") - conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djview'@'%%';") - conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djssl'@'%%';") - else: - conn_root.query( - """ - GRANT ALL PRIVILEGES ON `djtest%%`.* TO 'datajoint'@'%%' - IDENTIFIED BY 'datajoint'; - """ - ) - conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djview'@'%%' IDENTIFIED BY 'djview';") - conn_root.query( - """ - GRANT SELECT ON `djtest%%`.* TO 'djssl'@'%%' - IDENTIFIED BY 'djssl' - REQUIRE SSL; - """ - ) + # Create MySQL users (MySQL 8.0+ syntax - we only support 8.0+) + conn_root.query( + """ + CREATE USER IF NOT EXISTS 'datajoint'@'%%' + IDENTIFIED BY 'datajoint'; + """ + ) + conn_root.query( + """ + CREATE USER IF NOT EXISTS 'djview'@'%%' + IDENTIFIED BY 'djview'; + """ + ) + conn_root.query( + """ + CREATE USER IF NOT EXISTS 'djssl'@'%%' + IDENTIFIED BY 'djssl' + REQUIRE SSL; + """ + ) + conn_root.query("GRANT ALL PRIVILEGES ON `djtest%%`.* TO 'datajoint'@'%%';") + conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djview'@'%%';") + conn_root.query("GRANT SELECT ON `djtest%%`.* TO 'djssl'@'%%';") yield conn_root @@ -167,27 +315,19 @@ def connection_test(connection_root, prefix, db_creds_test): database = f"{prefix}%%" permission = "ALL PRIVILEGES" - if version.parse(connection_root.query("select @@version;").fetchone()[0]) >= version.parse("8.0.0"): - connection_root.query( - f""" - CREATE USER IF NOT EXISTS '{db_creds_test["user"]}'@'%%' - IDENTIFIED BY '{db_creds_test["password"]}'; - """ - ) - connection_root.query( - f""" - GRANT {permission} ON `{database}`.* - TO '{db_creds_test["user"]}'@'%%'; - """ - ) - else: - connection_root.query( - f""" - GRANT {permission} ON `{database}`.* - TO '{db_creds_test["user"]}'@'%%' - IDENTIFIED BY '{db_creds_test["password"]}'; - """ - ) + # MySQL 8.0+ syntax + connection_root.query( + f""" + CREATE USER IF NOT EXISTS '{db_creds_test["user"]}'@'%%' + IDENTIFIED BY '{db_creds_test["password"]}'; + """ + ) + connection_root.query( + f""" + GRANT {permission} ON `{database}`.* + TO '{db_creds_test["user"]}'@'%%'; + """ + ) connection = dj.Connection(**db_creds_test) yield connection @@ -195,7 +335,9 @@ def connection_test(connection_root, prefix, db_creds_test): connection.close() -# --- S3/MinIO fixtures --- +# ============================================================================= +# S3/MinIO Fixtures +# ============================================================================= @pytest.fixture(scope="session") @@ -312,7 +454,9 @@ def minio_client(s3_creds, s3fs_client, teardown=False): pass -# --- Utility fixtures --- +# ============================================================================= +# Utility Fixtures +# ============================================================================= @pytest.fixture(scope="session") @@ -333,14 +477,9 @@ def enable_adapted_types(): yield -@pytest.fixture -def enable_filepath_feature(monkeypatch): - monkeypatch.setenv(FILEPATH_FEATURE_SWITCH, "TRUE") - yield - monkeypatch.delenv(FILEPATH_FEATURE_SWITCH, raising=True) - - -# --- Cleanup fixtures --- +# ============================================================================= +# Cleanup Fixtures +# ============================================================================= @pytest.fixture @@ -374,7 +513,9 @@ def clean_test_tables(test, test_extra, test_no_extra): test_no_extra.delete() -# --- Schema fixtures --- +# ============================================================================= +# Schema Fixtures +# ============================================================================= @pytest.fixture(scope="module") @@ -554,7 +695,7 @@ def schema_adv(connection_test, prefix): @pytest.fixture -def schema_ext(connection_test, enable_filepath_feature, mock_stores, mock_cache, prefix): +def schema_ext(connection_test, mock_stores, mock_cache, prefix): schema = dj.Schema( prefix + "_extern", context=schema_external.LOCALS_EXTERNAL, @@ -601,7 +742,9 @@ def schema_type_aliases(connection_test, prefix): schema.drop() -# --- Table fixtures --- +# ============================================================================= +# Table Fixtures +# ============================================================================= @pytest.fixture @@ -677,7 +820,9 @@ def trash(schema_any): return schema.UberTrash() -# --- Object storage fixtures --- +# ============================================================================= +# Object Storage Fixtures +# ============================================================================= @pytest.fixture @@ -701,6 +846,7 @@ def mock_object_storage(object_storage_config): "protocol": dj.config.object_storage.protocol, "location": dj.config.object_storage.location, "token_length": dj.config.object_storage.token_length, + "stores": dict(dj.config.object_storage.stores), } # Set test values @@ -709,6 +855,12 @@ def mock_object_storage(object_storage_config): dj.config.object_storage.location = object_storage_config["location"] dj.config.object_storage.token_length = object_storage_config.get("token_length", 8) + # Configure 'local' store using same location + dj.config.object_storage.stores["local"] = { + "protocol": "file", + "location": object_storage_config["location"], + } + yield object_storage_config # Restore original values @@ -716,6 +868,8 @@ def mock_object_storage(object_storage_config): dj.config.object_storage.protocol = original["protocol"] dj.config.object_storage.location = original["location"] dj.config.object_storage.token_length = original["token_length"] + dj.config.object_storage.stores.clear() + dj.config.object_storage.stores.update(original["stores"]) @pytest.fixture diff --git a/tests/integration/test_autopopulate.py b/tests/integration/test_autopopulate.py index 6bde3b49..de9dc95a 100644 --- a/tests/integration/test_autopopulate.py +++ b/tests/integration/test_autopopulate.py @@ -121,7 +121,7 @@ class Image(dj.Imported): definition = """ -> ImageSource --- - image_data: + image_data: """ def make(self, key): @@ -134,7 +134,7 @@ class Crop(dj.Computed): definition = """ -> Image --- - crop_image: + crop_image: """ def make(self, key): diff --git a/tests/integration/test_blob_matlab.py b/tests/integration/test_blob_matlab.py index 8e5e9235..07f42660 100644 --- a/tests/integration/test_blob_matlab.py +++ b/tests/integration/test_blob_matlab.py @@ -11,7 +11,7 @@ class Blob(dj.Manual): id : int ----- comment : varchar(255) - blob : + blob : """ diff --git a/tests/integration/test_codec_chaining.py b/tests/integration/test_codec_chaining.py new file mode 100644 index 00000000..defbd428 --- /dev/null +++ b/tests/integration/test_codec_chaining.py @@ -0,0 +1,368 @@ +""" +Tests for codec chaining (composition). + +This tests the → json composition pattern +and similar codec chains. +""" + +from datajoint.codecs import ( + Codec, + _codec_registry, + resolve_dtype, +) + + +class TestCodecChainResolution: + """Tests for resolving codec chains.""" + + def setup_method(self): + """Clear test codecs from registry before each test.""" + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def teardown_method(self): + """Clean up test codecs after each test.""" + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def test_single_codec_chain(self): + """Test resolving a single-codec chain.""" + + class TestSingle(Codec): + name = "test_single" + + def get_dtype(self, is_external: bool) -> str: + return "varchar(100)" + + def encode(self, value, *, key=None, store_name=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "varchar(100)" + assert len(chain) == 1 + assert chain[0].name == "test_single" + assert store is None + + def test_two_codec_chain(self): + """Test resolving a two-codec chain.""" + + class TestInner(Codec): + name = "test_inner" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class TestOuter(Codec): + name = "test_outer" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "bytes" + assert len(chain) == 2 + assert chain[0].name == "test_outer" + assert chain[1].name == "test_inner" + + def test_three_codec_chain(self): + """Test resolving a three-codec chain.""" + + class TestBase(Codec): + name = "test_base" + + def get_dtype(self, is_external: bool) -> str: + return "json" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class TestMiddle(Codec): + name = "test_middle" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class TestTop(Codec): + name = "test_top" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 3 + assert chain[0].name == "test_top" + assert chain[1].name == "test_middle" + assert chain[2].name == "test_base" + + +class TestCodecChainEncodeDecode: + """Tests for encode/decode through codec chains.""" + + def setup_method(self): + """Clear test codecs from registry before each test.""" + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def teardown_method(self): + """Clean up test codecs after each test.""" + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def test_encode_order(self): + """Test that encode is applied outer → inner.""" + encode_order = [] + + class TestInnerEnc(Codec): + name = "test_inner_enc" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): + encode_order.append("inner") + return value + b"_inner" + + def decode(self, stored, *, key=None): + return stored + + class TestOuterEnc(Codec): + name = "test_outer_enc" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + encode_order.append("outer") + return value + b"_outer" + + def decode(self, stored, *, key=None): + return stored + + _, chain, _ = resolve_dtype("") + + # Apply encode in order: outer first, then inner + value = b"start" + for codec in chain: + value = codec.encode(value) + + assert encode_order == ["outer", "inner"] + assert value == b"start_outer_inner" + + def test_decode_order(self): + """Test that decode is applied inner → outer (reverse of encode).""" + decode_order = [] + + class TestInnerDec(Codec): + name = "test_inner_dec" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + decode_order.append("inner") + return stored.replace(b"_inner", b"") + + class TestOuterDec(Codec): + name = "test_outer_dec" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + decode_order.append("outer") + return stored.replace(b"_outer", b"") + + _, chain, _ = resolve_dtype("") + + # Apply decode in reverse order: inner first, then outer + value = b"start_outer_inner" + for codec in reversed(chain): + value = codec.decode(value) + + assert decode_order == ["inner", "outer"] + assert value == b"start" + + def test_roundtrip(self): + """Test encode/decode roundtrip through a codec chain.""" + + class TestInnerRt(Codec): + name = "test_inner_rt" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): + # Compress (just add prefix for testing) + return b"COMPRESSED:" + value + + def decode(self, stored, *, key=None): + # Decompress + return stored.replace(b"COMPRESSED:", b"") + + class TestOuterRt(Codec): + name = "test_outer_rt" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + # Serialize (just encode string for testing) + return str(value).encode("utf-8") + + def decode(self, stored, *, key=None): + # Deserialize + return stored.decode("utf-8") + + _, chain, _ = resolve_dtype("") + + # Original value + original = "test data" + + # Encode: outer → inner + encoded = original + for codec in chain: + encoded = codec.encode(encoded) + + assert encoded == b"COMPRESSED:test data" + + # Decode: inner → outer (reversed) + decoded = encoded + for codec in reversed(chain): + decoded = codec.decode(decoded) + + assert decoded == original + + +class TestBuiltinCodecChains: + """Tests for built-in codec chains.""" + + def test_blob_internal_resolves_to_bytes(self): + """Test that (internal) → bytes.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "bytes" + assert len(chain) == 1 + assert chain[0].name == "blob" + + def test_blob_external_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 2 + assert chain[0].name == "blob" + assert chain[1].name == "hash" + assert store == "store" + + def test_attach_internal_resolves_to_bytes(self): + """Test that (internal) → bytes.""" + final_dtype, chain, _ = resolve_dtype("") + + assert final_dtype == "bytes" + assert len(chain) == 1 + assert chain[0].name == "attach" + + def test_attach_external_resolves_to_json(self): + """Test that → json.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 2 + assert chain[0].name == "attach" + assert chain[1].name == "hash" + assert store == "store" + + def test_hash_external_resolves_to_json(self): + """Test that → json (external only).""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].name == "hash" + assert store == "store" + + def test_object_external_resolves_to_json(self): + """Test that → json (external only).""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].name == "object" + assert store == "store" + + def test_filepath_external_resolves_to_json(self): + """Test that → json (external only).""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert len(chain) == 1 + assert chain[0].name == "filepath" + assert store == "store" + + +class TestStoreNameParsing: + """Tests for store name parsing in codec specs.""" + + def test_codec_with_store(self): + """Test parsing codec with store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert store == "mystore" + + def test_codec_without_store(self): + """Test parsing codec without store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert store is None + + def test_filepath_with_store(self): + """Test parsing filepath with store name.""" + final_dtype, chain, store = resolve_dtype("") + + assert final_dtype == "json" + assert store == "s3store" diff --git a/tests/integration/test_adapted_attributes.py b/tests/integration/test_codecs.py similarity index 69% rename from tests/integration/test_adapted_attributes.py rename to tests/integration/test_codecs.py index ee88c6fc..05b8aabe 100644 --- a/tests/integration/test_adapted_attributes.py +++ b/tests/integration/test_codecs.py @@ -1,7 +1,7 @@ """ -Tests for adapted/custom attribute types. +Tests for custom codecs. -These tests verify the AttributeType system for custom data types. +These tests verify the Codec system for custom data types. """ from itertools import zip_longest @@ -11,51 +11,51 @@ import datajoint as dj -from tests import schema_adapted -from tests.schema_adapted import Connectivity, Layout +from tests import schema_codecs +from tests.schema_codecs import Connectivity, Layout @pytest.fixture def schema_name(prefix): - return prefix + "_test_custom_datatype" + return prefix + "_test_codecs" @pytest.fixture -def schema_ad( +def schema_codec( connection_test, - enable_filepath_feature, s3_creds, tmpdir, schema_name, ): - dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="adapted/repo", stage=str(tmpdir))} - # Types are registered globally via @dj.register_type decorator in schema_adapted - context = {**schema_adapted.LOCALS_ADAPTED} + dj.config["stores"] = {"repo-s3": dict(s3_creds, protocol="s3", location="codecs/repo", stage=str(tmpdir))} + # Codecs are auto-registered via __init_subclass__ in schema_codecs + context = {**schema_codecs.LOCALS_CODECS} schema = dj.schema(schema_name, context=context, connection=connection_test) - schema(schema_adapted.Connectivity) - schema(schema_adapted.Layout) + schema(schema_codecs.Connectivity) + schema(schema_codecs.Layout) yield schema schema.drop() @pytest.fixture -def local_schema(schema_ad, schema_name): +def local_schema(schema_codec, schema_name): """Fixture for testing spawned classes""" - local_schema = dj.Schema(schema_name, connection=schema_ad.connection) + local_schema = dj.Schema(schema_name, connection=schema_codec.connection) local_schema.spawn_missing_classes() yield local_schema - # Don't drop - schema_ad fixture handles cleanup + # Don't drop - schema_codec fixture handles cleanup @pytest.fixture -def schema_virtual_module(schema_ad, schema_name): +def schema_virtual_module(schema_codec, schema_name): """Fixture for testing virtual modules""" - # Types are registered globally, no need to add_objects for adapters - schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, connection=schema_ad.connection) + # Codecs are registered globally, no need to add_objects + schema_virtual_module = dj.VirtualModule("virtual_module", schema_name, connection=schema_codec.connection) return schema_virtual_module -def test_adapted_type(schema_ad): +def test_codec_graph(schema_codec): + """Test basic codec encode/decode with graph type.""" c = Connectivity() graphs = [ nx.lollipop_graph(4, 2), @@ -72,8 +72,8 @@ def test_adapted_type(schema_ad): c.delete() -def test_adapted_filepath_type(schema_ad, minio_client): - """https://github.com/datajoint/datajoint-python/issues/684""" +def test_codec_chained(schema_codec, minio_client): + """Test codec chaining (layout -> blob).""" c = Connectivity() c.delete() c.insert1((0, nx.lollipop_graph(4, 2))) @@ -89,7 +89,8 @@ def test_adapted_filepath_type(schema_ad, minio_client): c.delete() -def test_adapted_spawned(local_schema): +def test_codec_spawned(local_schema): + """Test codecs work with spawned classes.""" c = Connectivity() # a spawned class graphs = [ nx.lollipop_graph(4, 2), @@ -106,7 +107,8 @@ def test_adapted_spawned(local_schema): c.delete() -def test_adapted_virtual(schema_virtual_module): +def test_codec_virtual_module(schema_virtual_module): + """Test codecs work with virtual modules.""" c = schema_virtual_module.Connectivity() graphs = [ nx.lollipop_graph(4, 2), diff --git a/tests/integration/test_fetch_same.py b/tests/integration/test_fetch_same.py index ad830616..886af2b9 100644 --- a/tests/integration/test_fetch_same.py +++ b/tests/integration/test_fetch_same.py @@ -10,7 +10,7 @@ class ProjData(dj.Manual): --- resp : float sim : float - big : + big : blah : varchar(10) """ diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py index 2c312bcc..e0c5fafc 100644 --- a/tests/integration/test_gc.py +++ b/tests/integration/test_gc.py @@ -14,41 +14,45 @@ class TestUsesContentStorage: """Tests for _uses_content_storage helper function.""" def test_returns_false_for_no_adapter(self): - """Test that False is returned when attribute has no adapter.""" + """Test that False is returned when attribute has no codec.""" attr = MagicMock() - attr.adapter = None + attr.codec = None assert gc._uses_content_storage(attr) is False - def test_returns_true_for_content_type(self): - """Test that True is returned for type.""" + def test_returns_true_for_hash_type(self): + """Test that True is returned for type.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "content" + attr.codec = MagicMock() + attr.codec.name = "hash" + attr.store = "mystore" assert gc._uses_content_storage(attr) is True - def test_returns_true_for_xblob_type(self): - """Test that True is returned for type.""" + def test_returns_true_for_blob_external(self): + """Test that True is returned for type (external).""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "xblob" + attr.codec = MagicMock() + attr.codec.name = "blob" + attr.store = "mystore" assert gc._uses_content_storage(attr) is True - def test_returns_true_for_xattach_type(self): - """Test that True is returned for type.""" + def test_returns_true_for_attach_external(self): + """Test that True is returned for type (external).""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "xattach" + attr.codec = MagicMock() + attr.codec.name = "attach" + attr.store = "mystore" assert gc._uses_content_storage(attr) is True - def test_returns_false_for_other_types(self): - """Test that False is returned for non-content types.""" + def test_returns_false_for_blob_internal(self): + """Test that False is returned for internal storage.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "djblob" + attr.codec = MagicMock() + attr.codec.name = "blob" + attr.store = None assert gc._uses_content_storage(attr) is False @@ -89,25 +93,25 @@ class TestUsesObjectStorage: """Tests for _uses_object_storage helper function.""" def test_returns_false_for_no_adapter(self): - """Test that False is returned when attribute has no adapter.""" + """Test that False is returned when attribute has no codec.""" attr = MagicMock() - attr.adapter = None + attr.codec = None assert gc._uses_object_storage(attr) is False def test_returns_true_for_object_type(self): """Test that True is returned for type.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "object" + attr.codec = MagicMock() + attr.codec.name = "object" assert gc._uses_object_storage(attr) is True def test_returns_false_for_other_types(self): """Test that False is returned for non-object types.""" attr = MagicMock() - attr.adapter = MagicMock() - attr.adapter.type_name = "xblob" + attr.codec = MagicMock() + attr.codec.name = "blob" assert gc._uses_object_storage(attr) is False diff --git a/tests/integration/test_relational_operand.py b/tests/integration/test_relational_operand.py index d6580ee8..3f15a731 100644 --- a/tests/integration/test_relational_operand.py +++ b/tests/integration/test_relational_operand.py @@ -561,30 +561,42 @@ def test_restrictions_by_top(self, schema_simp_pop): ] def test_top_restriction_with_keywords(self, schema_simp_pop): + # dj.Top only guarantees which elements are selected, not their order select = SelectPK() & dj.Top(limit=9, order_by=["select desc"]) key = KeyPK() & dj.Top(limit=9, order_by="key desc") - assert select.fetch(as_dict=True) == [ - {"id": 2, "select": 8}, - {"id": 2, "select": 6}, - {"id": 1, "select": 4}, - {"id": 2, "select": 4}, - {"id": 1, "select": 3}, - {"id": 1, "select": 2}, - {"id": 2, "select": 2}, - {"id": 1, "select": 1}, - {"id": 0, "select": 0}, - ] - assert key.fetch(as_dict=True) == [ - {"id": 2, "key": 6}, - {"id": 2, "key": 5}, - {"id": 1, "key": 5}, - {"id": 0, "key": 4}, - {"id": 1, "key": 4}, - {"id": 2, "key": 4}, - {"id": 0, "key": 3}, - {"id": 1, "key": 3}, - {"id": 2, "key": 3}, - ] + # Convert to sets of tuples for order-independent comparison + select_result = {tuple(sorted(d.items())) for d in select.fetch(as_dict=True)} + select_expected = { + tuple(sorted(d.items())) + for d in [ + {"id": 2, "select": 8}, + {"id": 2, "select": 6}, + {"id": 1, "select": 4}, + {"id": 2, "select": 4}, + {"id": 1, "select": 3}, + {"id": 1, "select": 2}, + {"id": 2, "select": 2}, + {"id": 1, "select": 1}, + {"id": 0, "select": 0}, + ] + } + assert select_result == select_expected + key_result = {tuple(sorted(d.items())) for d in key.fetch(as_dict=True)} + key_expected = { + tuple(sorted(d.items())) + for d in [ + {"id": 2, "key": 6}, + {"id": 2, "key": 5}, + {"id": 1, "key": 5}, + {"id": 0, "key": 4}, + {"id": 1, "key": 4}, + {"id": 2, "key": 4}, + {"id": 0, "key": 3}, + {"id": 1, "key": 3}, + {"id": 2, "key": 3}, + ] + } + assert key_result == key_expected def test_top_errors(self, schema_simp_pop): with pytest.raises(DataJointError) as err1: diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 36a15493..d463ccf4 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -66,8 +66,10 @@ def test_schema_list(schema_any): assert schema_any.database in schemas -def test_drop_unauthorized(): - info_schema = dj.schema("information_schema") +@pytest.mark.requires_mysql +def test_drop_unauthorized(connection_test): + """Test that dropping information_schema raises AccessError.""" + info_schema = dj.schema("information_schema", connection=connection_test) with pytest.raises(dj.errors.AccessError): info_schema.drop() diff --git a/tests/integration/test_type_composition.py b/tests/integration/test_type_composition.py deleted file mode 100644 index 0b51b3d6..00000000 --- a/tests/integration/test_type_composition.py +++ /dev/null @@ -1,352 +0,0 @@ -""" -Tests for type composition (type chain encoding/decoding). - -This tests the → json composition pattern -and similar type chains. -""" - -from datajoint.attribute_type import ( - AttributeType, - _type_registry, - register_type, - resolve_dtype, -) - - -class TestTypeChainResolution: - """Tests for resolving type chains.""" - - def setup_method(self): - """Clear test types from registry before each test.""" - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def teardown_method(self): - """Clean up test types after each test.""" - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def test_single_type_chain(self): - """Test resolving a single-type chain.""" - - @register_type - class TestSingle(AttributeType): - type_name = "test_single" - dtype = "varchar(100)" - - def encode(self, value, *, key=None, store_name=None): - return str(value) - - def decode(self, stored, *, key=None): - return stored - - final_dtype, chain, store = resolve_dtype("") - - assert final_dtype == "varchar(100)" - assert len(chain) == 1 - assert chain[0].type_name == "test_single" - assert store is None - - def test_two_type_chain(self): - """Test resolving a two-type chain.""" - - @register_type - class TestInner(AttributeType): - type_name = "test_inner" - dtype = "longblob" - - def encode(self, value, *, key=None, store_name=None): - return value - - def decode(self, stored, *, key=None): - return stored - - @register_type - class TestOuter(AttributeType): - type_name = "test_outer" - dtype = "" - - def encode(self, value, *, key=None, store_name=None): - return value - - def decode(self, stored, *, key=None): - return stored - - final_dtype, chain, store = resolve_dtype("") - - assert final_dtype == "longblob" - assert len(chain) == 2 - assert chain[0].type_name == "test_outer" - assert chain[1].type_name == "test_inner" - - def test_three_type_chain(self): - """Test resolving a three-type chain.""" - - @register_type - class TestBase(AttributeType): - type_name = "test_base" - dtype = "json" - - def encode(self, value, *, key=None, store_name=None): - return value - - def decode(self, stored, *, key=None): - return stored - - @register_type - class TestMiddle(AttributeType): - type_name = "test_middle" - dtype = "" - - def encode(self, value, *, key=None, store_name=None): - return value - - def decode(self, stored, *, key=None): - return stored - - @register_type - class TestTop(AttributeType): - type_name = "test_top" - dtype = "" - - def encode(self, value, *, key=None, store_name=None): - return value - - def decode(self, stored, *, key=None): - return stored - - final_dtype, chain, store = resolve_dtype("") - - assert final_dtype == "json" - assert len(chain) == 3 - assert chain[0].type_name == "test_top" - assert chain[1].type_name == "test_middle" - assert chain[2].type_name == "test_base" - - -class TestTypeChainEncodeDecode: - """Tests for encode/decode through type chains.""" - - def setup_method(self): - """Clear test types from registry before each test.""" - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def teardown_method(self): - """Clean up test types after each test.""" - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def test_encode_order(self): - """Test that encode is applied outer → inner.""" - encode_order = [] - - @register_type - class TestInnerEnc(AttributeType): - type_name = "test_inner_enc" - dtype = "longblob" - - def encode(self, value, *, key=None, store_name=None): - encode_order.append("inner") - return value + b"_inner" - - def decode(self, stored, *, key=None): - return stored - - @register_type - class TestOuterEnc(AttributeType): - type_name = "test_outer_enc" - dtype = "" - - def encode(self, value, *, key=None, store_name=None): - encode_order.append("outer") - return value + b"_outer" - - def decode(self, stored, *, key=None): - return stored - - _, chain, _ = resolve_dtype("") - - # Apply encode in order: outer first, then inner - value = b"start" - for attr_type in chain: - value = attr_type.encode(value) - - assert encode_order == ["outer", "inner"] - assert value == b"start_outer_inner" - - def test_decode_order(self): - """Test that decode is applied inner → outer (reverse of encode).""" - decode_order = [] - - @register_type - class TestInnerDec(AttributeType): - type_name = "test_inner_dec" - dtype = "longblob" - - def encode(self, value, *, key=None, store_name=None): - return value - - def decode(self, stored, *, key=None): - decode_order.append("inner") - return stored.replace(b"_inner", b"") - - @register_type - class TestOuterDec(AttributeType): - type_name = "test_outer_dec" - dtype = "" - - def encode(self, value, *, key=None, store_name=None): - return value - - def decode(self, stored, *, key=None): - decode_order.append("outer") - return stored.replace(b"_outer", b"") - - _, chain, _ = resolve_dtype("") - - # Apply decode in reverse order: inner first, then outer - value = b"start_outer_inner" - for attr_type in reversed(chain): - value = attr_type.decode(value) - - assert decode_order == ["inner", "outer"] - assert value == b"start" - - def test_roundtrip(self): - """Test encode/decode roundtrip through a type chain.""" - - @register_type - class TestInnerRt(AttributeType): - type_name = "test_inner_rt" - dtype = "longblob" - - def encode(self, value, *, key=None, store_name=None): - # Compress (just add prefix for testing) - return b"COMPRESSED:" + value - - def decode(self, stored, *, key=None): - # Decompress - return stored.replace(b"COMPRESSED:", b"") - - @register_type - class TestOuterRt(AttributeType): - type_name = "test_outer_rt" - dtype = "" - - def encode(self, value, *, key=None, store_name=None): - # Serialize (just encode string for testing) - return str(value).encode("utf-8") - - def decode(self, stored, *, key=None): - # Deserialize - return stored.decode("utf-8") - - _, chain, _ = resolve_dtype("") - - # Original value - original = "test data" - - # Encode: outer → inner - encoded = original - for attr_type in chain: - encoded = attr_type.encode(encoded) - - assert encoded == b"COMPRESSED:test data" - - # Decode: inner → outer (reversed) - decoded = encoded - for attr_type in reversed(chain): - decoded = attr_type.decode(decoded) - - assert decoded == original - - -class TestBuiltinTypeComposition: - """Tests for built-in type composition.""" - - def test_xblob_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") - - assert final_dtype == "json" - assert len(chain) == 2 - assert chain[0].type_name == "xblob" - assert chain[1].type_name == "content" - - def test_xattach_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") - - assert final_dtype == "json" - assert len(chain) == 2 - assert chain[0].type_name == "xattach" - assert chain[1].type_name == "content" - - def test_djblob_resolves_to_longblob(self): - """Test that → longblob (no chain).""" - final_dtype, chain, _ = resolve_dtype("") - - assert final_dtype == "longblob" - assert len(chain) == 1 - assert chain[0].type_name == "djblob" - - def test_content_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") - - assert final_dtype == "json" - assert len(chain) == 1 - assert chain[0].type_name == "content" - - def test_object_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") - - assert final_dtype == "json" - assert len(chain) == 1 - assert chain[0].type_name == "object" - - def test_attach_resolves_to_longblob(self): - """Test that → longblob.""" - final_dtype, chain, _ = resolve_dtype("") - - assert final_dtype == "longblob" - assert len(chain) == 1 - assert chain[0].type_name == "attach" - - def test_filepath_resolves_to_json(self): - """Test that → json.""" - final_dtype, chain, _ = resolve_dtype("") - - assert final_dtype == "json" - assert len(chain) == 1 - assert chain[0].type_name == "filepath" - - -class TestStoreNameParsing: - """Tests for store name parsing in type specs.""" - - def test_type_with_store(self): - """Test parsing type with store name.""" - final_dtype, chain, store = resolve_dtype("") - - assert final_dtype == "json" - assert store == "mystore" - - def test_type_without_store(self): - """Test parsing type without store name.""" - final_dtype, chain, store = resolve_dtype("") - - assert store is None - - def test_filepath_with_store(self): - """Test parsing filepath with store name.""" - final_dtype, chain, store = resolve_dtype("") - - assert final_dtype == "json" - assert store == "s3store" diff --git a/tests/integration/test_update1.py b/tests/integration/test_update1.py index d09f70c4..eb525a6b 100644 --- a/tests/integration/test_update1.py +++ b/tests/integration/test_update1.py @@ -14,8 +14,8 @@ class Thing(dj.Manual): --- number=0 : int frac : float - picture = null : - params = null : + picture = null : + params = null : img_file = null: timestamp = CURRENT_TIMESTAMP : datetime """ @@ -57,7 +57,7 @@ def schema_update1(connection_test, prefix): schema.drop() -def test_update1(tmpdir, enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1(tmpdir, schema_update1, mock_stores_update): """Test normal updates""" # CHECK 1 -- initial insert key = dict(thing=1) @@ -128,19 +128,19 @@ def test_update1(tmpdir, enable_filepath_feature, schema_update1, mock_stores_up assert original_file_data == final_file_data -def test_update1_nonexistent(enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1_nonexistent(schema_update1, mock_stores_update): with pytest.raises(DataJointError): # updating a non-existent entry Thing.update1(dict(thing=100, frac=0.5)) -def test_update1_noprimary(enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1_noprimary(schema_update1, mock_stores_update): with pytest.raises(DataJointError): # missing primary key Thing.update1(dict(number=None)) -def test_update1_misspelled_attribute(enable_filepath_feature, schema_update1, mock_stores_update): +def test_update1_misspelled_attribute(schema_update1, mock_stores_update): key = dict(thing=17) Thing.insert1(dict(key, frac=1.5)) with pytest.raises(DataJointError): diff --git a/tests/schema.py b/tests/schema.py index b4ffa7f0..99a7c457 100644 --- a/tests/schema.py +++ b/tests/schema.py @@ -200,8 +200,8 @@ class Channel(dj.Part): -> master channel :tinyint unsigned # channel number within Ephys ---- - voltage : - current = null : # optional current to test null handling + voltage : + current = null : # optional current to test null handling """ def _make_tuples(self, key): @@ -228,7 +228,7 @@ class Image(dj.Manual): # table for testing blob inserts id : int # image identifier --- - img : # image + img : # image """ @@ -454,7 +454,7 @@ class Longblob(dj.Manual): definition = """ id: int --- - data: + data: """ diff --git a/tests/schema_adapted.py b/tests/schema_adapted.py deleted file mode 100644 index a2b3e492..00000000 --- a/tests/schema_adapted.py +++ /dev/null @@ -1,59 +0,0 @@ -import inspect - -import networkx as nx - -import datajoint as dj - - -@dj.register_type -class GraphType(dj.AttributeType): - """Custom type for storing NetworkX graphs as edge lists.""" - - type_name = "graph" - dtype = "" # Use djblob for proper serialization - - def encode(self, obj, *, key=None): - """Convert graph object into an edge list.""" - assert isinstance(obj, nx.Graph) - return list(obj.edges) - - def decode(self, stored, *, key=None): - """Convert edge list into a graph.""" - return nx.Graph(stored) - - -@dj.register_type -class LayoutToFilepathType(dj.AttributeType): - """Custom type that saves a graph layout as serialized JSON blob.""" - - type_name = "layout_to_filepath" - dtype = "" # Use djblob for serialization - - def encode(self, layout, *, key=None): - """Serialize layout dict.""" - return layout # djblob handles serialization - - def decode(self, stored, *, key=None): - """Deserialize layout dict.""" - return stored # djblob handles deserialization - - -class Connectivity(dj.Manual): - definition = """ - connid : int - --- - conn_graph = null : - """ - - -class Layout(dj.Manual): - definition = """ - # stores graph layout - -> Connectivity - --- - layout: - """ - - -LOCALS_ADAPTED = {k: v for k, v in locals().items() if inspect.isclass(v)} -__all__ = list(LOCALS_ADAPTED) diff --git a/tests/schema_alter.py b/tests/schema_alter.py index 6f18448e..ef8b35f0 100644 --- a/tests/schema_alter.py +++ b/tests/schema_alter.py @@ -20,7 +20,7 @@ class Experiment(dj.Imported): experiment_id :smallint # experiment number for this subject --- data_path : int # some number - extra=null : # just testing + extra=null : # just testing -> [nullable] User subject_notes=null :varchar(2048) # {notes} e.g. purpose of experiment entry_time=CURRENT_TIMESTAMP :timestamp # automatic timestamp diff --git a/tests/schema_codecs.py b/tests/schema_codecs.py new file mode 100644 index 00000000..6a8d478d --- /dev/null +++ b/tests/schema_codecs.py @@ -0,0 +1,63 @@ +import inspect + +import networkx as nx + +import datajoint as dj + + +class GraphCodec(dj.Codec): + """Custom codec for storing NetworkX graphs as edge lists.""" + + name = "graph" + + def get_dtype(self, is_external: bool) -> str: + """Chain to blob for serialization.""" + return "" + + def encode(self, obj, *, key=None, store_name=None): + """Convert graph object into an edge list.""" + assert isinstance(obj, nx.Graph) + return list(obj.edges) + + def decode(self, stored, *, key=None): + """Convert edge list into a graph.""" + return nx.Graph(stored) + + +class LayoutCodec(dj.Codec): + """Custom codec that saves a graph layout as serialized blob.""" + + name = "layout" + + def get_dtype(self, is_external: bool) -> str: + """Chain to blob for serialization.""" + return "" + + def encode(self, layout, *, key=None, store_name=None): + """Serialize layout dict.""" + return layout # blob handles serialization + + def decode(self, stored, *, key=None): + """Deserialize layout dict.""" + return stored # blob handles deserialization + + +class Connectivity(dj.Manual): + definition = """ + connid : int + --- + conn_graph = null : + """ + + +class Layout(dj.Manual): + definition = """ + # stores graph layout + -> Connectivity + --- + layout: + """ + + +LOCALS_CODECS = {k: v for k, v in locals().items() if inspect.isclass(v)} +__all__ = list(LOCALS_CODECS) diff --git a/tests/schema_external.py b/tests/schema_external.py index 5a2db1e8..ae1803f5 100644 --- a/tests/schema_external.py +++ b/tests/schema_external.py @@ -13,7 +13,7 @@ class Simple(dj.Manual): definition = """ simple : int --- - item : + item : """ @@ -21,7 +21,7 @@ class SimpleRemote(dj.Manual): definition = """ simple : int --- - item : + item : """ @@ -36,7 +36,7 @@ class Dimension(dj.Lookup): definition = """ dim : int --- - dimensions : + dimensions : """ contents = ([0, [100, 50]], [1, [3, 4, 8, 6]]) @@ -47,8 +47,8 @@ class Image(dj.Computed): -> Seed -> Dimension ---- - img : # objects are stored as specified by dj.config['stores']['share'] - neg : # objects are stored as specified by dj.config['stores']['local'] + img : # objects are stored as specified by dj.config['stores']['share'] + neg : # objects are stored as specified by dj.config['stores']['local'] """ def make(self, key): @@ -62,7 +62,7 @@ class Attach(dj.Manual): # table for storing attachments attach : int ---- - img : # attachments are stored as specified by: dj.config['stores']['share'] + img : # attachments are stored as specified by: dj.config['stores']['share'] txt : # attachments are stored directly in the database """ diff --git a/tests/schema_object.py b/tests/schema_object.py index 7caf7e16..ef1d957d 100644 --- a/tests/schema_object.py +++ b/tests/schema_object.py @@ -13,7 +13,7 @@ class ObjectFile(dj.Manual): definition = """ file_id : int --- - data_file : # stored file + data_file : # stored file """ @@ -23,7 +23,7 @@ class ObjectFolder(dj.Manual): definition = """ folder_id : int --- - data_folder : # stored folder + data_folder : # stored folder """ @@ -33,8 +33,8 @@ class ObjectMultiple(dj.Manual): definition = """ record_id : int --- - raw_data : # raw data file - processed : # processed data file + raw_data : # raw data file + processed : # processed data file """ @@ -46,6 +46,6 @@ class ObjectWithOther(dj.Manual): session_id : int --- name : varchar(100) - data_file : + data_file : notes : varchar(255) """ diff --git a/tests/schema_simple.py b/tests/schema_simple.py index 0d4ebd53..3ac71469 100644 --- a/tests/schema_simple.py +++ b/tests/schema_simple.py @@ -250,7 +250,7 @@ class TTestUpdate(dj.Lookup): --- string_attr : varchar(255) num_attr=null : float - blob_attr : + blob_attr : """ contents = [ diff --git a/tests/unit/test_attribute_type.py b/tests/unit/test_attribute_type.py deleted file mode 100644 index afc6674a..00000000 --- a/tests/unit/test_attribute_type.py +++ /dev/null @@ -1,415 +0,0 @@ -""" -Tests for the new AttributeType system. -""" - -import pytest - -import datajoint as dj -from datajoint.attribute_type import ( - AttributeType, - _type_registry, - get_type, - is_type_registered, - list_types, - register_type, - resolve_dtype, - unregister_type, -) -from datajoint.errors import DataJointError - - -class TestAttributeTypeRegistry: - """Tests for the type registry functionality.""" - - def setup_method(self): - """Clear any test types from registry before each test.""" - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def teardown_method(self): - """Clean up test types after each test.""" - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def test_register_type_decorator(self): - """Test registering a type using the decorator.""" - - @register_type - class TestType(AttributeType): - type_name = "test_decorator" - dtype = "longblob" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - assert is_type_registered("test_decorator") - assert get_type("test_decorator").type_name == "test_decorator" - - def test_register_type_direct(self): - """Test registering a type by calling register_type directly.""" - - class TestType(AttributeType): - type_name = "test_direct" - dtype = "varchar(255)" - - def encode(self, value, *, key=None): - return str(value) - - def decode(self, stored, *, key=None): - return stored - - register_type(TestType) - assert is_type_registered("test_direct") - - def test_register_type_idempotent(self): - """Test that registering the same type twice is idempotent.""" - - @register_type - class TestType(AttributeType): - type_name = "test_idempotent" - dtype = "int" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - # Second registration should not raise - register_type(TestType) - assert is_type_registered("test_idempotent") - - def test_register_duplicate_name_different_class(self): - """Test that registering different classes with same name raises error.""" - - @register_type - class TestType1(AttributeType): - type_name = "test_duplicate" - dtype = "int" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - class TestType2(AttributeType): - type_name = "test_duplicate" - dtype = "varchar(100)" - - def encode(self, value, *, key=None): - return str(value) - - def decode(self, stored, *, key=None): - return stored - - with pytest.raises(DataJointError, match="already registered"): - register_type(TestType2) - - def test_unregister_type(self): - """Test unregistering a type.""" - - @register_type - class TestType(AttributeType): - type_name = "test_unregister" - dtype = "int" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - assert is_type_registered("test_unregister") - unregister_type("test_unregister") - assert not is_type_registered("test_unregister") - - def test_get_type_not_found(self): - """Test that getting an unregistered type raises error.""" - with pytest.raises(DataJointError, match="Unknown attribute type"): - get_type("nonexistent_type") - - def test_list_types(self): - """Test listing registered types.""" - - @register_type - class TestType(AttributeType): - type_name = "test_list" - dtype = "int" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - types = list_types() - assert "test_list" in types - assert types == sorted(types) # Should be sorted - - def test_get_type_strips_brackets(self): - """Test that get_type accepts names with or without angle brackets.""" - - @register_type - class TestType(AttributeType): - type_name = "test_brackets" - dtype = "int" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - assert get_type("test_brackets") is get_type("") - - -class TestAttributeTypeValidation: - """Tests for the validate method.""" - - def setup_method(self): - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def teardown_method(self): - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def test_validate_called_default(self): - """Test that default validate accepts any value.""" - - @register_type - class TestType(AttributeType): - type_name = "test_validate_default" - dtype = "longblob" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - t = get_type("test_validate_default") - # Default validate should not raise for any value - t.validate(None) - t.validate(42) - t.validate("string") - t.validate([1, 2, 3]) - - def test_validate_custom(self): - """Test custom validation logic.""" - - @register_type - class PositiveIntType(AttributeType): - type_name = "test_positive_int" - dtype = "int" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - def validate(self, value): - if not isinstance(value, int): - raise TypeError(f"Expected int, got {type(value).__name__}") - if value < 0: - raise ValueError("Value must be positive") - - t = get_type("test_positive_int") - t.validate(42) # Should pass - - with pytest.raises(TypeError): - t.validate("not an int") - - with pytest.raises(ValueError): - t.validate(-1) - - -class TestTypeChaining: - """Tests for type chaining (dtype referencing another custom type).""" - - def setup_method(self): - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def teardown_method(self): - for name in list(_type_registry.keys()): - if name.startswith("test_"): - del _type_registry[name] - - def test_resolve_native_dtype(self): - """Test resolving a native dtype.""" - final_dtype, chain, store = resolve_dtype("longblob") - assert final_dtype == "longblob" - assert chain == [] - assert store is None - - def test_resolve_custom_dtype(self): - """Test resolving a custom dtype.""" - - @register_type - class TestType(AttributeType): - type_name = "test_resolve" - dtype = "varchar(100)" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - final_dtype, chain, store = resolve_dtype("") - assert final_dtype == "varchar(100)" - assert len(chain) == 1 - assert chain[0].type_name == "test_resolve" - assert store is None - - def test_resolve_chained_dtype(self): - """Test resolving a chained dtype.""" - - @register_type - class InnerType(AttributeType): - type_name = "test_inner" - dtype = "longblob" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - @register_type - class OuterType(AttributeType): - type_name = "test_outer" - dtype = "" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - final_dtype, chain, store = resolve_dtype("") - assert final_dtype == "longblob" - assert len(chain) == 2 - assert chain[0].type_name == "test_outer" - assert chain[1].type_name == "test_inner" - assert store is None - - def test_circular_reference_detection(self): - """Test that circular type references are detected.""" - - @register_type - class TypeA(AttributeType): - type_name = "test_circular_a" - dtype = "" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - @register_type - class TypeB(AttributeType): - type_name = "test_circular_b" - dtype = "" - - def encode(self, value, *, key=None): - return value - - def decode(self, stored, *, key=None): - return stored - - with pytest.raises(DataJointError, match="Circular type reference"): - resolve_dtype("") - - -class TestExportsAndAPI: - """Test that the public API is properly exported.""" - - def test_exports_from_datajoint(self): - """Test that AttributeType and helpers are exported from datajoint.""" - assert hasattr(dj, "AttributeType") - assert hasattr(dj, "register_type") - assert hasattr(dj, "list_types") - - -class TestDJBlobType: - """Tests for the built-in DJBlobType.""" - - def test_djblob_is_registered(self): - """Test that djblob is automatically registered.""" - assert is_type_registered("djblob") - - def test_djblob_properties(self): - """Test DJBlobType properties.""" - blob_type = get_type("djblob") - assert blob_type.type_name == "djblob" - assert blob_type.dtype == "longblob" - - def test_djblob_encode_decode_roundtrip(self): - """Test that encode/decode is a proper roundtrip.""" - import numpy as np - - blob_type = get_type("djblob") - - # Test with various data types - test_data = [ - {"key": "value", "number": 42}, - [1, 2, 3, 4, 5], - np.array([1.0, 2.0, 3.0]), - "simple string", - (1, 2, 3), - None, - ] - - for original in test_data: - encoded = blob_type.encode(original) - assert isinstance(encoded, bytes) - decoded = blob_type.decode(encoded) - if isinstance(original, np.ndarray): - np.testing.assert_array_equal(decoded, original) - else: - assert decoded == original - - def test_djblob_encode_produces_valid_blob_format(self): - """Test that encoded data has valid blob protocol header.""" - blob_type = get_type("djblob") - encoded = blob_type.encode({"test": "data"}) - - # Should start with compression prefix or protocol header - valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0") - assert any(encoded.startswith(p) for p in valid_prefixes) - - def test_djblob_in_list_types(self): - """Test that djblob appears in list_types.""" - types = list_types() - assert "djblob" in types - - def test_djblob_handles_serialization(self): - """Test that DJBlobType handles serialization internally. - - With the new design: - - Plain longblob columns store/return raw bytes (no serialization) - - handles pack/unpack in encode/decode - """ - blob_type = get_type("djblob") - - # DJBlobType.encode() should produce packed bytes - data = {"key": "value"} - encoded = blob_type.encode(data) - assert isinstance(encoded, bytes) - - # DJBlobType.decode() should unpack back to original - decoded = blob_type.decode(encoded) - assert decoded == data diff --git a/tests/unit/test_codecs.py b/tests/unit/test_codecs.py new file mode 100644 index 00000000..ada62674 --- /dev/null +++ b/tests/unit/test_codecs.py @@ -0,0 +1,429 @@ +""" +Tests for the Codec system. +""" + +import pytest + +import datajoint as dj +from datajoint.codecs import ( + Codec, + _codec_registry, + get_codec, + is_codec_registered, + list_codecs, + resolve_dtype, + unregister_codec, +) +from datajoint.errors import DataJointError + + +class TestCodecRegistry: + """Tests for the codec registry functionality.""" + + def setup_method(self): + """Clear any test codecs from registry before each test.""" + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def teardown_method(self): + """Clean up test codecs after each test.""" + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def test_register_codec_auto(self): + """Test auto-registration via __init_subclass__.""" + + class TestCodec(Codec): + name = "test_decorator" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_codec_registered("test_decorator") + assert get_codec("test_decorator").name == "test_decorator" + + def test_register_codec_skip(self): + """Test skipping registration with register=False.""" + + class TestCodec(Codec, register=False): + name = "test_skip" + + def get_dtype(self, is_external: bool) -> str: + return "varchar(255)" + + def encode(self, value, *, key=None, store_name=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + assert not is_codec_registered("test_skip") + + def test_register_codec_idempotent(self): + """Test that defining the same codec class twice is idempotent.""" + + class TestCodec(Codec): + name = "test_idempotent" + + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + # Redefine the same name should not raise (same class) + assert is_codec_registered("test_idempotent") + + def test_register_duplicate_name_different_class(self): + """Test that registering different classes with same name raises error.""" + + class TestCodec1(Codec): + name = "test_duplicate" + + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="already registered"): + + class TestCodec2(Codec): + name = "test_duplicate" + + def get_dtype(self, is_external: bool) -> str: + return "varchar(100)" + + def encode(self, value, *, key=None, store_name=None): + return str(value) + + def decode(self, stored, *, key=None): + return stored + + def test_unregister_codec(self): + """Test unregistering a codec.""" + + class TestCodec(Codec): + name = "test_unregister" + + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert is_codec_registered("test_unregister") + unregister_codec("test_unregister") + assert not is_codec_registered("test_unregister") + + def test_get_codec_not_found(self): + """Test that getting an unregistered codec raises error.""" + with pytest.raises(DataJointError, match="Unknown codec"): + get_codec("nonexistent_codec") + + def test_list_codecs(self): + """Test listing registered codecs.""" + + class TestCodec(Codec): + name = "test_list" + + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + codecs = list_codecs() + assert "test_list" in codecs + assert codecs == sorted(codecs) # Should be sorted + + def test_get_codec_strips_brackets(self): + """Test that get_codec accepts names with or without angle brackets.""" + + class TestCodec(Codec): + name = "test_brackets" + + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + assert get_codec("test_brackets") is get_codec("") + + +class TestCodecValidation: + """Tests for the validate method.""" + + def setup_method(self): + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def teardown_method(self): + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def test_validate_called_default(self): + """Test that default validate accepts any value.""" + + class TestCodec(Codec): + name = "test_validate_default" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + t = get_codec("test_validate_default") + # Default validate should not raise for any value + t.validate(None) + t.validate(42) + t.validate("string") + t.validate([1, 2, 3]) + + def test_validate_custom(self): + """Test custom validation logic.""" + + class PositiveIntCodec(Codec): + name = "test_positive_int" + + def get_dtype(self, is_external: bool) -> str: + return "int32" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + def validate(self, value): + if not isinstance(value, int): + raise TypeError(f"Expected int, got {type(value).__name__}") + if value < 0: + raise ValueError("Value must be positive") + + t = get_codec("test_positive_int") + t.validate(42) # Should pass + + with pytest.raises(TypeError): + t.validate("not an int") + + with pytest.raises(ValueError): + t.validate(-1) + + +class TestCodecChaining: + """Tests for codec chaining (dtype referencing another codec).""" + + def setup_method(self): + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def teardown_method(self): + for name in list(_codec_registry.keys()): + if name.startswith("test_"): + del _codec_registry[name] + + def test_resolve_native_dtype(self): + """Test resolving a native dtype.""" + final_dtype, chain, store = resolve_dtype("bytes") + assert final_dtype == "bytes" + assert chain == [] + assert store is None + + def test_resolve_custom_dtype(self): + """Test resolving a custom dtype.""" + + class TestCodec(Codec): + name = "test_resolve" + + def get_dtype(self, is_external: bool) -> str: + return "varchar(100)" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + assert final_dtype == "varchar(100)" + assert len(chain) == 1 + assert chain[0].name == "test_resolve" + assert store is None + + def test_resolve_chained_dtype(self): + """Test resolving a chained dtype.""" + + class InnerCodec(Codec): + name = "test_inner" + + def get_dtype(self, is_external: bool) -> str: + return "bytes" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class OuterCodec(Codec): + name = "test_outer" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + final_dtype, chain, store = resolve_dtype("") + assert final_dtype == "bytes" + assert len(chain) == 2 + assert chain[0].name == "test_outer" + assert chain[1].name == "test_inner" + assert store is None + + def test_circular_reference_detection(self): + """Test that circular codec references are detected.""" + + class CodecA(Codec): + name = "test_circular_a" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + class CodecB(Codec): + name = "test_circular_b" + + def get_dtype(self, is_external: bool) -> str: + return "" + + def encode(self, value, *, key=None, store_name=None): + return value + + def decode(self, stored, *, key=None): + return stored + + with pytest.raises(DataJointError, match="Circular codec reference"): + resolve_dtype("") + + +class TestExportsAndAPI: + """Test that the public API is properly exported.""" + + def test_exports_from_datajoint(self): + """Test that Codec and helpers are exported from datajoint.""" + assert hasattr(dj, "Codec") + assert hasattr(dj, "get_codec") + assert hasattr(dj, "list_codecs") + + +class TestBlobCodec: + """Tests for the built-in BlobCodec.""" + + def test_blob_is_registered(self): + """Test that blob is automatically registered.""" + assert is_codec_registered("blob") + + def test_blob_properties(self): + """Test BlobCodec properties.""" + blob_codec = get_codec("blob") + assert blob_codec.name == "blob" + assert blob_codec.get_dtype(is_external=False) == "bytes" + assert blob_codec.get_dtype(is_external=True) == "" + + def test_blob_encode_decode_roundtrip(self): + """Test that encode/decode is a proper roundtrip.""" + import numpy as np + + blob_codec = get_codec("blob") + + # Test with various data types + test_data = [ + {"key": "value", "number": 42}, + [1, 2, 3, 4, 5], + np.array([1.0, 2.0, 3.0]), + "simple string", + (1, 2, 3), + None, + ] + + for original in test_data: + encoded = blob_codec.encode(original) + assert isinstance(encoded, bytes) + decoded = blob_codec.decode(encoded) + if isinstance(original, np.ndarray): + np.testing.assert_array_equal(decoded, original) + else: + assert decoded == original + + def test_blob_encode_produces_valid_blob_format(self): + """Test that encoded data has valid blob protocol header.""" + blob_codec = get_codec("blob") + encoded = blob_codec.encode({"test": "data"}) + + # Should start with compression prefix or protocol header + valid_prefixes = (b"ZL123\0", b"mYm\0", b"dj0\0") + assert any(encoded.startswith(p) for p in valid_prefixes) + + def test_blob_in_list_codecs(self): + """Test that blob appears in list_codecs.""" + codecs = list_codecs() + assert "blob" in codecs + + def test_blob_handles_serialization(self): + """Test that BlobCodec handles serialization internally. + + With the new design: + - Plain bytes columns store/return raw bytes (no serialization) + - handles pack/unpack in encode/decode + """ + blob_codec = get_codec("blob") + + # BlobCodec.encode() should produce packed bytes + data = {"key": "value"} + encoded = blob_codec.encode(data) + assert isinstance(encoded, bytes) + + # BlobCodec.decode() should unpack back to original + decoded = blob_codec.decode(encoded) + assert decoded == data diff --git a/tests/unit/test_settings.py b/tests/unit/test_settings.py index d7122969..66d817f0 100644 --- a/tests/unit/test_settings.py +++ b/tests/unit/test_settings.py @@ -160,7 +160,9 @@ def test_attribute_access(self): # Host can be localhost or db (docker), just verify it's a string assert isinstance(dj.config.database.host, str) assert len(dj.config.database.host) > 0 - assert dj.config.database.port == 3306 + # Port may be 3306 (default) or a random port (testcontainers) + assert isinstance(dj.config.database.port, int) + assert 1 <= dj.config.database.port <= 65535 # safemode may be modified by conftest fixtures assert isinstance(dj.config.safemode, bool) @@ -169,7 +171,9 @@ def test_dict_style_access(self): # Host can be localhost or db (docker), just verify it's a string assert isinstance(dj.config["database.host"], str) assert len(dj.config["database.host"]) > 0 - assert dj.config["database.port"] == 3306 + # Port may be 3306 (default) or a random port (testcontainers) + assert isinstance(dj.config["database.port"], int) + assert 1 <= dj.config["database.port"] <= 65535 # safemode may be modified by conftest fixtures assert isinstance(dj.config["safemode"], bool)