diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/Dockerfile b/managed-connectivity/community-contributed-connectors/teradata-connector/Dockerfile new file mode 100644 index 0000000..4ca50e7 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/Dockerfile @@ -0,0 +1,58 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM debian:11-slim + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt update && apt install -y procps tini +RUN apt install -y wget + +ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/ +RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}" +COPY terajdbc4.jar "${SPARK_EXTRA_JARS_DIR}" + +ENV RUNNING_IN_CONTAINER=Y + +ENV CONDA_HOME=/opt/miniconda3 +ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python +ENV PATH=${CONDA_HOME}/bin:${PATH} +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py311_24.9.2-0-Linux-x86_64.sh + +RUN bash Miniconda3-py311_24.9.2-0-Linux-x86_64.sh -b -p /opt/miniconda3 \ + && ${CONDA_HOME}/bin/conda config --system --set always_yes True \ + && ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \ + && ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \ + && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict + +RUN ${CONDA_HOME}/bin/conda install mamba -n base -c conda-forge \ + && ${CONDA_HOME}/bin/mamba install \ + conda \ + google-cloud-dataproc \ + google-cloud-logging \ + google-cloud-monitoring \ + google-cloud-storage + +RUN apt update && apt install -y git +COPY requirements.txt . +RUN python -m pip install -r requirements.txt + +ENV PYTHONPATH=/opt/python/packages +RUN mkdir -p "${PYTHONPATH}/src/" +COPY src/ "${PYTHONPATH}/src/" +COPY main.py . + +RUN groupadd -g 1099 spark +RUN useradd -u 1099 -g 1099 -d /home/spark -m spark +USER spark diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/README.md b/managed-connectivity/community-contributed-connectors/teradata-connector/README.md new file mode 100644 index 0000000..9b6110e --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/README.md @@ -0,0 +1,566 @@ +# Teradata Connector + +This custom connector extracts metadata from Teradata databases for import into [Dataplex Universal Catalog](https://cloud.google.com/dataplex/docs/introduction). + +Custom connectors are part of the [Managed Connectivity framework](https://cloud.google.com/dataplex/docs/managed-connectivity-overview) and are responsible for the export of metadata from external systems into correctly formatted import files. See [Develop Custom Connectors](https://cloud.google.com/dataplex/docs/develop-custom-connector) for more information. + +This is not an officially supported Google product and is provided on an as-is basis, without warranty. This project is not eligible for the [Google Open Source Software Vulnerability Rewards Program](https://bughunters.google.com/open-source-security). + +## Overview + +### Extracted metadata + +|Object|Metadata Extracted| +|------|------------------| +|Tables|Table name, column names, column data types, column NULL/NOT NULL, column default value, table and column comments| +|Views|View name, column names, column data types, column NULL/NOT NULL, column default value, view and column comments| + +### Dataplex entry hierarchy + +``` +teradata-instance (host) + └── teradata-database (host) + └── teradata-schema (database from DBC.DatabasesV) + ├── teradata-table (with dataplex-types.global.schema aspect for columns) + └── teradata-view (with dataplex-types.global.schema aspect for columns) +``` + +System databases (DBC, SysAdmin, SystemFe, TDStats, etc.) are automatically excluded. + +## Parameters + +|Parameter|Description|Default|Required/Optional| +|---------|-----------|-------|-----------------| +|target_project_id|Google Cloud Project ID for the generated metadata||REQUIRED| +|target_location_id|Google Cloud Region ID, or `global`||REQUIRED| +|target_entry_group_id|Entry Group ID for the imported entries||REQUIRED| +|host|Teradata server hostname||REQUIRED| +|port|Teradata DBS port number|1025|OPTIONAL| +|user|Teradata username||REQUIRED for TD2; OPTIONAL for LDAP/JWT| +|database|Scope extraction to a specific database. If omitted, all non-system databases are extracted||OPTIONAL| +|password_secret|Secret Manager ID for password. Format: `projects/PROJECT_ID/secrets/SECRET_NAME`||See [Password methods](#password-methods)| +|password_file|Path to a file containing the password||See [Password methods](#password-methods)| +|password|Password provided directly (least secure)||See [Password methods](#password-methods)| +|logmech|Teradata logon mechanism: TD2, LDAP, or JWT|TD2|OPTIONAL| +|logdata|Additional logon data for the selected logmech (e.g., LDAP credentials, JWT tokens)||OPTIONAL| +|logdata_secret|Secret Manager ID for logdata. Format: `projects/PROJECT_ID/secrets/SECRET_NAME`. Mutually exclusive with `--logdata`||OPTIONAL| +|query_band|Teradata session query band for tracking. See [Query band](#query-band)|See below|OPTIONAL| +|charset|JDBC session character set|UTF8|OPTIONAL| +|local_output_only|Write metadata file locally only, do not upload to Cloud Storage|False|OPTIONAL| +|output_bucket|GCS bucket for metadata output (no `gs://` prefix). Required unless `--local_output_only`||REQUIRED| +|output_folder|Folder within the GCS bucket. Required unless `--local_output_only`||REQUIRED| +|jar|Path to JDBC jar file|terajdbc4.jar|OPTIONAL| +|min_expected_entries|Minimum entries expected; fewer means no upload to GCS|-1|OPTIONAL| + +The `TERADATA_PASSWORD` environment variable can also be used to provide the password. See [Password methods](#password-methods) for the full priority order. + +Note: **target_project_id**, **target_location_id** and **target_entry_group_id** are string values in the generated metadata file and define the import scope. They do not need to match the project where the connector runs. See [components of a metadata job](https://cloud.google.com/dataplex/docs/import-metadata#components) for details. + +### Password methods + +The connector resolves the password using the first available method in this priority order: + +|Priority|Method|Description| +|--------|------|-----------| +|1|`--password_secret`|Google Secret Manager (recommended for production)| +|2|`--password_file`|Path to a local file containing the password| +|3|`TERADATA_PASSWORD`|Environment variable| +|4|`--password`|CLI argument (a security warning is printed to stderr)| + +If multiple methods are provided, the highest-priority method is used. For **TD2** (default), at least one password method is required. For **LDAP** and **JWT**, password is optional and defaults to empty if not provided. Empty or whitespace-only values are rejected for all methods. + +### Authentication methods + +The connector supports multiple Teradata logon mechanisms via the `--logmech` parameter: + +|Method|`--user`|`--password`|`--logdata`|Notes| +|------|:------:|:------:|:---------:|-----| +|**TD2** (default)|Required|Required|Optional|Traditional username/password| +|**LDAP**|Optional|Optional|Optional|Enterprise directory authentication| +|**JWT**|Optional|Optional|Optional (for token)|Token-based authentication| + +When using LDAP or JWT, the `--logdata` parameter (or `--logdata_secret` for secure retrieval from Secret Manager) can pass additional authentication data such as LDAP credentials or JWT tokens. + +### Query band + +The `--query_band` parameter sets a Teradata session query band for tracking and telemetry. If not provided, the following default is applied: + +``` +org=teradata-internal-telem;appname=teradata-dataplex-connector; +``` + +Custom query bands must use `key=value;` format. The `org` and `appname` keys are always enforced -- if omitted, defaults are added; if a custom `appname` is provided, the default is appended (e.g., `appname=myapp_teradata-dataplex-connector;`). + +Restrictions: +* Only alphanumeric characters, hyphens, underscores, dots, equals, semicolons, commas, and spaces are allowed +* Reserved names (`proxyuser`, `proxyrole`) are rejected +* Maximum length: 2048 characters + +## Getting started + +### Prerequisites + +* **Python 3.x** + ```bash + sudo apt update + sudo apt install python3 python3-dev python3-venv python3-pip + ``` +* **Python virtual environment** + ```bash + python3 -m venv env + source env/bin/activate + ``` + Run `source env/bin/activate` each time before using the connector. + +* **Java Runtime Environment (JRE)** + ```bash + sudo apt install default-jre + ``` +* **PySpark** + ```bash + pip3 install pyspark + ``` + +#### Windows prerequisites + +PySpark on Windows requires Hadoop's `winutils.exe`. Download or build `winutils.exe` for your Hadoop version, place it at `C:\hadoop\bin\winutils.exe`, and set the environment variables: + +```shell +set HADOOP_HOME=C:\hadoop +set PYSPARK_PYTHON=C:\Path\To\python.exe +set PYSPARK_DRIVER_PYTHON=C:\Path\To\python.exe +``` + +### Install + +1. Clone the repository: + ```bash + git clone https://github.com/GoogleCloudPlatform/cloud-dataplex.git + cd cloud-dataplex/managed-connectivity/community-contributed-connectors/teradata-connector + ``` + +2. Install Python dependencies: + ```bash + pip3 install -r requirements.txt + ``` + +3. Download the Teradata JDBC driver **terajdbc4.jar** from [Teradata Downloads](https://downloads.teradata.com/) and place it in the connector directory. + + Note: The Teradata JDBC driver is not available on Maven Central and must be downloaded manually. Use `--jar` to specify a different version or path. + +### Create a database user + +Best practice is to create a dedicated database user with the minimum privileges required: +* SELECT on DBC.DatabasesV +* SELECT on DBC.TablesV +* SELECT on DBC.ColumnsV + +Note: When using LDAP or JWT authentication (`--logmech LDAP` or `--logmech JWT`), a dedicated database user may not be required. Authentication is handled by the external identity provider, though the authenticated identity still requires the privileges listed above. + +### Store the password in Secret Manager + +Create a secret: + +```bash +echo -n "YOUR_PASSWORD" | gcloud secrets create teradata-password \ + --project=PROJECT_ID --data-file=- +``` + +To update an existing secret with a new version: + +```bash +echo -n "YOUR_NEW_PASSWORD" | gcloud secrets versions add teradata-password \ + --project=PROJECT_ID --data-file=- +``` + +### GCP authentication and authorization + +Before running the connector, ensure your session is authenticated as a Google Cloud identity with the required IAM roles: + +* `roles/storage.objectUser` -- required when using `--output_bucket` +* `roles/secretmanager.secretAccessor` -- required when using `--password_secret` or `--logdata_secret` + +```bash +gcloud auth application-default login +``` + +Note: If you are not running in a Google Cloud managed environment, first install the [Google Cloud CLI](https://cloud.google.com/sdk/docs/install-sdk). + +## Run the connector + +Run from the connector root directory, substituting placeholder values for your environment. + +### Basic usage (TD2 with Secret Manager) + +```shell +python3 main.py \ + --target_project_id PROJECT_ID \ + --target_location_id us-central1 \ + --target_entry_group_id teradata \ + --host TERADATA_HOST \ + --user USERNAME \ + --password_secret projects/PROJECT_ID/secrets/teradata-password \ + --output_bucket OUTPUT_BUCKET \ + --output_folder teradata_metadata +``` + +To scope extraction to a single database, add `--database DATABASE_NAME`. +For local output only (no GCS upload), replace the output options with `--local_output_only`. + +### LDAP authentication + +```shell +python3 main.py \ + --target_project_id PROJECT_ID \ + --target_location_id us-central1 \ + --target_entry_group_id teradata \ + --host TERADATA_HOST \ + --logmech LDAP \ + --logdata_secret projects/PROJECT_ID/secrets/ldap-credentials \ + --local_output_only +``` + +If you prefer to pass logdata directly (not recommended for production), use `--logdata` instead of `--logdata_secret`. + +### JWT authentication + +```shell +python3 main.py \ + --target_project_id PROJECT_ID \ + --target_location_id us-central1 \ + --target_entry_group_id teradata \ + --host TERADATA_HOST \ + --logmech JWT \ + --logdata_secret projects/PROJECT_ID/secrets/jwt-token \ + --local_output_only +``` + +### Alternative password methods + +```shell +# Using a password file +python3 main.py \ + --target_project_id PROJECT_ID \ + --target_location_id us-central1 \ + --target_entry_group_id teradata \ + --host TERADATA_HOST \ + --user USERNAME \ + --password_file /path/to/password.txt \ + --local_output_only +``` + +```shell +# Using TERADATA_PASSWORD environment variable +export TERADATA_PASSWORD="YOUR_PASSWORD" +python3 main.py \ + --target_project_id PROJECT_ID \ + --target_location_id us-central1 \ + --target_entry_group_id teradata \ + --host TERADATA_HOST \ + --user USERNAME \ + --local_output_only +``` + +### Custom query band + +Add `--query_band` to any command to set a session query band: + +```shell +python3 main.py \ + --target_project_id PROJECT_ID \ + --target_location_id us-central1 \ + --target_entry_group_id teradata \ + --host TERADATA_HOST \ + --user USERNAME \ + --password_secret projects/PROJECT_ID/secrets/teradata-password \ + --query_band "org=myorg;appname=myapp;env=prod;" \ + --output_bucket OUTPUT_BUCKET \ + --output_folder teradata_metadata +``` + +## Import metadata into Dataplex Universal Catalog + +### Connector output + +The connector generates a JSONL metadata import file as described [in the documentation](https://cloud.google.com/dataplex/docs/import-metadata#metadata-import-file). The file is always written to the local `output/` directory. If `--output_bucket` and `--output_folder` are provided (and `--local_output_only` is not set), the file is also uploaded to Cloud Storage automatically. + +A sample output file is available in the [sample/](sample/) directory. + +### Validate the output + +Use the validation script to verify the generated JSONL file is compatible with the Dataplex import API before uploading: + +```bash +python3 tests/validate_output.py output/teradata-TERADATA_HOST.jsonl +``` + +The script checks each entry for: +* Required top-level keys (`entry`, `aspectKeys`, `updateMask`) +* Correct camelCase field naming (no snake_case) +* Valid `entrySource` with `system: "teradata"` +* Aspect structure and key consistency +* Valid schema fields (mode, dataType, metadataType) +* Parent-child hierarchy integrity + +It prints a summary of entry counts by type (instances, databases, schemas, tables, views) and total column count. Exits with code 0 on success or 1 if errors are found. + +### Upload to Cloud Storage + +If you used `--local_output_only`, upload the file manually before importing: + +```bash +gsutil cp output/teradata-TERADATA_HOST.jsonl gs://OUTPUT_BUCKET/import/ +``` + +### Create Dataplex catalog resources + +Before importing, the Entry Group, Entry Types, and Aspect Types must exist in the target project. Resources must be created in this dependency order: + +``` +template.json --> Aspect Types --> Entry Types + \ + Entry Group --> Metadata Import +``` + +Aspect Types must exist before Entry Types (because Entry Types reference them). The Entry Group must exist before import (because entries are stored in it). + +This connector requires: + +|Catalog Object|IDs| +|---------|---| +|Entry Group|Defined by `--target_entry_group_id`| +|Entry Types|teradata-instance, teradata-database, teradata-schema, teradata-table, teradata-view| +|Aspect Types|teradata-instance, teradata-database, teradata-schema, teradata-table, teradata-view| + +#### Using the setup script (Linux/macOS) + +```bash +PROJECT_ID=my-project-id LOCATION=us-central1 bash scripts/setup_dataplex_resources.sh +``` + +#### Manual setup + +**1. Create a metadata template file** (`template.json`): + +```json +{"name":"marker","type":"record","recordFields":[{"name":"description","type":"string","index":1,"constraints":{"required":false}}]} +``` + +**2. Create the Entry Group:** + +```bash +gcloud dataplex entry-groups create teradata \ + --project=PROJECT_ID \ + --location=us-central1 \ + --description="Entry group for Teradata metadata" +``` + +**3. Create Aspect Types** (one per entry level): + +```bash +gcloud dataplex aspect-types create teradata-instance --project=PROJECT_ID --location=us-central1 --display-name="Teradata Instance" --metadata-template-file-name=template.json +gcloud dataplex aspect-types create teradata-database --project=PROJECT_ID --location=us-central1 --display-name="Teradata Database" --metadata-template-file-name=template.json +gcloud dataplex aspect-types create teradata-schema --project=PROJECT_ID --location=us-central1 --display-name="Teradata Schema" --metadata-template-file-name=template.json +gcloud dataplex aspect-types create teradata-table --project=PROJECT_ID --location=us-central1 --display-name="Teradata Table" --metadata-template-file-name=template.json +gcloud dataplex aspect-types create teradata-view --project=PROJECT_ID --location=us-central1 --display-name="Teradata View" --metadata-template-file-name=template.json +``` + +**4. Create Entry Types** (instance, database, schema -- single required aspect each): + +```bash +gcloud dataplex entry-types create teradata-instance --project=PROJECT_ID --location=us-central1 --display-name="Teradata Instance" --required-aspects=type=projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-instance +gcloud dataplex entry-types create teradata-database --project=PROJECT_ID --location=us-central1 --display-name="Teradata Database" --required-aspects=type=projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-database +gcloud dataplex entry-types create teradata-schema --project=PROJECT_ID --location=us-central1 --display-name="Teradata Schema" --required-aspects=type=projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-schema +``` + +**5. Create Entry Types** (table, view -- require both custom aspect and global schema): + +```bash +gcloud dataplex entry-types create teradata-table --project=PROJECT_ID --location=us-central1 --display-name="Teradata Table" --required-aspects=type=projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-table --required-aspects=type=projects/dataplex-types/locations/global/aspectTypes/schema +gcloud dataplex entry-types create teradata-view --project=PROJECT_ID --location=us-central1 --display-name="Teradata View" --required-aspects=type=projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-view --required-aspects=type=projects/dataplex-types/locations/global/aspectTypes/schema +``` + +### Run the metadata import + +After uploading the JSONL file to GCS and creating the catalog resources: + +```bash +gcloud dataplex metadata-jobs create \ + --project=PROJECT_ID \ + --location=us-central1 \ + --type=IMPORT \ + --import-source-storage-uri=gs://OUTPUT_BUCKET/import/ \ + --import-entry-sync-mode=FULL \ + --import-aspect-sync-mode=INCREMENTAL \ + --import-entry-groups=projects/PROJECT_ID/locations/us-central1/entryGroups/teradata \ + --import-entry-types=projects/PROJECT_ID/locations/us-central1/entryTypes/teradata-instance,projects/PROJECT_ID/locations/us-central1/entryTypes/teradata-database,projects/PROJECT_ID/locations/us-central1/entryTypes/teradata-schema,projects/PROJECT_ID/locations/us-central1/entryTypes/teradata-table,projects/PROJECT_ID/locations/us-central1/entryTypes/teradata-view \ + --import-aspect-types=projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-instance,projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-database,projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-schema,projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-table,projects/PROJECT_ID/locations/us-central1/aspectTypes/teradata-view,projects/dataplex-types/locations/global/aspectTypes/schema +``` + +A sample metadata import request is available at [sample/metadata_import_request.json](sample/metadata_import_request.json). + +### Monitor import jobs + +List all metadata jobs: + +```bash +gcloud dataplex metadata-jobs list --project=PROJECT_ID --location=us-central1 +``` + +Check the status of a specific job: + +```bash +gcloud dataplex metadata-jobs describe JOB_ID --project=PROJECT_ID --location=us-central1 +``` + +View warning and error logs: + +```bash +gcloud logging read \ + "resource.type=dataplex.googleapis.com/MetadataJob \ + AND resource.labels.metadata_job_id=JOB_ID \ + AND severity>=WARNING" \ + --project=PROJECT_ID \ + --format="value(jsonPayload.message)" +``` + +See [manage entries and create custom sources](https://cloud.google.com/dataplex/docs/ingest-custom-sources) for more information. + +## Dataproc Serverless + +Follow these instructions to build a Docker container and run the connector with [Dataproc Serverless](https://cloud.google.com/dataproc-serverless/docs). + +### Build the container (one-time) + +1. Ensure [Docker](https://docs.docker.com/engine/install/) is installed. + +2. Create an Artifact Registry repository (if one does not already exist): + ```bash + gcloud artifacts repositories create docker-repo \ + --repository-format=docker \ + --location=us-central1 \ + --project=PROJECT_ID + ``` + +3. Configure Docker to authenticate with Artifact Registry: + ```bash + gcloud auth configure-docker us-central1-docker.pkg.dev + ``` + +4. Edit [build_and_push_docker.sh](build_and_push_docker.sh) and set `PROJECT_ID` and `REGION`. + +5. Build and push: + ```bash + chmod +x build_and_push_docker.sh + ./build_and_push_docker.sh + ``` + This builds a container called **catalog-teradata-pyspark** and pushes it to Artifact Registry (~5 minutes). + +### Set up IAM roles + +Grant the required IAM roles to the service account that will run the Dataproc job. If `--service-account` is not provided, the default Compute Engine service account is used. + +```bash +SA="my-sa@my-project-id.iam.gserviceaccount.com" +PROJECT_ID="my-project-id" + +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:${SA}" --role="roles/secretmanager.secretAccessor" +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:${SA}" --role="roles/storage.objectUser" +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:${SA}" --role="roles/dataproc.worker" +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:${SA}" --role="roles/dataplex.entryOwner" +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:${SA}" --role="roles/dataplex.catalogEditor" +``` + +You can also use this [script](../common_scripts/grant_SA_dataproc_roles.sh) to grant the required roles. + +### Submit a job + +1. Create or choose a Cloud Storage bucket for Dataproc (used as `--deps-bucket`). + +2. Identify the subnet for Dataproc Serverless: + ```bash + gcloud compute networks subnets list \ + --project=PROJECT_ID \ + --regions=us-central1 \ + --format="table(name,network.basename())" + ``` + +3. Submit the job: + ```shell + gcloud dataproc batches submit pyspark \ + --project=PROJECT_ID \ + --region=us-central1 \ + --batch=teradata-metadata-0001 \ + --deps-bucket=DEPS_BUCKET \ + --container-image=us-central1-docker.pkg.dev/PROJECT_ID/docker-repo/catalog-teradata-pyspark:latest \ + --service-account=SERVICE_ACCOUNT_EMAIL \ + --jars=terajdbc4.jar \ + --subnet=SUBNET_NAME \ + main.py \ + -- --target_project_id PROJECT_ID \ + --target_location_id us-central1 \ + --target_entry_group_id teradata \ + --host TERADATA_HOST \ + --port 1025 \ + --user USERNAME \ + --password_secret projects/PROJECT_ID/secrets/teradata-password \ + --output_bucket OUTPUT_BUCKET \ + --output_folder import + ``` + + Notes: + * Use `--network=default` instead of `--subnet` if your project uses the default network. + * To use a different JDBC jar version, store it in GCS: `--jars=gs://BUCKET/path/to/terajdbc4.jar` + +4. Monitor the job: + ```bash + gcloud dataproc batches describe BATCH_ID \ + --project=PROJECT_ID --region=us-central1 + ``` + +See the [documentation](https://cloud.google.com/sdk/gcloud/reference/dataproc/batches/submit/pyspark) for more information about Dataproc Serverless. + +## Cloud Workflows (automated pipeline) + +An end-to-end metadata extraction and import pipeline with monitoring can be created using [Workflows](https://cloud.google.com/workflows) and scheduled to run on a regular basis. + +A Teradata-specific workflow template is included at [teradata-connector-workflow.yaml](teradata-connector-workflow.yaml). + +### Deploy and execute + +```bash +# Deploy the workflow +gcloud workflows deploy teradata-metadata-import \ + --project=PROJECT_ID \ + --location=us-central1 \ + --source=teradata-connector-workflow.yaml + +# Execute the workflow +gcloud workflows execute teradata-metadata-import \ + --project=PROJECT_ID \ + --location=us-central1 \ + --data='{ + "PROJECT_ID": "my-project-id", + "CLOUD_REGION": "us-central1", + "TERADATA_HOST": "teradata.example.com", + "TERADATA_PORT": "1025", + "TERADATA_USER": "dataplexagent", + "PASSWORD_SECRET": "projects/my-project-id/secrets/teradata-password", + "OUTPUT_BUCKET": "my-project-dataplex-teradata", + "SERVICE_ACCOUNT": "my-sa@my-project-id.iam.gserviceaccount.com", + "CONTAINER_IMAGE": "us-central1-docker.pkg.dev/my-project-id/docker-repo/catalog-teradata-pyspark:latest", + "DEPS_BUCKET": "my-project-dataplex-teradata" + }' +``` + +You can also use the generic [byo-connector.yaml](https://github.com/GoogleCloudPlatform/cloud-dataplex/blob/main/managed-connectivity/cloud-workflows/byo-connector/templates/byo-connector.yaml) template. Follow the documentation: [Import metadata from a custom source using Workflows](https://cloud.google.com/dataplex/docs/import-using-workflows-custom-source). + +## Known limitations + +* **Non-ASCII column names** -- Non-ASCII characters in column names (e.g., Chinese, Japanese, accented characters) are encoded to `_u_` format for Dataplex compatibility. +* **Special characters in column names** -- Column names containing ASCII special characters (e.g., `!@#$%^&*{}|,?:;~`) are passed through as-is. The Dataplex import API may reject entries with these characters in schema field paths, resulting in `INVALID_UPDATE_ENTRY_REQUEST` errors. The connector does not alter these names to preserve the original metadata. Tables with affected columns will be partially imported (the entry is created but the schema aspect is rejected). diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/build_and_push_docker.sh b/managed-connectivity/community-contributed-connectors/teradata-connector/build_and_push_docker.sh new file mode 100644 index 0000000..62dadc4 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/build_and_push_docker.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Edit PROJECT_ID and REGION to match your environment +PROJECT_ID=PROJECT_ID +REGION=us-central1 + +IMAGE_NAME="catalog-teradata-pyspark" +IMAGE_VERSION="0.0.1" +IMAGE=${IMAGE_NAME}:${IMAGE_VERSION} +REPO_IMAGE=${REGION}-docker.pkg.dev/${PROJECT_ID}/docker-repo/${IMAGE_NAME} + +docker build -t "${IMAGE}" . + +# Tag and push to Artifact Registry +gcloud config set project ${PROJECT_ID} +gcloud auth configure-docker ${REGION}-docker.pkg.dev +docker tag "${IMAGE}" "${REPO_IMAGE}" +docker push "${REPO_IMAGE}" diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/main.py b/managed-connectivity/community-contributed-connectors/teradata-connector/main.py new file mode 100644 index 0000000..7511be8 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/main.py @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from src.common.bootstrap import run + +if __name__ == '__main__': + run() diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/requirements.txt b/managed-connectivity/community-contributed-connectors/teradata-connector/requirements.txt new file mode 100644 index 0000000..6a8d732 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/requirements.txt @@ -0,0 +1,5 @@ +google-cloud-dataplex +google-cloud-storage +google-cloud-secret-manager +google-cloud-logging +teradatasql diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/sample/metadata_import_request.json b/managed-connectivity/community-contributed-connectors/teradata-connector/sample/metadata_import_request.json new file mode 100644 index 0000000..4c43f10 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/sample/metadata_import_request.json @@ -0,0 +1,29 @@ +{ + "type": "IMPORT", + "import_spec": { + "source_storage_uri": "gs://your-metadata-file-gcs-bucket/teradata/", + "scope": { + "entry_groups": [ + "projects/your-gcp-project/locations/us-central1/entryGroups/teradata" + ], + "entry_types": [ + "projects/your-gcp-project/locations/us-central1/entryTypes/teradata-instance", + "projects/your-gcp-project/locations/us-central1/entryTypes/teradata-database", + "projects/your-gcp-project/locations/us-central1/entryTypes/teradata-schema", + "projects/your-gcp-project/locations/us-central1/entryTypes/teradata-table", + "projects/your-gcp-project/locations/us-central1/entryTypes/teradata-view" + ], + "aspect_types": [ + "projects/your-gcp-project/locations/us-central1/aspectTypes/teradata-instance", + "projects/your-gcp-project/locations/us-central1/aspectTypes/teradata-database", + "projects/your-gcp-project/locations/us-central1/aspectTypes/teradata-schema", + "projects/your-gcp-project/locations/us-central1/aspectTypes/teradata-table", + "projects/your-gcp-project/locations/us-central1/aspectTypes/teradata-view", + "projects/dataplex-types/locations/global/aspectTypes/schema" + ] + }, + "entry_sync_mode": "FULL", + "aspect_sync_mode": "INCREMENTAL", + "log_level": "DEBUG" + } +} \ No newline at end of file diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/sample/teradata-sample-output.jsonl b/managed-connectivity/community-contributed-connectors/teradata-connector/sample/teradata-sample-output.jsonl new file mode 100644 index 0000000..6df6432 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/sample/teradata-sample-output.jsonl @@ -0,0 +1,5 @@ +{"entry": {"name": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com", "entryType": "projects/my-project/locations/us-central1/entryTypes/teradata-instance", "aspects": {"my-project.us-central1.teradata-instance": {"aspectType": "my-project.us-central1.teradata-instance", "data": {}}}, "fullyQualifiedName": "custom:`host-clearscape-teradata-com`", "parentEntry": "", "entrySource": {"displayName": "host.clearscape.teradata.com", "system": "teradata"}}, "aspectKeys": ["my-project.us-central1.teradata-instance"], "updateMask": ["aspects"]} +{"entry": {"name": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com/databases/host.clearscape.teradata.com", "entryType": "projects/my-project/locations/us-central1/entryTypes/teradata-database", "aspects": {"my-project.us-central1.teradata-database": {"aspectType": "my-project.us-central1.teradata-database", "data": {}}}, "fullyQualifiedName": "custom:`host-clearscape-teradata-com`.host-clearscape-teradata-com", "parentEntry": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com", "entrySource": {"displayName": "host.clearscape.teradata.com", "system": "teradata"}}, "aspectKeys": ["my-project.us-central1.teradata-database"], "updateMask": ["aspects"]} +{"entry": {"name": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com/databases/host.clearscape.teradata.com/database_schemas/retail", "entryType": "projects/my-project/locations/us-central1/entryTypes/teradata-schema", "aspects": {"my-project.us-central1.teradata-schema": {"aspectType": "my-project.us-central1.teradata-schema", "data": {}}}, "fullyQualifiedName": "custom:`host-clearscape-teradata-com`.host-clearscape-teradata-com.retail", "parentEntry": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com/databases/host.clearscape.teradata.com", "entrySource": {"displayName": "retail", "system": "teradata"}}, "aspectKeys": ["my-project.us-central1.teradata-schema"], "updateMask": ["aspects"]} +{"entry": {"name": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com/databases/host.clearscape.teradata.com/database_schemas/retail/tables/customers", "entryType": "projects/my-project/locations/us-central1/entryTypes/teradata-table", "aspects": {"my-project.us-central1.teradata-table": {"aspectType": "my-project.us-central1.teradata-table", "data": {}}, "dataplex-types.global.schema": {"aspectType": "dataplex-types.global.schema", "data": {"fields": [{"name": "customer_id", "dataType": "NUMBER", "metadataType": "NUMBER", "mode": "REQUIRED"}, {"name": "first_name", "dataType": "STRING", "metadataType": "STRING", "mode": "NULLABLE"}, {"name": "last_name", "dataType": "STRING", "metadataType": "STRING", "mode": "NULLABLE"}, {"name": "email", "dataType": "STRING", "metadataType": "STRING", "mode": "NULLABLE"}, {"name": "created_at", "dataType": "TIMESTAMP", "metadataType": "TIMESTAMP", "mode": "NULLABLE"}]}}}, "fullyQualifiedName": "custom:`host-clearscape-teradata-com`.host-clearscape-teradata-com.retail.customers", "parentEntry": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com/databases/host.clearscape.teradata.com/database_schemas/retail", "entrySource": {"displayName": "customers", "system": "teradata"}}, "aspectKeys": ["dataplex-types.global.schema", "my-project.us-central1.teradata-table"], "updateMask": ["aspects"]} +{"entry": {"name": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com/databases/host.clearscape.teradata.com/database_schemas/retail/views/active_customers", "entryType": "projects/my-project/locations/us-central1/entryTypes/teradata-view", "aspects": {"my-project.us-central1.teradata-view": {"aspectType": "my-project.us-central1.teradata-view", "data": {}}, "dataplex-types.global.schema": {"aspectType": "dataplex-types.global.schema", "data": {"fields": [{"name": "customer_id", "dataType": "NUMBER", "metadataType": "NUMBER", "mode": "REQUIRED"}, {"name": "first_name", "dataType": "STRING", "metadataType": "STRING", "mode": "NULLABLE"}, {"name": "last_name", "dataType": "STRING", "metadataType": "STRING", "mode": "NULLABLE"}, {"name": "email", "dataType": "STRING", "metadataType": "STRING", "mode": "NULLABLE"}]}}}, "fullyQualifiedName": "custom:`host-clearscape-teradata-com`.host-clearscape-teradata-com.retail.active_customers", "parentEntry": "projects/my-project/locations/us-central1/entryGroups/teradata/entries/host.clearscape.teradata.com/databases/host.clearscape.teradata.com/database_schemas/retail", "entrySource": {"displayName": "active_customers", "system": "teradata"}}, "aspectKeys": ["dataplex-types.global.schema", "my-project.us-central1.teradata-view"], "updateMask": ["aspects"]} diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/scripts/setup_dataplex_resources.sh b/managed-connectivity/community-contributed-connectors/teradata-connector/scripts/setup_dataplex_resources.sh new file mode 100644 index 0000000..341fb29 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/scripts/setup_dataplex_resources.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Setup script for Teradata Dataplex connector. +# Creates the required Entry Group, Aspect Types, and Entry Types +# in Dataplex Universal Catalog before metadata can be imported. +# +# Usage: +# PROJECT_ID=my-project LOCATION=us-central1 ./setup_dataplex_resources.sh +# PROJECT_ID=my-project LOCATION=us-central1 ENTRY_GROUP_ID=my-group ./setup_dataplex_resources.sh + +set -e + +# Configuration +PROJECT_ID="${PROJECT_ID:-YOUR_PROJECT_ID}" +LOCATION="${LOCATION:-us-central1}" +ENTRY_GROUP_ID="${ENTRY_GROUP_ID:-teradata}" + +echo "Using Project: $PROJECT_ID" +echo "Using Location: $LOCATION" +echo "Target Entry Group: $ENTRY_GROUP_ID" + +# 1. Create Entry Group +echo "----------------------------------------------------------------" +echo "Creating Entry Group: $ENTRY_GROUP_ID..." +gcloud dataplex entry-groups create "$ENTRY_GROUP_ID" \ + --project="$PROJECT_ID" \ + --location="$LOCATION" \ + --description="Entry group for Teradata metadata" || echo "Entry Group might already exist." + +# 2. Create Aspect Types +echo "----------------------------------------------------------------" +echo "Creating Aspect Types..." + +# Create metadata template file (JSON format) +cat > template.json < DataFrame: + """Returns dataframe of schemas to extract objects from""" + pass \ No newline at end of file diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/argument_validator.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/argument_validator.py new file mode 100644 index 0000000..014361f --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/argument_validator.py @@ -0,0 +1,156 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from src.common.gcs_uploader import checkDestination +from src.common.secret_manager import get_password +from typing import Optional +import argparse +import re +import logging + +GCP_REGIONS = ['asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2', 'asia-northeast3', 'asia-south1', 'asia-south2', 'asia-southeast1', 'asia-southeast2', 'australia-southeast1', 'australia-southeast2', 'europe-central2', 'europe-north1', 'europe-southwest1', 'europe-west1', 'europe-west2', 'europe-west3', + 'europe-west4', 'europe-west6', 'europe-west8', 'europe-west9', 'europe-west12', 'me-central1', 'me-west1', 'northamerica-northeast1', 'northamerica-northeast2', 'southamerica-east1', 'southamerica-east2', 'us-central1', 'us-east1', 'us-east4', 'us-east5', 'us-south1', 'us-west1', 'us-west2', 'us-west3', 'us-west4'] + +DEFAULT_QUERY_BAND_ORG = "teradata-internal-telem" +DEFAULT_QUERY_BAND_APPNAME = "teradata-dataplex-connector" +DEFAULT_QUERY_BAND = f"org={DEFAULT_QUERY_BAND_ORG};appname={DEFAULT_QUERY_BAND_APPNAME};" +MAX_QUERY_BAND_LENGTH = 2048 +QUERY_BAND_RESERVED_NAMES = {"proxyuser", "proxyrole"} + +# Standard validation checks and value replacements. Additional checks can be applied in cmd_reader for specific data sources +def validateArguments(parsed_args): + + if parsed_args.local_output_only == False and (parsed_args.output_bucket is None or parsed_args.output_folder is None): + raise Exception("both --output_bucket and --output_folder must be supplied if not using --local_output_only") + + if not parsed_args.local_output_only and not checkDestination(parsed_args.output_bucket): + raise Exception(f"--output_bucket {parsed_args.output_bucket} is not valid") + + if parsed_args.target_location_id not in (GCP_REGIONS + ['global']): + raise Exception(f"--target_location_id must be valid google cloud region or 'global' : {parsed_args.target_location_id}") + + if parsed_args.password_secret is not None: + + validateSecretID(parsed_args.password_secret) + + parsed_args.password = get_password(parsed_args.password_secret) + + return parsed_args + + +def validateQueryBand(query_band: Optional[str]) -> str: + """Validate and normalize a Teradata query band string. + + Returns a normalized query band with org and appname enforced. + Returns DEFAULT_QUERY_BAND if input is None or empty. + """ + if query_band is None or query_band.strip() == "": + return DEFAULT_QUERY_BAND + + # Whitelist allowed characters to prevent SQL injection via SET QUERY_BAND + allowed_pattern = r"^[A-Za-z0-9\-_\.=;, ]+$" + if not re.match(allowed_pattern, query_band): + raise SystemExit( + f"Error: invalid --query_band value '{query_band}'. " + "Must contain only letters, numbers, hyphens, underscores, " + "dots, equals, semicolons, commas, and spaces." + ) + + # Normalize: trim whitespace and ensure trailing semicolon + query_band = query_band.strip() + if not query_band.endswith(";"): + query_band += ";" + + # Validate format and parse key-value pairs preserving order + pairs = {} + order = [] + for part in query_band.split(";"): + part = part.strip() + if not part: + continue + if "=" not in part: + raise SystemExit( + f"Error: --query_band has malformed segment '{part}'. " + "Expected format: name=value;" + ) + key, value = part.split("=", 1) + key = key.strip().lower() + value = value.strip() + if not key: + raise SystemExit( + f"Error: --query_band has segment with empty key: '{part}'." + ) + if key not in pairs: + order.append(key) + pairs[key] = value + + # Reject Teradata reserved names + for key in pairs: + if key in QUERY_BAND_RESERVED_NAMES: + raise SystemExit( + f"Error: --query_band contains reserved name '{key}'. " + "PROXYUSER and PROXYROLE require Trusted Session privileges and are not allowed." + ) + + # Enforce org + if "org" not in pairs or not pairs["org"]: + pairs["org"] = DEFAULT_QUERY_BAND_ORG + + # Enforce appname + if "appname" not in pairs or not pairs["appname"]: + pairs["appname"] = DEFAULT_QUERY_BAND_APPNAME + elif not pairs["appname"].endswith(DEFAULT_QUERY_BAND_APPNAME): + pairs["appname"] = f"{pairs['appname']}_{DEFAULT_QUERY_BAND_APPNAME}" + + # Build final query band with required ordering: org → appname → rest + remaining = [k for k in order if k not in ("org", "appname")] + final_order = ["org", "appname"] + remaining + + result = "".join(f"{k}={pairs[k]};" for k in final_order) + + # Enforce maximum length + if len(result) > MAX_QUERY_BAND_LENGTH: + raise SystemExit( + f"Error: --query_band exceeds maximum length of {MAX_QUERY_BAND_LENGTH} characters " + f"(current: {len(result)}). Reduce the number or length of key-value pairs." + ) + + return result + + +def validateSecretID(secretpath: str) -> bool: + pattern = r"^projects/[^/]+/secrets/[^/]+$" + + if not re.match(pattern, secretpath): + raise Exception(f"{secretpath} is not a valid Secret ID. Format is projects/PROJECTID/secrets/SECRETNAME.\nExiting.") + return True + +# Validates that a value for least one of given list arguments has been supplied +def checkOptionProvided(args: argparse.Namespace, checkParams: list): + provided = False + for arg in checkParams: + if args.__contains__(arg) and getattr(args, arg) is not None: + return True + return False + +# true/false argument type +def true_or_false(arg): + ua = str(arg).upper() + if 'TRUE'.startswith(ua): + return True + elif 'FALSE'.startswith(ua): + return False + else: + logging.fatal( + f"Received parameter value '{arg}' but expected true or false") diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/bootstrap.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/bootstrap.py new file mode 100644 index 0000000..a1ed952 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/bootstrap.py @@ -0,0 +1,133 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""The entrypoint of a pipeline.""" +from typing import Dict +import os +import importlib +import sys +import google.cloud.logging as gcp_logging +import logging +from src import cmd_reader +from src.constants import EntryType +from src.constants import SOURCE_TYPE +from src.constants import DB_OBJECT_TYPES_TO_PROCESS +from src.constants import TOP_ENTRY_HIERARCHY +from src.constants import generateFileName +from src.constants import CONNECTOR_MODULE +from src.constants import CONNECTOR_CLASS +from src.common import entry_builder +from src.common import gcs_uploader +from src.common import top_entry_builder +from src.common.util import isRunningInContainer +from src.common.ExternalSourceConnector import IExternalSourceConnector + +def write_jsonl(output_file, json_strings): + """Writes a list of string to the file in JSONL format.""" + for string in json_strings: + output_file.write(string + "\n") + +def process_dataset( + connector: IExternalSourceConnector, + config: Dict[str, str], + schema_name: str, + entry_type: EntryType, +): + """Builds dataset and converts it to jsonl.""" + df_raw = connector.get_dataset(schema_name, entry_type) + df = entry_builder.build_dataset(config, df_raw, schema_name, entry_type) + return df.toJSON().collect() + +def run(): + """Runs a pipeline.""" + + print(f"\nExtracting metadata from {SOURCE_TYPE}") + + try: + config = cmd_reader.read_args() + except Exception as ex: + print(f"Error in arguments: {ex}") + sys.exit(1) + + if config['local_output_only']: + print("File will be generated in local 'output' directory only") + + # Build output file name from connection details + FILENAME = generateFileName(config) + + if not config['local_output_only']: + FOLDERNAME = config['output_folder'] + + # Instantiate connector class + ConnectorClass = getattr(importlib.import_module(CONNECTOR_MODULE), CONNECTOR_CLASS) + connector = None + + try: + connector = ConnectorClass(config) + except Exception as ex: + print(f"Error setting up connector for {SOURCE_TYPE}: {ex}") + raise Exception(ex) + + entries_count = 0 + + # Build the output file name from connection details + FILENAME = generateFileName(config) + + output_path = './output' + if not os.path.exists(output_path): + os.mkdir(output_path) + + try: + with open(f"{output_path}/{FILENAME}", "w", encoding="utf-8") as file: + # First write the top level entry types to file which can be generated without processing the schemas + for entry in TOP_ENTRY_HIERARCHY: + file.writelines(top_entry_builder.create(config, entry)) + file.writelines("\n") + + # Collect list of schemas for extract + df_raw_schemas = None + try: + df_raw_schemas = connector.get_db_schemas() + except Exception as ex: + print(f"Error during metadata extraction from db: {ex}") + sys.exit(1) + + schemas = [schema.SCHEMA_NAME for schema in df_raw_schemas.select("SCHEMA_NAME").collect()] + schemas_json = entry_builder.build_schemas(config, df_raw_schemas).toJSON().collect() + + write_jsonl(file, schemas_json) + + print("Processing schemas..") + + # Collect metadata for target db objects in each schema + for schema in schemas: + for object_type in DB_OBJECT_TYPES_TO_PROCESS: + objects_json = process_dataset(connector, config, schema, object_type) + print(f"Processed {len(objects_json)} {object_type.name}S in {schema}") + entries_count += len(objects_json) + write_jsonl(file, objects_json) + + print(f"{entries_count} rows written to file {FILENAME}") + + # If 'min_expected_entries set, file must meet minimum number of expected entries + if entries_count < config['min_expected_entries']: + print(f"Row count is less than min_expected_entries value of {config['min_expected_entries']}. Will not upload to Cloud Storage bucket.") + elif not config['local_output_only']: + print(f"Uploading to Cloud Storage bucket: {config['output_bucket']}/{FOLDERNAME}") + gcs_uploader.upload(config,output_path,FILENAME,FOLDERNAME) + finally: + if connector is not None and hasattr(connector, 'close'): + connector.close() + + print("Finished") diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/connection_jar.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/connection_jar.py new file mode 100644 index 0000000..5a272be --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/connection_jar.py @@ -0,0 +1,47 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Jar files and paths for when connector is running as local script + +from pathlib import Path +from src.common.util import isRunningInContainer + +# Returns jar path, allowing override with --jar option +def getJarPath(config : dict[str:str], jars_to_include: [str]) -> str: + + base_jar_path = "" + user_jar = config.get('jar') + output_jar_path = '' + + # jar directory path depending on whether local script or running in container + if isRunningInContainer(): + base_jar_path = "/opt/spark/jars" + else: + base_jar_path = "." + + if user_jar is not None: + # if file path to jar provided then use it, otherwise current path + jar name + if (user_jar.startswith(".") or user_jar.startswith("/")): + output_jar_path = user_jar + else: + output_jar_path = f"{Path(base_jar_path).joinpath(user_jar)}" + else: + # Build path for one or more jar files + for jar in jars_to_include: + if len(output_jar_path) > 0: + output_jar_path = f"{output_jar_path},{Path(base_jar_path).joinpath(jar)}" + else: + output_jar_path = f"{Path(base_jar_path).joinpath(jar)}" + + return output_jar_path diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/entry_builder.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/entry_builder.py new file mode 100644 index 0000000..cde8c20 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/entry_builder.py @@ -0,0 +1,279 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Creates entries with PySpark.""" +import pyspark.sql.functions as F +from pyspark.sql.types import StringType +from src.datatype_mapper import get_catalog_metadata_type +from src.datatype_mapper import get_readable_type_name +from src.constants import SOURCE_TYPE +from src.constants import COLLECTION_ENTRY +from src.constants import EntryType +from src import name_builder as nb +from src.name_builder import encode_non_ascii +from enum import Enum + +# DB-specific value which indicates true +from src.constants import IS_NULLABLE_TRUE + +# Property names from google.cloud.dataplex_v1.types.Entry in camel Case +class JSONKeys(Enum): + NAME = 'name' + MODE = 'mode' + ENTRY = 'entry' + ENTRY_TYPE = 'entryType' + ENTRY_SOURCE = 'entrySource' + ASPECT_KEYS = 'aspectKeys' + ASPECT_TYPE = 'aspectType' + DISPLAY_NAME = 'displayName' + UPDATE_MASK = 'updateMask' + FQN = 'fullyQualifiedName' + PARENT_ENTRY = 'parentEntry' + ASPECTS = 'aspects' + DATA = 'data' + DATA_TYPE = 'dataType' + METADATA_TYPE = 'metadataType' + DESCRIPTION = 'description' + ENTRY_ASPECT = 'entry_aspect' + FIELDS = 'fields' + SYSTEM = 'system' + PLATFORM = 'platform' + SCHEMA = 'schema' + COLUMNS = 'columns' + DEFAULT_VALUE = 'defaultValue' + +"""Enum representing the Spark dataframe columns""" +class Columns(Enum): + TABLE_NAME = 'TABLE_NAME' + DATA_TYPE = 'DATA_TYPE' + COLUMN_NAME = 'COLUMN_NAME' + IS_NULLABLE = 'IS_NULLABLE' + SCHEMA_NAME = 'SCHEMA_NAME' + COLUMN_COMMENT = 'COLUMN_COMMENT' + TABLE_COMMENT = 'TABLE_COMMENT' + COLUMN_DEFAULT_VALUE = 'DATA_DEFAULT' + COLUMN_LENGTH = 'COLUMN_LENGTH' + DECIMAL_TOTAL_DIGITS = 'DECIMAL_TOTAL_DIGITS' + DECIMAL_FRACTIONAL_DIGITS = 'DECIMAL_FRACTIONAL_DIGITS' + TABLE_CREATE_TIME = 'TABLE_CREATE_TIME' + TABLE_LAST_ALTER_TIME = 'TABLE_LAST_ALTER_TIME' + +# Dataplex constants +class DataplexTypesSchema(Enum): + NULLABLE = 'NULLABLE' + REQUIRED = 'REQUIRED' + +# universal catalog system AspectType for database tables and schemas +SCHEMA_KEY = "dataplex-types.global.schema" + +@F.udf(returnType=StringType()) +def sanitize_column_name_udf(name: str): + """Convert non-ASCII characters in column names to _u_ format. + + Dataplex import rejects non-ASCII characters in column names. + Each non-ASCII character is replaced with _u_ using its + Unicode code point (e.g. _u6D4B_, _u1F389_ for emoji). Column + names with ASCII special characters (e.g. !@#$) are kept as-is + — if Dataplex rejects them during import, that is a Dataplex + constraint, not something the connector should silently alter. + """ + return encode_non_ascii(name) + +@F.udf(returnType=StringType()) +def choose_metadata_type_udf(data_type: str): + """Choose the dataplex metadata type based on native source type.""" + return get_catalog_metadata_type(data_type) + +@F.udf(returnType=StringType()) +def readable_type_name_udf(data_type: str): + """Convert short code to human-readable type name.""" + return get_readable_type_name(data_type) + + +def create_entry_source(column,entryType : EntryType,comment): + """Create Entry Source segment.""" + + ## Add comments to description field for tables and views + if entryType in [EntryType.TABLE, EntryType.VIEW]: + return F.named_struct(F.lit(JSONKeys.DISPLAY_NAME.value), + column, + F.lit(JSONKeys.SYSTEM.value), + F.lit(SOURCE_TYPE), + F.lit(JSONKeys.DESCRIPTION.value), + F.lit(comment) + ) + else: + return F.named_struct(F.lit(JSONKeys.DISPLAY_NAME.value), + column, + F.lit(JSONKeys.SYSTEM.value), + F.lit(SOURCE_TYPE) + ) + + +def create_entry_aspect(entry_aspect_name): + """Create aspect with general information (usually it is empty).""" + return F.create_map( + F.lit(entry_aspect_name), + F.named_struct( + F.lit(JSONKeys.ASPECT_TYPE.value), + F.lit(entry_aspect_name), + F.lit(JSONKeys.DATA.value), + F.create_map() + ) + ) + + +def convert_to_import_items(df, aspect_keys): + """Convert entries to import items.""" + entry_columns = [JSONKeys.NAME.value, JSONKeys.FQN.value, JSONKeys.PARENT_ENTRY.value, + JSONKeys.ENTRY_SOURCE.value, JSONKeys.ASPECTS.value, JSONKeys.ENTRY_TYPE.value] + + # Puts entry to "entry" key, a list of keys from aspects in "aspects_keys" + # and "aspects" string in "update_mask" + return df.withColumn(JSONKeys.ENTRY.value, F.struct(entry_columns)) \ + .withColumn(JSONKeys.ASPECT_KEYS.value, F.array([F.lit(key) for key in aspect_keys])) \ + .withColumn(JSONKeys.UPDATE_MASK.value, F.array(F.lit(JSONKeys.ASPECTS.value))) \ + .drop(*entry_columns) + + +def build_schemas(config, df_raw_schemas): + """Create a dataframe with database schemas from the list of usernames. + Args: + df_raw_schemas - a dataframe with only one column called SCHEMA_NAME + Returns: + A dataframe with Dataplex-readable schemas. + """ + entry_type = COLLECTION_ENTRY + entry_aspect_name = nb.create_entry_aspect_name(config, entry_type) + + # For schema, parent name is the name of the database + parent_name = nb.create_parent_name(config, entry_type) + + # Create user-defined function. + create_name_udf = F.udf(lambda x: nb.create_name(config, entry_type, x), + StringType()) + create_fqn_udf = F.udf(lambda x: nb.create_fqn(config, entry_type, x), + StringType()) + + # Fills the project and location into the entry type string + full_entry_type = entry_type.value.format( + project=config["target_project_id"], + location=config["target_location_id"]) + + # Convert list of schema names to Dataplex-compatible form + + column = F.col(Columns.SCHEMA_NAME.value) + df = df_raw_schemas.withColumn(JSONKeys.NAME.value, create_name_udf(column)) \ + .withColumn(JSONKeys.FQN.value, create_fqn_udf(column)) \ + .withColumn(JSONKeys.PARENT_ENTRY.value, F.lit(parent_name)) \ + .withColumn(JSONKeys.ENTRY_TYPE.value, F.lit(full_entry_type)) \ + .withColumn(JSONKeys.ENTRY_SOURCE.value, create_entry_source(column,entry_type,F.col(JSONKeys.DESCRIPTION.value))) \ + .withColumn(JSONKeys.ASPECTS.value, create_entry_aspect(entry_aspect_name)) \ + .drop(column) + + df = convert_to_import_items(df, [entry_aspect_name]) + return df + + +def build_dataset(config, df_raw, db_schema, entry_type): + """Build table entries from a flat list of columns. + Args: + df_raw - a plain dataframe with TABLE_NAME, COLUMN_NAME, DATA_TYPE,NULLABLE,COMMENT columns + db_schema - parent database schema + entry_type - entry type: table or view + Returns: + A dataframe with Dataplex-readable data of tables of views. + """ + + # The transformation below does the following + # 1. Alters IS_NULLABLE content from 1/0 to NULLABLE/REQUIRED + # 2. Renames IS_NULLABLE to mode + # 3. Creates metadataType column based on dataType column + # 4. Renames COLUMN_NAME to name + # 5. Renames COMMENT to DESCRIPTION + # 6. Renames DATA_DEFAULT to DEFAULT_VALUE + + df = df_raw \ + .na.fill(value='UNKNOWN',subset=[Columns.DATA_TYPE.value]) \ + .withColumn(JSONKeys.MODE.value, F.when(F.col(Columns.IS_NULLABLE.value) == IS_NULLABLE_TRUE, DataplexTypesSchema.NULLABLE.value).otherwise(DataplexTypesSchema.REQUIRED.value)) \ + .drop(Columns.IS_NULLABLE.value) \ + .withColumn(JSONKeys.METADATA_TYPE.value, choose_metadata_type_udf(Columns.DATA_TYPE.value)) \ + .withColumn(JSONKeys.DATA_TYPE.value, readable_type_name_udf(Columns.DATA_TYPE.value)) \ + .drop(Columns.DATA_TYPE.value) \ + .withColumnRenamed(Columns.COLUMN_NAME.value, JSONKeys.NAME.value) \ + .withColumn(JSONKeys.NAME.value, sanitize_column_name_udf(F.col(JSONKeys.NAME.value))) \ + .withColumnRenamed(Columns.COLUMN_COMMENT.value, JSONKeys.DESCRIPTION.value) \ + .withColumnRenamed(Columns.COLUMN_DEFAULT_VALUE.value, JSONKeys.DEFAULT_VALUE.value) \ + .na.fill(value='',subset=[JSONKeys.DESCRIPTION.value]) \ + .na.fill(value='',subset=[Columns.TABLE_COMMENT.value]) + + # Transformation to aggregates fields, denormalizing the table + # TABLE_NAME becomes top-level field, rest are put into array type "fields" + aspect_columns = [JSONKeys.NAME.value, JSONKeys.MODE.value, JSONKeys.DATA_TYPE.value, JSONKeys.METADATA_TYPE.value, JSONKeys.DESCRIPTION.value, JSONKeys.DEFAULT_VALUE.value] + df = df.withColumn(JSONKeys.COLUMNS.value, F.struct(aspect_columns)) \ + .groupby(Columns.TABLE_NAME.value, Columns.TABLE_COMMENT.value) \ + .agg(F.collect_list(JSONKeys.COLUMNS.value).alias(JSONKeys.FIELDS.value)) + + df = df.withColumnRenamed(Columns.TABLE_COMMENT.value, JSONKeys.DESCRIPTION.value) + + # Create nested structured called aspects. + # Fields becoming part of the 'schema' struct + # Entry_aspect repeats each entry_type for the aspect_type + entry_aspect_name = nb.create_entry_aspect_name(config, entry_type) + df = df.withColumn(JSONKeys.SCHEMA.value, + F.create_map(F.lit(SCHEMA_KEY), + F.named_struct( + F.lit(JSONKeys.ASPECT_TYPE.value), + F.lit(SCHEMA_KEY), + F.lit(JSONKeys.DATA.value), + F.create_map(F.lit(JSONKeys.FIELDS.value), + F.col(JSONKeys.FIELDS.value)))\ + )\ + )\ + .withColumn(JSONKeys.ENTRY_ASPECT.value, create_entry_aspect(entry_aspect_name)) \ + .drop(JSONKeys.FIELDS.value) + + # Merge separate aspect columns into 'aspects' map + df = df.select(F.col(Columns.TABLE_NAME.value),F.col(JSONKeys.DESCRIPTION.value), + F.map_concat(JSONKeys.SCHEMA.value, JSONKeys.ENTRY_ASPECT.value).alias(JSONKeys.ASPECTS.value)) + + # Define user-defined functions to fill the general information + # and hierarchy names + create_name_udf = F.udf(lambda x: nb.create_name(config, entry_type, + db_schema, x), + StringType()) + + create_fqn_udf = F.udf(lambda x: nb.create_fqn(config, entry_type, + db_schema, x), StringType()) + + parent_name = nb.create_parent_name(config,entry_type, db_schema) + + full_entry_type = entry_type.value.format( + project=config["target_project_id"], + location=config["target_location_id"]) + + # Fill the top-level fields + column = F.col(Columns.TABLE_NAME.value) + + df = df.withColumn(JSONKeys.NAME.value, create_name_udf(column)) \ + .withColumn(JSONKeys.FQN.value, create_fqn_udf(column)) \ + .withColumn(JSONKeys.ENTRY_TYPE.value, F.lit(full_entry_type)) \ + .withColumn(JSONKeys.PARENT_ENTRY.value, F.lit(parent_name)) \ + .withColumn(JSONKeys.ENTRY_SOURCE.value, create_entry_source(column,entry_type,F.col(JSONKeys.DESCRIPTION.value))) \ + .drop(column) \ + .drop(JSONKeys.DESCRIPTION.value) + + df = convert_to_import_items(df, [SCHEMA_KEY, entry_aspect_name]) + + return df \ No newline at end of file diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/gcs_uploader.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/gcs_uploader.py new file mode 100644 index 0000000..4fbadc1 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/gcs_uploader.py @@ -0,0 +1,40 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sends files to Cloud Storage.""" +from typing import Dict +from google.cloud import storage +import logging + +def upload(config: Dict[str, str], fileDirectory: str, filename: str, folder: str): + """Uploads a file to a Cloud Storage bucket.""" + client = storage.Client() + bucket = client.get_bucket((config["output_bucket"])) + + blob = bucket.blob(f"{folder}/{filename}") + blob.upload_from_filename(f"{fileDirectory}/{filename}") + +def checkDestination(bucketpath: str): + """Check Cloud Storage output folder exists""" + client = storage.Client() + + if bucketpath.startswith("gs://"): + raise Exception(f"Please provide output Cloud Storage bucket {bucketpath} without gs:// prefix") + + bucket = client.bucket(bucketpath) + + if not bucket.exists(): + raise Exception(f"Cloud Storage bucket {bucketpath} does not exist") + + return True diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/secret_manager.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/secret_manager.py new file mode 100644 index 0000000..f6e9a60 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/secret_manager.py @@ -0,0 +1,25 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import secretmanager + +# Retrieve password from Secret Manager +def get_password(secret_path: str) -> str: + """Gets password from a GCP service.""" + client = secretmanager.SecretManagerServiceClient() + if "versions" not in secret_path: + # If not specified, we need the latest version of a password + secret_path = f"{secret_path}/versions/latest" + response = client.access_secret_version(request={"name": secret_path}) + return response.payload.data.decode("UTF-8") diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/top_entry_builder.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/top_entry_builder.py new file mode 100644 index 0000000..816afe0 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/top_entry_builder.py @@ -0,0 +1,130 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Non-Spark approach for building the entries.""" +import dataclasses +import json +from typing import List +from typing import Dict +import re +import proto +from google.cloud import dataplex_v1 +from src.constants import EntryType, SOURCE_TYPE +from src import name_builder as nb + +@dataclasses.dataclass(slots=True) +class ImportItem: + """A template class for Import API.""" + + entry: dataplex_v1.Entry = dataclasses.field(default_factory=dataplex_v1.Entry) + aspect_keys: List[str] = dataclasses.field(default_factory=list) + update_mask: List[str] = dataclasses.field(default_factory=list) + +def _dict_factory(data: object): + """Factory function required for converting Entry dataclass to dict.""" + + def convert(obj: object): + if isinstance(obj, proto.Message): + return proto.Message.to_dict(obj) + return obj + + return dict((k, convert(v)) for k, v in data) + +def _to_camel_case(keyname_str: str) -> str: + """Converts string to camelCase.""" + + if not isinstance(keyname_str, str) or '_' not in keyname_str: + return keyname_str # Return non-strings or already-camel-like strings as is + + # Filter so only converting known keys to prevent unexpected errors on aspect keys etc + if keyname_str not in ["entry_type","aspect_key","aspect_type","source_type","parent_entry","fully_qualified_name","update_mask","aspect_keys","entry_source","display_name"]: + return keyname_str + + return re.sub(r"[-_]([a-zA-Z])", lambda x: x[1].upper(), keyname_str) + + +def _dict_factory_camcelCase(data: object): + """ + return object with camelCase property names + """ + + def convert_recursive(obj: object): + """Recursively converts objects.""" + if isinstance(obj, proto.Message): + # Convert proto message to dict first + native_dict = proto.Message.to_dict(obj) + # Then recursively process the resulting dict + return convert_recursive(native_dict) + elif isinstance(obj, dict): + # If it's a dict, convert keys to camelCase and recurse on values + return { + _to_camel_case(k): convert_recursive(v) + for k, v in obj.items() + } + elif isinstance(obj, (list, tuple)): + # If it's a list/tuple, recurse on elements + # Ensure the same type (list/tuple) is returned + return type(obj)(convert_recursive(item) for item in obj) + else: + # Base case: return other types (int, str, bool, etc.) as is + return obj + return { + _to_camel_case(k): convert_recursive(v) + for k, v in data + } + + +def _create_entry(config: Dict[str, str], entry_type: EntryType): + """Creates a Dataplex Entry.""" + entry = dataplex_v1.Entry() + entry.name = nb.create_name(config, entry_type) + + entry.entry_type = entry_type.value.format( + project=config["target_project_id"], location=config["target_location_id"]) + + entry.fully_qualified_name = nb.create_fqn(config, entry_type) + entry.parent_entry = nb.create_parent_name(config, entry_type) + + # Set entry source with display name + display_name = entry.name.split("/")[-1] + entry.entry_source = dataplex_v1.EntrySource( + display_name=display_name, + system=SOURCE_TYPE, + ) + + aspect_key = nb.create_entry_aspect_name(config, entry_type) + + # Add mandatory aspect + entry_aspect = dataplex_v1.Aspect() + entry_aspect.aspect_type = aspect_key + entry_aspect.data = {} + entry.aspects[aspect_key] = entry_aspect + + return entry + + +def _entry_to_import_item(entry: dataplex_v1.Entry): + """Packs entry to import item, accepted by the API.""" + import_item = ImportItem() + import_item.entry = entry + import_item.aspect_keys = list(entry.aspects.keys()) + import_item.update_mask = ["aspects"] + + return import_item + +def create(config, entry_type: EntryType): + """Creates a dataplex entry, packs it to Import Item and converts to json.""" + import_item = _entry_to_import_item(_create_entry(config, entry_type)) + camelCase_item = dataclasses.asdict(import_item, dict_factory=_dict_factory_camcelCase) + return json.dumps(camelCase_item) \ No newline at end of file diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/util.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/util.py new file mode 100644 index 0000000..eed837b --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/common/util.py @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Utility convenience functions +import sys +from datetime import datetime +import re +import os + +# Loads file at a given path and returns the content as a string +def loadReferencedFile(file_path) -> str: + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + return content + except Exception as e: + print(f"Error while reading file {file_path}: {e}") + sys.exit(1) + return None + +# Convert string to camel case - dataplex v1 property names +def to_camel_case(text) -> str: + return re.sub(r"[-_]([a-zA-Z])", lambda x: x[1].upper(), text) + +# folder name with timestamp +def generateFolderName(SOURCE_TYPE: str) -> str: + currentDate = datetime.now() + return f"{SOURCE_TYPE}/{currentDate.year}{currentDate.month}{currentDate.day}-{currentDate.hour}{currentDate.minute}{currentDate.second}" + +# True if running in container +def isRunningInContainer() -> bool: + return os.environ.get("RUNNING_IN_CONTAINER", "").lower() in ("yes", "y", "on", "true", "1") + + +# Returns True is file exists at given path. +# filePath can also be comma-seperated list of multiple jar paths +def fileExists(filepath : str) -> bool: + + if "," in filepath: + # Split into individual file paths and check each + jar_list = filepath.split(",") + for jar_file_path in jar_list: + if not os.path.isfile(jar_file_path): + raise Exception(f"Jar file not found: {jar_file_path}") + else: + # check single file path + if not os.path.isfile(filepath): + raise Exception(f"Jar file not found: {filepath}") + + return True diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/constants.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/constants.py new file mode 100644 index 0000000..87507a4 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/constants.py @@ -0,0 +1,59 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constants for the Teradata connector.""" + +import enum +from typing import List + +SOURCE_TYPE = "teradata" + +# Default JDBC jar file. Can override with --jar +JDBC_JAR = "terajdbc4.jar" + +# Allow common bootstrap to load connector for this datasource +CONNECTOR_MODULE = "src.teradata_connector" +CONNECTOR_CLASS = "TeradataConnector" + +# Value to test for if column is nullable. Teradata DBC.ColumnsV uses Y/N +IS_NULLABLE_TRUE = "Y" + + +class EntryType(enum.Enum): + """Logical hierarchy of EntryTypes in Teradata.""" + INSTANCE: str = "projects/{project}/locations/{location}/entryTypes/teradata-instance" + DATABASE: str = "projects/{project}/locations/{location}/entryTypes/teradata-database" + DB_SCHEMA: str = "projects/{project}/locations/{location}/entryTypes/teradata-schema" + TABLE: str = "projects/{project}/locations/{location}/entryTypes/teradata-table" + VIEW: str = "projects/{project}/locations/{location}/entryTypes/teradata-view" + + +# Top-level entries written before schema processing +TOP_ENTRY_HIERARCHY: List[EntryType] = [ + EntryType.INSTANCE, + EntryType.DATABASE, +] + +# EntryType under which tables/views are organized +COLLECTION_ENTRY: EntryType = EntryType.DB_SCHEMA + +# DB objects to extract metadata for +DB_OBJECT_TYPES_TO_PROCESS: List[EntryType] = [ + EntryType.TABLE, + EntryType.VIEW, +] + + +def generateFileName(config: dict) -> str: + return f"{SOURCE_TYPE}-{config['host']}.jsonl" diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/datatype_mapper.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/datatype_mapper.py new file mode 100644 index 0000000..34470ef --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/datatype_mapper.py @@ -0,0 +1,171 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Maps Teradata data types to Dataplex Catalog metadata types. + +Handles both short codes from DBC.ColumnsV.ColumnType (e.g. I, CV, DA) +and full type names from the ColumnType() SQL function (e.g. INTEGER, +VARCHAR, DATE) for maximum compatibility across Teradata versions. +""" + + +def get_catalog_metadata_type(data_type: str) -> str: + """Map Teradata type to Dataplex metadata type.""" + if data_type is None: + return "OTHER" + dt = data_type.strip().upper() + + # --- Short codes (DBC.ColumnsV.ColumnType raw values) --- + # Date/Time short codes (check before numeric since DA = DATE) + if dt in ("DA", "AT", "TZ"): + return "DATETIME" + if dt in ("TS", "SZ"): + return "TIMESTAMP" + + # Numeric short codes + if dt in ("I", "I1", "I2", "I8", "D", "F", "N"): + return "NUMBER" + + # String short codes + if dt in ("CV", "CF", "CO", "LV", "JN", "XM", "GF", "GV", "GL"): + return "STRING" + + # Bytes short codes + if dt in ("BV", "BF", "BO"): + return "BYTES" + + # Interval short codes + if dt in ("YR", "YM", "MO", "DY", "DH", "DM", "DS", + "HR", "HM", "HS", "MI", "MS", "SC"): + return "OTHER" + + # Period short codes + if dt in ("PD", "PT", "PS", "PM", "PZ"): + return "OTHER" + + # UDT, Dataset, Array short codes + if dt in ("UT", "DT", "A1", "AN"): + return "OTHER" + + # --- Full type names (from ColumnType() function) --- + # ColumnType() may return names with size qualifiers, e.g. + # DECIMAL(18,2), VARCHAR(100), BYTE(10), FLOAT(53), CLOB(1M) + + # Numeric types (check BYTEINT before BYTE to avoid conflict) + if dt in ("INTEGER", "SMALLINT", "BIGINT", "REAL") or \ + dt.startswith("BYTEINT") or \ + dt.startswith("FLOAT") or dt.startswith("DOUBLE") or \ + dt.startswith("DECIMAL") or dt.startswith("NUMERIC") or \ + dt.startswith("NUMBER"): + return "NUMBER" + + # String types (check LONG VARCHAR before VARCHAR, VARGRAPHIC before GRAPHIC) + if dt.startswith("LONG VARCHAR") or dt.startswith("LONG VARGRAPHIC") or \ + dt.startswith("VARCHAR") or dt.startswith("VARGRAPHIC") or \ + dt.startswith("CHAR") or dt.startswith("GRAPHIC") or \ + dt.startswith("CLOB") or \ + dt.startswith("JSON") or dt.startswith("XML"): + return "STRING" + + # Binary types (check LONG VARBYTE and VARBYTE before BYTE) + if dt.startswith("LONG VARBYTE") or dt.startswith("VARBYTE") or \ + dt.startswith("BLOB") or dt.startswith("BYTE"): + return "BYTES" + + # Timestamp types (check before TIME to avoid TIMESTAMP matching TIME) + if dt.startswith("TIMESTAMP"): + return "TIMESTAMP" + + # Date/Time types + if dt == "DATE": + return "DATETIME" + if dt.startswith("TIME"): + return "DATETIME" + + # Boolean (Teradata 16.20+) + if dt == "BOOLEAN": + return "BOOLEAN" + + # Geospatial, Interval, Period, UDT, Dataset, Array, etc. + return "OTHER" + + +# Mapping from DBC.ColumnsV short codes to human-readable type names. +_SHORT_CODE_TO_NAME = { + # Numeric + "I": "INTEGER", + "I1": "BYTEINT", + "I2": "SMALLINT", + "I8": "BIGINT", + "D": "DECIMAL", + "F": "FLOAT", + "N": "NUMBER", + # String + "CV": "VARCHAR", + "CF": "CHAR", + "CO": "CLOB", + "LV": "LONG VARCHAR", + "JN": "JSON", + "XM": "XML", + "GF": "GRAPHIC", + "GV": "VARGRAPHIC", + "GL": "LONG VARGRAPHIC", + # Bytes + "BV": "VARBYTE", + "BF": "BYTE", + "BO": "BLOB", + # Date/Time + "DA": "DATE", + "AT": "TIME", + "TS": "TIMESTAMP", + "SZ": "TIMESTAMP WITH TIME ZONE", + "TZ": "TIME WITH TIME ZONE", + # Interval + "YR": "INTERVAL YEAR", + "YM": "INTERVAL YEAR TO MONTH", + "MO": "INTERVAL MONTH", + "DY": "INTERVAL DAY", + "DH": "INTERVAL DAY TO HOUR", + "DM": "INTERVAL DAY TO MINUTE", + "DS": "INTERVAL DAY TO SECOND", + "HR": "INTERVAL HOUR", + "HM": "INTERVAL HOUR TO MINUTE", + "HS": "INTERVAL HOUR TO SECOND", + "MI": "INTERVAL MINUTE", + "MS": "INTERVAL MINUTE TO SECOND", + "SC": "INTERVAL SECOND", + # Period + "PD": "PERIOD(DATE)", + "PT": "PERIOD(TIME)", + "PS": "PERIOD(TIMESTAMP)", + "PM": "PERIOD(TIME WITH TIME ZONE)", + "PZ": "PERIOD(TIMESTAMP WITH TIME ZONE)", + # Other + "UT": "UDT", + "DT": "DATASET", + "A1": "ARRAY", + "AN": "MULTI-DIMENSIONAL ARRAY", +} + + +def get_readable_type_name(data_type: str) -> str: + """Convert a Teradata short code to a human-readable type name. + + If the value is already a full name (e.g. from ColumnType()), + it is returned as-is. + """ + if data_type is None: + return "UNKNOWN" + dt = data_type.strip().upper() + return _SHORT_CODE_TO_NAME.get(dt, dt) diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/name_builder.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/name_builder.py new file mode 100644 index 0000000..b60d2f2 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/name_builder.py @@ -0,0 +1,166 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Builds Dataplex hierarchy identifiers for Teradata.""" + +from typing import Dict + +from src.constants import EntryType, DB_OBJECT_TYPES_TO_PROCESS + +# Dataplex FQN system prefix. Only a fixed set of prefixes are recognized +# (oracle, mysql, postgresql, sqlserver, custom). Use 'custom' for Teradata. +FQN_PREFIX = "custom" + +# Characters allowed in a single Dataplex entry ID segment. +# '/' is excluded — it is the hierarchy separator in resource paths +# and must not appear within an individual segment. +_ALLOWED_ENTRY_ID_CHARS = frozenset( + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789" + "-._~%!$&'()*+,;=@" +) + + +def encode_non_ascii(text: str) -> str: + """Replace non-ASCII characters with _u_ format. + + Each character with ord > 127 becomes _u_ using its Unicode + code point (e.g. 测 -> _u6D4B_, 🎉 -> _u1F389_). ASCII characters + are returned unchanged. Returns None if input is None. + """ + if text is None: + return text + if all(ord(ch) <= 127 for ch in text): + return text + return "".join( + f"_u{ord(ch):04X}_" if ord(ch) > 127 else ch + for ch in text + ) + + +def _sanitize_entry_id(segment: str) -> str: + """Replace characters not allowed in Dataplex entry IDs. + + Non-ASCII characters (Chinese, etc.) are converted to + _u_ format (e.g. _u6D4B_, _u1F389_ for emoji). + Other invalid characters (including '/') are replaced with + underscores. + """ + encoded = encode_non_ascii(segment) or "" + return "".join( + ch if ch in _ALLOWED_ENTRY_ID_CHARS else "_" + for ch in encoded + ) + + +def _sanitize_fqn_segment(segment: str) -> str: + """Replace dots with hyphens so Dataplex FQN parser doesn't split on them.""" + return segment.replace(".", "-") + + +def create_fqn( + config: Dict[str, str], + entry_type: EntryType, + schema_name: str = "", + table_name: str = "", +) -> str: + """Creates a fully qualified name.""" + host = _sanitize_fqn_segment(config["host"]) + + if entry_type == EntryType.INSTANCE: + return f"{FQN_PREFIX}:`{host}`" + + if entry_type == EntryType.DATABASE: + instance = create_fqn(config, EntryType.INSTANCE) + return f"{instance}.{host}" + + if entry_type == EntryType.DB_SCHEMA: + database = create_fqn(config, EntryType.DATABASE) + return f"{database}.{schema_name}" + + if entry_type in [EntryType.TABLE, EntryType.VIEW]: + database = create_fqn(config, EntryType.DATABASE) + return f"{database}.{schema_name}.{table_name}" + + return "" + + +def create_name( + config: Dict[str, str], + entry_type: EntryType, + schema_name: str = "", + table_name: str = "", +) -> str: + """Creates a Dataplex v2 hierarchy name (resource path).""" + if entry_type == EntryType.INSTANCE: + name_prefix = ( + f"projects/{config['target_project_id']}/" + f"locations/{config['target_location_id']}/" + f"entryGroups/{config['target_entry_group_id']}/" + f"entries/" + ) + return name_prefix + _sanitize_entry_id(config["host"].replace(":", "@")) + + if entry_type == EntryType.DATABASE: + instance = create_name(config, EntryType.INSTANCE) + return f"{instance}/databases/{_sanitize_entry_id(config['host'])}" + + if entry_type == EntryType.DB_SCHEMA: + database = create_name(config, EntryType.DATABASE) + return f"{database}/database_schemas/{_sanitize_entry_id(schema_name)}" + + if entry_type == EntryType.TABLE: + db_schema = create_name( + config, EntryType.DB_SCHEMA, schema_name + ) + return f"{db_schema}/tables/{_sanitize_entry_id(table_name)}" + + if entry_type == EntryType.VIEW: + db_schema = create_name( + config, EntryType.DB_SCHEMA, schema_name + ) + return f"{db_schema}/views/{_sanitize_entry_id(table_name)}" + + return "" + + +def create_parent_name( + config: Dict[str, str], + entry_type: EntryType, + parent_name: str = "", +) -> str: + """Generates a Dataplex v2 name of the parent.""" + if entry_type == EntryType.DATABASE: + return create_name(config, EntryType.INSTANCE) + + if entry_type == EntryType.DB_SCHEMA: + return create_name(config, EntryType.DATABASE) + + if entry_type in DB_OBJECT_TYPES_TO_PROCESS: + return create_name(config, EntryType.DB_SCHEMA, parent_name) + + return "" + + +def create_entry_aspect_name( + config: Dict[str, str], entry_type: EntryType +) -> str: + """Generates an entry aspect name.""" + last_segment = entry_type.value.split("/")[-1] + return ( + f"{config['target_project_id']}" + f".{config['target_location_id']}" + f".{last_segment}" + ) diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/src/teradata_connector.py b/managed-connectivity/community-contributed-connectors/teradata-connector/src/teradata_connector.py new file mode 100644 index 0000000..7f3c48a --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/src/teradata_connector.py @@ -0,0 +1,355 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Reads Teradata metadata using PySpark JDBC and teradatasql.""" + +from typing import Dict, List, Tuple, Any + +import teradatasql +from pyspark.sql import SparkSession, DataFrame + +from src.common.ExternalSourceConnector import IExternalSourceConnector +from src.constants import EntryType, JDBC_JAR +from src.common.connection_jar import getJarPath +from src.common.util import fileExists +from src.common.argument_validator import validateQueryBand + +# Teradata system databases to exclude from metadata extraction +SYSTEM_DATABASES = ( + "DBC", "SysAdmin", "SystemFe", "TDQCD", "TDStats", + "tdwm", "SYSLIB", "SYSBAR", "SYSJDBC", "SYSSPATIAL", + "SysUDTLib", "dbcmngr", "LockLogShredder", "SQLJ", + "Crashdumps", "Default", "EXTUSER", "TDPUSER", + "TDMaps", "TD_SYSXML", "TD_SERVER_DB", + "All", "Sys_Calendar", "SYSUDTLIB", "SYSUIF", + "External_AP", "SYSGATEWAY", "TD_SYSFNLIB", + "TD_SYSGPL", +) + + +class TeradataConnector(IExternalSourceConnector): + """Reads metadata from Teradata and returns Spark DataFrames.""" + + def __init__(self, config: Dict[str, str]): + jar_path = getJarPath(config, [JDBC_JAR]) + fileExists(jar_path) + + # Validate config before starting heavyweight resources + query_band = validateQueryBand(config.get("query_band")) + + self._spark = ( + SparkSession.builder + .appName("TeradataIngestor") + .config("spark.jars", jar_path) + .config("spark.log.level", "ERROR") + .getOrCreate() + ) + self._config = config + charset = config.get("charset", "UTF8") + self._url = ( + f"jdbc:teradata://{config['host']}" + f"/DBS_PORT={config['port']},CHARSET={charset}" + ) + + # Append LOGMECH / LOGDATA to the JDBC URL so the + # Teradata JDBC driver receives them as connection + # parameters (Spark's BasicConnectionProvider does not + # forward custom options as Teradata properties). + if config.get("logmech"): + self._url += f",LOGMECH={config['logmech']}" + if config.get("logdata"): + self._url += f",LOGDATA={config['logdata']}" + + self._connectOptions = { + "driver": "com.teradata.jdbc.TeraDriver", + "url": self._url, + } + if config.get("user"): + self._connectOptions["user"] = config["user"] + if config.get("password"): + self._connectOptions["password"] = config["password"] + + self._query_band = query_band + + # Safe: validateQueryBand whitelist guarantees no single quotes in _query_band + self._connectOptions["sessionInitStatement"] = ( + f"SET QUERY_BAND = '{self._query_band}' FOR SESSION" + ) + + # Native Python connection for HELP COLUMN (views) + td_connect_params = { + "host": config["host"], + "dbs_port": str(config["port"]), + } + if config.get("user"): + td_connect_params["user"] = config["user"] + if config.get("password"): + td_connect_params["password"] = config["password"] + if config.get("logmech"): + td_connect_params["logmech"] = config["logmech"] + if config.get("logdata"): + td_connect_params["logdata"] = config["logdata"] + self._td_conn = teradatasql.connect(**td_connect_params) + + # Safe: validateQueryBand whitelist guarantees no single quotes in _query_band + with self._td_conn.cursor() as cur: + cur.execute( + f"SET QUERY_BAND = '{self._query_band}' FOR SESSION" + ) + + def _execute(self, query: str) -> DataFrame: + """Execute a query via JDBC and return a DataFrame.""" + return ( + self._spark.read.format("jdbc") + .options(**self._connectOptions) + .option("query", query) + .load() + ) + + def get_db_schemas(self) -> DataFrame: + """Get database/schema names, excluding system databases.""" + exclusion_list = ",".join(f"'{db}'" for db in SYSTEM_DATABASES) + + # Optional: scope to a single database + db_filter = "" + if self._config.get("database"): + db_filter = ( + f"AND DatabaseName = '{self._config['database']}'" + ) + + query = f""" + SELECT TRIM(DatabaseName) AS SCHEMA_NAME + FROM DBC.DatabasesV + WHERE DatabaseName NOT IN ({exclusion_list}) + {db_filter} + """ + return self._execute(query) + + def get_dataset( + self, schema_name: str, entry_type: EntryType + ) -> DataFrame: + """Get table or view metadata with columns. + + Args: + schema_name: the Teradata database/schema to query + entry_type: EntryType.TABLE or EntryType.VIEW + """ + if entry_type == EntryType.TABLE: + return self._get_tables(schema_name) + return self._get_views(schema_name) + + def _get_tables(self, schema_name: str) -> DataFrame: + """Get table metadata from DBC.ColumnsV.""" + safe_schema = schema_name.replace("'", "''") + query = f""" + SELECT + TRIM(c.TableName) AS TABLE_NAME, + TRIM(c.ColumnName) AS COLUMN_NAME, + TRIM(c.ColumnType) AS DATA_TYPE, + CASE WHEN c.Nullable = 'Y' + THEN 'Y' ELSE 'N' + END AS IS_NULLABLE, + t.CommentString AS TABLE_COMMENT, + c.CommentString AS COLUMN_COMMENT, + c.DefaultValue AS DATA_DEFAULT, + t.CreateTimeStamp AS TABLE_CREATE_TIME, + t.LastAlterTimeStamp AS TABLE_LAST_ALTER_TIME + FROM DBC.ColumnsV c + INNER JOIN DBC.TablesV t + ON t.TableName = c.TableName + AND t.DatabaseName = c.DatabaseName + WHERE c.DatabaseName = '{safe_schema}' + AND t.TableKind IN ('T', 'O') + """ + return self._execute(query).orderBy("TABLE_NAME") + + def _execute_td(self, query: str) -> List[Tuple[Any, ...]]: + """Execute a query via teradatasql and return rows.""" + with self._td_conn.cursor() as cur: + cur.execute(query) + return cur.fetchall() + + def _get_views(self, schema_name: str) -> DataFrame: + """Get view metadata using teradatasql + HELP COLUMN. + + DBC.ColumnsV returns NULL types for view columns and + DBC.ColumnsQV requires the QVCI feature which is unstable + and often not enabled. HELP COLUMN reliably resolves + view column types via the teradatasql Python driver. + """ + # Step 1: Get list of views with table-level metadata + safe_schema = schema_name.replace("'", "''") + view_list_query = f""" + SELECT TRIM(TableName), + CommentString, + CreateTimeStamp, + LastAlterTimeStamp + FROM DBC.TablesV + WHERE DatabaseName = '{safe_schema}' + AND TableKind = 'V' + ORDER BY TableName + """ + views = self._execute_td(view_list_query) + + if not views: + return self._spark.createDataFrame( + [], self._view_column_schema() + ) + + # Step 2: For each view, get column metadata via HELP COLUMN. + # Quote identifiers to handle special characters and reserved words. + all_rows = [] + for view_name, table_comment, create_time, alter_time in views: + quoted_schema = schema_name.replace('"', '""') + quoted_view = view_name.replace('"', '""') + help_query = f'HELP COLUMN "{quoted_schema}"."{quoted_view}".*' + try: + with self._td_conn.cursor() as cur: + cur.execute(help_query) + col_descriptions = cur.description + columns = cur.fetchall() + + # Build column name index from cursor description + col_idx = { + desc[0]: i + for i, desc in enumerate(col_descriptions) + } + + for col in columns: + col_name = col[col_idx["Column Name"]] + col_type = col[col_idx["Type"]] + nullable = col[col_idx["Nullable"]] + max_len = col[col_idx["Max Length"]] + dec_total = col[col_idx["Decimal Total Digits"]] + dec_frac = col[col_idx["Decimal Fractional Digits"]] + comment = col[col_idx.get("Comment", -1)] \ + if "Comment" in col_idx else None + + all_rows.append(( + view_name, + col_name.strip() if col_name else "", + col_type.strip() if col_type else None, + max_len, + dec_total, + dec_frac, + "Y" if nullable == "Y" else "N", + table_comment, + comment, + None, # DATA_DEFAULT not in HELP COLUMN + create_time, + alter_time, + )) + except Exception as e: + # Fall back to DBC.ColumnsV so the view still appears + # in the catalog (with NULL types) rather than being + # silently dropped. + msg = str(e).split("\n")[0].strip() + print(f"Warning: HELP COLUMN failed for " + f"{schema_name}.{view_name}, falling back to " + f"DBC.ColumnsV: {msg}") + fallback_rows = self._get_view_fallback( + schema_name, view_name, table_comment, + create_time, alter_time, + ) + all_rows.extend(fallback_rows) + + return self._spark.createDataFrame( + all_rows, self._view_column_schema() + ) + + def _get_view_fallback( + self, schema_name, view_name, table_comment, + create_time, alter_time, + ) -> List[Tuple[Any, ...]]: + """Fall back to DBC.ColumnsV for a view when HELP COLUMN fails. + + Returns rows with NULL types — the view still appears in the + catalog rather than being silently dropped. + """ + rows = [] + try: + safe_schema = schema_name.replace("'", "''") + safe_view = view_name.replace("'", "''") + fallback_query = f""" + SELECT TRIM(ColumnName), Nullable + FROM DBC.ColumnsV + WHERE DatabaseName = '{safe_schema}' + AND TableName = '{safe_view}' + """ + for col_name, nullable in self._execute_td(fallback_query): + rows.append(( + view_name, + col_name.strip() if col_name else "", + None, # DATA_TYPE unknown + None, # COLUMN_LENGTH + None, # DECIMAL_TOTAL_DIGITS + None, # DECIMAL_FRACTIONAL_DIGITS + "Y" if nullable == "Y" else "N", + table_comment, + None, # COLUMN_COMMENT + None, # DATA_DEFAULT + create_time, + alter_time, + )) + except Exception as e2: + msg2 = str(e2).split("\n")[0].strip() + print(f"Warning: DBC.ColumnsV fallback also failed for " + f"{schema_name}.{view_name}: {msg2}") + return rows + + def close(self) -> None: + """Close the teradatasql connection and stop the SparkSession.""" + if getattr(self, "_td_conn", None) is not None: + try: + self._td_conn.close() + except Exception: + pass + finally: + self._td_conn = None + if getattr(self, "_spark", None) is not None: + try: + self._spark.stop() + except Exception: + pass + finally: + self._spark = None + + def __enter__(self) -> "TeradataConnector": + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + self.close() + + @staticmethod + def _view_column_schema(): + """Schema matching _get_tables output for DataFrame creation.""" + from pyspark.sql.types import ( + StructType, StructField, StringType, + IntegerType, TimestampType, + ) + return StructType([ + StructField("TABLE_NAME", StringType()), + StructField("COLUMN_NAME", StringType()), + StructField("DATA_TYPE", StringType()), + StructField("COLUMN_LENGTH", IntegerType()), + StructField("DECIMAL_TOTAL_DIGITS", IntegerType()), + StructField("DECIMAL_FRACTIONAL_DIGITS", IntegerType()), + StructField("IS_NULLABLE", StringType()), + StructField("TABLE_COMMENT", StringType()), + StructField("COLUMN_COMMENT", StringType()), + StructField("DATA_DEFAULT", StringType()), + StructField("TABLE_CREATE_TIME", TimestampType()), + StructField("TABLE_LAST_ALTER_TIME", TimestampType()), + ]) diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/teradata-connector-workflow.yaml b/managed-connectivity/community-contributed-connectors/teradata-connector/teradata-connector-workflow.yaml new file mode 100644 index 0000000..5ae6316 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/teradata-connector-workflow.yaml @@ -0,0 +1,112 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Cloud Workflow for end-to-end Teradata metadata extraction and import. +# Submits a Dataproc Serverless PySpark job to extract metadata from +# Teradata, then triggers a Dataplex metadata import job. +# +# Deploy: +# gcloud workflows deploy teradata-metadata-import \ +# --location=us-central1 \ +# --source=teradata-connector-workflow.yaml +# +# Execute: +# gcloud workflows execute teradata-metadata-import \ +# --location=us-central1 \ +# --data='{ ... }' + +main: + params: [args] + steps: + - init: + assign: + - project: ${args.PROJECT_ID} + - region: ${args.CLOUD_REGION} + - entryGroupId: "teradata" + - teradataHost: ${args.TERADATA_HOST} + - teradataPort: ${args.TERADATA_PORT} + - teradataUser: ${args.TERADATA_USER} + - passwordSecret: ${args.PASSWORD_SECRET} + - outputBucket: ${args.OUTPUT_BUCKET} + - outputFolder: "teradata" + - serviceAccount: ${args.SERVICE_ACCOUNT} + - containerImage: ${args.CONTAINER_IMAGE} + - depsBucket: ${args.DEPS_BUCKET} + + - submit_pyspark_extract_job: + call: http.post + args: + url: ${"https://dataproc.googleapis.com/v1/projects/" + project + "/locations/" + region + "/batches"} + auth: + type: OAuth2 + body: + pysparkBatch: + mainPythonFileUri: "file:///main.py" + jars: + - "file:///opt/spark/jars/terajdbc4.jar" + args: + - ${"--target_project_id=" + project} + - ${"--target_location_id=" + region} + - ${"--target_entry_group_id=" + entryGroupId} + - ${"--host=" + teradataHost} + - ${"--port=" + teradataPort} + - ${"--user=" + teradataUser} + - ${"--password_secret=" + passwordSecret} + - ${"--output_bucket=" + outputBucket} + - ${"--output_folder=" + outputFolder} + runtimeConfig: + containerImage: ${containerImage} + environmentConfig: + executionConfig: + serviceAccount: ${serviceAccount} + result: batch_result + + - wait_for_batch: + call: googleapis.dataproc.v1.projects.locations.batches.get + args: + name: ${batch_result.body.name} + result: batch_status + + - submit_import_job: + call: http.post + args: + url: ${"https://dataplex.googleapis.com/v1/projects/" + project + "/locations/" + region + "/metadataJobs"} + auth: + type: OAuth2 + body: + type: "IMPORT" + import_spec: + source_storage_uri: ${"gs://" + outputBucket + "/" + outputFolder + "/"} + entry_sync_mode: "FULL" + aspect_sync_mode: "INCREMENTAL" + scope: + entry_groups: + - ${"projects/" + project + "/locations/" + region + "/entryGroups/" + entryGroupId} + entry_types: + - ${"projects/" + project + "/locations/" + region + "/entryTypes/teradata-instance"} + - ${"projects/" + project + "/locations/" + region + "/entryTypes/teradata-database"} + - ${"projects/" + project + "/locations/" + region + "/entryTypes/teradata-schema"} + - ${"projects/" + project + "/locations/" + region + "/entryTypes/teradata-table"} + - ${"projects/" + project + "/locations/" + region + "/entryTypes/teradata-view"} + aspect_types: + - ${"projects/" + project + "/locations/" + region + "/aspectTypes/teradata-instance"} + - ${"projects/" + project + "/locations/" + region + "/aspectTypes/teradata-database"} + - ${"projects/" + project + "/locations/" + region + "/aspectTypes/teradata-schema"} + - ${"projects/" + project + "/locations/" + region + "/aspectTypes/teradata-table"} + - ${"projects/" + project + "/locations/" + region + "/aspectTypes/teradata-view"} + - "projects/dataplex-types/locations/global/aspectTypes/schema" + result: import_result + + - return_result: + return: ${import_result} diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/__init__.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_argument_validator.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_argument_validator.py new file mode 100644 index 0000000..825bf7f --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_argument_validator.py @@ -0,0 +1,152 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for argument_validator with mocked GCP dependencies.""" + +import argparse +from unittest import mock +from unittest.mock import MagicMock + +import pytest + +from src.common.argument_validator import ( + validateArguments, + validateSecretID, + checkOptionProvided, + true_or_false, +) + + +def _make_args(**kwargs): + """Create a namespace with default valid args.""" + defaults = { + "local_output_only": True, + "output_bucket": None, + "output_folder": None, + "target_location_id": "us-central1", + "password_secret": None, + } + defaults.update(kwargs) + return argparse.Namespace(**defaults) + + +class TestValidateArguments: + """Tests for validateArguments().""" + + @mock.patch( + "src.common.argument_validator.get_password", + return_value="secret123", + ) + def test_password_secret_resolved(self, mock_pw): + args = _make_args( + password_secret="projects/proj/secrets/mysecret" + ) + result = validateArguments(args) + assert result.password == "secret123" + + def test_output_bucket_required_when_not_local(self): + args = _make_args( + local_output_only=False, + output_bucket=None, + output_folder=None, + ) + with pytest.raises(Exception, match="output_bucket"): + validateArguments(args) + + @mock.patch("src.common.argument_validator.checkDestination") + def test_invalid_bucket_rejected(self, mock_check): + mock_check.return_value = False + args = _make_args( + local_output_only=False, + output_bucket="bad-bucket", + output_folder="folder", + ) + with pytest.raises(Exception, match="not valid"): + validateArguments(args) + + def test_invalid_region_rejected(self): + args = _make_args(target_location_id="invalid-region") + with pytest.raises(Exception, match="target_location_id"): + validateArguments(args) + + @mock.patch("src.common.argument_validator.checkDestination") + def test_valid_region_accepted(self, mock_check): + mock_check.return_value = True + args = _make_args( + local_output_only=False, + output_bucket="good-bucket", + output_folder="folder", + target_location_id="us-east1", + ) + result = validateArguments(args) + assert result.target_location_id == "us-east1" + + @mock.patch("src.common.argument_validator.checkDestination") + def test_global_region_accepted(self, mock_check): + mock_check.return_value = True + args = _make_args( + local_output_only=False, + output_bucket="bucket", + output_folder="folder", + target_location_id="global", + ) + result = validateArguments(args) + assert result.target_location_id == "global" + + +class TestValidateSecretID: + """Tests for validateSecretID().""" + + def test_valid_secret_id(self): + assert validateSecretID( + "projects/my-project/secrets/my-secret" + ) is True + + def test_invalid_secret_id_missing_projects(self): + with pytest.raises(Exception, match="not a valid Secret ID"): + validateSecretID("secrets/my-secret") + + def test_invalid_secret_id_extra_slash(self): + with pytest.raises(Exception, match="not a valid Secret ID"): + validateSecretID( + "projects/proj/secrets/mysecret/versions/1" + ) + + +class TestCheckOptionProvided: + """Tests for checkOptionProvided().""" + + def test_option_present(self): + args = argparse.Namespace(foo="bar", baz=None) + assert checkOptionProvided(args, ["foo"]) is True + + def test_option_missing(self): + args = argparse.Namespace(foo=None) + assert checkOptionProvided(args, ["foo"]) is False + + def test_option_not_in_namespace(self): + args = argparse.Namespace(foo="bar") + assert checkOptionProvided(args, ["missing"]) is False + + +class TestTrueOrFalse: + """Tests for true_or_false().""" + + @pytest.mark.parametrize("val", ["true", "True", "TRUE", "T", "t"]) + def test_true_values(self, val): + assert true_or_false(val) is True + + @pytest.mark.parametrize("val", ["false", "False", "FALSE", "F", "f"]) + def test_false_values(self, val): + assert true_or_false(val) is False diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_auth_methods.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_auth_methods.py new file mode 100644 index 0000000..3ec143c --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_auth_methods.py @@ -0,0 +1,254 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for --logmech and --logdata CLI arguments.""" + +from unittest import mock + +import pytest + +from src.cmd_reader import read_args + + +# Base args with user + password (TD2 default scenario) +BASE_ARGS = [ + "main.py", + "--target_project_id", "test-project", + "--target_location_id", "us-central1", + "--target_entry_group_id", "teradata", + "--host", "10.25.56.44", + "--user", "testuser", + "--password", "testpass", + "--local_output_only", +] + +# Base args without user/password (for LDAP/JWT tests) +BASE_ARGS_NO_CREDS = [ + "main.py", + "--target_project_id", "test-project", + "--target_location_id", "us-central1", + "--target_entry_group_id", "teradata", + "--host", "10.25.56.44", + "--local_output_only", +] + + +class TestDefaultBehavior: + """Default (no --logmech) must behave like TD2.""" + + def test_default_requires_user_and_password(self): + """No --logmech still requires --user and a password method.""" + with mock.patch("sys.argv", BASE_ARGS): + config = read_args() + assert config["logmech"] is None + assert config["user"] == "testuser" + assert config["password"] == "testpass" + + def test_default_missing_user_raises(self): + args = [ + "main.py", + "--target_project_id", "test-project", + "--target_location_id", "us-central1", + "--target_entry_group_id", "teradata", + "--host", "10.25.56.44", + "--password", "testpass", + "--local_output_only", + ] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + def test_default_missing_password_raises(self): + args = [ + "main.py", + "--target_project_id", "test-project", + "--target_location_id", "us-central1", + "--target_entry_group_id", "teradata", + "--host", "10.25.56.44", + "--user", "testuser", + "--local_output_only", + ] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + +class TestLogmechValidation: + """Logmech value validation and normalization.""" + + @pytest.mark.parametrize("logmech", [ + "TD2", "LDAP", "JWT", + ]) + def test_valid_logmech_accepted(self, logmech): + args = BASE_ARGS + ["--logmech", logmech] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == logmech + + @pytest.mark.parametrize("logmech", [ + "BROWSER", "INVALID", "browser", "OAuth", "", "TDNEGO", "KRB5", + ]) + def test_invalid_logmech_rejected(self, logmech): + args = BASE_ARGS + ["--logmech", logmech] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + @pytest.mark.parametrize("input_val,expected", [ + ("ldap", "LDAP"), + ("td2", "TD2"), + ("jwt", "JWT"), + ("Ldap", "LDAP"), + ]) + def test_logmech_normalized_to_uppercase(self, input_val, expected): + args = BASE_ARGS + ["--logmech", input_val] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == expected + + +class TestLDAPAuth: + """LDAP — user/password optional.""" + + def test_ldap_without_credentials(self): + args = BASE_ARGS_NO_CREDS + ["--logmech", "LDAP"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == "LDAP" + assert config["user"] == "" + assert config["password"] == "" + + def test_ldap_with_user_and_password(self): + args = BASE_ARGS + ["--logmech", "LDAP"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == "LDAP" + assert config["user"] == "testuser" + assert config["password"] == "testpass" + + def test_ldap_with_logdata(self): + args = BASE_ARGS_NO_CREDS + [ + "--logmech", "LDAP", + "--logdata", "authcid=user realm=CORP", + ] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == "LDAP" + assert config["logdata"] == "authcid=user realm=CORP" + + def test_ldap_with_creds_and_logdata(self): + args = BASE_ARGS + [ + "--logmech", "LDAP", + "--logdata", "authcid=user realm=CORP", + ] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == "LDAP" + assert config["user"] == "testuser" + assert config["password"] == "testpass" + assert config["logdata"] == "authcid=user realm=CORP" + + +class TestJWTAuth: + """JWT — user/password optional, logdata optional.""" + + def test_jwt_without_credentials(self): + args = BASE_ARGS_NO_CREDS + ["--logmech", "JWT"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == "JWT" + assert config["user"] == "" + assert config["password"] == "" + + def test_jwt_with_logdata(self): + args = BASE_ARGS_NO_CREDS + [ + "--logmech", "JWT", + "--logdata", "token=eyJhbGciOiJSUzI1NiJ9...", + ] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == "JWT" + assert config["logdata"] == "token=eyJhbGciOiJSUzI1NiJ9..." + + +class TestTD2Auth: + """TD2 — explicit logmech, user + password required.""" + + def test_td2_with_credentials(self): + args = BASE_ARGS + ["--logmech", "TD2"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logmech"] == "TD2" + assert config["user"] == "testuser" + + def test_td2_missing_user_raises(self): + args = BASE_ARGS_NO_CREDS + [ + "--logmech", "TD2", + "--password", "testpass", + ] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + def test_td2_missing_password_raises(self): + args = BASE_ARGS_NO_CREDS + [ + "--logmech", "TD2", + "--user", "testuser", + ] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + +class TestLogdata: + """--logdata argument handling.""" + + def test_logdata_without_logmech_accepted(self): + args = BASE_ARGS + ["--logdata", "somedata"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logdata"] == "somedata" + assert config["logmech"] is None + + def test_logdata_default_is_none(self): + with mock.patch("sys.argv", BASE_ARGS): + config = read_args() + assert config["logdata"] is None + + @mock.patch("src.cmd_reader.get_password", return_value="authcid=user") + @mock.patch("src.cmd_reader.validateSecretID") + def test_logdata_secret_resolves(self, mock_validate, mock_get): + args = BASE_ARGS + [ + "--logdata_secret", + "projects/my-proj/secrets/ldap-logdata", + ] + with mock.patch("sys.argv", args): + config = read_args() + assert config["logdata"] == "authcid=user" + mock_validate.assert_called_once_with( + "projects/my-proj/secrets/ldap-logdata" + ) + mock_get.assert_called_once_with( + "projects/my-proj/secrets/ldap-logdata" + ) + + def test_logdata_and_logdata_secret_mutual_exclusion(self): + args = BASE_ARGS + [ + "--logdata", "somedata", + "--logdata_secret", + "projects/my-proj/secrets/ldap-logdata", + ] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_cmd_reader_charset.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_cmd_reader_charset.py new file mode 100644 index 0000000..5755a2c --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_cmd_reader_charset.py @@ -0,0 +1,173 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the --charset CLI argument and JDBC URL construction.""" + +from unittest import mock + +import pytest + +from src.cmd_reader import read_args + + +# Required CLI args for cmd_reader.read_args() to succeed +BASE_ARGS = [ + "main.py", + "--target_project_id", "test-project", + "--target_location_id", "us-central1", + "--target_entry_group_id", "teradata", + "--host", "10.25.56.44", + "--user", "testuser", + "--password", "testpass", + "--local_output_only", +] + + +class TestCharsetArgument: + """Tests for --charset CLI argument parsing.""" + + def test_default_charset_is_utf8(self): + """When --charset is not specified, default is UTF8.""" + with mock.patch("sys.argv", BASE_ARGS): + config = read_args() + assert config["charset"] == "UTF8" + + def test_custom_charset_utf16(self): + """--charset UTF16 is parsed correctly.""" + args = BASE_ARGS + ["--charset", "UTF16"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["charset"] == "UTF16" + + def test_custom_charset_ascii(self): + """--charset ASCII is parsed correctly.""" + args = BASE_ARGS + ["--charset", "ASCII"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["charset"] == "ASCII" + + def test_custom_charset_kanjisjis(self): + """Japanese Shift-JIS charset is parsed correctly.""" + args = BASE_ARGS + ["--charset", "KANJISJIS_0S"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["charset"] == "KANJISJIS_0S" + + def test_charset_included_in_config(self): + """charset key exists in the returned config dict.""" + with mock.patch("sys.argv", BASE_ARGS): + config = read_args() + assert "charset" in config + + def test_charset_lowercase_preserved(self): + """Lowercase charset value is passed through as-is.""" + args = BASE_ARGS + ["--charset", "utf8"] + with mock.patch("sys.argv", args): + config = read_args() + assert config["charset"] == "utf8" + + def test_charset_stripped_of_whitespace(self): + """Leading/trailing whitespace is stripped.""" + args = BASE_ARGS + ["--charset", " UTF16 "] + with mock.patch("sys.argv", args): + config = read_args() + assert config["charset"] == "UTF16" + + def test_charset_rejects_comma_injection(self): + """Comma in charset would inject JDBC URL params — must be rejected.""" + args = BASE_ARGS + ["--charset", "UTF8,LOGMECH=LDAP"] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + def test_charset_rejects_slash(self): + """Slash in charset would break JDBC URL parsing — must be rejected.""" + args = BASE_ARGS + ["--charset", "UTF8/EXTRA"] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + def test_charset_rejects_special_chars(self): + """Special characters in charset must be rejected.""" + args = BASE_ARGS + ["--charset", "UTF-8"] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() + + +class TestJdbcUrlConstruction: + """Tests for JDBC URL charset parameter. + + The JDBC URL is constructed in TeradataConnector.__init__ which + requires PySpark and a live connection. We test the URL construction + logic directly here. + """ + + def _build_jdbc_url(self, config): + """Mirror of the JDBC URL construction in teradata_connector.py. + + Keep in sync with TeradataConnector.__init__ (teradata_connector.py:56-59). + """ + charset = config.get("charset", "UTF8") + return ( + f"jdbc:teradata://{config['host']}" + f"/DBS_PORT={config['port']},CHARSET={charset}" + ) + + def test_default_charset_in_url(self): + config = {"host": "10.25.56.44", "port": 1025} + url = self._build_jdbc_url(config) + assert url == "jdbc:teradata://10.25.56.44/DBS_PORT=1025,CHARSET=UTF8" + + def test_utf16_charset_in_url(self): + config = {"host": "10.25.56.44", "port": 1025, "charset": "UTF16"} + url = self._build_jdbc_url(config) + assert url == "jdbc:teradata://10.25.56.44/DBS_PORT=1025,CHARSET=UTF16" + + def test_ascii_charset_in_url(self): + config = {"host": "10.25.56.44", "port": 1025, "charset": "ASCII"} + url = self._build_jdbc_url(config) + assert url == "jdbc:teradata://10.25.56.44/DBS_PORT=1025,CHARSET=ASCII" + + def test_custom_charset_in_url(self): + config = {"host": "td-server", "port": 1025, "charset": "KANJISJIS_0S"} + url = self._build_jdbc_url(config) + assert "CHARSET=KANJISJIS_0S" in url + + def test_missing_charset_defaults_to_utf8(self): + """Config without charset key should default to UTF8.""" + config = {"host": "10.25.56.44", "port": 1025} + url = self._build_jdbc_url(config) + assert "CHARSET=UTF8" in url + + def test_custom_port_with_charset(self): + config = {"host": "10.25.56.44", "port": 2025, "charset": "UTF16"} + url = self._build_jdbc_url(config) + assert url == "jdbc:teradata://10.25.56.44/DBS_PORT=2025,CHARSET=UTF16" + + def test_hostname_with_charset(self): + config = {"host": "td-server.example.com", "port": 1025, "charset": "UTF8"} + url = self._build_jdbc_url(config) + assert url == "jdbc:teradata://td-server.example.com/DBS_PORT=1025,CHARSET=UTF8" + + def test_lowercase_charset_in_url(self): + """Lowercase charset flows through to JDBC URL as-is.""" + config = {"host": "10.25.56.44", "port": 1025, "charset": "utf8"} + url = self._build_jdbc_url(config) + assert url == "jdbc:teradata://10.25.56.44/DBS_PORT=1025,CHARSET=utf8" + + def test_mixed_case_charset_in_url(self): + config = {"host": "10.25.56.44", "port": 1025, "charset": "Utf16"} + url = self._build_jdbc_url(config) + assert url == "jdbc:teradata://10.25.56.44/DBS_PORT=1025,CHARSET=Utf16" diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_constants.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_constants.py new file mode 100644 index 0000000..9922e7e --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_constants.py @@ -0,0 +1,80 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Teradata constants.""" + +from src.constants import ( + SOURCE_TYPE, + JDBC_JAR, + CONNECTOR_MODULE, + CONNECTOR_CLASS, + IS_NULLABLE_TRUE, + EntryType, + TOP_ENTRY_HIERARCHY, + COLLECTION_ENTRY, + DB_OBJECT_TYPES_TO_PROCESS, + generateFileName, +) + + +def test_source_type(): + assert SOURCE_TYPE == "teradata" + + +def test_jdbc_jar(): + assert JDBC_JAR == "terajdbc4.jar" + + +def test_connector_module(): + assert CONNECTOR_MODULE == "src.teradata_connector" + assert CONNECTOR_CLASS == "TeradataConnector" + + +def test_nullable_value(): + assert IS_NULLABLE_TRUE == "Y" + + +def test_entry_type_hierarchy(): + """Teradata uses 5-level hierarchy matching Oracle.""" + assert hasattr(EntryType, "INSTANCE") + assert hasattr(EntryType, "DATABASE") + assert hasattr(EntryType, "DB_SCHEMA") + assert hasattr(EntryType, "TABLE") + assert hasattr(EntryType, "VIEW") + + +def test_entry_type_values_contain_teradata(): + for et in EntryType: + assert "teradata-" in et.value + + +def test_top_entry_hierarchy(): + assert TOP_ENTRY_HIERARCHY == [ + EntryType.INSTANCE, EntryType.DATABASE + ] + + +def test_collection_entry(): + assert COLLECTION_ENTRY == EntryType.DB_SCHEMA + + +def test_db_object_types(): + assert DB_OBJECT_TYPES_TO_PROCESS == [ + EntryType.TABLE, EntryType.VIEW + ] + + +def test_generate_filename(): + config = {"host": "server.example.com"} + assert generateFileName(config) == "teradata-server.example.com.jsonl" diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_datatype_mapper.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_datatype_mapper.py new file mode 100644 index 0000000..ce7d409 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_datatype_mapper.py @@ -0,0 +1,218 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Teradata datatype mapper.""" + +import pytest +from src.datatype_mapper import get_catalog_metadata_type +from src.datatype_mapper import get_readable_type_name + + +class TestNumericShortCodes: + @pytest.mark.parametrize("td_type", [ + "I", "I1", "I2", "I8", "D", "F", "N", + ]) + def test_numeric_short_codes(self, td_type): + assert get_catalog_metadata_type(td_type) == "NUMBER" + + +class TestNumericTypes: + @pytest.mark.parametrize("td_type", [ + "INTEGER", "SMALLINT", "BYTEINT", "BIGINT", + "FLOAT", "REAL", "DOUBLE", "DOUBLE PRECISION", + "DECIMAL", "NUMERIC", "NUMBER", + ]) + def test_numeric_types(self, td_type): + assert get_catalog_metadata_type(td_type) == "NUMBER" + + @pytest.mark.parametrize("td_type", [ + "DECIMAL(18,2)", "DECIMAL(10,0)", "NUMERIC(10)", + "NUMBER(10)", "FLOAT(53)", "DOUBLE PRECISION", + ]) + def test_numeric_types_with_size(self, td_type): + assert get_catalog_metadata_type(td_type) == "NUMBER" + + def test_numeric_with_whitespace(self): + assert get_catalog_metadata_type(" INTEGER ") == "NUMBER" + + def test_numeric_case_insensitive(self): + assert get_catalog_metadata_type("integer") == "NUMBER" + assert get_catalog_metadata_type("Float") == "NUMBER" + + def test_byteint_not_confused_with_byte(self): + assert get_catalog_metadata_type("BYTEINT") == "NUMBER" + + +class TestStringShortCodes: + @pytest.mark.parametrize("td_type", [ + "CV", "CF", "CO", "LV", "JN", "XM", "GF", "GV", "GL", + ]) + def test_string_short_codes(self, td_type): + assert get_catalog_metadata_type(td_type) == "STRING" + + +class TestStringTypes: + @pytest.mark.parametrize("td_type", [ + "VARCHAR", "VARCHAR(100)", "CHAR", "CHAR(50)", + "CLOB", "LONG VARCHAR", "LONG VARGRAPHIC", + "JSON", "XML", + "GRAPHIC", "GRAPHIC(100)", "VARGRAPHIC", "VARGRAPHIC(200)", + ]) + def test_string_types(self, td_type): + assert get_catalog_metadata_type(td_type) == "STRING" + + @pytest.mark.parametrize("td_type", [ + "CLOB(1000000)", "LONG VARCHAR(32000)", + "JSON(32000)", "XML(2000)", + ]) + def test_string_types_with_size(self, td_type): + assert get_catalog_metadata_type(td_type) == "STRING" + + +class TestBytesShortCodes: + @pytest.mark.parametrize("td_type", ["BV", "BF", "BO"]) + def test_bytes_short_codes(self, td_type): + assert get_catalog_metadata_type(td_type) == "BYTES" + + +class TestBytesTypes: + @pytest.mark.parametrize("td_type", [ + "BYTE", "VARBYTE", "BLOB", "LONG VARBYTE", + ]) + def test_bytes_types(self, td_type): + assert get_catalog_metadata_type(td_type) == "BYTES" + + @pytest.mark.parametrize("td_type", [ + "BYTE(10)", "VARBYTE(100)", "BLOB(1000000)", + "LONG VARBYTE(32000)", + ]) + def test_bytes_types_with_size(self, td_type): + assert get_catalog_metadata_type(td_type) == "BYTES" + + +class TestTimestampShortCodes: + @pytest.mark.parametrize("td_type", ["TS", "SZ"]) + def test_timestamp_short_codes(self, td_type): + assert get_catalog_metadata_type(td_type) == "TIMESTAMP" + + +class TestTimestampTypes: + @pytest.mark.parametrize("td_type", [ + "TIMESTAMP", "TIMESTAMP WITH TIME ZONE", + "TIMESTAMP(6)", "TIMESTAMP(0) WITH TIME ZONE", + ]) + def test_timestamp_types(self, td_type): + assert get_catalog_metadata_type(td_type) == "TIMESTAMP" + + +class TestDateTimeShortCodes: + @pytest.mark.parametrize("td_type", ["DA", "AT", "TZ"]) + def test_datetime_short_codes(self, td_type): + assert get_catalog_metadata_type(td_type) == "DATETIME" + + +class TestDateTimeTypes: + def test_date(self): + assert get_catalog_metadata_type("DATE") == "DATETIME" + + @pytest.mark.parametrize("td_type", [ + "TIME", "TIME WITH TIME ZONE", "TIME(6)", + ]) + def test_time_types(self, td_type): + assert get_catalog_metadata_type(td_type) == "DATETIME" + + +class TestBooleanType: + def test_boolean(self): + assert get_catalog_metadata_type("BOOLEAN") == "BOOLEAN" + + +class TestOtherShortCodes: + @pytest.mark.parametrize("td_type", [ + # Interval short codes + "YR", "YM", "MO", "DY", "DH", "DM", "DS", + "HR", "HM", "HS", "MI", "MS", "SC", + # Period short codes + "PD", "PT", "PS", "PM", "PZ", + # UDT, Dataset, Array short codes + "UT", "DT", "A1", "AN", + ]) + def test_other_short_codes(self, td_type): + assert get_catalog_metadata_type(td_type) == "OTHER" + + +class TestOtherTypes: + @pytest.mark.parametrize("td_type", [ + "INTERVAL YEAR", "PERIOD(DATE)", "UDT", + "ST_GEOMETRY", "ARRAY", "DATASET", + "MBR", "MBB", "UNKNOWN_TYPE", + ]) + def test_other_types(self, td_type): + assert get_catalog_metadata_type(td_type) == "OTHER" + + +class TestNullAndEmpty: + def test_none(self): + assert get_catalog_metadata_type(None) == "OTHER" + + def test_empty_string(self): + assert get_catalog_metadata_type("") == "OTHER" + + +# --- Tests for get_readable_type_name --- + +class TestReadableTypeName: + @pytest.mark.parametrize("short_code,expected", [ + ("I", "INTEGER"), + ("I1", "BYTEINT"), + ("I2", "SMALLINT"), + ("I8", "BIGINT"), + ("D", "DECIMAL"), + ("F", "FLOAT"), + ("N", "NUMBER"), + ("CV", "VARCHAR"), + ("CF", "CHAR"), + ("CO", "CLOB"), + ("DA", "DATE"), + ("TS", "TIMESTAMP"), + ("BV", "VARBYTE"), + ("JN", "JSON"), + ("XM", "XML"), + ("GF", "GRAPHIC"), + ("GV", "VARGRAPHIC"), + ("YR", "INTERVAL YEAR"), + ("PD", "PERIOD(DATE)"), + ("UT", "UDT"), + ("DT", "DATASET"), + ("A1", "ARRAY"), + ]) + def test_short_code_to_readable(self, short_code, expected): + assert get_readable_type_name(short_code) == expected + + def test_full_name_passthrough(self): + assert get_readable_type_name("INTEGER") == "INTEGER" + assert get_readable_type_name("VARCHAR") == "VARCHAR" + + def test_unknown_passthrough(self): + assert get_readable_type_name("SOMECUSTOMTYPE") == "SOMECUSTOMTYPE" + + def test_none_returns_unknown(self): + assert get_readable_type_name(None) == "UNKNOWN" + + def test_whitespace_handling(self): + assert get_readable_type_name(" I ") == "INTEGER" + + def test_case_insensitive(self): + assert get_readable_type_name("cv") == "VARCHAR" + assert get_readable_type_name("Da") == "DATE" diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_gcs_uploader.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_gcs_uploader.py new file mode 100644 index 0000000..8434582 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_gcs_uploader.py @@ -0,0 +1,74 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for GCS uploader with mocked storage client.""" + +from unittest import mock +from unittest.mock import MagicMock + +import pytest + +from src.common.gcs_uploader import upload, checkDestination + + +class TestUpload: + """Tests for upload().""" + + @mock.patch("src.common.gcs_uploader.storage") + def test_upload_calls_gcs(self, mock_storage): + mock_client = MagicMock() + mock_storage.Client.return_value = mock_client + mock_bucket = MagicMock() + mock_client.get_bucket.return_value = mock_bucket + mock_blob = MagicMock() + mock_bucket.blob.return_value = mock_blob + + config = {"output_bucket": "my-bucket"} + upload(config, "/tmp/output", "metadata.jsonl", "run1") + + mock_client.get_bucket.assert_called_once_with("my-bucket") + mock_bucket.blob.assert_called_once_with("run1/metadata.jsonl") + mock_blob.upload_from_filename.assert_called_once_with( + "/tmp/output/metadata.jsonl" + ) + + +class TestCheckDestination: + """Tests for checkDestination().""" + + @mock.patch("src.common.gcs_uploader.storage") + def test_valid_bucket(self, mock_storage): + mock_client = MagicMock() + mock_storage.Client.return_value = mock_client + mock_bucket = MagicMock() + mock_bucket.exists.return_value = True + mock_client.bucket.return_value = mock_bucket + + assert checkDestination("my-bucket") is True + + @mock.patch("src.common.gcs_uploader.storage") + def test_bucket_not_exists(self, mock_storage): + mock_client = MagicMock() + mock_storage.Client.return_value = mock_client + mock_bucket = MagicMock() + mock_bucket.exists.return_value = False + mock_client.bucket.return_value = mock_bucket + + with pytest.raises(Exception, match="does not exist"): + checkDestination("bad-bucket") + + @mock.patch("src.common.gcs_uploader.storage") + def test_gs_prefix_rejected(self, mock_storage): + with pytest.raises(Exception, match="without gs://"): + checkDestination("gs://my-bucket") diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_name_builder.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_name_builder.py new file mode 100644 index 0000000..53bc67e --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_name_builder.py @@ -0,0 +1,268 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Teradata name builder.""" + +import pytest +from src.constants import EntryType +from src.name_builder import ( + create_fqn, + create_name, + create_parent_name, + create_entry_aspect_name, + _sanitize_entry_id, +) + +CONFIG = { + "target_project_id": "my-project", + "target_location_id": "us-central1", + "target_entry_group_id": "teradata", + "host": "td-server.example.com", +} + +PREFIX = ( + "projects/my-project/locations/us-central1" + "/entryGroups/teradata/entries" +) + + +class TestCreateFqn: + def test_instance(self): + assert create_fqn(CONFIG, EntryType.INSTANCE) == ( + "custom:`td-server-example-com`" + ) + + def test_database(self): + assert create_fqn(CONFIG, EntryType.DATABASE) == ( + "custom:`td-server-example-com`.td-server-example-com" + ) + + def test_db_schema(self): + assert create_fqn( + CONFIG, EntryType.DB_SCHEMA, schema_name="retail" + ) == ( + "custom:`td-server-example-com`" + ".td-server-example-com.retail" + ) + + def test_table(self): + assert create_fqn( + CONFIG, EntryType.TABLE, + schema_name="retail", table_name="orders" + ) == ( + "custom:`td-server-example-com`" + ".td-server-example-com.retail.orders" + ) + + def test_view(self): + assert create_fqn( + CONFIG, EntryType.VIEW, + schema_name="retail", table_name="v_orders" + ) == ( + "custom:`td-server-example-com`" + ".td-server-example-com.retail.v_orders" + ) + + +class TestCreateName: + def test_instance(self): + assert create_name(CONFIG, EntryType.INSTANCE) == ( + f"{PREFIX}/td-server.example.com" + ) + + def test_database(self): + assert create_name(CONFIG, EntryType.DATABASE) == ( + f"{PREFIX}/td-server.example.com" + "/databases/td-server.example.com" + ) + + def test_db_schema(self): + assert create_name( + CONFIG, EntryType.DB_SCHEMA, schema_name="retail" + ) == ( + f"{PREFIX}/td-server.example.com" + "/databases/td-server.example.com" + "/database_schemas/retail" + ) + + def test_table(self): + assert create_name( + CONFIG, EntryType.TABLE, + schema_name="retail", table_name="orders" + ) == ( + f"{PREFIX}/td-server.example.com" + "/databases/td-server.example.com" + "/database_schemas/retail/tables/orders" + ) + + def test_view(self): + assert create_name( + CONFIG, EntryType.VIEW, + schema_name="retail", table_name="v_orders" + ) == ( + f"{PREFIX}/td-server.example.com" + "/databases/td-server.example.com" + "/database_schemas/retail/views/v_orders" + ) + + def test_host_colon_replaced(self): + config = {**CONFIG, "host": "server:1025"} + name = create_name(config, EntryType.INSTANCE) + assert "server@1025" in name + assert ":" not in name.split("/entries/")[1] + + def test_host_colon_sanitized_in_database(self): + """DATABASE segment must not contain ':' from host:port.""" + config = {**CONFIG, "host": "server:1025"} + db_name = create_name(config, EntryType.DATABASE) + db_segment = db_name.split("/databases/")[1] + assert ":" not in db_segment + assert "server_1025" in db_segment + + def test_chinese_table_name_sanitized(self): + """Non-ASCII table names must be converted to _u_ format.""" + name = create_name( + CONFIG, EntryType.TABLE, + schema_name="retail", table_name="测试中文" + ) + assert "测试中文" not in name + assert "_u6D4B_" in name # 测 = U+6D4B + + def test_chinese_schema_name_sanitized(self): + """Non-ASCII schema names must be converted to _u_ format.""" + name = create_name( + CONFIG, EntryType.DB_SCHEMA, + schema_name="数据库" + ) + assert "数据库" not in name + assert "_u" in name + + def test_space_in_table_name_replaced(self): + """Spaces replaced with underscores.""" + name = create_name( + CONFIG, EntryType.TABLE, + schema_name="retail", table_name="my table" + ) + assert "my_table" in name + + def test_normal_table_name_unchanged(self): + """Normal ASCII names should pass through unchanged.""" + name = create_name( + CONFIG, EntryType.TABLE, + schema_name="retail", table_name="orders_2024" + ) + assert name.endswith("/tables/orders_2024") + + def test_special_chars_in_table_name(self): + """Curly braces and other special chars replaced with underscores.""" + name = create_name( + CONFIG, EntryType.TABLE, + schema_name="retail", table_name="test{table}" + ) + assert "{" not in name + assert "}" not in name + + +class TestSanitizeEntryId: + """Tests for the _sanitize_entry_id helper.""" + + def test_ascii_unchanged(self): + assert _sanitize_entry_id("orders") == "orders" + + def test_dots_preserved(self): + assert _sanitize_entry_id("my.table") == "my.table" + + def test_hyphens_preserved(self): + assert _sanitize_entry_id("my-table") == "my-table" + + def test_underscores_preserved(self): + assert _sanitize_entry_id("my_table") == "my_table" + + def test_chinese_to_unicode_format(self): + """Chinese chars become _u_ code points.""" + result = _sanitize_entry_id("测试") + assert result == "_u6D4B__u8BD5_" + assert "测" not in result + + def test_space_replaced(self): + assert _sanitize_entry_id("my table") == "my_table" + + def test_curly_braces_replaced(self): + result = _sanitize_entry_id("test{1}") + assert "{" not in result + assert "}" not in result + + def test_hash_replaced(self): + result = _sanitize_entry_id("test#1") + assert "#" not in result + + def test_allowed_special_chars_preserved(self): + """Chars in Dataplex allowed set should pass through.""" + assert _sanitize_entry_id("a-b.c_d") == "a-b.c_d" + assert _sanitize_entry_id("a~b!c") == "a~b!c" + assert _sanitize_entry_id("a+b=c") == "a+b=c" + assert _sanitize_entry_id("%") == "%" + assert _sanitize_entry_id("a%b-c") == "a%b-c" + + def test_emoji_supplementary_plane(self): + """Supplementary plane chars (>U+FFFF) use 5+ hex digits.""" + result = _sanitize_entry_id("test_🎉") + assert result == "test__u1F389_" + assert "🎉" not in result + + +class TestCreateParentName: + def test_instance_has_no_parent(self): + assert create_parent_name(CONFIG, EntryType.INSTANCE) == "" + + def test_database_parent_is_instance(self): + parent = create_parent_name(CONFIG, EntryType.DATABASE) + assert parent == create_name(CONFIG, EntryType.INSTANCE) + + def test_schema_parent_is_database(self): + parent = create_parent_name(CONFIG, EntryType.DB_SCHEMA) + assert parent == create_name(CONFIG, EntryType.DATABASE) + + def test_table_parent_is_schema(self): + parent = create_parent_name( + CONFIG, EntryType.TABLE, parent_name="retail" + ) + assert parent == create_name( + CONFIG, EntryType.DB_SCHEMA, schema_name="retail" + ) + + def test_view_parent_is_schema(self): + parent = create_parent_name( + CONFIG, EntryType.VIEW, parent_name="retail" + ) + assert parent == create_name( + CONFIG, EntryType.DB_SCHEMA, schema_name="retail" + ) + + +class TestCreateEntryAspectName: + def test_instance_aspect(self): + assert create_entry_aspect_name(CONFIG, EntryType.INSTANCE) == ( + "my-project.us-central1.teradata-instance" + ) + + def test_table_aspect(self): + assert create_entry_aspect_name(CONFIG, EntryType.TABLE) == ( + "my-project.us-central1.teradata-table" + ) + + def test_schema_aspect(self): + assert create_entry_aspect_name(CONFIG, EntryType.DB_SCHEMA) == ( + "my-project.us-central1.teradata-schema" + ) diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_password_resolution.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_password_resolution.py new file mode 100644 index 0000000..28e9899 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_password_resolution.py @@ -0,0 +1,231 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for password resolution priority in cmd_reader.""" + +import os +from unittest import mock + +import pytest + +from src.cmd_reader import read_args + + +# Required CLI args (no password method included) +BASE_ARGS = [ + "main.py", + "--target_project_id", "test-project", + "--target_location_id", "us-central1", + "--target_entry_group_id", "teradata", + "--host", "10.25.56.44", + "--user", "testuser", + "--local_output_only", +] + + +class TestPasswordFile: + """Tests for --password_file resolution.""" + + def test_password_file_reads_content(self, tmp_path): + pw_file = tmp_path / "pw.txt" + pw_file.write_text("file_secret") + args = BASE_ARGS + ["--password_file", str(pw_file)] + with mock.patch("sys.argv", args): + config = read_args() + assert config["password"] == "file_secret" + + def test_password_file_takes_precedence_over_password( + self, tmp_path, capsys + ): + pw_file = tmp_path / "pw.txt" + pw_file.write_text("file_secret") + args = BASE_ARGS + [ + "--password_file", str(pw_file), + "--password", "cli_secret", + ] + with mock.patch("sys.argv", args): + config = read_args() + assert config["password"] == "file_secret" + stderr = capsys.readouterr().err + assert "WARNING" not in stderr + + def test_password_file_strips_whitespace(self, tmp_path): + pw_file = tmp_path / "pw.txt" + pw_file.write_text(" file_secret\n") + args = BASE_ARGS + ["--password_file", str(pw_file)] + with mock.patch("sys.argv", args): + config = read_args() + assert config["password"] == "file_secret" + + def test_password_file_empty(self, tmp_path): + pw_file = tmp_path / "pw.txt" + pw_file.write_text(" \n") + args = BASE_ARGS + ["--password_file", str(pw_file)] + with mock.patch("sys.argv", args): + with pytest.raises( + SystemExit, match="password file is empty" + ): + read_args() + + def test_password_file_not_found(self): + args = BASE_ARGS + [ + "--password_file", "/nonexistent/path/pw.txt", + ] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit, match="password file not found"): + read_args() + + def test_password_file_unreadable(self, tmp_path): + pw_file = tmp_path / "pw.txt" + args = BASE_ARGS + ["--password_file", str(pw_file)] + with mock.patch("sys.argv", args), \ + mock.patch( + "builtins.open", + side_effect=PermissionError("Permission denied"), + ): + with pytest.raises( + SystemExit, match="unable to read password file" + ): + read_args() + + def test_password_file_invalid_utf8(self, tmp_path): + pw_file = tmp_path / "pw.bin" + pw_file.write_bytes(b"\xff\xfe invalid utf-8") + args = BASE_ARGS + ["--password_file", str(pw_file)] + with mock.patch("sys.argv", args): + with pytest.raises( + SystemExit, match="invalid UTF-8" + ): + read_args() + + +class TestEnvironmentVariable: + """Tests for TERADATA_PASSWORD env var resolution.""" + + def test_env_var_used_when_no_other_method(self): + with mock.patch("sys.argv", BASE_ARGS), \ + mock.patch.dict( + os.environ, {"TERADATA_PASSWORD": "env_secret"} + ): + config = read_args() + assert config["password"] == "env_secret" + + @pytest.mark.parametrize("env_password", ["", " "]) + def test_env_var_empty_or_whitespace_fails(self, env_password): + env = os.environ.copy() + env["TERADATA_PASSWORD"] = env_password + with mock.patch("sys.argv", BASE_ARGS), \ + mock.patch.dict(os.environ, env, clear=True): + with pytest.raises( + SystemExit, match="TERADATA_PASSWORD is empty" + ): + read_args() + + def test_env_var_ignored_when_password_file_set(self, tmp_path): + pw_file = tmp_path / "pw.txt" + pw_file.write_text("file_secret") + args = BASE_ARGS + ["--password_file", str(pw_file)] + with mock.patch("sys.argv", args), \ + mock.patch.dict( + os.environ, {"TERADATA_PASSWORD": "env_secret"} + ): + config = read_args() + assert config["password"] == "file_secret" + + +class TestCliPassword: + """Tests for --password CLI argument.""" + + @pytest.mark.parametrize("empty_pw", ["", " "]) + def test_cli_password_empty_or_whitespace_fails(self, empty_pw): + args = BASE_ARGS + ["--password", empty_pw] + env = os.environ.copy() + env.pop("TERADATA_PASSWORD", None) + with mock.patch("sys.argv", args), \ + mock.patch.dict(os.environ, env, clear=True): + with pytest.raises( + SystemExit, match="--password value is empty" + ): + read_args() + + def test_cli_password_prints_warning(self, capsys): + args = BASE_ARGS + ["--password", "cli_secret"] + env = os.environ.copy() + env.pop("TERADATA_PASSWORD", None) + with mock.patch("sys.argv", args), \ + mock.patch.dict(os.environ, env, clear=True): + config = read_args() + assert config["password"] == "cli_secret" + stderr = capsys.readouterr().err + assert "WARNING" in stderr + assert "--password_secret" in stderr + + def test_cli_password_lower_priority_than_env(self, capsys): + args = BASE_ARGS + ["--password", "cli_secret"] + with mock.patch("sys.argv", args), \ + mock.patch.dict( + os.environ, {"TERADATA_PASSWORD": "env_secret"} + ): + config = read_args() + # Env var (priority #3) beats --password (priority #4) + assert config["password"] == "env_secret" + # No warning because env var was used, not --password + stderr = capsys.readouterr().err + assert "WARNING" not in stderr + + +class TestPasswordSecretPrecedence: + """--password_secret takes highest precedence.""" + + def test_secret_overrides_all_others(self, tmp_path): + pw_file = tmp_path / "pw.txt" + pw_file.write_text("file_secret") + args = BASE_ARGS + [ + "--password_secret", + "projects/proj/secrets/mysecret", + "--password_file", str(pw_file), + "--password", "cli_secret", + ] + with mock.patch("sys.argv", args), \ + mock.patch.dict( + os.environ, {"TERADATA_PASSWORD": "env_secret"} + ), \ + mock.patch( + "src.common.argument_validator.get_password", + return_value="gcp_secret", + ), \ + mock.patch( + "src.common.argument_validator.checkDestination", + return_value=True, + ): + config = read_args() + assert config["password"] == "gcp_secret" + + +class TestNoPasswordProvided: + """When no password method is given, clear error is raised.""" + + def test_no_password_lists_all_options(self): + env = os.environ.copy() + env.pop("TERADATA_PASSWORD", None) + with mock.patch("sys.argv", BASE_ARGS), \ + mock.patch.dict(os.environ, env, clear=True): + with pytest.raises(SystemExit) as exc_info: + read_args() + + message = str(exc_info.value) + assert "no password provided" in message + assert "--password_secret" in message + assert "--password_file" in message + assert "TERADATA_PASSWORD" in message diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_query_band.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_query_band.py new file mode 100644 index 0000000..b488945 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_query_band.py @@ -0,0 +1,188 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for query band validation and normalization.""" + +from unittest import mock + +import pytest + +from src.common.argument_validator import ( + validateQueryBand, + DEFAULT_QUERY_BAND, + DEFAULT_QUERY_BAND_ORG, + DEFAULT_QUERY_BAND_APPNAME, +) +from src.cmd_reader import read_args + +# Required CLI args for read_args() to succeed +BASE_ARGS = [ + "main.py", + "--target_project_id", "test-project", + "--target_location_id", "us-central1", + "--target_entry_group_id", "teradata", + "--host", "10.25.56.44", + "--user", "testuser", + "--password", "testpass", + "--local_output_only", +] + + +class TestValidateQueryBandDefaults: + """None, empty, or whitespace input returns the default query band.""" + + def test_none_returns_default(self): + assert validateQueryBand(None) == DEFAULT_QUERY_BAND + + def test_empty_string_returns_default(self): + assert validateQueryBand("") == DEFAULT_QUERY_BAND + + def test_whitespace_only_returns_default(self): + assert validateQueryBand(" ") == DEFAULT_QUERY_BAND + + def test_default_contains_org_and_appname(self): + assert DEFAULT_QUERY_BAND == ( + f"org={DEFAULT_QUERY_BAND_ORG};" + f"appname={DEFAULT_QUERY_BAND_APPNAME};" + ) + + +class TestValidateQueryBandValidInput: + """Valid query band strings are normalized correctly.""" + + def test_custom_key_gets_defaults_prepended(self): + result = validateQueryBand("team=analytics;") + assert result.startswith(f"org={DEFAULT_QUERY_BAND_ORG};") + assert f"appname={DEFAULT_QUERY_BAND_APPNAME};" in result + assert "team=analytics;" in result + + def test_custom_org_preserved(self): + result = validateQueryBand("org=myorg;") + assert result.startswith("org=myorg;") + + def test_custom_appname_gets_default_appended(self): + result = validateQueryBand("appname=myapp;") + assert f"appname=myapp_{DEFAULT_QUERY_BAND_APPNAME};" in result + + def test_exact_default_appname_not_duplicated(self): + result = validateQueryBand(f"appname={DEFAULT_QUERY_BAND_APPNAME};") + assert result.count(DEFAULT_QUERY_BAND_APPNAME) == 1 + + def test_trailing_semicolon_added(self): + result = validateQueryBand("team=analytics") + assert result.endswith(";") + assert "team=analytics;" in result + + def test_extra_keys_preserved_after_org_appname(self): + result = validateQueryBand("org=myorg;appname=myapp;env=prod;") + parts = result.split(";") + # org first, appname second, env after + assert parts[0].startswith("org=") + assert parts[1].startswith("appname=") + assert "env=prod" in parts[2] + + def test_ordering_org_appname_first(self): + """Even if user provides appname before org, output is org → appname → rest.""" + result = validateQueryBand("appname=myapp;org=myorg;team=z;") + parts = result.split(";") + assert parts[0] == "org=myorg" + assert parts[1].startswith("appname=myapp") + assert parts[2] == "team=z" + + def test_whitespace_in_value_preserved(self): + result = validateQueryBand("team=data analytics;") + assert "team=data analytics;" in result + + def test_dots_in_value_allowed(self): + result = validateQueryBand("version=1.2.3;") + assert "version=1.2.3;" in result + + def test_uppercase_org_recognized(self): + """ORG=myorg should be normalized to org=myorg, not create a duplicate.""" + result = validateQueryBand("ORG=myorg;") + assert result.count("org=") == 1 + assert "org=myorg;" in result + + def test_mixed_case_appname_recognized(self): + """APPNAME=myapp should be normalized to appname key.""" + result = validateQueryBand("APPNAME=myapp;") + assert result.count("appname=") == 1 + assert f"appname=myapp_{DEFAULT_QUERY_BAND_APPNAME};" in result + + def test_uppercase_keys_no_duplicates(self): + """ORG and APPNAME should not produce duplicate org/appname entries.""" + result = validateQueryBand("ORG=myorg;APPNAME=myapp;team=z;") + assert result.count("org=") == 1 + assert result.count("appname=") == 1 + assert "team=z;" in result + + +class TestValidateQueryBandRejection: + """Invalid query band strings are rejected with SystemExit.""" + + def test_single_quote_rejected(self): + with pytest.raises(SystemExit): + validateQueryBand("team=ana'lytics;") + + def test_double_quote_rejected(self): + with pytest.raises(SystemExit): + validateQueryBand('team="analytics";') + + def test_parentheses_rejected(self): + with pytest.raises(SystemExit): + validateQueryBand("team=(analytics);") + + def test_reserved_name_proxyuser(self): + with pytest.raises(SystemExit, match="reserved name"): + validateQueryBand("proxyuser=admin;") + + def test_reserved_name_proxyrole_case_insensitive(self): + with pytest.raises(SystemExit, match="reserved name"): + validateQueryBand("PROXYROLE=admin;") + + def test_malformed_segment_no_equals(self): + with pytest.raises(SystemExit, match="malformed segment"): + validateQueryBand("noequals;") + + def test_empty_key_rejected(self): + """Segment like '=value;' has an empty key and must be rejected.""" + with pytest.raises(SystemExit, match="empty key"): + validateQueryBand("=value;") + + def test_exceeds_max_length(self): + long_value = "a" * 2049 + with pytest.raises(SystemExit, match="maximum length"): + validateQueryBand(f"key={long_value};") + + +class TestQueryBandCLIIntegration: + """Tests for --query_band via cmd_reader.read_args().""" + + def test_default_when_not_specified(self): + with mock.patch("sys.argv", BASE_ARGS): + config = read_args() + assert config["query_band"] == DEFAULT_QUERY_BAND + + def test_custom_query_band(self): + args = BASE_ARGS + ["--query_band", "team=analytics;"] + with mock.patch("sys.argv", args): + config = read_args() + assert "team=analytics;" in config["query_band"] + assert config["query_band"].startswith(f"org={DEFAULT_QUERY_BAND_ORG};") + + def test_invalid_query_band_rejected(self): + args = BASE_ARGS + ["--query_band", "team='evil;"] + with mock.patch("sys.argv", args): + with pytest.raises(SystemExit): + read_args() diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_sanitize_column_name.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_sanitize_column_name.py new file mode 100644 index 0000000..fc08fa2 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_sanitize_column_name.py @@ -0,0 +1,80 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for column name sanitization. + +The sanitize_column_name_udf in entry_builder.py only converts non-ASCII +characters to _u_ format. Special characters (!@#$ etc.) are kept +as-is since they are valid Teradata column names. +""" + +from src.name_builder import encode_non_ascii as sanitize_column_name + + +class TestSanitizeColumnName: + + # --- ASCII names pass through unchanged --- + + def test_normal_name_unchanged(self): + assert sanitize_column_name("order_id") == "order_id" + + def test_special_chars_preserved(self): + """Special characters are valid Teradata column names — keep as-is.""" + assert sanitize_column_name("!@#$%^&*{}|,?:;~") == "!@#$%^&*{}|,?:;~" + + def test_comma_preserved(self): + assert sanitize_column_name("c decimal (4,2)") == "c decimal (4,2)" + + def test_spaces_preserved(self): + assert sanitize_column_name(" col name ") == " col name " + + def test_empty_string_unchanged(self): + assert sanitize_column_name("") == "" + + def test_none_returns_none(self): + assert sanitize_column_name(None) is None + + # --- Non-ASCII converted to _u_ --- + + def test_chinese_characters(self): + result = sanitize_column_name("测试列") + assert result == "_u6D4B__u8BD5__u5217_" + assert "测" not in result + + def test_japanese_characters(self): + result = sanitize_column_name("にほんご") + assert result == "_u306B__u307B__u3093__u3054_" + + def test_mixed_ascii_and_chinese(self): + result = sanitize_column_name("col_测试") + assert result == "col__u6D4B__u8BD5_" + + def test_accented_characters(self): + result = sanitize_column_name("café") + assert result == "caf_u00E9_" + + def test_single_non_ascii(self): + result = sanitize_column_name("ü") + assert result == "_u00FC_" + + def test_emoji(self): + result = sanitize_column_name("col_🎉") + assert result.startswith("col_") + assert "🎉" not in result + + def test_mixed_special_and_non_ascii(self): + """Special chars kept, non-ASCII encoded.""" + result = sanitize_column_name("!@#测试") + assert result.startswith("!@#") + assert "_u6D4B_" in result diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_secret_manager.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_secret_manager.py new file mode 100644 index 0000000..310c023 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_secret_manager.py @@ -0,0 +1,69 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Secret Manager password retrieval.""" + +from unittest import mock +from unittest.mock import MagicMock + +from src.common.secret_manager import get_password + + +class TestGetPassword: + """Tests for get_password().""" + + @mock.patch("src.common.secret_manager.secretmanager") + def test_appends_versions_latest(self, mock_sm): + mock_client = MagicMock() + mock_sm.SecretManagerServiceClient.return_value = mock_client + mock_response = MagicMock() + mock_response.payload.data.decode.return_value = "my_secret" + mock_client.access_secret_version.return_value = mock_response + + result = get_password("projects/proj/secrets/mysecret") + + mock_client.access_secret_version.assert_called_once_with( + request={ + "name": "projects/proj/secrets/mysecret/versions/latest" + } + ) + assert result == "my_secret" + + @mock.patch("src.common.secret_manager.secretmanager") + def test_preserves_explicit_version(self, mock_sm): + mock_client = MagicMock() + mock_sm.SecretManagerServiceClient.return_value = mock_client + mock_response = MagicMock() + mock_response.payload.data.decode.return_value = "versioned" + mock_client.access_secret_version.return_value = mock_response + + path = "projects/proj/secrets/mysecret/versions/3" + result = get_password(path) + + mock_client.access_secret_version.assert_called_once_with( + request={"name": path} + ) + assert result == "versioned" + + @mock.patch("src.common.secret_manager.secretmanager") + def test_decodes_utf8(self, mock_sm): + mock_client = MagicMock() + mock_sm.SecretManagerServiceClient.return_value = mock_client + mock_response = MagicMock() + mock_response.payload.data.decode.return_value = "p@ss" + mock_client.access_secret_version.return_value = mock_response + + result = get_password("projects/p/secrets/s") + mock_response.payload.data.decode.assert_called_with("UTF-8") + assert result == "p@ss" diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_teradata_connector.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_teradata_connector.py new file mode 100644 index 0000000..21a3d82 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_teradata_connector.py @@ -0,0 +1,409 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for TeradataConnector with mocked PySpark and teradatasql.""" + +import sys +from unittest import mock +from unittest.mock import MagicMock, patch, PropertyMock + +import pytest + +# Pre-mock heavy dependencies so the module can be imported without +# PySpark or teradatasql installed on the CI runner. +sys.modules.setdefault("pyspark", MagicMock()) +sys.modules.setdefault("pyspark.sql", MagicMock()) +sys.modules.setdefault("pyspark.sql.types", MagicMock()) +sys.modules.setdefault("teradatasql", MagicMock()) + +from src.constants import EntryType +import src.teradata_connector as _tc_module # force module registration + + +BASE_CONFIG = { + "host": "td-server.example.com", + "port": 1025, + "user": "testuser", + "password": "testpass", +} + + +def _build_connector(config=None, spark=None, td_conn=None): + """Build a TeradataConnector with all external deps mocked.""" + cfg = {**BASE_CONFIG, **(config or {})} + with patch("src.teradata_connector.getJarPath", return_value="/fake.jar"), \ + patch("src.teradata_connector.fileExists", return_value=True), \ + patch("src.teradata_connector.SparkSession") as mock_spark_cls, \ + patch("src.teradata_connector.teradatasql") as mock_td: + + mock_session = spark or MagicMock() + mock_spark_cls.builder.appName.return_value \ + .config.return_value \ + .config.return_value \ + .getOrCreate.return_value = mock_session + + mock_td.connect.return_value = td_conn or MagicMock() + + from src.teradata_connector import TeradataConnector + connector = TeradataConnector(cfg) + return connector + + +class TestTeradataConnectorInit: + """Tests for __init__ and connection setup.""" + + def test_init_sets_jdbc_url(self): + connector = _build_connector() + assert "td-server.example.com" in connector._url + assert "DBS_PORT=1025" in connector._url + assert "CHARSET=UTF8" in connector._url + + def test_init_custom_charset(self): + connector = _build_connector({"charset": "UTF16"}) + assert "CHARSET=UTF16" in connector._url + + def test_init_connect_options(self): + connector = _build_connector() + assert connector._connectOptions["driver"] == "com.teradata.jdbc.TeraDriver" + assert connector._connectOptions["user"] == "testuser" + assert connector._connectOptions["password"] == "testpass" + + def test_init_with_logmech(self): + connector = _build_connector({"logmech": "LDAP"}) + assert "LOGMECH=LDAP" in connector._url + + def test_init_without_logmech(self): + connector = _build_connector() + assert "LOGMECH" not in connector._url + + def test_init_logdata_appended_to_url(self): + connector = _build_connector({ + "logmech": "LDAP", + "logdata": "authdata", + }) + assert "LOGDATA=authdata" in connector._url + + def test_init_logdata_not_in_url_when_absent(self): + connector = _build_connector() + assert "LOGDATA" not in connector._url + + def test_init_teradatasql_receives_logmech_logdata(self): + mock_td_conn = MagicMock() + with patch("src.teradata_connector.getJarPath", return_value="/fake.jar"), \ + patch("src.teradata_connector.fileExists", return_value=True), \ + patch("src.teradata_connector.SparkSession") as mock_spark_cls, \ + patch("src.teradata_connector.teradatasql") as mock_td: + + mock_spark_cls.builder.appName.return_value \ + .config.return_value \ + .config.return_value \ + .getOrCreate.return_value = MagicMock() + mock_td.connect.return_value = mock_td_conn + + cfg = { + **BASE_CONFIG, + "logmech": "JWT", + "logdata": "eyJhbGciOi...", + } + from src.teradata_connector import TeradataConnector + TeradataConnector(cfg) + + call_kwargs = mock_td.connect.call_args[1] + assert call_kwargs["logmech"] == "JWT" + assert call_kwargs["logdata"] == "eyJhbGciOi..." + + def test_init_user_password_omitted_when_empty(self): + mock_td_conn = MagicMock() + with patch("src.teradata_connector.getJarPath", return_value="/fake.jar"), \ + patch("src.teradata_connector.fileExists", return_value=True), \ + patch("src.teradata_connector.SparkSession") as mock_spark_cls, \ + patch("src.teradata_connector.teradatasql") as mock_td: + + mock_spark_cls.builder.appName.return_value \ + .config.return_value \ + .config.return_value \ + .getOrCreate.return_value = MagicMock() + mock_td.connect.return_value = mock_td_conn + + cfg = { + **BASE_CONFIG, + "user": "", + "password": "", + "logmech": "LDAP", + "logdata": "authdata", + } + from src.teradata_connector import TeradataConnector + connector = TeradataConnector(cfg) + + assert "user" not in connector._connectOptions + assert "password" not in connector._connectOptions + + call_kwargs = mock_td.connect.call_args[1] + assert "user" not in call_kwargs + assert "password" not in call_kwargs + + +class TestGetDbSchemas: + """Tests for get_db_schemas().""" + + def test_get_db_schemas_executes_query(self): + mock_spark = MagicMock() + mock_df = MagicMock() + mock_spark.read.format.return_value \ + .options.return_value \ + .option.return_value \ + .load.return_value = mock_df + + connector = _build_connector(spark=mock_spark) + result = connector.get_db_schemas() + + mock_spark.read.format.assert_called_with("jdbc") + assert result == mock_df + + def test_get_db_schemas_excludes_system_dbs(self): + mock_spark = MagicMock() + mock_df = MagicMock() + mock_spark.read.format.return_value \ + .options.return_value \ + .option.return_value \ + .load.return_value = mock_df + + connector = _build_connector(spark=mock_spark) + connector.get_db_schemas() + + query_call = mock_spark.read.format.return_value \ + .options.return_value \ + .option.call_args + query = query_call[0][1] + assert "DBC" in query + assert "NOT IN" in query + + def test_get_db_schemas_with_database_filter(self): + mock_spark = MagicMock() + mock_df = MagicMock() + mock_spark.read.format.return_value \ + .options.return_value \ + .option.return_value \ + .load.return_value = mock_df + + connector = _build_connector( + config={"database": "MyDB"}, spark=mock_spark + ) + connector.get_db_schemas() + + query_call = mock_spark.read.format.return_value \ + .options.return_value \ + .option.call_args + query = query_call[0][1] + assert "MyDB" in query + + +class TestGetDataset: + """Tests for get_dataset() dispatch.""" + + def test_get_dataset_table(self): + mock_spark = MagicMock() + mock_df = MagicMock() + mock_spark.read.format.return_value \ + .options.return_value \ + .option.return_value \ + .load.return_value = mock_df + mock_df.orderBy.return_value = mock_df + + connector = _build_connector(spark=mock_spark) + result = connector.get_dataset("test_schema", EntryType.TABLE) + mock_df.orderBy.assert_called_with("TABLE_NAME") + + def test_get_dataset_view_empty(self): + mock_spark = MagicMock() + mock_td_conn = MagicMock() + mock_cursor = MagicMock() + mock_td_conn.cursor.return_value.__enter__ = MagicMock( + return_value=mock_cursor + ) + mock_td_conn.cursor.return_value.__exit__ = MagicMock( + return_value=False + ) + + # Return empty view list + mock_cursor.execute.return_value = None + mock_cursor.fetchall.return_value = [] + + connector = _build_connector( + spark=mock_spark, td_conn=mock_td_conn + ) + # _execute_td returns empty → createDataFrame with empty list + connector._execute_td = MagicMock(return_value=[]) + result = connector.get_dataset("test_schema", EntryType.VIEW) + mock_spark.createDataFrame.assert_called_once() + + +class TestGetTables: + """Tests for _get_tables().""" + + def test_get_tables_query_contains_schema(self): + mock_spark = MagicMock() + mock_df = MagicMock() + mock_spark.read.format.return_value \ + .options.return_value \ + .option.return_value \ + .load.return_value = mock_df + mock_df.orderBy.return_value = mock_df + + connector = _build_connector(spark=mock_spark) + connector._get_tables("my_schema") + + query_call = mock_spark.read.format.return_value \ + .options.return_value \ + .option.call_args + query = query_call[0][1] + assert "my_schema" in query + assert "TableKind IN ('T', 'O')" in query + + def test_get_tables_escapes_quotes(self): + mock_spark = MagicMock() + mock_df = MagicMock() + mock_spark.read.format.return_value \ + .options.return_value \ + .option.return_value \ + .load.return_value = mock_df + mock_df.orderBy.return_value = mock_df + + connector = _build_connector(spark=mock_spark) + connector._get_tables("schema'name") + + query_call = mock_spark.read.format.return_value \ + .options.return_value \ + .option.call_args + query = query_call[0][1] + assert "schema''name" in query + + +class TestGetViews: + """Tests for _get_views() with HELP COLUMN.""" + + def test_get_views_with_columns(self): + mock_spark = MagicMock() + mock_td_conn = MagicMock() + + # Step 1: view list query + view_list = [("view1", "comment", None, None)] + + # Step 2: HELP COLUMN cursor + mock_cursor = MagicMock() + mock_cursor.description = [ + ("Column Name", None), ("Type", None), + ("Nullable", None), ("Max Length", None), + ("Decimal Total Digits", None), + ("Decimal Fractional Digits", None), + ("Comment", None), + ] + mock_cursor.fetchall.return_value = [ + ("col1", "CV", "Y", 100, None, None, "col comment"), + ] + + mock_td_conn.cursor.return_value.__enter__ = MagicMock( + return_value=mock_cursor + ) + mock_td_conn.cursor.return_value.__exit__ = MagicMock( + return_value=False + ) + + connector = _build_connector( + spark=mock_spark, td_conn=mock_td_conn + ) + connector._execute_td = MagicMock(return_value=view_list) + + connector._get_views("test_schema") + mock_spark.createDataFrame.assert_called_once() + rows = mock_spark.createDataFrame.call_args[0][0] + assert len(rows) == 1 + assert rows[0][0] == "view1" + assert rows[0][1] == "col1" + + def test_get_views_help_column_fallback(self): + """When HELP COLUMN fails, falls back to DBC.ColumnsV.""" + mock_spark = MagicMock() + + # Build connector with a clean td_conn first + connector = _build_connector(spark=mock_spark) + + # Now replace _td_conn with one whose cursor raises on execute + mock_td_conn = MagicMock() + mock_cursor = MagicMock() + mock_cursor.execute.side_effect = Exception("HELP COLUMN failed") + mock_td_conn.cursor.return_value.__enter__ = MagicMock( + return_value=mock_cursor + ) + mock_td_conn.cursor.return_value.__exit__ = MagicMock( + return_value=False + ) + connector._td_conn = mock_td_conn + + view_list = [("bad_view", "comment", None, None)] + + # First call returns view list, second returns fallback columns + connector._execute_td = MagicMock( + side_effect=[ + view_list, + [("fallback_col", "Y")], + ] + ) + + connector._get_views("test_schema") + mock_spark.createDataFrame.assert_called_once() + rows = mock_spark.createDataFrame.call_args[0][0] + assert len(rows) == 1 + assert rows[0][0] == "bad_view" + assert rows[0][1] == "fallback_col" + + +class TestClose: + """Tests for close() and context manager.""" + + def test_close_connections(self): + mock_td_conn = MagicMock() + mock_spark = MagicMock() + connector = _build_connector( + spark=mock_spark, td_conn=mock_td_conn + ) + + connector.close() + mock_td_conn.close.assert_called_once() + mock_spark.stop.assert_called_once() + assert connector._td_conn is None + assert connector._spark is None + + def test_close_idempotent(self): + connector = _build_connector() + connector._td_conn = None + connector._spark = None + connector.close() # should not raise + + def test_close_handles_exception(self): + mock_td_conn = MagicMock() + mock_td_conn.close.side_effect = Exception("close failed") + mock_spark = MagicMock() + + connector = _build_connector( + spark=mock_spark, td_conn=mock_td_conn + ) + connector.close() # should not raise + assert connector._td_conn is None + + def test_context_manager(self): + connector = _build_connector() + with connector as c: + assert c is connector + assert connector._td_conn is None diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_top_entry_builder.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_top_entry_builder.py new file mode 100644 index 0000000..b6035a3 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/test_top_entry_builder.py @@ -0,0 +1,118 @@ +# Copyright 2026 Teradata +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for top_entry_builder output format.""" + +import json +import pytest +from src.constants import EntryType +from src.common.top_entry_builder import create + +CONFIG = { + "target_project_id": "my-project", + "target_location_id": "us-central1", + "target_entry_group_id": "teradata", + "host": "td-server.example.com", +} + + +class TestImportItemFormat: + """Verify output matches Dataplex import API requirements.""" + + @pytest.fixture(params=[EntryType.INSTANCE, EntryType.DATABASE]) + def import_item(self, request): + json_str = create(CONFIG, request.param) + return json.loads(json_str) + + def test_top_level_keys(self, import_item): + assert "entry" in import_item + assert "aspectKeys" in import_item + assert "updateMask" in import_item + + def test_update_mask_is_array(self, import_item): + """Docs require update_mask to be ArrayType(StringType()).""" + assert isinstance(import_item["updateMask"], list) + assert import_item["updateMask"] == ["aspects"] + + def test_aspect_keys_is_array(self, import_item): + assert isinstance(import_item["aspectKeys"], list) + assert len(import_item["aspectKeys"]) > 0 + + def test_entry_has_required_fields(self, import_item): + entry = import_item["entry"] + assert "name" in entry + assert "entryType" in entry + assert "fullyQualifiedName" in entry + assert "aspects" in entry + # parentEntry may be empty string for instance + assert "parentEntry" in entry + + def test_entry_has_entry_source(self, import_item): + """Docs require entrySource on all entries.""" + entry = import_item["entry"] + assert "entrySource" in entry + es = entry["entrySource"] + assert "displayName" in es + assert "system" in es + assert es["system"] == "teradata" + + def test_aspects_have_correct_structure(self, import_item): + aspects = import_item["entry"]["aspects"] + for key, aspect in aspects.items(): + assert "aspectType" in aspect + assert "data" in aspect + assert aspect["aspectType"] == key + + def test_camel_case_keys(self, import_item): + """Ensure all keys are camelCase, not snake_case.""" + entry = import_item["entry"] + assert "entry_type" not in entry + assert "entryType" in entry + assert "fully_qualified_name" not in entry + assert "fullyQualifiedName" in entry + assert "parent_entry" not in entry + assert "parentEntry" in entry + assert "entry_source" not in entry + assert "entrySource" in entry + + +class TestInstanceEntry: + def test_instance_fqn(self): + item = json.loads(create(CONFIG, EntryType.INSTANCE)) + assert item["entry"]["fullyQualifiedName"] == ( + "custom:`td-server-example-com`" + ) + + def test_instance_parent_empty(self): + item = json.loads(create(CONFIG, EntryType.INSTANCE)) + assert item["entry"]["parentEntry"] == "" + + def test_instance_entry_type(self): + item = json.loads(create(CONFIG, EntryType.INSTANCE)) + assert item["entry"]["entryType"].endswith( + "teradata-instance" + ) + + +class TestDatabaseEntry: + def test_database_parent_is_instance(self): + item = json.loads(create(CONFIG, EntryType.DATABASE)) + assert "/entries/td-server.example.com" in ( + item["entry"]["parentEntry"] + ) + assert "/databases/" not in item["entry"]["parentEntry"] + + def test_database_name_contains_databases(self): + item = json.loads(create(CONFIG, EntryType.DATABASE)) + assert "/databases/" in item["entry"]["name"] diff --git a/managed-connectivity/community-contributed-connectors/teradata-connector/tests/validate_output.py b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/validate_output.py new file mode 100644 index 0000000..70dd578 --- /dev/null +++ b/managed-connectivity/community-contributed-connectors/teradata-connector/tests/validate_output.py @@ -0,0 +1,257 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Validate a JSONL output file against Dataplex import API requirements.""" + +import json +import sys +from pathlib import Path + +VALID_METADATA_TYPES = { + "NUMBER", "STRING", "BYTES", "TIMESTAMP", "DATETIME", "BOOLEAN", "OTHER" +} +VALID_MODES = {"NULLABLE", "REQUIRED"} + + +def validate_import_item(item, line_num): + """Validate a single import item. Returns list of errors.""" + errors = [] + prefix = f"Line {line_num}" + + # 1. Top-level keys + for key in ("entry", "aspectKeys", "updateMask"): + if key not in item: + errors.append(f"{prefix}: missing top-level key '{key}'") + + if "entry" not in item: + return errors + + entry = item["entry"] + if not isinstance(entry, dict): + errors.append(f"{prefix}: 'entry' must be a JSON object") + return errors + + # 2. updateMask must be ["aspects"] + if item.get("updateMask") != ["aspects"]: + errors.append( + f"{prefix}: updateMask should be ['aspects'], " + f"got {item.get('updateMask')}" + ) + + # 3. aspectKeys must be a non-empty list + aspect_keys = item.get("aspectKeys", []) + if not isinstance(aspect_keys, list) or len(aspect_keys) == 0: + errors.append(f"{prefix}: aspectKeys must be a non-empty list") + + # 4. Required entry fields + for field in ( + "name", "entryType", "fullyQualifiedName", "aspects", "entrySource" + ): + if field not in entry: + errors.append(f"{prefix}: entry missing '{field}'") + + # 5. No snake_case keys (must be camelCase) + snake_case_keys = { + "entry_type", "fully_qualified_name", "parent_entry", + "entry_source", "aspect_keys", "update_mask", + "display_name", "aspect_type", "data_type", "metadata_type", + "default_value", + } + for key in entry: + if key in snake_case_keys: + errors.append(f"{prefix}: snake_case key '{key}' found in entry") + + # 6. entrySource validation + entry_source = entry.get("entrySource", {}) + if "displayName" not in entry_source: + errors.append(f"{prefix}: entrySource missing 'displayName'") + if entry_source.get("system") != "teradata": + errors.append( + f"{prefix}: entrySource.system should be 'teradata', " + f"got '{entry_source.get('system')}'" + ) + + # 7. Aspects structure + aspects = entry.get("aspects", {}) + if not isinstance(aspects, dict): + errors.append(f"{prefix}: aspects must be an object (dictionary)") + aspects = {} + + for key, aspect in aspects.items(): + if not isinstance(aspect, dict): + errors.append( + f"{prefix}: aspect '{key}' must be an object (dictionary)" + ) + continue + if "aspectType" not in aspect: + errors.append(f"{prefix}: aspect '{key}' missing 'aspectType'") + elif aspect["aspectType"] != key: + errors.append( + f"{prefix}: aspect key '{key}' != " + f"aspectType '{aspect['aspectType']}'" + ) + if "data" not in aspect: + errors.append(f"{prefix}: aspect '{key}' missing 'data'") + + # 8. aspectKeys must match aspects keys + aspect_key_set = set(aspect_keys) + actual_keys = set(aspects.keys()) + if aspect_key_set != actual_keys: + errors.append( + f"{prefix}: aspectKeys {aspect_key_set} != " + f"aspects keys {actual_keys}" + ) + + # 9. Name pattern + name = entry.get("name", "") + if not name.startswith("projects/"): + errors.append(f"{prefix}: name doesn't start with 'projects/'") + if "/entryGroups/" not in name: + errors.append(f"{prefix}: name missing '/entryGroups/'") + if "/entries/" not in name: + errors.append(f"{prefix}: name missing '/entries/'") + + # 10. Schema aspect validation (tables and views) + schema_key = "dataplex-types.global.schema" + if schema_key in aspects: + schema_data = aspects[schema_key].get("data", {}) + fields = schema_data.get("fields", []) + if not fields: + errors.append(f"{prefix}: schema has no fields") + for i, field in enumerate(fields): + fp = f"{prefix}, field[{i}]" + if "name" not in field: + errors.append(f"{fp}: missing 'name'") + if "mode" not in field: + errors.append(f"{fp}: missing 'mode'") + elif field["mode"] not in VALID_MODES: + errors.append( + f"{fp}: invalid mode '{field['mode']}'" + ) + if "dataType" not in field: + errors.append(f"{fp}: missing 'dataType'") + if "metadataType" not in field: + errors.append(f"{fp}: missing 'metadataType'") + elif field["metadataType"] not in VALID_METADATA_TYPES: + errors.append( + f"{fp}: invalid metadataType " + f"'{field['metadataType']}'" + ) + + return errors + + +def validate_hierarchy(items): + """Check parent-child relationships are consistent. + + Args: + items: list of (line_num, parsed_item) tuples. + """ + errors = [] + # Collect names only from items that have a valid entry with a name. + entry_names = set() + for _line_num, item in items: + entry = item.get("entry") + if isinstance(entry, dict) and entry.get("name"): + entry_names.add(entry["name"]) + + for line_num, item in items: + entry = item.get("entry") + if not isinstance(entry, dict): + continue + parent = entry.get("parentEntry", "") + if parent and parent not in entry_names: + errors.append( + f"Line {line_num}: parentEntry '{parent}' " + f"not found in any entry name" + ) + return errors + + +def main(): + if len(sys.argv) < 2: + print("Usage: python validate_output.py ") + sys.exit(1) + + filepath = Path(sys.argv[1]) + if not filepath.exists(): + print(f"File not found: {filepath}") + sys.exit(1) + + items = [] + all_errors = [] + stats = { + "total": 0, "instance": 0, "database": 0, + "schema": 0, "table": 0, "view": 0, + "fields_total": 0, + } + + with open(filepath, encoding="utf-8") as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + + try: + item = json.loads(line) + except json.JSONDecodeError as e: + all_errors.append(f"Line {line_num}: invalid JSON: {e}") + continue + + items.append((line_num, item)) + stats["total"] += 1 + + # Classify entry type + entry_type = item.get("entry", {}).get("entryType", "") + for t in ("instance", "database", "schema", "table", "view"): + if entry_type.endswith(f"teradata-{t}"): + stats[t] += 1 + break + + # Count fields + aspects = item.get("entry", {}).get("aspects", {}) + schema_aspect = aspects.get("dataplex-types.global.schema", {}) + fields = schema_aspect.get("data", {}).get("fields", []) + stats["fields_total"] += len(fields) + + errors = validate_import_item(item, line_num) + all_errors.extend(errors) + + # Validate hierarchy + all_errors.extend(validate_hierarchy(items)) + + # Report + print(f"=== Dataplex Import Validation Report ===") + print(f"File: {filepath}") + print(f"Total entries: {stats['total']}") + print(f" Instances: {stats['instance']}") + print(f" Databases: {stats['database']}") + print(f" Schemas: {stats['schema']}") + print(f" Tables: {stats['table']}") + print(f" Views: {stats['view']}") + print(f" Total fields (columns): {stats['fields_total']}") + print() + + if all_errors: + print(f"FAILED: {len(all_errors)} error(s) found:") + for err in all_errors: + print(f" - {err}") + sys.exit(1) + else: + print("PASSED: All entries are Dataplex-compatible.") + sys.exit(0) + + +if __name__ == "__main__": + main()