From fe48990133d62eb1c99aa593a150256c44e75807 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 28 Jan 2026 15:43:33 -0500 Subject: [PATCH 1/3] feat: add request translator for IndexEmbed to semantic_text Add _translate_embed_to_semantic_text() method to PineconeDBControlRequestFactory that converts create_index_for_model parameters (cloud, region, embed) to the new API format using deployment and schema with semantic_text field type. Translation logic: - cloud/region -> ServerlessDeployment - IndexEmbed.model -> semantic_text field model - IndexEmbed.metric -> semantic_text field metric (if provided) - IndexEmbed.field_map values -> schema field names - Default read_parameters: {"input_type": "query"} - Default write_parameters: {"input_type": "passage"} Refs: SDK-105 --- pinecone/db_control/request_factory.py | 104 +++++++++++ .../db_control/test_index_request_factory.py | 163 ++++++++++++++++++ 2 files changed, 267 insertions(+) diff --git a/pinecone/db_control/request_factory.py b/pinecone/db_control/request_factory.py index 9874c610..5cd81c6d 100644 --- a/pinecone/db_control/request_factory.py +++ b/pinecone/db_control/request_factory.py @@ -492,6 +492,110 @@ def _translate_legacy_request( return deployment_dict, schema_dict + @staticmethod + def _translate_embed_to_semantic_text( + cloud: CloudProvider | str, + region: AwsRegion | GcpRegion | AzureRegion | str, + embed: IndexEmbed | CreateIndexForModelEmbedTypedDict, + ) -> tuple[dict[str, Any], dict[str, Any]]: + """Translate embed-based request to deployment + schema format. + + This method converts `create_index_for_model` parameters (cloud, region, embed) + to the new API format using deployment and schema structures with a semantic_text + field type. + + :param cloud: The cloud provider (aws, gcp, azure). + :param region: The cloud region. + :param embed: The IndexEmbed configuration or equivalent dict. + :returns: A tuple of (deployment_dict, schema_dict) for the new API format. + + **Translation Example:** + + * Input: ``cloud="aws"``, ``region="us-east-1"``, + ``embed=IndexEmbed(model="multilingual-e5-large", metric="cosine", field_map={"text": "synopsis"})`` + * Output deployment: ``{"deployment_type": "serverless", "cloud": "aws", "region": "us-east-1"}`` + * Output schema: ``{"fields": {"synopsis": {"type": "semantic_text", "model": "multilingual-e5-large", ...}}}`` + + Example:: + + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", + region="us-east-1", + embed=IndexEmbed( + model="multilingual-e5-large", + metric="cosine", + field_map={"text": "synopsis"} + ) + ) + """ + # Convert enum values to strings + cloud = convert_enum_to_string(cloud) + region = convert_enum_to_string(region) + + # Create ServerlessDeployment for cloud/region + deployment = ServerlessDeployment(cloud=cloud, region=region) + deployment_dict = deployment.to_dict() + + # Parse embed configuration + model: str + metric: str | None + field_map: dict[str, str] + read_parameters: dict[str, Any] | None + write_parameters: dict[str, Any] | None + + if isinstance(embed, IndexEmbed): + model = embed.model + metric = embed.metric + field_map = embed.field_map + read_parameters = embed.read_parameters + write_parameters = embed.write_parameters + else: + # Dict-based embed + raw_model = embed.get("model") + if raw_model is None: + raise ValueError("model is required in embed") + model = convert_enum_to_string(raw_model) + raw_metric = embed.get("metric") + metric = convert_enum_to_string(raw_metric) if raw_metric is not None else None + raw_field_map = embed.get("field_map") + if raw_field_map is None: + raise ValueError("field_map is required in embed") + field_map = raw_field_map + read_parameters = embed.get("read_parameters") + write_parameters = embed.get("write_parameters") + + # Extract field name from field_map values + # field_map is like {"text": "synopsis"} where "synopsis" is the target field name + if not field_map: + raise ValueError("field_map must contain at least one mapping") + + # Build schema with semantic_text fields + schema_dict: dict[str, Any] = {"fields": {}} + + for source_field, target_field in field_map.items(): + # Build the semantic_text field configuration + field_config: dict[str, Any] = {"type": "semantic_text", "model": model} + + # Include metric if provided + if metric is not None: + field_config["metric"] = convert_enum_to_string(metric) + + # Apply default read_parameters if not provided or empty + if read_parameters: + field_config["read_parameters"] = read_parameters + else: + field_config["read_parameters"] = {"input_type": "query"} + + # Apply default write_parameters if not provided or empty + if write_parameters: + field_config["write_parameters"] = write_parameters + else: + field_config["write_parameters"] = {"input_type": "passage"} + + schema_dict["fields"][target_field] = field_config + + return deployment_dict, schema_dict + @staticmethod def create_index_request( name: str, diff --git a/tests/unit/db_control/test_index_request_factory.py b/tests/unit/db_control/test_index_request_factory.py index 38911506..9bd29708 100644 --- a/tests/unit/db_control/test_index_request_factory.py +++ b/tests/unit/db_control/test_index_request_factory.py @@ -549,3 +549,166 @@ def test_translate_invalid_vector_type_typo(self): dimension=1536, vector_type="desnse", # Typo ) + + +class TestTranslateEmbedToSemanticText: + """Tests for _translate_embed_to_semantic_text method.""" + + def test_basic_index_embed_translation(self): + """Test basic IndexEmbed to semantic_text translation.""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed( + model="multilingual-e5-large", metric="cosine", field_map={"text": "synopsis"} + ) + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + assert deployment == { + "deployment_type": "serverless", + "cloud": "aws", + "region": "us-east-1", + } + assert schema == { + "fields": { + "synopsis": { + "type": "semantic_text", + "model": "multilingual-e5-large", + "metric": "cosine", + "read_parameters": {"input_type": "query"}, + "write_parameters": {"input_type": "passage"}, + } + } + } + + def test_embed_translation_with_custom_parameters(self): + """Test IndexEmbed translation with custom read/write parameters.""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed( + model="multilingual-e5-large", + metric="dotproduct", + field_map={"text": "content"}, + read_parameters={"input_type": "search_query", "truncate": "END"}, + write_parameters={"input_type": "search_document", "truncate": "END"}, + ) + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="gcp", region="us-central1", embed=embed + ) + + assert deployment == { + "deployment_type": "serverless", + "cloud": "gcp", + "region": "us-central1", + } + assert schema["fields"]["content"]["read_parameters"] == { + "input_type": "search_query", + "truncate": "END", + } + assert schema["fields"]["content"]["write_parameters"] == { + "input_type": "search_document", + "truncate": "END", + } + + def test_embed_translation_without_metric(self): + """Test IndexEmbed translation without metric (should not include metric in output).""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed(model="multilingual-e5-large", field_map={"text": "description"}) + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-west-2", embed=embed + ) + + assert "metric" not in schema["fields"]["description"] + assert schema["fields"]["description"]["type"] == "semantic_text" + assert schema["fields"]["description"]["model"] == "multilingual-e5-large" + + def test_embed_translation_with_dict(self): + """Test dict-based embed configuration translation.""" + embed = { + "model": "multilingual-e5-large", + "metric": "euclidean", + "field_map": {"text": "body"}, + } + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + assert deployment["deployment_type"] == "serverless" + assert schema["fields"]["body"]["type"] == "semantic_text" + assert schema["fields"]["body"]["model"] == "multilingual-e5-large" + assert schema["fields"]["body"]["metric"] == "euclidean" + assert schema["fields"]["body"]["read_parameters"] == {"input_type": "query"} + assert schema["fields"]["body"]["write_parameters"] == {"input_type": "passage"} + + def test_embed_translation_with_enum_cloud_region(self): + """Test translation with enum values for cloud and region.""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed( + model="multilingual-e5-large", metric="cosine", field_map={"text": "synopsis"} + ) + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud=CloudProvider.AWS, region=AwsRegion.US_EAST_1, embed=embed + ) + + assert deployment["cloud"] == "aws" + assert deployment["region"] == "us-east-1" + + def test_embed_translation_multiple_field_mappings(self): + """Test IndexEmbed translation with multiple field mappings.""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed( + model="multilingual-e5-large", + metric="cosine", + field_map={"text": "title", "description": "content"}, + ) + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + assert "title" in schema["fields"] + assert "content" in schema["fields"] + assert schema["fields"]["title"]["type"] == "semantic_text" + assert schema["fields"]["content"]["type"] == "semantic_text" + + def test_embed_translation_missing_model_raises_error(self): + """Test that missing model in dict embed raises ValueError.""" + embed = {"field_map": {"text": "synopsis"}} + with pytest.raises(ValueError, match="model is required"): + PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + def test_embed_translation_missing_field_map_raises_error(self): + """Test that missing field_map in dict embed raises ValueError.""" + embed = {"model": "multilingual-e5-large"} + with pytest.raises(ValueError, match="field_map is required"): + PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + def test_embed_translation_empty_field_map_raises_error(self): + """Test that empty field_map raises ValueError.""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed(model="multilingual-e5-large", field_map={}) + with pytest.raises(ValueError, match="field_map must contain at least one mapping"): + PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + def test_embed_translation_with_metric_enum(self): + """Test IndexEmbed translation with Metric enum value.""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed( + model="multilingual-e5-large", metric=Metric.EUCLIDEAN, field_map={"text": "synopsis"} + ) + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + assert schema["fields"]["synopsis"]["metric"] == "euclidean" From 02d230d15d669b1f30cd5aac20dc43b51469d977 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 28 Jan 2026 15:50:24 -0500 Subject: [PATCH 2/3] fix: copy parameter dicts to avoid shared references across fields When multiple fields exist in field_map and custom read/write parameters are provided, each field now gets an independent copy of the dictionary instead of sharing the same reference. --- pinecone/db_control/request_factory.py | 6 ++-- .../db_control/test_index_request_factory.py | 33 +++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/pinecone/db_control/request_factory.py b/pinecone/db_control/request_factory.py index 5cd81c6d..c5f19192 100644 --- a/pinecone/db_control/request_factory.py +++ b/pinecone/db_control/request_factory.py @@ -581,14 +581,16 @@ def _translate_embed_to_semantic_text( field_config["metric"] = convert_enum_to_string(metric) # Apply default read_parameters if not provided or empty + # Use dict() to create a copy to avoid shared references across fields if read_parameters: - field_config["read_parameters"] = read_parameters + field_config["read_parameters"] = dict(read_parameters) else: field_config["read_parameters"] = {"input_type": "query"} # Apply default write_parameters if not provided or empty + # Use dict() to create a copy to avoid shared references across fields if write_parameters: - field_config["write_parameters"] = write_parameters + field_config["write_parameters"] = dict(write_parameters) else: field_config["write_parameters"] = {"input_type": "passage"} diff --git a/tests/unit/db_control/test_index_request_factory.py b/tests/unit/db_control/test_index_request_factory.py index 9bd29708..9ff9f1d5 100644 --- a/tests/unit/db_control/test_index_request_factory.py +++ b/tests/unit/db_control/test_index_request_factory.py @@ -712,3 +712,36 @@ def test_embed_translation_with_metric_enum(self): ) assert schema["fields"]["synopsis"]["metric"] == "euclidean" + + def test_embed_translation_multiple_fields_independent_copies(self): + """Test that multiple field mappings get independent copies of parameters.""" + from pinecone.db_control.models import IndexEmbed + + embed = IndexEmbed( + model="multilingual-e5-large", + metric="cosine", + field_map={"text": "title", "description": "content"}, + read_parameters={"input_type": "search_query"}, + write_parameters={"input_type": "search_document"}, + ) + deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( + cloud="aws", region="us-east-1", embed=embed + ) + + # Verify both fields have correct parameters + assert schema["fields"]["title"]["read_parameters"] == {"input_type": "search_query"} + assert schema["fields"]["content"]["read_parameters"] == {"input_type": "search_query"} + + # Verify dictionaries are independent copies (not shared references) + assert ( + schema["fields"]["title"]["read_parameters"] + is not schema["fields"]["content"]["read_parameters"] + ) + assert ( + schema["fields"]["title"]["write_parameters"] + is not schema["fields"]["content"]["write_parameters"] + ) + + # Verify modifying one doesn't affect the other + schema["fields"]["title"]["read_parameters"]["extra"] = "value" + assert "extra" not in schema["fields"]["content"]["read_parameters"] From 266687b6f5b8d0a63323fa78895025e9831fc73c Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Wed, 28 Jan 2026 16:00:04 -0500 Subject: [PATCH 3/3] style: use underscore for unused loop variable --- pinecone/db_control/request_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pinecone/db_control/request_factory.py b/pinecone/db_control/request_factory.py index c5f19192..fb99a10d 100644 --- a/pinecone/db_control/request_factory.py +++ b/pinecone/db_control/request_factory.py @@ -572,7 +572,7 @@ def _translate_embed_to_semantic_text( # Build schema with semantic_text fields schema_dict: dict[str, Any] = {"fields": {}} - for source_field, target_field in field_map.items(): + for _, target_field in field_map.items(): # Build the semantic_text field configuration field_config: dict[str, Any] = {"type": "semantic_text", "model": model}