Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions pinecone/db_control/request_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,112 @@ def _translate_legacy_request(

return deployment_dict, schema_dict

@staticmethod
def _translate_embed_to_semantic_text(
cloud: CloudProvider | str,
region: AwsRegion | GcpRegion | AzureRegion | str,
embed: IndexEmbed | CreateIndexForModelEmbedTypedDict,
) -> tuple[dict[str, Any], dict[str, Any]]:
"""Translate embed-based request to deployment + schema format.

This method converts `create_index_for_model` parameters (cloud, region, embed)
to the new API format using deployment and schema structures with a semantic_text
field type.

:param cloud: The cloud provider (aws, gcp, azure).
:param region: The cloud region.
:param embed: The IndexEmbed configuration or equivalent dict.
:returns: A tuple of (deployment_dict, schema_dict) for the new API format.

**Translation Example:**

* Input: ``cloud="aws"``, ``region="us-east-1"``,
``embed=IndexEmbed(model="multilingual-e5-large", metric="cosine", field_map={"text": "synopsis"})``
* Output deployment: ``{"deployment_type": "serverless", "cloud": "aws", "region": "us-east-1"}``
* Output schema: ``{"fields": {"synopsis": {"type": "semantic_text", "model": "multilingual-e5-large", ...}}}``

Example::

deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws",
region="us-east-1",
embed=IndexEmbed(
model="multilingual-e5-large",
metric="cosine",
field_map={"text": "synopsis"}
)
)
"""
# Convert enum values to strings
cloud = convert_enum_to_string(cloud)
region = convert_enum_to_string(region)

# Create ServerlessDeployment for cloud/region
deployment = ServerlessDeployment(cloud=cloud, region=region)
deployment_dict = deployment.to_dict()

# Parse embed configuration
model: str
metric: str | None
field_map: dict[str, str]
read_parameters: dict[str, Any] | None
write_parameters: dict[str, Any] | None

if isinstance(embed, IndexEmbed):
model = embed.model
metric = embed.metric
field_map = embed.field_map
read_parameters = embed.read_parameters
write_parameters = embed.write_parameters
else:
# Dict-based embed
raw_model = embed.get("model")
if raw_model is None:
raise ValueError("model is required in embed")
model = convert_enum_to_string(raw_model)
raw_metric = embed.get("metric")
metric = convert_enum_to_string(raw_metric) if raw_metric is not None else None
raw_field_map = embed.get("field_map")
if raw_field_map is None:
raise ValueError("field_map is required in embed")
field_map = raw_field_map
read_parameters = embed.get("read_parameters")
write_parameters = embed.get("write_parameters")

# Extract field name from field_map values
# field_map is like {"text": "synopsis"} where "synopsis" is the target field name
if not field_map:
raise ValueError("field_map must contain at least one mapping")

# Build schema with semantic_text fields
schema_dict: dict[str, Any] = {"fields": {}}

for _, target_field in field_map.items():
# Build the semantic_text field configuration
field_config: dict[str, Any] = {"type": "semantic_text", "model": model}

# Include metric if provided
if metric is not None:
field_config["metric"] = convert_enum_to_string(metric)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Redundant enum conversion call on already-converted metric

Low Severity

The convert_enum_to_string(metric) call at line 581 is redundant because metric is already guaranteed to be a string (or None) at this point. For IndexEmbed objects, the metric attribute is converted to a string in IndexEmbed.__init__ (line 62 of index_embed.py). For dict-based embeds, the conversion already happens at line 559. Calling convert_enum_to_string on an already-converted string value just returns it unchanged, adding unnecessary overhead.

Fix in Cursor Fix in Web


# Apply default read_parameters if not provided or empty
# Use dict() to create a copy to avoid shared references across fields
if read_parameters:
field_config["read_parameters"] = dict(read_parameters)
else:
field_config["read_parameters"] = {"input_type": "query"}

# Apply default write_parameters if not provided or empty
# Use dict() to create a copy to avoid shared references across fields
if write_parameters:
field_config["write_parameters"] = dict(write_parameters)
else:
field_config["write_parameters"] = {"input_type": "passage"}

schema_dict["fields"][target_field] = field_config

return deployment_dict, schema_dict
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New function defined but never called in production code

Low Severity

The _translate_embed_to_semantic_text method is defined and has unit tests, but is never called from any production code path. The only references outside the definition itself are in the docstring example and test files. This is scaffolding code that hasn't been integrated into the actual request flow (e.g., create_index_for_model_request doesn't call it).

Fix in Cursor Fix in Web


@staticmethod
def create_index_request(
name: str,
Expand Down
196 changes: 196 additions & 0 deletions tests/unit/db_control/test_index_request_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,3 +549,199 @@ def test_translate_invalid_vector_type_typo(self):
dimension=1536,
vector_type="desnse", # Typo
)


class TestTranslateEmbedToSemanticText:
"""Tests for _translate_embed_to_semantic_text method."""

def test_basic_index_embed_translation(self):
"""Test basic IndexEmbed to semantic_text translation."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(
model="multilingual-e5-large", metric="cosine", field_map={"text": "synopsis"}
)
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

assert deployment == {
"deployment_type": "serverless",
"cloud": "aws",
"region": "us-east-1",
}
assert schema == {
"fields": {
"synopsis": {
"type": "semantic_text",
"model": "multilingual-e5-large",
"metric": "cosine",
"read_parameters": {"input_type": "query"},
"write_parameters": {"input_type": "passage"},
}
}
}

def test_embed_translation_with_custom_parameters(self):
"""Test IndexEmbed translation with custom read/write parameters."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(
model="multilingual-e5-large",
metric="dotproduct",
field_map={"text": "content"},
read_parameters={"input_type": "search_query", "truncate": "END"},
write_parameters={"input_type": "search_document", "truncate": "END"},
)
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="gcp", region="us-central1", embed=embed
)

assert deployment == {
"deployment_type": "serverless",
"cloud": "gcp",
"region": "us-central1",
}
assert schema["fields"]["content"]["read_parameters"] == {
"input_type": "search_query",
"truncate": "END",
}
assert schema["fields"]["content"]["write_parameters"] == {
"input_type": "search_document",
"truncate": "END",
}

def test_embed_translation_without_metric(self):
"""Test IndexEmbed translation without metric (should not include metric in output)."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(model="multilingual-e5-large", field_map={"text": "description"})
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-west-2", embed=embed
)

assert "metric" not in schema["fields"]["description"]
assert schema["fields"]["description"]["type"] == "semantic_text"
assert schema["fields"]["description"]["model"] == "multilingual-e5-large"

def test_embed_translation_with_dict(self):
"""Test dict-based embed configuration translation."""
embed = {
"model": "multilingual-e5-large",
"metric": "euclidean",
"field_map": {"text": "body"},
}
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

assert deployment["deployment_type"] == "serverless"
assert schema["fields"]["body"]["type"] == "semantic_text"
assert schema["fields"]["body"]["model"] == "multilingual-e5-large"
assert schema["fields"]["body"]["metric"] == "euclidean"
assert schema["fields"]["body"]["read_parameters"] == {"input_type": "query"}
assert schema["fields"]["body"]["write_parameters"] == {"input_type": "passage"}

def test_embed_translation_with_enum_cloud_region(self):
"""Test translation with enum values for cloud and region."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(
model="multilingual-e5-large", metric="cosine", field_map={"text": "synopsis"}
)
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud=CloudProvider.AWS, region=AwsRegion.US_EAST_1, embed=embed
)

assert deployment["cloud"] == "aws"
assert deployment["region"] == "us-east-1"

def test_embed_translation_multiple_field_mappings(self):
"""Test IndexEmbed translation with multiple field mappings."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(
model="multilingual-e5-large",
metric="cosine",
field_map={"text": "title", "description": "content"},
)
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

assert "title" in schema["fields"]
assert "content" in schema["fields"]
assert schema["fields"]["title"]["type"] == "semantic_text"
assert schema["fields"]["content"]["type"] == "semantic_text"

def test_embed_translation_missing_model_raises_error(self):
"""Test that missing model in dict embed raises ValueError."""
embed = {"field_map": {"text": "synopsis"}}
with pytest.raises(ValueError, match="model is required"):
PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

def test_embed_translation_missing_field_map_raises_error(self):
"""Test that missing field_map in dict embed raises ValueError."""
embed = {"model": "multilingual-e5-large"}
with pytest.raises(ValueError, match="field_map is required"):
PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

def test_embed_translation_empty_field_map_raises_error(self):
"""Test that empty field_map raises ValueError."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(model="multilingual-e5-large", field_map={})
with pytest.raises(ValueError, match="field_map must contain at least one mapping"):
PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

def test_embed_translation_with_metric_enum(self):
"""Test IndexEmbed translation with Metric enum value."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(
model="multilingual-e5-large", metric=Metric.EUCLIDEAN, field_map={"text": "synopsis"}
)
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

assert schema["fields"]["synopsis"]["metric"] == "euclidean"

def test_embed_translation_multiple_fields_independent_copies(self):
"""Test that multiple field mappings get independent copies of parameters."""
from pinecone.db_control.models import IndexEmbed

embed = IndexEmbed(
model="multilingual-e5-large",
metric="cosine",
field_map={"text": "title", "description": "content"},
read_parameters={"input_type": "search_query"},
write_parameters={"input_type": "search_document"},
)
deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text(
cloud="aws", region="us-east-1", embed=embed
)

# Verify both fields have correct parameters
assert schema["fields"]["title"]["read_parameters"] == {"input_type": "search_query"}
assert schema["fields"]["content"]["read_parameters"] == {"input_type": "search_query"}

# Verify dictionaries are independent copies (not shared references)
assert (
schema["fields"]["title"]["read_parameters"]
is not schema["fields"]["content"]["read_parameters"]
)
assert (
schema["fields"]["title"]["write_parameters"]
is not schema["fields"]["content"]["write_parameters"]
)

# Verify modifying one doesn't affect the other
schema["fields"]["title"]["read_parameters"]["extra"] = "value"
assert "extra" not in schema["fields"]["content"]["read_parameters"]