-
Notifications
You must be signed in to change notification settings - Fork 115
feat: add request translator for IndexEmbed to semantic_text #588
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -492,6 +492,112 @@ def _translate_legacy_request( | |
|
|
||
| return deployment_dict, schema_dict | ||
|
|
||
| @staticmethod | ||
| def _translate_embed_to_semantic_text( | ||
| cloud: CloudProvider | str, | ||
| region: AwsRegion | GcpRegion | AzureRegion | str, | ||
| embed: IndexEmbed | CreateIndexForModelEmbedTypedDict, | ||
| ) -> tuple[dict[str, Any], dict[str, Any]]: | ||
| """Translate embed-based request to deployment + schema format. | ||
|
|
||
| This method converts `create_index_for_model` parameters (cloud, region, embed) | ||
| to the new API format using deployment and schema structures with a semantic_text | ||
| field type. | ||
|
|
||
| :param cloud: The cloud provider (aws, gcp, azure). | ||
| :param region: The cloud region. | ||
| :param embed: The IndexEmbed configuration or equivalent dict. | ||
| :returns: A tuple of (deployment_dict, schema_dict) for the new API format. | ||
|
|
||
| **Translation Example:** | ||
|
|
||
| * Input: ``cloud="aws"``, ``region="us-east-1"``, | ||
| ``embed=IndexEmbed(model="multilingual-e5-large", metric="cosine", field_map={"text": "synopsis"})`` | ||
| * Output deployment: ``{"deployment_type": "serverless", "cloud": "aws", "region": "us-east-1"}`` | ||
| * Output schema: ``{"fields": {"synopsis": {"type": "semantic_text", "model": "multilingual-e5-large", ...}}}`` | ||
|
|
||
| Example:: | ||
|
|
||
| deployment, schema = PineconeDBControlRequestFactory._translate_embed_to_semantic_text( | ||
| cloud="aws", | ||
| region="us-east-1", | ||
| embed=IndexEmbed( | ||
| model="multilingual-e5-large", | ||
| metric="cosine", | ||
| field_map={"text": "synopsis"} | ||
| ) | ||
| ) | ||
| """ | ||
| # Convert enum values to strings | ||
| cloud = convert_enum_to_string(cloud) | ||
| region = convert_enum_to_string(region) | ||
|
|
||
| # Create ServerlessDeployment for cloud/region | ||
| deployment = ServerlessDeployment(cloud=cloud, region=region) | ||
| deployment_dict = deployment.to_dict() | ||
|
|
||
| # Parse embed configuration | ||
| model: str | ||
| metric: str | None | ||
| field_map: dict[str, str] | ||
| read_parameters: dict[str, Any] | None | ||
| write_parameters: dict[str, Any] | None | ||
|
|
||
| if isinstance(embed, IndexEmbed): | ||
| model = embed.model | ||
| metric = embed.metric | ||
| field_map = embed.field_map | ||
| read_parameters = embed.read_parameters | ||
| write_parameters = embed.write_parameters | ||
| else: | ||
| # Dict-based embed | ||
| raw_model = embed.get("model") | ||
| if raw_model is None: | ||
| raise ValueError("model is required in embed") | ||
| model = convert_enum_to_string(raw_model) | ||
| raw_metric = embed.get("metric") | ||
| metric = convert_enum_to_string(raw_metric) if raw_metric is not None else None | ||
| raw_field_map = embed.get("field_map") | ||
| if raw_field_map is None: | ||
| raise ValueError("field_map is required in embed") | ||
| field_map = raw_field_map | ||
| read_parameters = embed.get("read_parameters") | ||
| write_parameters = embed.get("write_parameters") | ||
|
|
||
| # Extract field name from field_map values | ||
| # field_map is like {"text": "synopsis"} where "synopsis" is the target field name | ||
| if not field_map: | ||
| raise ValueError("field_map must contain at least one mapping") | ||
|
|
||
| # Build schema with semantic_text fields | ||
| schema_dict: dict[str, Any] = {"fields": {}} | ||
|
|
||
| for _, target_field in field_map.items(): | ||
| # Build the semantic_text field configuration | ||
| field_config: dict[str, Any] = {"type": "semantic_text", "model": model} | ||
|
|
||
| # Include metric if provided | ||
| if metric is not None: | ||
| field_config["metric"] = convert_enum_to_string(metric) | ||
|
|
||
| # Apply default read_parameters if not provided or empty | ||
| # Use dict() to create a copy to avoid shared references across fields | ||
| if read_parameters: | ||
| field_config["read_parameters"] = dict(read_parameters) | ||
| else: | ||
| field_config["read_parameters"] = {"input_type": "query"} | ||
|
|
||
| # Apply default write_parameters if not provided or empty | ||
| # Use dict() to create a copy to avoid shared references across fields | ||
| if write_parameters: | ||
| field_config["write_parameters"] = dict(write_parameters) | ||
| else: | ||
| field_config["write_parameters"] = {"input_type": "passage"} | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| schema_dict["fields"][target_field] = field_config | ||
|
|
||
| return deployment_dict, schema_dict | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. New function defined but never called in production codeLow Severity The |
||
|
|
||
| @staticmethod | ||
| def create_index_request( | ||
| name: str, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Redundant enum conversion call on already-converted metric
Low Severity
The
convert_enum_to_string(metric)call at line 581 is redundant becausemetricis already guaranteed to be a string (orNone) at this point. ForIndexEmbedobjects, themetricattribute is converted to a string inIndexEmbed.__init__(line 62 of index_embed.py). For dict-based embeds, the conversion already happens at line 559. Callingconvert_enum_to_stringon an already-converted string value just returns it unchanged, adding unnecessary overhead.