diff --git a/pinecone/db_data/index.py b/pinecone/db_data/index.py index 537dac94..08c1afcc 100644 --- a/pinecone/db_data/index.py +++ b/pinecone/db_data/index.py @@ -10,6 +10,7 @@ from pinecone.openapi_support import ApiClient from pinecone.core.openapi.db_data.api.vector_operations_api import VectorOperationsApi +from pinecone.core.openapi.db_data.api.document_operations_api import DocumentOperationsApi from pinecone.core.openapi.db_data import API_VERSION from pinecone.core.openapi.db_data.models import ( QueryResponse as OpenAPIQueryResponse, @@ -30,6 +31,10 @@ QueryResponse, UpsertResponse, UpdateResponse, + TextQuery, + VectorQuery, + DocumentSearchResponse, + Document, ) from .interfaces import IndexInterface from .request_factory import IndexRequestFactory @@ -158,6 +163,9 @@ class Index(PluginAware, IndexInterface): _namespace_resource: "NamespaceResource" | None """ :meta private: """ + _document_api: DocumentOperationsApi | None + """ :meta private: """ + def __init__( self, api_key: str, @@ -202,6 +210,9 @@ def __init__( self._namespace_resource = None """ :meta private: """ + self._document_api = None + """ :meta private: """ + # Pass the same api_client to the ImportFeatureMixin super().__init__(api_client=self._api_client) @@ -253,6 +264,13 @@ def namespace(self) -> "NamespaceResource": ) return self._namespace_resource + @property + def document_api(self) -> DocumentOperationsApi: + """:meta private:""" + if self._document_api is None: + self._document_api = DocumentOperationsApi(api_client=self._api_client) + return self._document_api + def _openapi_kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]: return filter_dict(kwargs, OPENAPI_ENDPOINT_PARAMS) @@ -800,6 +818,102 @@ def search_records( """ return self.search(namespace, query=query, rerank=rerank, fields=fields) + @validate_and_convert_errors + def search_documents( + self, + namespace: str, + score_by: TextQuery | VectorQuery, + filter: FilterTypedDict | None = None, + include_fields: list[str] | None = None, + top_k: int = 10, + ) -> DocumentSearchResponse: + """Search for documents in a namespace. + + This operation searches a namespace using text or vector queries and returns + matching documents with their scores. + + Args: + namespace: The namespace to search in. + score_by: A :class:`~pinecone.TextQuery` or :class:`~pinecone.VectorQuery` + object defining how to rank results. + filter: Optional metadata filter. Supports ``$text_match`` for FTS filtering. [optional] + include_fields: Optional list of fields to include in results. Use ``["*"]`` + to return all fields. [optional] + top_k: Number of results to return. Defaults to 10. + + Returns: + DocumentSearchResponse: Response containing matching documents and usage info. + + Examples: + + .. code-block:: python + + from pinecone import Pinecone, text_query, vector_query + + pc = Pinecone() + index = pc.Index(host="example-index-host") + + # Simple text search + results = index.search_documents( + namespace="movies", + score_by=text_query("title", 'return "pink panther"'), + filter={"genre": {"$eq": "comedy"}}, + top_k=10, + ) + + # Access results + for doc in results.documents: + print(f"{doc.id}: {doc.score}") + print(f"Title: {doc.title}") + + # Vector search with text filter + results = index.search_documents( + namespace="logs", + score_by=vector_query("embedding", values=[0.1, 0.2, 0.3]), + filter={ + "service": {"$eq": "payment-gateway"}, + "message": {"$text_match": '+error +\"connection refused\"'}, + }, + include_fields=["message", "timestamp"], + top_k=10, + ) + + """ + if namespace is None: + raise ValueError("Namespace is required when searching documents") + + request = IndexRequestFactory.search_documents_request( + score_by=score_by, top_k=top_k, filter=filter, include_fields=include_fields + ) + + result = self.document_api.search_documents(namespace, request) + + # Convert OpenAPI response to our dataclass + documents: list[Document] = [] + if hasattr(result, "documents") and result.documents: + for doc in result.documents: + # Extract id and score, rest goes to fields + doc_dict = doc.to_dict() if hasattr(doc, "to_dict") else dict(doc) + doc_id = doc_dict.pop("id", doc_dict.pop("_id", "")) + score = doc_dict.pop("score", 0.0) + documents.append(Document(id=doc_id, score=score, **doc_dict)) + + # Extract usage info + usage = result.usage if hasattr(result, "usage") else None + + # Extract response info + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + return DocumentSearchResponse( + documents=documents, usage=usage, _response_info=response_info + ) + @validate_and_convert_errors def delete( self, diff --git a/pinecone/db_data/index_asyncio.py b/pinecone/db_data/index_asyncio.py index 0cb24342..1db2815f 100644 --- a/pinecone/db_data/index_asyncio.py +++ b/pinecone/db_data/index_asyncio.py @@ -16,6 +16,7 @@ from pinecone.openapi_support import AsyncioApiClient from pinecone.core.openapi.db_data.api.vector_operations_api import AsyncioVectorOperationsApi +from pinecone.core.openapi.db_data.api.document_operations_api import AsyncioDocumentOperationsApi from pinecone.core.openapi.db_data import API_VERSION from pinecone.core.openapi.db_data.models import ( QueryResponse as OpenAPIQueryResponse, @@ -56,6 +57,10 @@ QueryResponse, UpsertResponse, UpdateResponse, + TextQuery, + VectorQuery, + DocumentSearchResponse, + Document, ) from pinecone.openapi_support import OPENAPI_ENDPOINT_PARAMS @@ -187,6 +192,9 @@ async def main(): _namespace_resource: "NamespaceResourceAsyncio" | None """ :meta private: """ + _document_api: AsyncioDocumentOperationsApi | None + """ :meta private: """ + def __init__( self, api_key: str, @@ -224,6 +232,9 @@ def __init__( self._namespace_resource = None """ :meta private: """ + self._document_api = None + """ :meta private: """ + async def __aenter__(self) -> Self: return self @@ -304,6 +315,13 @@ def namespace(self) -> "NamespaceResourceAsyncio": self._namespace_resource = NamespaceResourceAsyncio(api_client=self._api_client) return self._namespace_resource + @property + def document_api(self) -> AsyncioDocumentOperationsApi: + """:meta private:""" + if self._document_api is None: + self._document_api = AsyncioDocumentOperationsApi(api_client=self._api_client) + return self._document_api + @validate_and_convert_errors async def upsert( self, @@ -795,6 +813,92 @@ async def search_records( ) -> SearchRecordsResponse: return await self.search(namespace, query=query, rerank=rerank, fields=fields) + @validate_and_convert_errors + async def search_documents( + self, + namespace: str, + score_by: TextQuery | VectorQuery, + filter: Dict | None = None, + include_fields: List[str] | None = None, + top_k: int = 10, + ) -> DocumentSearchResponse: + """Search for documents in a namespace. + + This operation searches a namespace using text or vector queries and returns + matching documents with their scores. + + Args: + namespace: The namespace to search in. + score_by: A :class:`~pinecone.TextQuery` or :class:`~pinecone.VectorQuery` + object defining how to rank results. + filter: Optional metadata filter. Supports ``$text_match`` for FTS filtering. [optional] + include_fields: Optional list of fields to include in results. Use ``["*"]`` + to return all fields. [optional] + top_k: Number of results to return. Defaults to 10. + + Returns: + DocumentSearchResponse: Response containing matching documents and usage info. + + Examples: + + .. code-block:: python + + import asyncio + from pinecone import Pinecone, text_query, vector_query + + async def main(): + pc = Pinecone() + async with pc.IndexAsyncio(host="example-index-host") as index: + # Simple text search + results = await index.search_documents( + namespace="movies", + score_by=text_query("title", 'return "pink panther"'), + filter={"genre": {"$eq": "comedy"}}, + top_k=10, + ) + + # Access results + for doc in results.documents: + print(f"{doc.id}: {doc.score}") + + asyncio.run(main()) + + """ + if namespace is None: + raise ValueError("Namespace is required when searching documents") + + request = IndexRequestFactory.search_documents_request( + score_by=score_by, top_k=top_k, filter=filter, include_fields=include_fields + ) + + result = await self.document_api.search_documents(namespace, request) + + # Convert OpenAPI response to our dataclass + documents: List[Document] = [] + if hasattr(result, "documents") and result.documents: + for doc in result.documents: + # Extract id and score, rest goes to fields + doc_dict = doc.to_dict() if hasattr(doc, "to_dict") else dict(doc) + doc_id = doc_dict.pop("id", doc_dict.pop("_id", "")) + score = doc_dict.pop("score", 0.0) + documents.append(Document(id=doc_id, score=score, **doc_dict)) + + # Extract usage info + usage = result.usage if hasattr(result, "usage") else None + + # Extract response info + from pinecone.utils.response_info import extract_response_info + + response_info = None + if hasattr(result, "_response_info"): + response_info = result._response_info + if response_info is None: + response_info = extract_response_info({}) + + return DocumentSearchResponse( + documents=documents, usage=usage, _response_info=response_info + ) + def _openapi_kwargs(self, kwargs: dict[str, Any]) -> dict[str, Any]: return filter_dict(kwargs, OPENAPI_ENDPOINT_PARAMS) diff --git a/pinecone/db_data/interfaces.py b/pinecone/db_data/interfaces.py index a8224de1..e2f19efe 100644 --- a/pinecone/db_data/interfaces.py +++ b/pinecone/db_data/interfaces.py @@ -32,6 +32,9 @@ UpdateResponse, SparseValues, Vector, + TextQuery, + VectorQuery, + DocumentSearchResponse, ) from pinecone.utils import require_kwargs @@ -453,6 +456,57 @@ def search_records( """ pass + @abstractmethod + def search_documents( + self, + namespace: str, + score_by: TextQuery | VectorQuery, + filter: FilterTypedDict | None = None, + include_fields: list[str] | None = None, + top_k: int = 10, + ) -> DocumentSearchResponse: + """Search for documents in a namespace. + + This operation searches a namespace using text or vector queries and returns + matching documents with their scores. + + Args: + namespace: The namespace to search in. + score_by: A :class:`~pinecone.TextQuery` or :class:`~pinecone.VectorQuery` + object defining how to rank results. + filter: Optional metadata filter. Supports ``$text_match`` for FTS filtering. [optional] + include_fields: Optional list of fields to include in results. Use ``["*"]`` + to return all fields. [optional] + top_k: Number of results to return. Defaults to 10. + + Returns: + DocumentSearchResponse: Response containing matching documents and usage info. + + Examples: + + .. code-block:: python + + from pinecone import Pinecone, text_query, vector_query + + pc = Pinecone() + index = pc.Index(host="example-index-host") + + # Simple text search + results = index.search_documents( + namespace="movies", + score_by=text_query("title", 'return "pink panther"'), + filter={"genre": {"$eq": "comedy"}}, + top_k=10, + ) + + # Access results + for doc in results.documents: + print(f"{doc.id}: {doc.score}") + print(f"Title: {doc.title}") + + """ + pass + @abstractmethod def delete( self, diff --git a/pinecone/db_data/request_factory.py b/pinecone/db_data/request_factory.py index c40c3e09..cfaad4b8 100644 --- a/pinecone/db_data/request_factory.py +++ b/pinecone/db_data/request_factory.py @@ -18,6 +18,7 @@ SearchRecordsVector, UpsertRecord, Vector as OpenApiVector, + DocumentSearchRequest, ) from ..utils import parse_non_empty_args, convert_enum_to_string from .vector_factory import VectorFactory @@ -35,7 +36,15 @@ SearchQueryVectorTypedDict, ) -from .dataclasses import Vector, SparseValues, SearchQuery, SearchRerank, SearchQueryVector +from .dataclasses import ( + Vector, + SparseValues, + SearchQuery, + SearchRerank, + SearchQueryVector, + TextQuery, + VectorQuery, +) logger = logging.getLogger(__name__) """ :meta private: """ @@ -325,3 +334,36 @@ def upsert_records_args(namespace: str, records: list[dict[str, Any]]) -> dict[s ) return {"namespace": namespace, "upsert_record": records_to_upsert} + + @staticmethod + def search_documents_request( + score_by: TextQuery | VectorQuery, + top_k: int = 10, + filter: FilterTypedDict | None = None, + include_fields: list[str] | None = None, + ) -> DocumentSearchRequest: + """Build a DocumentSearchRequest for the search_documents API. + + :param score_by: A TextQuery or VectorQuery object defining how to rank results. + :param top_k: Number of results to return (default 10). + :param filter: Optional metadata filter. + :param include_fields: Optional list of fields to include in results. + :returns: DocumentSearchRequest object for the API call. + """ + # Convert score_by to the API format + score_by_dict = score_by.as_dict() + + request_args: dict[str, Any] = {"top_k": top_k, "score_by": [score_by_dict]} + + if filter is not None: + request_args["filter"] = filter + + # Handle include_fields - API accepts "*" string or list of field names + if include_fields is not None: + if include_fields == ["*"]: + request_args["include_fields"] = "*" + else: + request_args["include_fields"] = include_fields + + result: DocumentSearchRequest = DocumentSearchRequest(**request_args, _check_type=False) + return result diff --git a/tests/unit/data/test_search_documents.py b/tests/unit/data/test_search_documents.py new file mode 100644 index 00000000..8b3c3230 --- /dev/null +++ b/tests/unit/data/test_search_documents.py @@ -0,0 +1,181 @@ +"""Tests for search_documents functionality.""" + +from unittest.mock import MagicMock + +from pinecone.db_data.request_factory import IndexRequestFactory +from pinecone.db_data.dataclasses import ( + TextQuery, + VectorQuery, + DocumentSearchResponse, + Document, + SparseValues, +) +from pinecone.core.openapi.db_data.models import DocumentSearchRequest + + +class TestSearchDocumentsRequestFactory: + """Tests for the search_documents_request factory method.""" + + def test_text_query_basic(self): + """Test request creation with basic text query.""" + request = IndexRequestFactory.search_documents_request( + score_by=TextQuery(field="title", query="pink panther"), top_k=10 + ) + assert isinstance(request, DocumentSearchRequest) + assert request.top_k == 10 + assert request.score_by is not None + assert len(request.score_by) == 1 + assert request.score_by[0]["type"] == "text" + assert request.score_by[0]["field"] == "title" + assert request.score_by[0]["text_query"] == "pink panther" + + def test_text_query_with_boost_and_slop(self): + """Test request creation with text query boost and slop.""" + request = IndexRequestFactory.search_documents_request( + score_by=TextQuery(field="title", query="pink panther", boost=2.0, slop=3), top_k=5 + ) + assert request.top_k == 5 + assert request.score_by[0]["boost"] == 2.0 + assert request.score_by[0]["slop"] == 3 + + def test_vector_query_dense(self): + """Test request creation with dense vector query.""" + request = IndexRequestFactory.search_documents_request( + score_by=VectorQuery(field="embedding", values=[0.1, 0.2, 0.3]), top_k=20 + ) + assert isinstance(request, DocumentSearchRequest) + assert request.top_k == 20 + assert request.score_by is not None + assert len(request.score_by) == 1 + assert request.score_by[0]["type"] == "vector" + assert request.score_by[0]["field"] == "embedding" + assert request.score_by[0]["values"] == [0.1, 0.2, 0.3] + + def test_vector_query_sparse(self): + """Test request creation with sparse vector query.""" + sparse = SparseValues(indices=[1, 5, 10], values=[0.5, 0.3, 0.2]) + request = IndexRequestFactory.search_documents_request( + score_by=VectorQuery(field="sparse_embedding", sparse_values=sparse), top_k=10 + ) + assert request.score_by[0]["type"] == "vector" + assert request.score_by[0]["field"] == "sparse_embedding" + assert "sparse_values" in request.score_by[0] + + def test_with_filter(self): + """Test request creation with metadata filter.""" + request = IndexRequestFactory.search_documents_request( + score_by=TextQuery(field="title", query="panther"), + top_k=10, + filter={"genre": {"$eq": "comedy"}}, + ) + assert request.filter == {"genre": {"$eq": "comedy"}} + + def test_with_include_fields_wildcard(self): + """Test request creation with wildcard include_fields.""" + request = IndexRequestFactory.search_documents_request( + score_by=TextQuery(field="title", query="panther"), top_k=10, include_fields=["*"] + ) + assert request.include_fields == "*" + + def test_with_include_fields_list(self): + """Test request creation with specific include_fields.""" + request = IndexRequestFactory.search_documents_request( + score_by=TextQuery(field="title", query="panther"), + top_k=10, + include_fields=["title", "year", "genre"], + ) + assert request.include_fields == ["title", "year", "genre"] + + def test_default_top_k(self): + """Test that default top_k is 10.""" + request = IndexRequestFactory.search_documents_request( + score_by=TextQuery(field="title", query="panther") + ) + assert request.top_k == 10 + + def test_with_all_parameters(self): + """Test request creation with all parameters.""" + request = IndexRequestFactory.search_documents_request( + score_by=TextQuery(field="title", query="panther", boost=1.5), + top_k=25, + filter={"year": {"$gte": 2000}}, + include_fields=["title", "year"], + ) + assert request.top_k == 25 + assert request.filter == {"year": {"$gte": 2000}} + assert request.include_fields == ["title", "year"] + assert request.score_by[0]["boost"] == 1.5 + + +class TestDocumentClass: + """Tests for the Document class.""" + + def test_document_creation(self): + """Test creating a Document with id, score, and fields.""" + doc = Document(id="doc1", score=0.95, title="Test Title", year=2020) + assert doc.id == "doc1" + assert doc.score == 0.95 + assert doc.title == "Test Title" + assert doc.year == 2020 + + def test_document_dict_access(self): + """Test dict-style access to Document fields.""" + doc = Document(id="doc1", score=0.95, title="Test Title") + assert doc["id"] == "doc1" + assert doc["score"] == 0.95 + assert doc["title"] == "Test Title" + + def test_document_get_with_default(self): + """Test get() method with default value.""" + doc = Document(id="doc1", score=0.95) + assert doc.get("title") is None + assert doc.get("title", "N/A") == "N/A" + assert doc.get("id") == "doc1" + + def test_document_contains(self): + """Test 'in' operator for Document.""" + doc = Document(id="doc1", score=0.95, title="Test") + assert "id" in doc + assert "score" in doc + assert "title" in doc + assert "missing" not in doc + + def test_document_keys(self): + """Test keys() method.""" + doc = Document(id="doc1", score=0.95, title="Test") + assert doc.keys() == ["id", "score", "title"] + + def test_document_to_dict(self): + """Test to_dict() method.""" + doc = Document(id="doc1", score=0.95, title="Test", year=2020) + result = doc.to_dict() + assert result == {"id": "doc1", "score": 0.95, "title": "Test", "year": 2020} + + +class TestDocumentSearchResponse: + """Tests for the DocumentSearchResponse class.""" + + def test_response_creation(self): + """Test creating a DocumentSearchResponse.""" + docs = [ + Document(id="doc1", score=0.95, title="Title 1"), + Document(id="doc2", score=0.85, title="Title 2"), + ] + response = DocumentSearchResponse(documents=docs) + assert len(response.documents) == 2 + assert response.documents[0].id == "doc1" + assert response.documents[1].id == "doc2" + + def test_response_with_usage(self): + """Test DocumentSearchResponse with usage info.""" + docs = [Document(id="doc1", score=0.95)] + # Mock usage object + usage = MagicMock() + usage.read_units = 5 + response = DocumentSearchResponse(documents=docs, usage=usage) + assert response.usage.read_units == 5 + + def test_empty_response(self): + """Test DocumentSearchResponse with no documents.""" + response = DocumentSearchResponse(documents=[]) + assert len(response.documents) == 0