pinecone_text.dense.jina_encoder

 1from typing import Union, List, Any, Optional
 2from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder
 3import os
 4import requests
 5
 6JINA_API_URL: str = "https://api.jina.ai/v1/embeddings"
 7
 8
 9class JinaEncoder(BaseDenseEncoder):
10    """
11    JinaAI's text embedding wrapper. See https://jina.ai/embeddings/
12
13    Note: You should provide an API key in the environment variable JINA_API_KEY.
14          Or you can pass it as argument to the constructor as `api_key`.
15    """
16
17    def __init__(
18        self,
19        model_name: str = "jina-embeddings-v2-base-en",
20        api_key: Optional[str] = None,
21        **kwargs: Any,
22    ):
23        """
24        Initialize the OpenAI encoder.
25
26        :param model_name: The name of the embedding model to use.
27        :param kwargs: Additional arguments
28        """
29        if api_key is None:
30            api_key = os.environ.get("JINA_API_KEY", None)
31
32        if api_key is None:
33            raise ValueError(
34                "JinaEncoder requires an API key to work. Please provide `api_key` argument or set `JINA_API_KEY` environment variable"
35            )
36        super().__init__()
37        self._model_name = model_name
38        self._session = requests.Session()
39        self._session.headers.update(
40            {
41                "Authorization": f"Bearer {api_key}",
42                "Accept-Encoding": "identity",
43                "Content-type": "application/json",
44            }
45        )
46
47    def encode_documents(
48        self, texts: Union[str, List[str]]
49    ) -> Union[List[float], List[List[float]]]:
50        return self._encode(texts)
51
52    def encode_queries(
53        self, texts: Union[str, List[str]]
54    ) -> Union[List[float], List[List[float]]]:
55        return self._encode(texts)
56
57    def _encode(
58        self, texts: Union[str, List[str]]
59    ) -> Union[List[float], List[List[float]]]:
60        if isinstance(texts, str):
61            texts_input = [texts]
62        elif isinstance(texts, list):
63            texts_input = texts
64        else:
65            raise ValueError(
66                f"texts must be a string or list of strings, got: {type(texts)}"
67            )
68
69        resp = self._session.post(  # type: ignore
70            JINA_API_URL, json={"input": texts_input, "model": self._model_name}
71        ).json()
72        if "data" not in resp:
73            raise RuntimeError(resp["detail"])
74
75        embeddings = resp["data"]
76
77        # Sort resulting embeddings by index
78        sorted_embeddings = sorted(embeddings, key=lambda e: e["index"])  # type: ignore
79
80        # Return just the embeddings
81        res = [result["embedding"] for result in sorted_embeddings]
82
83        if isinstance(texts, str):
84            res = res[0]
85        return res
10class JinaEncoder(BaseDenseEncoder):
11    """
12    JinaAI's text embedding wrapper. See https://jina.ai/embeddings/
13
14    Note: You should provide an API key in the environment variable JINA_API_KEY.
15          Or you can pass it as argument to the constructor as `api_key`.
16    """
17
18    def __init__(
19        self,
20        model_name: str = "jina-embeddings-v2-base-en",
21        api_key: Optional[str] = None,
22        **kwargs: Any,
23    ):
24        """
25        Initialize the OpenAI encoder.
26
27        :param model_name: The name of the embedding model to use.
28        :param kwargs: Additional arguments
29        """
30        if api_key is None:
31            api_key = os.environ.get("JINA_API_KEY", None)
32
33        if api_key is None:
34            raise ValueError(
35                "JinaEncoder requires an API key to work. Please provide `api_key` argument or set `JINA_API_KEY` environment variable"
36            )
37        super().__init__()
38        self._model_name = model_name
39        self._session = requests.Session()
40        self._session.headers.update(
41            {
42                "Authorization": f"Bearer {api_key}",
43                "Accept-Encoding": "identity",
44                "Content-type": "application/json",
45            }
46        )
47
48    def encode_documents(
49        self, texts: Union[str, List[str]]
50    ) -> Union[List[float], List[List[float]]]:
51        return self._encode(texts)
52
53    def encode_queries(
54        self, texts: Union[str, List[str]]
55    ) -> Union[List[float], List[List[float]]]:
56        return self._encode(texts)
57
58    def _encode(
59        self, texts: Union[str, List[str]]
60    ) -> Union[List[float], List[List[float]]]:
61        if isinstance(texts, str):
62            texts_input = [texts]
63        elif isinstance(texts, list):
64            texts_input = texts
65        else:
66            raise ValueError(
67                f"texts must be a string or list of strings, got: {type(texts)}"
68            )
69
70        resp = self._session.post(  # type: ignore
71            JINA_API_URL, json={"input": texts_input, "model": self._model_name}
72        ).json()
73        if "data" not in resp:
74            raise RuntimeError(resp["detail"])
75
76        embeddings = resp["data"]
77
78        # Sort resulting embeddings by index
79        sorted_embeddings = sorted(embeddings, key=lambda e: e["index"])  # type: ignore
80
81        # Return just the embeddings
82        res = [result["embedding"] for result in sorted_embeddings]
83
84        if isinstance(texts, str):
85            res = res[0]
86        return res

JinaAI's text embedding wrapper. See https://jina.ai/embeddings/

Note: You should provide an API key in the environment variable JINA_API_KEY. Or you can pass it as argument to the constructor as api_key.

JinaEncoder( model_name: str = 'jina-embeddings-v2-base-en', api_key: Optional[str] = None, **kwargs: Any)
18    def __init__(
19        self,
20        model_name: str = "jina-embeddings-v2-base-en",
21        api_key: Optional[str] = None,
22        **kwargs: Any,
23    ):
24        """
25        Initialize the OpenAI encoder.
26
27        :param model_name: The name of the embedding model to use.
28        :param kwargs: Additional arguments
29        """
30        if api_key is None:
31            api_key = os.environ.get("JINA_API_KEY", None)
32
33        if api_key is None:
34            raise ValueError(
35                "JinaEncoder requires an API key to work. Please provide `api_key` argument or set `JINA_API_KEY` environment variable"
36            )
37        super().__init__()
38        self._model_name = model_name
39        self._session = requests.Session()
40        self._session.headers.update(
41            {
42                "Authorization": f"Bearer {api_key}",
43                "Accept-Encoding": "identity",
44                "Content-type": "application/json",
45            }
46        )

Initialize the OpenAI encoder.

Parameters
  • model_name: The name of the embedding model to use.
  • kwargs: Additional arguments
def encode_documents( self, texts: Union[str, List[str]]) -> Union[List[float], List[List[float]]]:
48    def encode_documents(
49        self, texts: Union[str, List[str]]
50    ) -> Union[List[float], List[List[float]]]:
51        return self._encode(texts)

encode documents to a dense vector (for upsert to pinecone)

Arguments:
  • texts: a single or list of documents to encode as a string
def encode_queries( self, texts: Union[str, List[str]]) -> Union[List[float], List[List[float]]]:
53    def encode_queries(
54        self, texts: Union[str, List[str]]
55    ) -> Union[List[float], List[List[float]]]:
56        return self._encode(texts)

encode queries to a dense vector

Arguments:
  • texts: a single or list of queries to encode as a string