pinecone_datasets.catalog

  1from datetime import datetime
  2import warnings
  3import os
  4import json
  5from ssl import SSLCertVerificationError
  6from typing import List, Optional, Union, Any, Dict
  7import s3fs
  8import gcsfs
  9from pydantic import BaseModel, ValidationError, Field
 10import pandas as pd
 11
 12from pinecone_datasets import cfg
 13from pinecone_datasets.fs import get_cloud_fs
 14
 15
 16class DenseModelMetadata(BaseModel):
 17    name: str
 18    tokenizer: Optional[str]
 19    dimension: int
 20
 21
 22class SparseModelMetdata(BaseModel):
 23    name: Optional[str]
 24    tokenizer: Optional[str]
 25
 26
 27def get_time_now() -> str:
 28    return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
 29
 30
 31class DatasetMetadata(BaseModel):
 32    name: str
 33    created_at: str
 34    documents: int
 35    queries: int
 36    source: Optional[str]
 37    license: Optional[str]
 38    bucket: Optional[str]
 39    task: Optional[str]
 40    dense_model: DenseModelMetadata
 41    sparse_model: Optional[SparseModelMetdata]
 42    description: Optional[str]
 43    tags: Optional[List[str]]
 44    args: Optional[Dict[str, Any]]
 45
 46    @staticmethod
 47    def empty() -> "DatasetMetadata":
 48        return DatasetMetadata(
 49            name="",
 50            created_at=get_time_now(),
 51            documents=0,
 52            queries=0,
 53            dense_model=DenseModelMetadata(name="", dimension=0),
 54        )
 55
 56    def is_empty(self) -> bool:
 57        return self.name == "" and self.documents == 0 and self.queries == 0
 58
 59
 60class Catalog(BaseModel):
 61    datasets: List[DatasetMetadata] = []
 62
 63    @staticmethod
 64    def load(**kwargs) -> "Catalog":
 65        public_datasets_base_path = os.environ.get(
 66            "DATASETS_CATALOG_BASEPATH", cfg.Storage.endpoint
 67        )
 68        fs = get_cloud_fs(public_datasets_base_path, **kwargs)
 69        if not fs:
 70            raise ValueError(
 71                "Public datasets are only supported on cloud storage, with valid s3:// or gs:// paths"
 72            )
 73        collected_datasets = []
 74        try:
 75            for f in fs.listdir(public_datasets_base_path):
 76                if f["type"] == "directory":
 77                    try:
 78                        prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3"
 79                        with fs.open(f"{prefix}://{f['name']}/metadata.json") as f:
 80                            try:
 81                                this_dataset_json = json.load(f)
 82                            except json.JSONDecodeError:
 83                                warnings.warn(
 84                                    f"Not a JSON: Invalid metadata.json for {f['name']}, skipping"
 85                                )
 86                            try:
 87                                this_dataset = DatasetMetadata(**this_dataset_json)
 88                                collected_datasets.append(this_dataset)
 89                            except ValidationError:
 90                                warnings.warn(
 91                                    f"metadata file for dataset: {f['name']} is not valid, skipping"
 92                                )
 93                    except FileNotFoundError:
 94                        pass
 95            return Catalog(datasets=collected_datasets)
 96        except SSLCertVerificationError:
 97            raise ValueError("There is an Issue with loading the public catalog")
 98
 99    def list_datasets(self, as_df: bool) -> Union[List[str], pd.DataFrame]:
100        if as_df:
101            return pd.DataFrame([ds.dict() for ds in self.datasets])
102        else:
103            return [dataset.name for dataset in self.datasets]
class DenseModelMetadata(pydantic.main.BaseModel):
17class DenseModelMetadata(BaseModel):
18    name: str
19    tokenizer: Optional[str]
20    dimension: int
Inherited Members
pydantic.main.BaseModel
BaseModel
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
class SparseModelMetdata(pydantic.main.BaseModel):
23class SparseModelMetdata(BaseModel):
24    name: Optional[str]
25    tokenizer: Optional[str]
Inherited Members
pydantic.main.BaseModel
BaseModel
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
def get_time_now() -> str:
28def get_time_now() -> str:
29    return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
class DatasetMetadata(pydantic.main.BaseModel):
32class DatasetMetadata(BaseModel):
33    name: str
34    created_at: str
35    documents: int
36    queries: int
37    source: Optional[str]
38    license: Optional[str]
39    bucket: Optional[str]
40    task: Optional[str]
41    dense_model: DenseModelMetadata
42    sparse_model: Optional[SparseModelMetdata]
43    description: Optional[str]
44    tags: Optional[List[str]]
45    args: Optional[Dict[str, Any]]
46
47    @staticmethod
48    def empty() -> "DatasetMetadata":
49        return DatasetMetadata(
50            name="",
51            created_at=get_time_now(),
52            documents=0,
53            queries=0,
54            dense_model=DenseModelMetadata(name="", dimension=0),
55        )
56
57    def is_empty(self) -> bool:
58        return self.name == "" and self.documents == 0 and self.queries == 0
@staticmethod
def empty() -> pinecone_datasets.catalog.DatasetMetadata:
47    @staticmethod
48    def empty() -> "DatasetMetadata":
49        return DatasetMetadata(
50            name="",
51            created_at=get_time_now(),
52            documents=0,
53            queries=0,
54            dense_model=DenseModelMetadata(name="", dimension=0),
55        )
def is_empty(self) -> bool:
57    def is_empty(self) -> bool:
58        return self.name == "" and self.documents == 0 and self.queries == 0
Inherited Members
pydantic.main.BaseModel
BaseModel
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
class Catalog(pydantic.main.BaseModel):
 61class Catalog(BaseModel):
 62    datasets: List[DatasetMetadata] = []
 63
 64    @staticmethod
 65    def load(**kwargs) -> "Catalog":
 66        public_datasets_base_path = os.environ.get(
 67            "DATASETS_CATALOG_BASEPATH", cfg.Storage.endpoint
 68        )
 69        fs = get_cloud_fs(public_datasets_base_path, **kwargs)
 70        if not fs:
 71            raise ValueError(
 72                "Public datasets are only supported on cloud storage, with valid s3:// or gs:// paths"
 73            )
 74        collected_datasets = []
 75        try:
 76            for f in fs.listdir(public_datasets_base_path):
 77                if f["type"] == "directory":
 78                    try:
 79                        prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3"
 80                        with fs.open(f"{prefix}://{f['name']}/metadata.json") as f:
 81                            try:
 82                                this_dataset_json = json.load(f)
 83                            except json.JSONDecodeError:
 84                                warnings.warn(
 85                                    f"Not a JSON: Invalid metadata.json for {f['name']}, skipping"
 86                                )
 87                            try:
 88                                this_dataset = DatasetMetadata(**this_dataset_json)
 89                                collected_datasets.append(this_dataset)
 90                            except ValidationError:
 91                                warnings.warn(
 92                                    f"metadata file for dataset: {f['name']} is not valid, skipping"
 93                                )
 94                    except FileNotFoundError:
 95                        pass
 96            return Catalog(datasets=collected_datasets)
 97        except SSLCertVerificationError:
 98            raise ValueError("There is an Issue with loading the public catalog")
 99
100    def list_datasets(self, as_df: bool) -> Union[List[str], pd.DataFrame]:
101        if as_df:
102            return pd.DataFrame([ds.dict() for ds in self.datasets])
103        else:
104            return [dataset.name for dataset in self.datasets]
@staticmethod
def load(**kwargs) -> pinecone_datasets.catalog.Catalog:
64    @staticmethod
65    def load(**kwargs) -> "Catalog":
66        public_datasets_base_path = os.environ.get(
67            "DATASETS_CATALOG_BASEPATH", cfg.Storage.endpoint
68        )
69        fs = get_cloud_fs(public_datasets_base_path, **kwargs)
70        if not fs:
71            raise ValueError(
72                "Public datasets are only supported on cloud storage, with valid s3:// or gs:// paths"
73            )
74        collected_datasets = []
75        try:
76            for f in fs.listdir(public_datasets_base_path):
77                if f["type"] == "directory":
78                    try:
79                        prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3"
80                        with fs.open(f"{prefix}://{f['name']}/metadata.json") as f:
81                            try:
82                                this_dataset_json = json.load(f)
83                            except json.JSONDecodeError:
84                                warnings.warn(
85                                    f"Not a JSON: Invalid metadata.json for {f['name']}, skipping"
86                                )
87                            try:
88                                this_dataset = DatasetMetadata(**this_dataset_json)
89                                collected_datasets.append(this_dataset)
90                            except ValidationError:
91                                warnings.warn(
92                                    f"metadata file for dataset: {f['name']} is not valid, skipping"
93                                )
94                    except FileNotFoundError:
95                        pass
96            return Catalog(datasets=collected_datasets)
97        except SSLCertVerificationError:
98            raise ValueError("There is an Issue with loading the public catalog")
def list_datasets(self, as_df: bool) -> Union[List[str], pandas.core.frame.DataFrame]:
100    def list_datasets(self, as_df: bool) -> Union[List[str], pd.DataFrame]:
101        if as_df:
102            return pd.DataFrame([ds.dict() for ds in self.datasets])
103        else:
104            return [dataset.name for dataset in self.datasets]
Inherited Members
pydantic.main.BaseModel
BaseModel
dict
json
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs