pinecone_datasets.catalog
1from datetime import datetime 2import warnings 3import os 4import json 5from ssl import SSLCertVerificationError 6from typing import List, Optional, Union, Any, Dict 7import s3fs 8import gcsfs 9from pydantic import BaseModel, ValidationError, Field 10import pandas as pd 11 12from pinecone_datasets import cfg 13from pinecone_datasets.fs import get_cloud_fs 14 15 16class DenseModelMetadata(BaseModel): 17 name: str 18 tokenizer: Optional[str] 19 dimension: int 20 21 22class SparseModelMetdata(BaseModel): 23 name: Optional[str] 24 tokenizer: Optional[str] 25 26 27def get_time_now() -> str: 28 return datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") 29 30 31class DatasetMetadata(BaseModel): 32 name: str 33 created_at: str 34 documents: int 35 queries: int 36 source: Optional[str] 37 license: Optional[str] 38 bucket: Optional[str] 39 task: Optional[str] 40 dense_model: DenseModelMetadata 41 sparse_model: Optional[SparseModelMetdata] 42 description: Optional[str] 43 tags: Optional[List[str]] 44 args: Optional[Dict[str, Any]] 45 46 @staticmethod 47 def empty() -> "DatasetMetadata": 48 return DatasetMetadata( 49 name="", 50 created_at=get_time_now(), 51 documents=0, 52 queries=0, 53 dense_model=DenseModelMetadata(name="", dimension=0), 54 ) 55 56 def is_empty(self) -> bool: 57 return self.name == "" and self.documents == 0 and self.queries == 0 58 59 60class Catalog(BaseModel): 61 datasets: List[DatasetMetadata] = [] 62 63 @staticmethod 64 def load(**kwargs) -> "Catalog": 65 public_datasets_base_path = os.environ.get( 66 "DATASETS_CATALOG_BASEPATH", cfg.Storage.endpoint 67 ) 68 fs = get_cloud_fs(public_datasets_base_path, **kwargs) 69 if not fs: 70 raise ValueError( 71 "Public datasets are only supported on cloud storage, with valid s3:// or gs:// paths" 72 ) 73 collected_datasets = [] 74 try: 75 for f in fs.listdir(public_datasets_base_path): 76 if f["type"] == "directory": 77 try: 78 prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3" 79 with fs.open(f"{prefix}://{f['name']}/metadata.json") as f: 80 try: 81 this_dataset_json = json.load(f) 82 except json.JSONDecodeError: 83 warnings.warn( 84 f"Not a JSON: Invalid metadata.json for {f['name']}, skipping" 85 ) 86 try: 87 this_dataset = DatasetMetadata(**this_dataset_json) 88 collected_datasets.append(this_dataset) 89 except ValidationError: 90 warnings.warn( 91 f"metadata file for dataset: {f['name']} is not valid, skipping" 92 ) 93 except FileNotFoundError: 94 pass 95 return Catalog(datasets=collected_datasets) 96 except SSLCertVerificationError: 97 raise ValueError("There is an Issue with loading the public catalog") 98 99 def list_datasets(self, as_df: bool) -> Union[List[str], pd.DataFrame]: 100 if as_df: 101 return pd.DataFrame([ds.dict() for ds in self.datasets]) 102 else: 103 return [dataset.name for dataset in self.datasets]
class
DenseModelMetadata(pydantic.main.BaseModel):
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
class
SparseModelMetdata(pydantic.main.BaseModel):
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
def
get_time_now() -> str:
class
DatasetMetadata(pydantic.main.BaseModel):
32class DatasetMetadata(BaseModel): 33 name: str 34 created_at: str 35 documents: int 36 queries: int 37 source: Optional[str] 38 license: Optional[str] 39 bucket: Optional[str] 40 task: Optional[str] 41 dense_model: DenseModelMetadata 42 sparse_model: Optional[SparseModelMetdata] 43 description: Optional[str] 44 tags: Optional[List[str]] 45 args: Optional[Dict[str, Any]] 46 47 @staticmethod 48 def empty() -> "DatasetMetadata": 49 return DatasetMetadata( 50 name="", 51 created_at=get_time_now(), 52 documents=0, 53 queries=0, 54 dense_model=DenseModelMetadata(name="", dimension=0), 55 ) 56 57 def is_empty(self) -> bool: 58 return self.name == "" and self.documents == 0 and self.queries == 0
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
class
Catalog(pydantic.main.BaseModel):
61class Catalog(BaseModel): 62 datasets: List[DatasetMetadata] = [] 63 64 @staticmethod 65 def load(**kwargs) -> "Catalog": 66 public_datasets_base_path = os.environ.get( 67 "DATASETS_CATALOG_BASEPATH", cfg.Storage.endpoint 68 ) 69 fs = get_cloud_fs(public_datasets_base_path, **kwargs) 70 if not fs: 71 raise ValueError( 72 "Public datasets are only supported on cloud storage, with valid s3:// or gs:// paths" 73 ) 74 collected_datasets = [] 75 try: 76 for f in fs.listdir(public_datasets_base_path): 77 if f["type"] == "directory": 78 try: 79 prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3" 80 with fs.open(f"{prefix}://{f['name']}/metadata.json") as f: 81 try: 82 this_dataset_json = json.load(f) 83 except json.JSONDecodeError: 84 warnings.warn( 85 f"Not a JSON: Invalid metadata.json for {f['name']}, skipping" 86 ) 87 try: 88 this_dataset = DatasetMetadata(**this_dataset_json) 89 collected_datasets.append(this_dataset) 90 except ValidationError: 91 warnings.warn( 92 f"metadata file for dataset: {f['name']} is not valid, skipping" 93 ) 94 except FileNotFoundError: 95 pass 96 return Catalog(datasets=collected_datasets) 97 except SSLCertVerificationError: 98 raise ValueError("There is an Issue with loading the public catalog") 99 100 def list_datasets(self, as_df: bool) -> Union[List[str], pd.DataFrame]: 101 if as_df: 102 return pd.DataFrame([ds.dict() for ds in self.datasets]) 103 else: 104 return [dataset.name for dataset in self.datasets]
64 @staticmethod 65 def load(**kwargs) -> "Catalog": 66 public_datasets_base_path = os.environ.get( 67 "DATASETS_CATALOG_BASEPATH", cfg.Storage.endpoint 68 ) 69 fs = get_cloud_fs(public_datasets_base_path, **kwargs) 70 if not fs: 71 raise ValueError( 72 "Public datasets are only supported on cloud storage, with valid s3:// or gs:// paths" 73 ) 74 collected_datasets = [] 75 try: 76 for f in fs.listdir(public_datasets_base_path): 77 if f["type"] == "directory": 78 try: 79 prefix = "gs" if isinstance(fs, gcsfs.GCSFileSystem) else "s3" 80 with fs.open(f"{prefix}://{f['name']}/metadata.json") as f: 81 try: 82 this_dataset_json = json.load(f) 83 except json.JSONDecodeError: 84 warnings.warn( 85 f"Not a JSON: Invalid metadata.json for {f['name']}, skipping" 86 ) 87 try: 88 this_dataset = DatasetMetadata(**this_dataset_json) 89 collected_datasets.append(this_dataset) 90 except ValidationError: 91 warnings.warn( 92 f"metadata file for dataset: {f['name']} is not valid, skipping" 93 ) 94 except FileNotFoundError: 95 pass 96 return Catalog(datasets=collected_datasets) 97 except SSLCertVerificationError: 98 raise ValueError("There is an Issue with loading the public catalog")
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- dict
- json
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs