Page cover

Hdf5

A custom simple class to create, access, destroy hdf5 file


import pandas as pd
import numpy as np
import h5py
import os
from pandas import HDFStore
from datetime import datetime,timedelta,date
from dataclasses import dataclass
from typing import Any
from collections import defaultdict



class MyHDF5:
    
    current_path = os.getcwd()
    cursor =  Any
    zip_level = 0
    hdf5_name = ""
    def __init__(self,name:str,path:str = None,compressed_level:int = 0) -> pd.HDFStore:
        
        self.hdf5_name = f"{name}.h5"
        if compressed_level:
            self.zip_level = compressed_level
        if path:
            os.chdir(path)
            self.cursor = pd.HDFStore(self.hdf5_name,complevel=self.zip_level)
            os.chdir(self.current_path)
        self.cursor = pd.HDFStore(self.hdf5_name,complevel=self.zip_level)
        print(self.cursor.info())       
        
    def insert_df(self,key,df,timestamp=False):
        data_location = key
        if timestamp==True:
            time_label = datetime.now().strftime("%Y%m%d_%H%M%S")
#             time_label = datetime.now().strftime("%Y%m%d")
            data_location=f"{key}/{time_label}"
            
        try:
            print("Inserting...")
            self.cursor.put(key=data_location,value=df)
            print(f"Insert df success, key: <<{data_location}>>, dataset size: {get_mem_usage(df)}")
        except Exception as e:
            pass
#             print(e)

            
    def extract_data(self,key):
        return self.cursor[key]
    
    def remove_data(self,key):
        try:
            del self.cursor[key]
            print(f"Remove dataset {key} success")
        except KeyError:
            raise KeyError(f"{key} not found")
    def append_df(self,key,df:pd.DataFrame):
        self.cursor[key] = pd.concat([self.cursor[key],df])
        
        
    
    def __repr__(self)->None:
        return(self.cursor.info())
    
    def is_open(self)->bool:
        return self.cursor.is_open
    
    @property
    def keys(self)->list:
        return sorted(list(self.cursor.keys()))
    
    def __getitem__(self,key:str)->"dataset":
        return self.cursor[key]
        
    def __setitem__(self,key,value)->None:
        self.cursor[key]=value
    
    def __delitem__(self,key)->None:
        del self.cursor[key]
    
    def __contains__(self, key: str) -> bool:
        """
        check for existence of this key
        can match the exact pathname or the pathnm w/o the leading '/'
        """
        node = self.cursor.get_node(key)
        if node is not None:
            name = node._v_pathname
            if name == key or name[1:] == key:
                return True
        return False
    
    def __len__(self) -> int:
        return len(self.cursor.groups())
    
    def items(self):
        """
        iterate on key->group
        """
        for g in self.cursor.groups():
            yield g._v_pathname, g
            
    def close(self)->None:
        try:
            self.cursor.close()
            print("Close success")
        except Exception as e:
            print(e)
            
    def check_groups(self)->list:
        return self.cursor.groups()
    
    def destroy(self):
        if self.is_open():
            print("Close connection...")
            self.close()
        
        print(f"Destroy << {self.hdf5_name}>>... ")
        os.remove(f"{self.hdf5_name}")
        print("Finish")

Last updated

Was this helpful?