Useful functions
Other useful functions, notes, class
import numpy as np
import pandas as pd
import glob
import shutil
import os
import sqlite3
from copy import deepcopy
from datetime import datetime,timedelta,dateAbout dataframe
# Get memory used
def get_mem_usage(df):
print(f"{df.memory_usage(deep=True).sum()/1024 **2:3.2f}Mb")
# Get basic info
def get_general_info(df):
name = [x for x in globals() if globals()[x] is df][0]
print(f"Dataframe << {name}>>has {df.shape[0]} rows, {df.shape[1]} columns")
print("=======================================")
print("Column Types:\n")
print(df.dtypes)
print("=======================================")
print("Missing values per column: ")
percent_missing = df.isnull().sum()*100 / len(df)
missing_value_df = pd.DataFrame({'column_name':df.columns,'percent_missing':percent_missing})
missing_value_df['percent_missing'] = ["{:.2f}%".format(x) for x in missing_value_df['percent_missing'] ]
print(missing_value_df)
print("=======================================")
print(f"Memory Use: {df.memory_usage(deep=True).sum()/1024 **2:3.2f}Mb")
print("=======================================")
print("Missing Values in columns: ")
print(df.isnull().sum())
# Change col format
def change_col_format(df,target_type):
for c in df.columns:
df[c] = df[c].astype(target_type)
return df
# Optimize a df
def get_optimize_df(df):
df.astype({col:'category' for col in df.columns if df[col].nunique() / df[col].shape[0]<0.5})
return df
# Save / read pickle
def save_as_pickle(df,name,path=None):
try:
if path==None:
df.to_pickle(f"{name}.pkl")
print(f"Dataframe saved as pickle in => {os.getcwd()}")
else:
current_path = os.getcwd()
df.to_pickle(f"{path}/{name}.pkl")
print(f"Dataframe saved as pickle in => {path}/{name}.pkl")
os.chdir(current_path)
except:
print("Save failed. Make sure it's a dataframe or the path is correct")
def read_pickle_as_df(path=None):
result={}
current_path = os.getcwd()
target_path = os.getcwd()
if path!=None:
target_path = path
os.chdir(target_path)
lst = glob.glob(f"*.pkl")
for p in lst:
name = p.split(".")[0]
result[name]=pd.read_pickle(p)
os.chdir(current_path)
return result
# Read big csv file
def read_large_csv(name,chunkSize=1000000,encoding='utf-8'):
reader = pd.read_csv(name,iterator=True,encoding=encoding)
chunks=[]
loop=True
while loop:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
loop=False
df = pd.concat(chunks,ignore_index=True)
return df
Files
Time
Test iterable
Get df from list of tuples
Split a list to groups with almost equal file size
Add extra params in command
Filter out empty files
Convert a file size to mb,gb...
Last updated
Was this helpful?
