import hashlib
import abc
from collections import namedtuple
# Build data type
basic_cols = ['a','b','c'... ,'hash_d=id']
fields = " ".join(basic_cols)
Data = namedtuple('Data',fields,defaults=""* len(fields))
class Datafactory:
__metaclass__ = abc.ABCMeta
source = []
def __init__(self,file_name):
if self.source==[]:
print("Reading data...")
with open(file_name,encoding='utf-8') as file:
file_iter = iter(file)
# Jump first line
_ = next(file_iter)
for line in file_iter:
each_ine = line.strip("\n").split(";")+['']
self.source.append(Data(*each_line))
print("Finish")
print("Add hash_id ...")
self.source = list(map(self.create_hash_id,self.source))
@staticmethod
def create_hash_id(each):
text = "".join(x.replace(" ","") for x in each._asdict().values())
return each._replace(hash_id = hashlib.sha256(text.encode('utf-8').hexdigest())
@abc.abstractmethod
def produce_data(self):
pass
class D1_Factory(DataFactory):
def __init__(self,file_name):
super().__init__(file_name)
print("Apply D1 filter...")
self.d1_data = list(filter(self.apply_filter,self.source))
print("Done")
@staticmethod
def apply_filter(e):
return (e.a=='xxx' and e.b in [1,24] and ..)
def produce_data(self):
return pd.DataFrame(self.d1_data,columns = Data._fields)
class C1_Factory(DataFactory):
def __init__(self,file_name):
super().__init__(file_name)
print("Apply C1 filter...")
self.c1_data= list(filter(self.apply_filter,self.source))
print("Done")
@staticmethod
def apply_filter(e):
return (e.c=='c1')
def produce_data(self):
return pd.DataFrame(self.c1_data,columns = Data._fields)
D1_data = D1_Factory('data.csv').produce_data()
C1_data = C1_Factory('data.csv').produce_data()