I needed to optimize how we store and manage our data. The data was plain float numbers, so I decided to first check how Python does it with CSV vs saving for HDF5 format. In order to check quickly, I generated random numbers and checked the file size of the stored data as well as the time it took to save them. Results:
1. The type of ordering (Row, Column, Square) didn't matter for CSV or HDF5 data format for time to save as well as the file size.
2. HDF5 performed significantly better in time and constantly better than CSV in size. Somewhere around 10,000 as the number of floating point numbers, things shifted to HDF5, for less than that, CSV appears to do better.
3. To read back, you can use,
h5f = h5py.File('ColH.h5','r')
bb = h5f['ColData'][:]
h5f.close()
Note, there is NO loss of information in HDF5 compression.
Plots and Code below.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 6 13:49:32 2018
@author: pranjal.bordia
"""
import numpy as np
import os
import h5py
import time
#### Linear Arrays Column
DataPoints= [10,100,1000,10000,100000,1000000,10000000]
Size_Col = []
Size_Row = []
Size_Sq = []
Time_CSV = []
Size_ColH = []
Size_RowH = []
Size_SqH = []
Time_H5 = []
for dataP in DataPoints:
xx = np.random.rand(dataP,1)
yy = np.random.rand(1,dataP)
zz = np.random.rand(int(np.floor(np.sqrt(dataP))),int(np.floor(np.sqrt(dataP))))
np.savetxt('Col.csv', xx, delimiter=',')
np.savetxt('Row.csv', yy, delimiter=',')
start = time.time()
np.savetxt('Square.csv', zz, delimiter=',')
end = time.time()
Time_CSV.append(end - start)
with h5py.File('ColH.h5', 'w') as hf:
hf.create_dataset("ColData", data=xx)
with h5py.File('RowH.h5', 'w') as hf:
hf.create_dataset("RowData", data=yy)
start = time.time()
with h5py.File('SquareH.h5', 'w') as hf:
hf.create_dataset("SquareData", data=zz)
end = time.time()
Time_H5.append(end - start)
statinfo = os.stat('Col.csv')
Size_Col.append(statinfo.st_size)
statinfo = os.stat('ColH.h5')
Size_ColH.append(statinfo.st_size)
statinfo = os.stat('Row.csv')
Size_Row.append(statinfo.st_size)
statinfo = os.stat('RowH.h5')
Size_RowH.append(statinfo.st_size)
statinfo = os.stat('Square.csv')
Size_Sq.append(statinfo.st_size)
statinfo = os.stat('SquareH.h5')
Size_SqH.append(statinfo.st_size)
figure()
plot(DataPoints,array(Size_Col)/1000,'o-',label='Col Format',alpha=0.5,ms = 7,lw=1.1)
plot(DataPoints,array(Size_ColH)/1000,'o-',label='H5 Col Format',alpha=0.5,ms = 7,lw=1.1)
plot(DataPoints,array(Size_Row)/1000,'o-',label='Row Format',alpha=0.5,ms = 4,lw=1)
plot(DataPoints,array(Size_RowH)/1000,'o-',label='H5 Row Format',alpha=0.5,ms = 4,lw=1)
plot(DataPoints,array(Size_Sq)/1000,'o-',label='Square Format',alpha=0.5,ms = 4,lw=1)
plot(DataPoints,array(Size_SqH)/1000,'o-',label='H5 Square Format',alpha=0.5,ms = 4,lw=1)
legend(loc=2)
loglog()
xlabel('Total number of Points')
ylabel('File Size(kB)')
grid(which='major')
grid(which='minor',linestyle='--')
figure()
plot(DataPoints,Time_CSV,'o-',label='CSV')
plot(DataPoints,Time_H5,'o-',label='H5')
legend(loc=2)
loglog()
xlabel('Total number of Points')
ylabel('Time (s)')
grid(which='major')
grid(which='minor',linestyle='--')
1. The type of ordering (Row, Column, Square) didn't matter for CSV or HDF5 data format for time to save as well as the file size.
2. HDF5 performed significantly better in time and constantly better than CSV in size. Somewhere around 10,000 as the number of floating point numbers, things shifted to HDF5, for less than that, CSV appears to do better.
3. To read back, you can use,
h5f = h5py.File('ColH.h5','r')
bb = h5f['ColData'][:]
h5f.close()
Note, there is NO loss of information in HDF5 compression.
Plots and Code below.
![]() |
![]() |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 6 13:49:32 2018
@author: pranjal.bordia
"""
import numpy as np
import os
import h5py
import time
#### Linear Arrays Column
DataPoints= [10,100,1000,10000,100000,1000000,10000000]
Size_Col = []
Size_Row = []
Size_Sq = []
Time_CSV = []
Size_ColH = []
Size_RowH = []
Size_SqH = []
Time_H5 = []
for dataP in DataPoints:
xx = np.random.rand(dataP,1)
yy = np.random.rand(1,dataP)
zz = np.random.rand(int(np.floor(np.sqrt(dataP))),int(np.floor(np.sqrt(dataP))))
np.savetxt('Col.csv', xx, delimiter=',')
np.savetxt('Row.csv', yy, delimiter=',')
start = time.time()
np.savetxt('Square.csv', zz, delimiter=',')
end = time.time()
Time_CSV.append(end - start)
with h5py.File('ColH.h5', 'w') as hf:
hf.create_dataset("ColData", data=xx)
with h5py.File('RowH.h5', 'w') as hf:
hf.create_dataset("RowData", data=yy)
start = time.time()
with h5py.File('SquareH.h5', 'w') as hf:
hf.create_dataset("SquareData", data=zz)
end = time.time()
Time_H5.append(end - start)
statinfo = os.stat('Col.csv')
Size_Col.append(statinfo.st_size)
statinfo = os.stat('ColH.h5')
Size_ColH.append(statinfo.st_size)
statinfo = os.stat('Row.csv')
Size_Row.append(statinfo.st_size)
statinfo = os.stat('RowH.h5')
Size_RowH.append(statinfo.st_size)
statinfo = os.stat('Square.csv')
Size_Sq.append(statinfo.st_size)
statinfo = os.stat('SquareH.h5')
Size_SqH.append(statinfo.st_size)
figure()
plot(DataPoints,array(Size_Col)/1000,'o-',label='Col Format',alpha=0.5,ms = 7,lw=1.1)
plot(DataPoints,array(Size_ColH)/1000,'o-',label='H5 Col Format',alpha=0.5,ms = 7,lw=1.1)
plot(DataPoints,array(Size_Row)/1000,'o-',label='Row Format',alpha=0.5,ms = 4,lw=1)
plot(DataPoints,array(Size_RowH)/1000,'o-',label='H5 Row Format',alpha=0.5,ms = 4,lw=1)
plot(DataPoints,array(Size_Sq)/1000,'o-',label='Square Format',alpha=0.5,ms = 4,lw=1)
plot(DataPoints,array(Size_SqH)/1000,'o-',label='H5 Square Format',alpha=0.5,ms = 4,lw=1)
legend(loc=2)
loglog()
xlabel('Total number of Points')
ylabel('File Size(kB)')
grid(which='major')
grid(which='minor',linestyle='--')
figure()
plot(DataPoints,Time_CSV,'o-',label='CSV')
plot(DataPoints,Time_H5,'o-',label='H5')
legend(loc=2)
loglog()
xlabel('Total number of Points')
ylabel('Time (s)')
grid(which='major')
grid(which='minor',linestyle='--')


Comments
Post a Comment