稀疏矩阵内存使用和连接/ hstack Pandas vs Numpy vs Spicy

时间:2018-01-17 21:32:28

标签: python pandas numpy sparse-matrix

我试图理解Python中的稀疏矩阵,并进行了这样的比较:

import scipy.sparse as sp
import pandas as pd
import numpy as np
import sys
''' Pandas '''
x_p=pd.DataFrame()
x_p["A"]=[0,1,0,2]
x_p["B"]=[1,1,0,0]
x_p["C"]=[1,0,0,0]
sys.getsizeof(x_p)

x_ps=x_p.to_sparse(fill_value=0)
print(x_ps)
sys.getsizeof(x_ps)

x_pd=x_p.to_dense() 
print(x_pd)
sys.getsizeof(x_pd)

''' Pandas concat'''
#if stack we get the same size
hp=pd.concat([x_ps,x_ps], axis=1)
sys.getsizeof(hp) #296

hp=pd.concat([x_pd,x_pd], axis=1)
sys.getsizeof(hp) #296

#only if we manualy convert to sparse we get lower usage
hp=pd.concat([x_ps,x_ps], axis=1).to_sparse(fill_value=0)
sys.getsizeof(hp) #184

''' Pandas -> Numpy'''
#pandas -> numpy array
x_n=np.array(x_p)
print(x_n)
sys.getsizeof(x_n) #size 208
# or
x_n=x_p.values
print(x_n)
sys.getsizeof(x_n) #size 112 Almost 2 times less memory usage?!

#pandas -> numpy matrix
x_n_mat=np.asmatrix(x_p)
sys.getsizeof(x_n_mat) #size 136


'''numpy concatenate/hstack'''
hn=np.hstack((x_n_mat,x_n_mat)) #matrix
print(hn)
sys.getsizeof(hn) #size 328

hn=np.concatenate((x_n_mat,x_n_mat), axis=1)
print(hn)
sys.getsizeof(hn) #size 328

''' Numpy -> Pandas'''
#numpy -> pandas
x_n_p=pd.DataFrame(x_n_mat) #same with "x_n"
sys.getsizeof(x_n_p)  #size 200


''' spicy crs_matrix '''
#pandas -> spicy crs_matrix
#keep names and index
row_names = x_p.index
col_names = list(x_p.columns)

x_sp=sp.csr_matrix(x_p) 
print(x_sp)
sys.getsizeof(x_sp) #size 56

#pandas sparse -> spicy crs_matrix
x_sps=sp.csr_matrix(x_ps)
print(x_sps)
sys.getsizeof(x_sps) #size 56

#numpy matrix or array -> spicy crs_matrix 
x_sn=sp.csr_matrix(x_n_mat)
print(x_sn)
sys.getsizeof(x_sn) #size 56

''' spicy hstack '''
# stack sparse matrices 
#spicy
h = sp.hstack((x_sp, x_sp), format='csr')
print(h)
sys.getsizeof(h) #size 56

''' from spicy to numpy and pandas '''
#spicy -> numpy
hd=h.todense() #->numpy matrix 
sys.getsizeof(hd) #size 136

hd=h.toarray() #->numpy array
sys.getsizeof(hd) # size 304

#spicy -> numpy matrix ->pandas df
hdp=pd.DataFrame(h.todense()) #no direct way
hdp.index=row_names #keep index from pandas df
hdp.columns=col_names #set column names from pandas df
sys.getsizeof(hd) # size 304
#spicy -> numpy array ->pandas df
hdp=pd.DataFrame(h.toarray())
sys.getsizeof(hd) # size 304

有一些意想不到的情况,例如当我们收集2个相同的矩阵时,内存中的大小不会增加。

第二件事是当我们从pandas dataframe创建numpy数组.values内存中的大小比np.array小2倍

第三,如果我们连接2个pandas稀疏数据帧,它会创建密集的数据帧,应该再次手动转换为稀疏数据帧。

也许这会有用。

0 个答案:

没有答案
相关问题