python: float64与float32转换、压缩比较与转换偏差

python原生类型中只有float,并没有float64和float32,但是在numpy中是有float64和32区分的。因此,互转是需要在特定的库下面进行,比如,numpy和dataframe之类。

结论:
1、互转的类型中,DataFrame和np.array是比较方便互转的。
2、大小差了一倍;
3、压缩时间也差了一倍(有时会到2倍);
4、转换偏差存在,看情况而异;

一、转换与压缩

import pickle
import sys
import pandas as pd
import random
import gzip
import time as t
import numpy as np

df = pd.DataFrame([123456789.0])

df = df.astype('float32')

n =10000

_dict_f64 = {"open": [random.random() +10.0 for i in range(n)] ,
          "high":[random.random() +10.0 for i in range(n)] ,
          "low":[random.random() +10.0 for i in range(n)] ,
          "close":[random.random() +10.0 for i in range(n)] ,
}
np.array,list的情况
open_f32      = np.array(_dict_f64["open"]).astype('float32')
open_f64_list = _dict_f64["open"]
open_f64_np   = np.array(open_f64_list)

print("np.array[32和64] 与 f64_list比较: ")
print(f"open_f64_list size : {sys.getsizeof(open_f64_list)}")
print(f"open_f64_np   size : {sys.getsizeof(open_f64_np)}")
print(f"open_f32_np   size : {sys.getsizeof(open_f32)}")

list 不能直接转,np.array可以f64转f32

print("df_64和df_32转换与大小比较:")

df_f64 = pd.DataFrame(_dict_f64)
print(f"df_f64 size : {sys.getsizeof(df_f64)}")
df_f32 = df_f64.astype('float32')
print(f"df_f32 size : {sys.getsizeof(df_f32)}")

print(f"f64和f32类型pickle二进制文件大小比较:")
pk_f64 = pickle.dumps(df_f64)
pk_dict = pickle.dumps(_dict_f64)
pk_f32 = pickle.dumps(df_f32)

print(f"pk_f64 size      : {sys.getsizeof(pk_f64)}")
print(f"pk_f32 size      : {sys.getsizeof(pk_f32)}")
print(f"pk_f64_dict size : {sys.getsizeof(pk_dict)}")

## 可以看一下差异

print("f64和f32类型二进制pickle文件压缩时间开销比较:")
t0 = t.time()
gzip_f64 = gzip.compress(pk_f64)
print(f"pk_64 -> compress cost time : {t.time() -t0} seconds!")

t1 = t.time()

gzip_f32 = gzip.compress(pk_f32)

print(f"pk_32 -> compress cost time : {t.time() -t1} seconds!")

输出:

np.array[32和64] 与 f64_list比较:
open_f64_list size : 87616
open_f64_np   size : 80112
open_f32_np   size : 40112
df_64和df_32转换与大小比较:
df_f64 size : 320144
df_f32 size : 160144
f64和f32类型pickle二进制文件大小比较:
pk_f64 size      : 320725
pk_f32 size      : 160725
pk_f64_dict size : 360210
f64和f32类型二进制pickle文件压缩时间开销比较:
pk_64 -> compress cost time : 0.02867913246154785 seconds!

pk_32 -> compress cost time : 0.009557962417602539 seconds!

二、floa64和np.float64

1、float64 list 与np.float64 array

a = np.array([1.0,2.0])
b = np.array([np.float64(1.0),np.float64(2.0)])
c = [1.0,2.0]

print(f" a: {sys.getsizeof(a)}")
print(f" b: {sys.getsizeof(b)}")
print(f" c: {sys.getsizeof(c)}")

输出:
a: 128
b: 128
c: 72
可见,np.float64要比单个原生的占用字节要大。

2、dict类型中比较


n =100000
k = 50000.0

_dict_f64_primitive = {"open": [random.random() +k for i in range(n)] ,
          "high":[random.random() +k for i in range(n)] ,
          "low":[random.random() +k for i in range(n)] ,
          "close":[random.random() +k for i in range(n)] ,
}
_dict_f64_np = {"open": np.array(_dict_f64_primitive["open"]) ,
          "high":np.array(_dict_f64_primitive["high"])  ,
          "low":np.array(_dict_f64_primitive["low"]) ,
          "close":np.array(_dict_f64_primitive["close"])  ,
}
_dict_f64_np2 = {"open": np.array([random.random() +k for i in range(n)] ) ,
          "high":np.array([random.random() +k for i in range(n)] )  ,
          "low":np.array([random.random() +k for i in range(n)] ) ,
          "close":np.array([random.random() +k for i in range(n)] )  ,
}
print(f"_dict_f64_primitive  size : {sys.getsizeof(_dict_f64_primitive)}")
print(f"_dict_f64_np         size : {sys.getsizeof(_dict_f64_np)}")
print(f"_dict_f64_np2        size : {sys.getsizeof(_dict_f64_np2)}\n")
t0 = t.time()
pk_primitive = pickle.dumps(_dict_f64_primitive)
t1 = t.time()
pk_np = pickle.dumps(_dict_f64_np)
t2 = t.time()
pk_np2 = pickle.dumps(_dict_f64_np2)
t3 = t.time()

gzip_primitive =  gzip.compress(pk_primitive)
t4  =t.time()
gzip_np =  gzip.compress(pk_np)
t5  =t.time()
gzip_np2 =  gzip.compress(pk_np2)
t6  =t.time()

print(f"pk_primitive  -> binary cost time :{t1-t0} seconds")
print(f"pk_np         -> binary cost time :{t2-t1} seconds")
print(f"pk_np2        -> binary cost time :{t3-t2} seconds\n")

print(f"pk_primitive  -> cpmpress cost time :{t4-t3} seconds")
print(f"pk_np         -> compress cost time :{t5-t4} seconds")
print(f"pk_np2        -> compress cost time :{t6-t5} seconds")

输出:

_dict_f64_primitive  size : 232
_dict_f64_np         size : 232
_dict_f64_np2        size : 232

pk_primitive  -> binary cost time :0.012560844421386719 seconds
pk_np         -> binary cost time :0.006933927536010742 seconds
pk_np2        -> binary cost time :0.0059435367584228516 seconds

pk_primitive  -> cpmpress cost time :3.5468034744262695 seconds
pk_np         -> compress cost time :2.197758674621582 seconds
pk_np2        -> compress cost time :2.230668783187866 seconds

但这三者占用空间又相同。但序列化和压缩用时有一定差异。感觉numpy要快一些。

三、转换的偏差

numbers = [12.345888888888888888888,123456789.0,123456789.978654412,123456782229.978,3309.07,7896.353,123456789.88]
for number in numbers:
    assert isinstance(number,float)
    f_64 = np.float64(number)
    f_32 = np.float32(number)
    f_32_64 = np.float64(f_32)

    error = f_64 - f_32_64

    print(f"f_64    : {type(f_64)}      value :{f_64} ")
    print(f"f_32    : {type(f_32)}      value :{f_32}  error : {error}" )
    print(f"f_32_64 : {type(f_32_64)}   value :{f_32} \n" )

输出:

f_64    : <class 'numpy.float64'>      value :12.345888888888888
f_32    : <class 'numpy.float32'>      value :12.3458890914917  error : -2.0260281097250754e-07
f_32_64 : <class 'numpy.float64'>   value :12.3458890914917

f_64    : <class 'numpy.float64'>      value :123456789.0
f_32    : <class 'numpy.float32'>      value :123456792.0  error : -3.0
f_32_64 : <class 'numpy.float64'>   value :123456792.0

f_64    : <class 'numpy.float64'>      value :123456789.97865441
f_32    : <class 'numpy.float32'>      value :123456792.0  error : -2.0213455855846405
f_32_64 : <class 'numpy.float64'>   value :123456792.0

f_64    : <class 'numpy.float64'>      value :123456782229.978
f_32    : <class 'numpy.float32'>      value :123456782336.0  error : -106.02200317382812
f_32_64 : <class 'numpy.float64'>   value :123456782336.0

f_64    : <class 'numpy.float64'>      value :3309.07
f_32    : <class 'numpy.float32'>      value :3309.070068359375  error : -6.835937483629095e-05
f_32_64 : <class 'numpy.float64'>   value :3309.070068359375

f_64    : <class 'numpy.float64'>      value :7896.353
f_32    : <class 'numpy.float32'>      value :7896.35302734375  error : -2.734374993451638e-05
f_32_64 : <class 'numpy.float64'>   value :7896.35302734375

f_64    : <class 'numpy.float64'>      value :123456789.88
f_32    : <class 'numpy.float32'>      value :123456792.0  error : -2.1200000047683716
f_32_64 : <class 'numpy.float64'>   value :123456792.0
</class></class></class></class></class></class></class></class></class></class></class></class></class></class></class></class></class></class></class></class></class>

比如:123456789.0 【f64】 ->【f32】这么简单转换却存在难以理解的偏差:

f_32 :

那么问题是:如何减少这种转换带来的偏差?目测了一下,如果数值在1000万以下的float,转换的误差总体上是0.-0.1之间。但这个不太成为数据库的考虑项。数据库层次还是需要保真,否则失去就是依赖;具体应用层次,那仁者见仁。

Original: https://blog.csdn.net/wowotuo/article/details/126763543
Author: songroom
Title: python: float64与float32转换、压缩比较与转换偏差

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/750773/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球