1、series数据的生成和访问
2、DataFrame数据生成的几种方法
3、时间序列的生成和处理
4、DataFrame数据的全方位访问
5、DataFrame数据的规整化处理
6、DataFrame数据的分组和聚合
7、DataFrame数据的高效遍历
8、DataFrame数据的导入和导出
数据结构:
Index(索引)Value(值)2019-01-1111.122019-01-1212.12
Series数据生成的方法
s=pandaa.Series(data=None,index=None,dtype=None)
data 支持列表(list)dict(字典)等
dtype 类型
Series的生成
import numpy as np
import pandas as pd
s_list_1=pd.Series([11,12,12.32,13.42,14.56],index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14','2019-02-15'])
print(s_list_1)
"""
打印结果
2019-01-11 11.00
2019-01-12 12.00
2019-01-13 12.32
2019-01-14 13.42
2019-02-15 14.56
dtype: float64
"""
s_ndarry_1=pd.Series(np.arange(4))
print(s_ndarry_1)
"""
打印结果
0 0
1 1
2 2
3 3
dtype: int32
"""
s_dict_2=pd.Series({'2019-01-11':11.12,'2019-01-12':11.12,'2019-01-13':11.13,'2019-01-14':11.14})
print(s_dict_2)
"""
打印结果
2019-01-11 11.12
2019-01-12 11.12
2019-01-13 11.13
2019-01-14 11.14
dtype: float64
"""
s_scalar=pd.Series(5.2,index=['2019-01-11','2019-01-13'])
print(s_scalar)
"""
打印结果
2019-01-11 5.2
2019-01-13 5.2
dtype: float64
"""
Series数据访问的方法:
Series.values
Series.index
索引访问
切片访问
print(s_list_1.index)
print(s_list_1['2019-01-14'])
print(s_list_1[['2019-01-12','2019-01-14']])
"""
2019-01-12 12.00
2019-01-14 13.42
dtype: float64
"""
"""
print(s_list_1[:2])
2019-01-11 11.0
2019-01-12 12.0
dtype: float64
"""
DataFrame 数据生成的方法
df=pandas.DataFrame (data,index,column,dtype)
data : 嵌套字典 列表 二维ndarry
DataFrame 是series的多列
indexvaluevalue2019-01-1111.2111.132019-01-1211.1112.13
import pandas as pd
import numpy as np
df_list_dict=pd.DataFrame({
'close':[11.12,12.13,13.56,14.88],
'open':[35.12,67.7,89.0,68.99]},index=['2019-01-12','2019-01-14','2019-01-45','2019-06-23'])
print(df_list_dict)
"""
close open
2019-01-12 11.12 35.12
2019-01-14 12.13 67.70
2019-01-45 13.56 89.00
2019-06-23 14.88 68.99
"""
df_list_b=pd.DataFrame([[11.12,11.13,11.14,11.15],[21.11,21.13,21.14,12.16]],
index=['2019-01-11','2019-01-16'],columns=['Close','Open','Low','High'])
print(df_list_b)
"""
Close Open Low High
2019-01-11 11.12 11.13 11.14 11.15
2019-01-16 21.11 21.13 21.14 12.16
"""
ndarray_b=np.array([(11.12,11.13,11.14),(21.11,21.22,21.14)],
dtype=[('close',np.float16),('open',np.uint32),('high',np.float16)])
print(ndarray_b)
df_ndarry=pd.DataFrame(data=ndarray_b,index=['2019-01-11','2019-01-13'],
columns=['close','open','high'])
print(df_ndarry)
'''
[(11.12, 11, 11.14) (21.11, 21, 21.14)]
close open high
2019-01-11 11.117188 11 11.140625
2019-01-13 21.109375 21 21.140625
'''
serries_data={
'close':pd.Series([11.12,11.13,11.14],index=['2019-01-11','2019-01-12','2019-01-16']),
'open': pd.Series([21.11,21.13,21.16], index=['2019-01-11', '2019-01-12', '2019-01-16'])
}
df_series=pd.DataFrame(serries_data)
print(df_series)
"""
close open
2019-01-11 11.12 21.11
2019-01-12 11.13 21.13
2019-01-16 11.14 21.16
"""
python 的内置时间序列模板 time datetime calendar
时间序列的三种格式:
timestamp时间戳–从新纪元开始按秒计算的偏移量
struct_time时间元组–元组格式的年、月、日
format time格式化时间–使时间更具有可读性
datetime的相关用法
from datetime import date,time,datetime,timedelta
datetime_obj=datetime(2016,10,26,10,23,15,1)
print(f'datetime:{datetime_obj}')
re_datetime_obj=datetime_obj.replace(day=27,hour=20)
print(f'datetime:{re_datetime_obj}')
print(f'datetime.isoformat:{datetime_obj.isoformat()}')
print(f'dtrtime:{datetime_obj.strftime("%Y-%m-%d %X")}')
print(f'datetime.now():{datetime.now()}')
delta_obj=datetime.strptime("2019-10-18 04:20:00","%Y-%m-%d %X")-datetime.strptime("2019-10-01 04:20:00","%Y-%m-%d %X")
print(delta_obj.days,delta_obj.total_seconds())
dt=datetime.now()
dt1=dt+timedelta(days=1,hours=1)
dt2=dt+timedelta(days=-1)
dt3=dt-timedelta(days=1)
print(f"{dt1}\n{dt2}\n{dt3}\n")
"""
2022-04-14 22:41:54.777538
2022-04-12 21:41:54.777538
2022-04-12 21:41:54.777538
"""
Timestamp、to_datetime、data_range() period_range()
生成Timestamp()对象:
pd.Timestamp() 代表时间戳 例如 2022-01-01 02:03:04
pd.to_datetime() 时期转换为时间戳 例如 2022-01-01 00:01:01
生成Timedelta对象
可以对datetime实现加减
生成日期范围序列
pd.data_range()
代表范围 其中很重要的两个属性是 fre(代表变化的是年、月、日等) 和period(代表变化范围)
pd.period_range()
其属性与上面类似 其不同之处见下面的代码案例
案例:
from datetime import date,time,datetime,timedelta
import pandas as pd
ts=pd.Timestamp(2019,1,1,2,3,4)
ts1=pd.Timestamp(datetime(2019,1,1,hour=2,minute=3,second=4))
ts2=pd.Timestamp("2019-1-1 2:3:4")
print(f'pd.Timestamp-ts:{ts}')
print(f'pd.Timestamp-ts1:{ts1}')
print(f'pd.Timestamp-ts2:{ts2}')
"""
pd.Timestamp-ts:2019-01-01 02:03:04
pd.Timestamp-ts1:2019-01-01 02:03:04
pd.Timestamp-ts2:2019-01-01 02:03:04
"""
print("///")
ts=pd.to_datetime(datetime(2019,1,1,hour=2,minute=3,second=4))
ts1=pd.to_datetime("2019-1-1 0:1:1")
print(f'ts:{ts}')
print(f'ts1:{ts1}')
"""
pd.Timestamp-ts:2019-01-01 02:03:04
pd.Timestamp-ts1:2019-01-01 00:01:01
"""
dt_list=pd.to_datetime(["2019-1-1 0:1:1","2019-1-1 0:1:1","2019-3-1 0:1:1"])
print(f'pd.to_datetime()-list:{dt_list}')
"""
pd.to_datetime()-list:DatetimeIndex(['2019-01-01 00:01:01', '2019-01-01 00:01:01',
'2019-03-01 00:01:01'],
dtype='datetime64[ns]', freq=None)
"""
print("------------")
dt0=pd.to_datetime(datetime(2019,1,1,hour=0,minute=0,second=0))
dt1=dt0+pd.Timedelta(days=5,minutes=50,seconds=20)
print(f"datetime-1{dt0}\ndatetime2-2{dt1}")
"""
datetime-12019-01-01 00:00:00
datetime2-22019-01-06 00:50:20
"""
date_rng=pd.date_range('2019-01-01',freq='M',periods=12)
print(f'{date_rng}')
date_rng=pd.period_range('2019-01-01',freq='M',periods=12)
print(f'{date_rng}')
"""
DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
'2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
'2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31'],
dtype='datetime64[ns]', freq='M')
PeriodIndex(['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06',
'2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12'],
dtype='period[M]')
"""
date_rng=pd.date_range('2019-01-01',freq='W-sun',periods=12)
print(f'{date_rng}')
date_rng=pd.period_range('2019-01-01',freq='W-sun',periods=12)
print(f'{date_rng}')
"""
DatetimeIndex(['2019-01-06', '2019-01-13', '2019-01-20', '2019-01-27',
'2019-02-03', '2019-02-10', '2019-02-17', '2019-02-24',
'2019-03-03', '2019-03-10', '2019-03-17', '2019-03-24'],
dtype='datetime64[ns]', freq='W-SUN')
PeriodIndex(['2018-12-31/2019-01-06', '2019-01-07/2019-01-13',
'2019-01-14/2019-01-20', '2019-01-21/2019-01-27',
'2019-01-28/2019-02-03', '2019-02-04/2019-02-10',
'2019-02-11/2019-02-17', '2019-02-18/2019-02-24',
'2019-02-25/2019-03-03', '2019-03-04/2019-03-10',
'2019-03-11/2019-03-17', '2019-03-18/2019-03-24'],
dtype='period[W-SUN]')
"""
方法:
DataFrame.index
DataFrame.columns
DataFrame.values
DataFrame.loc
DataFrame.iloc
假设生成的一个DataFrame为df_series
函数描述df_series.index访问df_series全部行索引df_series.columns访问df_series全部列索引df_series .values访问df_series全部元素数值df_series.index访问df_series全部行索引df_series.列名字或者df_series[‘列名字’]访问某一个列元素df_series.[0:1]访问第一行元素 也可改变值来改变访问行数df_series.loc[[pd.to_datetime(‘行名字’)],[‘列名’,’列名’]读取某一行对应的列标签df_series.loc[:,[‘列名’]]读取某一列的所有元素df_series.loc[pd.to_datetime(‘行名’)访问某一行的所有元素df_series.iloc[0:2,0:1]数字可以改变 代表访问某几行某几列的元素 代表范围df_series.iloc[[0,2],[0,1]数字可以改变 代表访问指定的某行某列的元素 代表固定值df_series.loc[df_series.index[[0,2]],’列名’]数字可以改变 代表要用到index 标签要变为索引 然后进行访问 访问某一列的几行元素df_series.iloc[[0,2],df_series.columns.get_loc(‘列名’)同上df_series.iloc[[0,2],df_series.columns.get_indexer([‘列名’,’列名’])]访问指定列的几行元素案例如下:
import pandas as pd
dates=pd.DatetimeIndex(['2019-01-11','2019-02-13','2019-01-16'])
series_data={
'close':pd.Series([11.12,11.13,11.14],index=dates),
'open':pd.Series([21.11,21.32,21.44],index=dates)
}
df_series=pd.DataFrame(series_data,columns=['close','open'])
print(df_series)
"""
close open
2019-01-11 11.12 21.11
2019-02-13 11.13 21.32
2019-01-16 11.14 21.44
"""
dates=pd.date_range('2019-01-11',freq='D',periods=4)
print(dates)
print("\\\\\\\\\\\\\\")
print(df_series.index)
print(df_series.columns)
print(df_series.values)
"""
行索引
DatetimeIndex(['2019-01-11', '2019-02-13', '2019-01-16'], dtype='datetime64[ns]', freq=None)
列索引
Index(['close', 'open'], dtype='object')
值
[[11.12 21.11]
[11.13 21.32]
[11.14 21.44]]
"""
print(df_series['close'])
print(df_series.close)
"""
2019-01-11 11.12
2019-02-13 11.13
2019-01-16 11.14
Name: close, dtype: float64
"""
print(df_series[0:1])
"""
close open
2019-01-11 11.12 21.11
"""
print(df_series.loc[[pd.to_datetime('2019-01-16')],['close','open']])
"""
close open
2019-01-16 11.14 21.44
"""
print(df_series.loc[:,['close']])
"""
close
2019-01-11 11.12
2019-02-13 11.13
2019-01-16 11.14
"""
print('访问'2019-01-16这行的元素内容:')
print(df_series.loc[pd.to_datetime('2019-01-16')])
"""
close 11.14
open 21.44
Name: 2019-01-16 00:00:00, dtype: float64
"""
print("以下是iloc的访问:")
print(df_series.iloc[0:2,0:1])
"""
close
2019-01-11 11.12
2019-02-13 11.13
"""
print(df_series.iloc[0:1])
"""
close open
2019-01-11 11.12 21.11
"""
print("//")
print(df_series.iloc[[0,2],[0]])
print('以下为index访问标签')
print(df_series.index[[0,2]])
print(df_series.loc[df_series.index[[0,2]],'open'])
"""
DatetimeIndex(['2019-01-11', '2019-01-16'], dtype='datetime64[ns]', freq=None)
2019-01-11 21.11
2019-01-16 21.44
"""
print('')
print(df_series.columns.get_loc('open'))
print(df_series.iloc[[0,2],df_series.columns.get_loc('open')])
"""
2019-01-11 21.11
2019-01-16 21.44
Name: open, dtype: float64
"""
print('/')
print(df_series.columns.get_indexer(['open','close']))
print(df_series.iloc[[0,2],df_series.columns.get_indexer(['open','close'])])
"""
[1 0]
open close
2019-01-11 21.11 11.12
2019-01-16 21.44 11.14
"""
Original: https://blog.csdn.net/qq_45126531/article/details/124145832
Author: 大葱一根
Title: pandas数据分析
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/750823/
转载文章受原作者版权保护。转载请注明原作者出处!