pandas数据分析

1、series数据的生成和访问
2、DataFrame数据生成的几种方法
3、时间序列的生成和处理
4、DataFrame数据的全方位访问
5、DataFrame数据的规整化处理
6、DataFrame数据的分组和聚合
7、DataFrame数据的高效遍历
8、DataFrame数据的导入和导出

数据结构:

Index(索引)Value(值)2019-01-1111.122019-01-1212.12

Series数据生成的方法
s=pandaa.Series(data=None,index=None,dtype=None)
data 支持列表(list)dict(字典)等
dtype 类型

Series的生成


import numpy as np
import pandas as pd

s_list_1=pd.Series([11,12,12.32,13.42,14.56],index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14','2019-02-15'])
print(s_list_1)
"""
打印结果
2019-01-11    11.00
2019-01-12    12.00
2019-01-13    12.32
2019-01-14    13.42
2019-02-15    14.56
dtype: float64
"""

s_ndarry_1=pd.Series(np.arange(4))
print(s_ndarry_1)
"""
打印结果
0    0
1    1
2    2
3    3
dtype: int32
"""

s_dict_2=pd.Series({'2019-01-11':11.12,'2019-01-12':11.12,'2019-01-13':11.13,'2019-01-14':11.14})
print(s_dict_2)
"""
打印结果
2019-01-11    11.12
2019-01-12    11.12
2019-01-13    11.13
2019-01-14    11.14
dtype: float64
"""

s_scalar=pd.Series(5.2,index=['2019-01-11','2019-01-13'])
print(s_scalar)
"""
打印结果
2019-01-11    5.2
2019-01-13    5.2
dtype: float64
"""

Series数据访问的方法:

Series.values
Series.index
索引访问
切片访问


print(s_list_1.index)

print(s_list_1['2019-01-14'])
print(s_list_1[['2019-01-12','2019-01-14']])
"""
2019-01-12    12.00
2019-01-14    13.42
dtype: float64
"""

"""
print(s_list_1[:2])
2019-01-11    11.0
2019-01-12    12.0
dtype: float64
"""

DataFrame 数据生成的方法
df=pandas.DataFrame (data,index,column,dtype)
data : 嵌套字典 列表 二维ndarry
DataFrame 是series的多列

indexvaluevalue2019-01-1111.2111.132019-01-1211.1112.13


import pandas as pd
import numpy as np

df_list_dict=pd.DataFrame({
    'close':[11.12,12.13,13.56,14.88],
    'open':[35.12,67.7,89.0,68.99]},index=['2019-01-12','2019-01-14','2019-01-45','2019-06-23'])
print(df_list_dict)
"""
            close   open
2019-01-12  11.12  35.12
2019-01-14  12.13  67.70
2019-01-45  13.56  89.00
2019-06-23  14.88  68.99

"""

df_list_b=pd.DataFrame([[11.12,11.13,11.14,11.15],[21.11,21.13,21.14,12.16]],
                       index=['2019-01-11','2019-01-16'],columns=['Close','Open','Low','High'])
print(df_list_b)
"""
            Close   Open    Low   High
2019-01-11  11.12  11.13  11.14  11.15
2019-01-16  21.11  21.13  21.14  12.16

"""

ndarray_b=np.array([(11.12,11.13,11.14),(21.11,21.22,21.14)],
                   dtype=[('close',np.float16),('open',np.uint32),('high',np.float16)])
print(ndarray_b)
df_ndarry=pd.DataFrame(data=ndarray_b,index=['2019-01-11','2019-01-13'],
                       columns=['close','open','high'])
print(df_ndarry)
'''
[(11.12, 11, 11.14) (21.11, 21, 21.14)]
                close  open       high
2019-01-11  11.117188    11  11.140625
2019-01-13  21.109375    21  21.140625
'''

serries_data={
    'close':pd.Series([11.12,11.13,11.14],index=['2019-01-11','2019-01-12','2019-01-16']),
    'open': pd.Series([21.11,21.13,21.16], index=['2019-01-11', '2019-01-12', '2019-01-16'])
}

df_series=pd.DataFrame(serries_data)
print(df_series)
"""
            close   open
2019-01-11  11.12  21.11
2019-01-12  11.13  21.13
2019-01-16  11.14  21.16
"""

python 的内置时间序列模板 time datetime calendar

时间序列的三种格式:
timestamp时间戳–从新纪元开始按秒计算的偏移量
struct_time时间元组–元组格式的年、月、日
format time格式化时间–使时间更具有可读性

datetime的相关用法


from datetime import date,time,datetime,timedelta

datetime_obj=datetime(2016,10,26,10,23,15,1)
print(f'datetime:{datetime_obj}')

re_datetime_obj=datetime_obj.replace(day=27,hour=20)
print(f'datetime:{re_datetime_obj}')

print(f'datetime.isoformat:{datetime_obj.isoformat()}')

print(f'dtrtime:{datetime_obj.strftime("%Y-%m-%d %X")}')

print(f'datetime.now():{datetime.now()}')

delta_obj=datetime.strptime("2019-10-18 04:20:00","%Y-%m-%d %X")-datetime.strptime("2019-10-01 04:20:00","%Y-%m-%d %X")
print(delta_obj.days,delta_obj.total_seconds())

dt=datetime.now()
dt1=dt+timedelta(days=1,hours=1)
dt2=dt+timedelta(days=-1)
dt3=dt-timedelta(days=1)
print(f"{dt1}\n{dt2}\n{dt3}\n")
"""
2022-04-14 22:41:54.777538
2022-04-12 21:41:54.777538
2022-04-12 21:41:54.777538

"""

Timestamp、to_datetime、data_range() period_range()

生成Timestamp()对象:
pd.Timestamp() 代表时间戳 例如 2022-01-01 02:03:04

pd.to_datetime() 时期转换为时间戳 例如 2022-01-01 00:01:01

生成Timedelta对象
可以对datetime实现加减

生成日期范围序列
pd.data_range()
代表范围 其中很重要的两个属性是 fre(代表变化的是年、月、日等) 和period(代表变化范围)

pd.period_range()
其属性与上面类似 其不同之处见下面的代码案例
案例:


from datetime import date,time,datetime,timedelta

import pandas as pd

ts=pd.Timestamp(2019,1,1,2,3,4)
ts1=pd.Timestamp(datetime(2019,1,1,hour=2,minute=3,second=4))
ts2=pd.Timestamp("2019-1-1 2:3:4")
print(f'pd.Timestamp-ts:{ts}')
print(f'pd.Timestamp-ts1:{ts1}')
print(f'pd.Timestamp-ts2:{ts2}')

"""
pd.Timestamp-ts:2019-01-01 02:03:04
pd.Timestamp-ts1:2019-01-01 02:03:04
pd.Timestamp-ts2:2019-01-01 02:03:04
"""
print("///")

ts=pd.to_datetime(datetime(2019,1,1,hour=2,minute=3,second=4))
ts1=pd.to_datetime("2019-1-1 0:1:1")
print(f'ts:{ts}')
print(f'ts1:{ts1}')
"""
pd.Timestamp-ts:2019-01-01 02:03:04
pd.Timestamp-ts1:2019-01-01 00:01:01
"""
dt_list=pd.to_datetime(["2019-1-1 0:1:1","2019-1-1 0:1:1","2019-3-1 0:1:1"])
print(f'pd.to_datetime()-list:{dt_list}')
"""
pd.to_datetime()-list:DatetimeIndex(['2019-01-01 00:01:01', '2019-01-01 00:01:01',
               '2019-03-01 00:01:01'],
              dtype='datetime64[ns]', freq=None)
"""
print("------------")
dt0=pd.to_datetime(datetime(2019,1,1,hour=0,minute=0,second=0))
dt1=dt0+pd.Timedelta(days=5,minutes=50,seconds=20)
print(f"datetime-1{dt0}\ndatetime2-2{dt1}")
"""
datetime-12019-01-01 00:00:00
datetime2-22019-01-06 00:50:20
"""

date_rng=pd.date_range('2019-01-01',freq='M',periods=12)
print(f'{date_rng}')
date_rng=pd.period_range('2019-01-01',freq='M',periods=12)
print(f'{date_rng}')
"""
DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
               '2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30', '2019-10-31', '2019-11-30', '2019-12-31'],
              dtype='datetime64[ns]', freq='M')
PeriodIndex(['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06',
             '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12'],
            dtype='period[M]')

"""

date_rng=pd.date_range('2019-01-01',freq='W-sun',periods=12)
print(f'{date_rng}')

date_rng=pd.period_range('2019-01-01',freq='W-sun',periods=12)
print(f'{date_rng}')
"""
DatetimeIndex(['2019-01-06', '2019-01-13', '2019-01-20', '2019-01-27',
               '2019-02-03', '2019-02-10', '2019-02-17', '2019-02-24',
               '2019-03-03', '2019-03-10', '2019-03-17', '2019-03-24'],
              dtype='datetime64[ns]', freq='W-SUN')
PeriodIndex(['2018-12-31/2019-01-06', '2019-01-07/2019-01-13',
             '2019-01-14/2019-01-20', '2019-01-21/2019-01-27',
             '2019-01-28/2019-02-03', '2019-02-04/2019-02-10',
             '2019-02-11/2019-02-17', '2019-02-18/2019-02-24',
             '2019-02-25/2019-03-03', '2019-03-04/2019-03-10',
             '2019-03-11/2019-03-17', '2019-03-18/2019-03-24'],
            dtype='period[W-SUN]')
"""

方法:
DataFrame.index
DataFrame.columns
DataFrame.values
DataFrame.loc
DataFrame.iloc
假设生成的一个DataFrame为df_series

函数描述df_series.index访问df_series全部行索引df_series.columns访问df_series全部列索引df_series .values访问df_series全部元素数值df_series.index访问df_series全部行索引df_series.列名字或者df_series[‘列名字’]访问某一个列元素df_series.[0:1]访问第一行元素 也可改变值来改变访问行数df_series.loc[[pd.to_datetime(‘行名字’)],[‘列名’,’列名’]读取某一行对应的列标签df_series.loc[:,[‘列名’]]读取某一列的所有元素df_series.loc[pd.to_datetime(‘行名’)访问某一行的所有元素df_series.iloc[0:2,0:1]数字可以改变 代表访问某几行某几列的元素 代表范围df_series.iloc[[0,2],[0,1]数字可以改变 代表访问指定的某行某列的元素 代表固定值df_series.loc[df_series.index[[0,2]],’列名’]数字可以改变 代表要用到index 标签要变为索引 然后进行访问 访问某一列的几行元素df_series.iloc[[0,2],df_series.columns.get_loc(‘列名’)同上df_series.iloc[[0,2],df_series.columns.get_indexer([‘列名’,’列名’])]访问指定列的几行元素案例如下:


import pandas as pd
dates=pd.DatetimeIndex(['2019-01-11','2019-02-13','2019-01-16'])
series_data={
    'close':pd.Series([11.12,11.13,11.14],index=dates),
    'open':pd.Series([21.11,21.32,21.44],index=dates)
}
df_series=pd.DataFrame(series_data,columns=['close','open'])
print(df_series)
"""
            close   open
2019-01-11  11.12  21.11
2019-02-13  11.13  21.32
2019-01-16  11.14  21.44
"""

dates=pd.date_range('2019-01-11',freq='D',periods=4)
print(dates)

print("\\\\\\\\\\\\\\")

print(df_series.index)

print(df_series.columns)

print(df_series.values)
"""
行索引
DatetimeIndex(['2019-01-11', '2019-02-13', '2019-01-16'], dtype='datetime64[ns]', freq=None)
列索引
Index(['close', 'open'], dtype='object')
值
[[11.12 21.11]
 [11.13 21.32]
 [11.14 21.44]]

"""

print(df_series['close'])
print(df_series.close)
"""
2019-01-11    11.12
2019-02-13    11.13
2019-01-16    11.14
Name: close, dtype: float64
"""

print(df_series[0:1])
"""
            close   open
2019-01-11  11.12  21.11
"""

print(df_series.loc[[pd.to_datetime('2019-01-16')],['close','open']])
"""
            close   open
2019-01-16  11.14  21.44
"""

print(df_series.loc[:,['close']])

"""
           close
2019-01-11  11.12
2019-02-13  11.13
2019-01-16  11.14

"""

print('访问'2019-01-16这行的元素内容:')
print(df_series.loc[pd.to_datetime('2019-01-16')])
"""
close    11.14
open     21.44
Name: 2019-01-16 00:00:00, dtype: float64
"""

print("以下是iloc的访问:")
print(df_series.iloc[0:2,0:1])
"""
           close
2019-01-11  11.12
2019-02-13  11.13
"""

print(df_series.iloc[0:1])
"""
            close   open
2019-01-11  11.12  21.11
"""
print("//")

print(df_series.iloc[[0,2],[0]])

print('以下为index访问标签')

print(df_series.index[[0,2]])
print(df_series.loc[df_series.index[[0,2]],'open'])
"""
DatetimeIndex(['2019-01-11', '2019-01-16'], dtype='datetime64[ns]', freq=None)
2019-01-11    21.11
2019-01-16    21.44
"""
print('')

print(df_series.columns.get_loc('open'))
print(df_series.iloc[[0,2],df_series.columns.get_loc('open')])
"""
2019-01-11    21.11
2019-01-16    21.44
Name: open, dtype: float64
"""

print('/')
print(df_series.columns.get_indexer(['open','close']))
print(df_series.iloc[[0,2],df_series.columns.get_indexer(['open','close'])])

"""
[1 0]
             open  close
2019-01-11  21.11  11.12
2019-01-16  21.44  11.14
"""

Original: https://blog.csdn.net/qq_45126531/article/details/124145832
Author: 大葱一根
Title: pandas数据分析

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/750823/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球