import pandas as pd
import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
1,显示当前版本信息
pd.show_versions()
"""
INSTALLED VERSIONS
0 animal 10 non-null object
1 age 8 non-null float64
2 visits 10 non-null int64
3 priority 10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes
"""
4,索引
df.iloc[:4]
5,指定选择数据范围
df[df['visits'] > 1]
6,查看缺失值
df[df['age'].isnull()]
7,通过给定范围查找某一属性
df[(df['animal'] =='cat') & (df['age'] < 3)]
8,改变数值
df.loc['f','age'] = 1.5
df[(df['animal'] =='cat') & (df['age'] < 3)]
9,groupby求均值
df.groupby('animal')['age'].mean()
"""
animal
cat 2.333333
dog 5.000000
snake 2.500000
Name: age, dtype: float64
"""
10,计算相同属性值的个数
df['animal'].value_counts()
"""
cat 4
dog 4
snake 2
Name: animal, dtype: int64
"""
11,属性值进行映射
df['priority'] = df['priority'].map({'yes':True,'no':False})
df.head()
12,属性值进行替换
df['animal'] = df['animal'].replace('snake','tangyudi')
df.head()
13,数据透视表
df.pivot_table(index = 'animal',columns = 'visits',values='age',aggfunc = 'mean')
14,提取均值组成新的数据
df = pd.DataFrame(np.random.random(size = (5,3)))
df.head()
df.sub(df.mean(axis = 1),axis = 0)
15,统计不同属性值的个数
len(df) - df.duplicated(keep=False).sum()
"""
5
"""
len(df.drop_duplicates(keep=False))
"""
5
"""
16,给定数据,分别求滑动窗口的均值(加入补0操作)
df = pd.DataFrame({'group': list('aabbabbbabab'),
'value': [1, 2, 3, np.nan, 2, 3,
np.nan, 1, 7, 3, np.nan, 8]})
df.head(12)
g1 = df.groupby(['group'])['value']
g2 = df.fillna(0).groupby(['group'])['value']
s = g2.rolling(3,min_periods=1).sum()/g2.rolling(3,min_periods=1).count()
s.reset_index(level = 0,drop=True).sort_index()
"""
0 1.000000
1 1.500000
2 3.000000
3 1.500000
4 1.666667
5 2.000000
6 1.000000
7 1.333333
8 3.666667
9 1.333333
10 3.000000
11 4.000000
Name: value, dtype: float64
"""
17,指定时间序列进行计算
dt = pd.date_range(start = '2022-05-04',end = '2022-10-14',freq = 'D')
s = pd.Series(np.random.rand(len(dt)),index = dt)
s[:5]
"""
2022-05-04 0.456731
2022-05-05 0.525470
2022-05-06 0.491474
2022-05-07 0.468223
2022-05-08 0.629639
Freq: D, dtype: float64
"""
s[s.index.weekday == 2].sum()
"""
11.524919039461853
"""
18,重采样
s.resample('M').mean()
"""
2022-05-31 0.487334
2022-06-30 0.508629
2022-07-31 0.522216
2022-08-31 0.500842
2022-09-30 0.555139
2022-10-31 0.449785
Freq: M, dtype: float64
"""
19,对缺失值数据自动计算
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm',
'Budapest_PaRis', 'Brussels_londOn'],
'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
'Airline': ['KLM(!)', ' (12)', '(British Airways. )',
'12. Air France', '"Swiss Air"']})
df.head()
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df.head()
20,将From_To这一列展开成两个特征
temp = df.From_To.str.split('_',expand = True)
temp.columns = ['From','To']
temp['From'] = temp['From'].str.capitalize()
temp['To'] = temp['To'].str.capitalize()
df = df.join(temp)
df.head()
21,删除From_To这一列
df = df.drop('From_To',axis = 1)
df.head()
22,去掉airline中多余的字符
df['Airline'] = df['Airline'].str.extract('([a-zA-Z\s]+)',expand = False).str.strip()
df.head()
23,将RecentDelays中的数据分开写
delays = df['RecentDelays'].apply(pd.Series)
delays.columns = ['delay_{}'.format(n) for n in range(1,len(delays.columns)+1)]
delays
24,多重索引
letters = ['A','B','C']
numbers = list(range(10))
mi = pd.MultiIndex.from_product([letters,numbers])
s = pd.Series(np.random.rand(30),index=mi)
s
"""
A 0 0.773126
1 0.030788
2 0.440044
3 0.751953
4 0.073763
5 0.750470
6 0.422485
7 0.256091
8 0.867278
9 0.167302
B 0 0.617402
1 0.884274
2 0.745445
3 0.017106
4 0.289594
5 0.346788
6 0.430361
7 0.900921
8 0.933771
9 0.550062
C 0 0.343099
1 0.767047
2 0.963959
3 0.971750
4 0.321016
5 0.544492
6 0.649962
7 0.934818
8 0.266529
9 0.155592
dtype: float64
"""
25,定位数据
s.loc[pd.IndexSlice[:'B',5:]]
"""
A 5 0.750470
6 0.422485
7 0.256091
8 0.867278
9 0.167302
B 5 0.346788
6 0.430361
7 0.900921
8 0.933771
9 0.550062
dtype: float64
"""
26,按索引计算
s.sum(level = 1)
"""
0 1.733627
1 1.682110
2 2.149448
3 1.740809
4 0.684373
5 1.641749
6 1.502808
7 2.091831
8 2.067578
9 0.872956
dtype: float64
"""
27,变换索引
new = s.swaplevel(0,1)
new
"""
0 A 0.773126
1 A 0.030788
2 A 0.440044
3 A 0.751953
4 A 0.073763
5 A 0.750470
6 A 0.422485
7 A 0.256091
8 A 0.867278
9 A 0.167302
0 B 0.617402
1 B 0.884274
2 B 0.745445
3 B 0.017106
4 B 0.289594
5 B 0.346788
6 B 0.430361
7 B 0.900921
8 B 0.933771
9 B 0.550062
0 C 0.343099
1 C 0.767047
2 C 0.963959
3 C 0.971750
4 C 0.321016
5 C 0.544492
6 C 0.649962
7 C 0.934818
8 C 0.266529
9 C 0.155592
dtype: float64
1
"""
Original: https://blog.csdn.net/qq_41264055/article/details/124566915
Author: beyond谚语
Title: Pandas(数据分析处理库)—小练习
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/742317/
转载文章受原作者版权保护。转载请注明原作者出处!