Pandas(数据分析处理库)—小练习

import pandas as pd
import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
1,显示当前版本信息
pd.show_versions()
"""
INSTALLED VERSIONS
 0   animal    10 non-null     object
 1   age       8 non-null      float64
 2   visits    10 non-null     int64
 3   priority  10 non-null     object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes
"""
4,索引
df.iloc[:4]

Pandas(数据分析处理库)---小练习
5,指定选择数据范围
df[df['visits'] > 1]

Pandas(数据分析处理库)---小练习
6,查看缺失值
df[df['age'].isnull()]

Pandas(数据分析处理库)---小练习
7,通过给定范围查找某一属性
df[(df['animal'] =='cat') & (df['age'] < 3)]

Pandas(数据分析处理库)---小练习
8,改变数值
df.loc['f','age'] = 1.5
df[(df['animal'] =='cat') & (df['age'] < 3)]

Pandas(数据分析处理库)---小练习
9,groupby求均值
df.groupby('animal')['age'].mean()
"""
animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64
"""
10,计算相同属性值的个数
df['animal'].value_counts()
"""
cat      4
dog      4
snake    2
Name: animal, dtype: int64
"""
11,属性值进行映射
df['priority'] = df['priority'].map({'yes':True,'no':False})
df.head()

Pandas(数据分析处理库)---小练习
12,属性值进行替换
df['animal'] = df['animal'].replace('snake','tangyudi')
df.head()

Pandas(数据分析处理库)---小练习
13,数据透视表
df.pivot_table(index = 'animal',columns = 'visits',values='age',aggfunc = 'mean')

Pandas(数据分析处理库)---小练习
14,提取均值组成新的数据
df = pd.DataFrame(np.random.random(size = (5,3)))
df.head()

Pandas(数据分析处理库)---小练习
df.sub(df.mean(axis = 1),axis = 0)

Pandas(数据分析处理库)---小练习
15,统计不同属性值的个数
len(df) - df.duplicated(keep=False).sum()
"""
5
"""
len(df.drop_duplicates(keep=False))
"""
5
"""
16,给定数据,分别求滑动窗口的均值(加入补0操作)
df = pd.DataFrame({'group': list('aabbabbbabab'),
                       'value': [1, 2, 3, np.nan, 2, 3,
                                 np.nan, 1, 7, 3, np.nan, 8]})
df.head(12)

Pandas(数据分析处理库)---小练习
g1 = df.groupby(['group'])['value']
g2 = df.fillna(0).groupby(['group'])['value']
s = g2.rolling(3,min_periods=1).sum()/g2.rolling(3,min_periods=1).count()
s.reset_index(level = 0,drop=True).sort_index()
"""
0     1.000000
1     1.500000
2     3.000000
3     1.500000
4     1.666667
5     2.000000
6     1.000000
7     1.333333
8     3.666667
9     1.333333
10    3.000000
11    4.000000
Name: value, dtype: float64
"""
17,指定时间序列进行计算
dt = pd.date_range(start = '2022-05-04',end = '2022-10-14',freq = 'D')
s = pd.Series(np.random.rand(len(dt)),index = dt)
s[:5]
"""
2022-05-04    0.456731
2022-05-05    0.525470
2022-05-06    0.491474
2022-05-07    0.468223
2022-05-08    0.629639
Freq: D, dtype: float64
"""
s[s.index.weekday == 2].sum()
"""
11.524919039461853
"""
18,重采样
s.resample('M').mean()
"""
2022-05-31    0.487334
2022-06-30    0.508629
2022-07-31    0.522216
2022-08-31    0.500842
2022-09-30    0.555139
2022-10-31    0.449785
Freq: M, dtype: float64
"""
19,对缺失值数据自动计算
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm',
                               'Budapest_PaRis', 'Brussels_londOn'],
              'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
              'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
                   'Airline': ['KLM(!)', ' (12)', '(British Airways. )',
                               '12. Air France', '"Swiss Air"']})
df.head()

Pandas(数据分析处理库)---小练习
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df.head()

Pandas(数据分析处理库)---小练习
20,将From_To这一列展开成两个特征
temp = df.From_To.str.split('_',expand = True)
temp.columns = ['From','To']

temp['From'] = temp['From'].str.capitalize()
temp['To'] = temp['To'].str.capitalize()

df = df.join(temp)
df.head()

Pandas(数据分析处理库)---小练习
21,删除From_To这一列
df = df.drop('From_To',axis = 1)
df.head()

Pandas(数据分析处理库)---小练习
22,去掉airline中多余的字符
df['Airline'] = df['Airline'].str.extract('([a-zA-Z\s]+)',expand = False).str.strip()
df.head()

Pandas(数据分析处理库)---小练习
23,将RecentDelays中的数据分开写
delays = df['RecentDelays'].apply(pd.Series)
delays.columns = ['delay_{}'.format(n) for n in range(1,len(delays.columns)+1)]
delays

Pandas(数据分析处理库)---小练习
24,多重索引
letters = ['A','B','C']
numbers = list(range(10))
mi = pd.MultiIndex.from_product([letters,numbers])
s = pd.Series(np.random.rand(30),index=mi)
s
"""
A  0    0.773126
   1    0.030788
   2    0.440044
   3    0.751953
   4    0.073763
   5    0.750470
   6    0.422485
   7    0.256091
   8    0.867278
   9    0.167302
B  0    0.617402
   1    0.884274
   2    0.745445
   3    0.017106
   4    0.289594
   5    0.346788
   6    0.430361
   7    0.900921
   8    0.933771
   9    0.550062
C  0    0.343099
   1    0.767047
   2    0.963959
   3    0.971750
   4    0.321016
   5    0.544492
   6    0.649962
   7    0.934818
   8    0.266529
   9    0.155592
dtype: float64
"""
25,定位数据
s.loc[pd.IndexSlice[:'B',5:]]
"""
A  5    0.750470
   6    0.422485
   7    0.256091
   8    0.867278
   9    0.167302
B  5    0.346788
   6    0.430361
   7    0.900921
   8    0.933771
   9    0.550062
dtype: float64
"""
26,按索引计算
s.sum(level = 1)
"""
0    1.733627
1    1.682110
2    2.149448
3    1.740809
4    0.684373
5    1.641749
6    1.502808
7    2.091831
8    2.067578
9    0.872956
dtype: float64
"""
27,变换索引
new = s.swaplevel(0,1)
new
"""
0  A    0.773126
1  A    0.030788
2  A    0.440044
3  A    0.751953
4  A    0.073763
5  A    0.750470
6  A    0.422485
7  A    0.256091
8  A    0.867278
9  A    0.167302
0  B    0.617402
1  B    0.884274
2  B    0.745445
3  B    0.017106
4  B    0.289594
5  B    0.346788
6  B    0.430361
7  B    0.900921
8  B    0.933771
9  B    0.550062
0  C    0.343099
1  C    0.767047
2  C    0.963959
3  C    0.971750
4  C    0.321016
5  C    0.544492
6  C    0.649962
7  C    0.934818
8  C    0.266529
9  C    0.155592
dtype: float64
1
​
"""

Original: https://blog.csdn.net/qq_41264055/article/details/124566915
Author: beyond谚语
Title: Pandas(数据分析处理库)—小练习

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/742317/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球