pandas基础_1

数据使用泰坦尼克数据

基本操作

读取训练集数据

df = pd.read_csv('train.csv')

常用的基本信息查询

df.head() #默认查看前5条,可以使用head(10)查看前10条
df.tail() #默认查看最后5行数据
df.info() #返回当前信息
df.index() #索引
df.columns #列名
df.dtype #类型
df.values #值
df.describe()
#有数值的列的count(),mean(),std(),min(),0.25,0.5,0.75,max()

取指定的数据,取得的数据为Series结构(DataFrame中的一行或者一列)

age = df['Age']

可以重新设置自己想要的索引

df = df.set_index('Name')

把名字设置为索引之后,可以根据索引提取想要的数据

age['Allen, Mr. William Henry']

索引结构

df['Age'] # 定位某一列
df[['Age','Fare']][:5] #定位两个列,注意要用[]来括住这两个列

loc 用label来去定位
iloc 用position来去定位

df.iloc[0]
df.iloc[0:5] #此时默认提出全部列
df.iloc[0:5,1:3] #提出0-4行,1-2列

df.set_index('Name')
df.loc['Allen, Mr. William Henry','Fare']
df.loc['Moran, Mr. James':'Andersson, Mr. Anders Johan',:]
df['Fare'] > 40 #大于40返回True,小于40返回False
df[df['Fare'] > 40] #返回大于40的行
df[df['Sex'] == 'male']
df.loc[df['Sex'] == 'male','Age'].mean() #求男性的平均年龄
(df['Age'] > 70).sum()

GROUP BY

df1 = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],'value':[0,5,10,5,10,15,10,15,20]}
df.groupby('key').sum()

#还可以进行numpy里的操作
df1.groupby('key').aggregate(np.mean)

#对泰坦尼克数据根据性别计算年龄均值
df.groupby('Sex')['Age'].mean()

#根据性别计算幸存率
df.groupby('Sex')['Survived'].mean()

数值计算

#首先构造一个DataFrame
df2 = pd.DataFrame([[1,2,3],[4,5,6]])
df2.index=['a','b']
df2.columns=['A','B','C']
print(df2)

数值计算的一些基本操作: sum(), mean(), max(), min(), median()

print(df2.sum()) #df2.sum(axis = 0)
'''
A    5
B    7
C    9
'''
print(df2.sum(axis = 1)) #df2.sum(axis = 'columns)
'''
a    6
b    15
'''

二元统计

df = pd.read_csv('train.csv')

df.cov()
#返回特征之间的协方差

df.corr()
#返回特征之间的相关系数,若两个系数相关性强,则可以选择删除其中一个系数

df['Age'].value_counts() #统计所有频数
df['Age'].value_counts(ascending = True) #升序排列
#此时会统计每个数的频数,可以指定参数bins,限定范围计算频数
df['Age'].value_counts(ascending = True, bins = 5)
'''
(64.084, 80.0]       11
(48.168, 64.084]     69
(0.339, 16.336]     100
(32.252, 48.168]    188
(16.336, 32.252]    346
'''

df['Age'].count()
#返回不为0的列有多少个

对象操作

data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
'''
a    10
b    11
c    12
'''

pandas的series类似表格中的一个列(column),类似一维数组,可以保存任何数据类型,由索引和列组成 pandas.Series(data,index,dtype,name,copy)
数值的修改:

s[0] #10
s[0:2]
'''
a    10
b    11
'''

mask = [True,False,True]
s[mask]
'''
a    10
c    12
'''

s.loc['b'] # 11
s.iloc[1] # 11

s1 = s.copy()
s1['a'] = 100
s1.replace(to_replace = 100, value = 101, inplace = False)
 #inplace指定为False,则只会展示结果,但不会改变s1,当inplace改为True时才会修改s1的值

索引的修改:

s1.index
#Index(['a', 'b', 'c'], dtype='object')
s1.index = ['a','b','d']
#Index(['a', 'b', 'd'], dtype='object')

s1.rename(index = {'a':'A'},inplace=True)
#Index(['A', 'b', 'c'], dtype='object')

增加:

s2 = pd.Series([100,110],index = ['h','k'])
s1 = s1.append(s2)
s1['j'] = 500

s3 = s1.append(s2,ignore_index=False) #不会生成新索引
s3 = s1.append(s2,ignore_index=True) #自动生成新索引
print(s3.index)
#RangeIndex(start=0, stop=5, step=1)

删除:

del s1['A']
s1.drop(['b','c'],inplace = True)
data =[[1,2,3],[4,5,6]]
index=['a','b']
columns=['A','B','C']
df = pd.DataFrame(data = data, index = index, columns = columns)

查操作是类似的 loc()iloc()
修改:

df['a']['A'] = 150 #改值
df.index = ['f','g'] #改索引

增加:

df.loc['c'] = [1,2,3] #增加一行

#增加多行
data =[[1,2,3],[4,5,6]]
index=['j','h']
columns=['A','B','C']
df2 = pd.DataFrame(data = data, index = index, columns = columns)

df3 = pd.concat([df,df2])
#还可以指定轴
df3 = pd.concat([df,df2],axis = 1)

#增加列
df2['Tang'] = [10,11]

df4 = pd.DataFrame([[10,11].[12,13]],index=['j','h'],columns=['D','E']) #保证index相同才能新增多列
df5 = pd.concat([df2,df4],axis = 1)

删除:

df5.drop(['j'],axis=0,inplace = True) #删除单行
del df5['Tang'] #删除单列
df5.drop(['A','B','C'],axis = 1,inplace = True) # 删除多列

merge函数

left = pd.DataFrame({'key':['k0','k1','k2','k3'],
                     'A':['A0','A1','A2','A3'],
                     'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['k0','k1','k2','k3'],
                     'C':['C0','C1','C2','C3'],
                     'D':['D0','D1','D2','D3']})
pd.merge(left,right,on='key') #on指定以什么键合并
'''
    A   B   key C   D
0   A0  B0  K0  C0  D0
1   A1  B1  K1  C1  D1
2   A2  B2  K2  C2  D2
3   A3  B3  K3  C3  D3
'''

合并时有重复的列,但并不是主键,则会生成_x,_y

left = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                     'key2':['k0','k1','k2','k3'],
                     'A':['A0','A1','A2','A3'],
                     'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                     'key2':['k0','k1','k2','k3'],
                     'C':['C0','C1','C2','C3'],
                     'D':['D0','D1','D2','D3']})
print(pd.merge(left,right,on='key1'))
'''
    key1    key2_x  A       B       key2_y  C   D
0   k0      k0      A0      B0      k0      C0  D0
1   k1      k1      A1      B1      k1      C1  D1
2   k2      k2      A2      B2      k2      C2  D2
3   k3      k3      A3      B3      k3      C3  D3
'''

连接的不同类型

left = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                     'key2':['k0','k1','k2','k3'],
                     'A':['A0','A1','A2','A3'],
                     'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                     'key2':['k0','k1','k2','k4'],
                     'C':['C0','C1','C2','C3'],
                     'D':['D0','D1','D2','D3']})
pd.merge(left,right,on=['key1','key2'])
#此时只有0,1,2三列,默认为内连接,指定参数how决定什么连接方式,how的取值有inner(默认),outer,left,right
pd.merge(left,right,on=['key1','key2'],how = 'outer')
#指定外连接,此时每一列都会出现,但连接不到的地方会出现NaN
pd.merge(left,right,on=['key1','key2'],how = 'outer',indicator = True) #指定器indicator会在每行的最后告诉你是both、left_only、right_only

显示设置

get:查询,set:设置
pd.get_option('display.max_rows')
pd.set_option('display.max_rows',6) #设置显示出来的行数为6行
pd.set_option('display.max_columns',6) #设置显示出来的列数为6列
pd.set_option('display.max_colwodth',100) #设置字符串的长度显示
pd.set_option('display.precision',20) #设置小数的精度

pivot数据透视表

example = pd.DataFrame({'Month':['Jan','Jan','Jan','Feb','Feb','Feb','Mar','Mar','Mar',],
                     'Category':['cola','tea','coffee','cola','tea','coffee','cola','tea','coffee'],
                     'Amount':[74,235,32,46,346,32,57,96,76]})
example_pivot = example.pivot(index = 'Category',columns= 'Month',values='Amount')
#这样使用要求index+columns可以生成唯一字段
print(example_pivot)
#index为透视表的行,columns为列 values是统计值
'''
Month     Feb  Jan  Mar
Category
coffee     32   32   76
cola       46   74   57
tea       346  235   96
'''
print(example_pivot.sum(axis=1))
'''
Category
coffee    140
cola      177
tea       677
'''
print(example_pivot.sum(axis=0))
'''
Month
Feb    424
Jan    341
Mar    229
'''

数据透视表pivot在实际数据中的使用方式:

df = pd.read_csv('train.csv')
df_pivot = df.pivot_table(index = 'Sex',columns='Pclass',values='Fare')
print(df_pivot)
#默认求平均值,可以设置aggfunc参数,求max(),min(),count()等
'''
Pclass           1          2          3
Sex
female  106.125798  21.970121  16.118810
male     67.226127  19.741782  12.661633
'''
df_pivot = df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='max')
print(df_pivot)
'''
Pclass         1     2      3
Sex
female  512.3292  65.0  69.55
male    512.3292  73.5  69.55
'''

#当数据透视表统计count时,还可以使用crosstab
df_crosstab = pd.crosstab(index=df['Sex'],columns=df['Pclass'])

df_pivot = df.pivot_table(index = 'Pclass',columns='Sex',values='Survived',aggfunc='mean')
print(df_pivot)
'''
Sex       female      male
Pclass
1       0.968085  0.368852
2       0.921053  0.157407
3       0.500000  0.135447
'''
df['Underaged'] = df['Age'] < 18
df_pivot = df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='mean')
print(df_pivot)
'''
Sex          female      male
Underaged
False      0.752896  0.165703
True       0.690909  0.396552
'''

时间操作

import datetime
dt = datetime.datetime(year=2017,month=11,day=24,hour=10,minute=30)
print(dt) #2017-11-24 10:30:00

在pandas中的时间操作

ts = pd.Timestamp('2017-11-24') #&#x65F6;&#x95F4;&#x6233;
ts #Timestamp('2017-11-24 00:00:00')
ts.month # 11
ts.day #24
ts +pd.Timedelta('5 days') #Timestamp('2017-11-29 00:00"00')
pd.to_datetime('2017-11-24')
pd.to_datetime('24/11/2017')

#&#x8BFB;&#x5165;&#x8868;&#xFF0C;&#x5BF9;&#x8868;&#x4E2D;&#x65F6;&#x95F4;&#x8FDB;&#x884C;&#x8F6C;&#x6362;

data = pd.resd_csv('time.csv')
data['Time'] = pd.to_datetime(data['Time']) #&#x628A;&#x539F;&#x672C;&#x7684;Time&#x5217;&#x8F6C;&#x6362;&#x4E3A;&#x65F6;&#x95F4;&#x683C;&#x5F0F;
data = data.set_index('Time') #&#x628A;Time&#x8BBE;&#x7F6E;&#x4E3A;&#x7D22;&#x5F15;

data = pd.read_csv('time.csv',index_col=0,parse_dates=True) #&#x4E5F;&#x53EF;&#x4EE5;&#x5BFC;&#x5165;&#x7684;&#x65F6;&#x5019;&#x4E00;&#x6B65;&#x8BBE;&#x7F6E;&#x597D;

'''

                        L06_347         LS06_347    LS06_348    month
Time
2009-01-01 00:00:00     0.137417        0.097500    0.016833    1
2009-01-01 03:00:00     0.131250        0.088833    0.016417    1
2009-01-01 06:00:00     0.113500        0.091250    0.016750    1
'''

#&#x53D6;&#x6570;&#x636E;
data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')] #&#x4E0D;&#x52A0;Timestamp&#x4E5F;&#x53EF;&#x4EE5;&#x53D6;&#x51FA;&#x6765;

#&#x60F3;&#x76F4;&#x63A5;&#x53D6;2013&#x5E74;&#x7684;&#x6570;&#x636E;
data['2013']
#&#x53D6;2012&#x5E74;1&#x6708;&#x5230;3&#x6708;
data['2012-01':'2012-03']
#&#x53E6;&#x4E00;&#x79CD;&#x53D6;&#x6CD5;&#xFF0C;&#x53D6;&#x6240;&#x6709;1&#x6708;&#x7684;&#x6570;&#x636E;&#xFF08;12&#x5E74;1&#x6708;&#x4EFD;&#x3001;13&#x5E74;1&#x6708;&#x4EFD;&#xFF09;
data[data.index.month == 1]
data[(data.index.hour > 8) & (data.index.hour <12)] #所有8-12点 #取时间还可以用between_time,这个就包含两头的端点 data.between_time('08:00','12:00') < code></12)]>

时间序列重采样

#&#x8BA1;&#x7B97;&#x6BCF;&#x5929;&#x7684;&#x5747;&#x503C;
data.resample('D').mean().head() #&#x591A;&#x5929;&#x6307;&#x5B9A;3D&#x5373;&#x53EF;
'''
                L06_347         LS06_347    LS06_348    month
Time
2009-01-01      0.125010        0.092281    0.016635    1
2009-01-02      0.124146        0.095781    0.016406    1
'''
data.resample('M').mean().plot() #&#x53EF;&#x4EE5;&#x76F4;&#x63A5;&#x753B;&#x56FE;

常用操作

按照指定顺序进行排序

data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c',],
                     'data':[4,3,2,1,12,3,4,5,7]})
data.sort_values(by=['group','data'],ascending=[False,True],inplace=True)
#group&#x964D;&#x5E8F;&#xFF0C;&#x518D;data&#x5347;&#x5E8F;
print(data)
'''
   data group
6     4     c
7     5     c
8     7     c
3     1     b
5     3     b
4    12     b
2     2     a
1     3     a
0     4     a
'''

删除重复值

data1 = pd.DataFrame({'k1':['one']*3+['two']*4,
                      'k2':[3,2,1,3,3,4,4]})
data1 = data1.drop_duplicates() #&#x53BB;&#x6389;&#x91CD;&#x590D;&#x7684;&#x503C;
data1 = data1.drop_duplicates(subset='k1') #&#x6307;&#x5B9A;&#x67D0;&#x4E00;&#x5217;&#x53BB;&#x91CD;
'''
    k1  k2
0  one   3
3  two   3
'''
print(data1)

map映射

data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],
                     'data':[1,2,3,4,5,6,7]})
food_map = {'A1':'A','A2':'A','B1':'B','B2':'B','B3':'B','C1':'C','C2':'C'}
data['food_map'] = data['food'].map(food_map)
#map&#x6620;&#x5C04;&#xFF0C;map&#x5FC5;&#x987B;&#x662F;&#x5B57;&#x5178;
print(data)
'''
  food  data food_map
0   A1     1        A
1   A2     2        A
2   B1     3        B
3   B2     4        B
4   B3     5        B
5   C1     6        C
6   C2     7        C
'''

添加比率的另一种写法,以及去除某列

df = pd.DataFrame({'data1':np.random.randn(3),
                   'data2':np.random.randn(3)})
df = df.assign(ration = df['data1']/df['data2'])
print(df)
'''
      data1     data2    ration
0  1.764052  2.240893  0.787209
1  0.400157  1.867558  0.214268
2  0.978738 -0.977278 -1.001494
'''
df.drop('ration',axis='columns',inplace=True)
print(df)
'''
      data1     data2
0  1.764052  2.240893
1  0.400157  1.867558
2  0.978738 -0.977278
'''
age = [13,23,35,23,36,24,12,17,22,30]
bins = [10,20,30,40]
bins_res = pd.cut(age,bins)
print(bins_res)
#[(10, 20], (20, 30], (30, 40], (20, 30], (30, 40], (20, 30], (10, 20], (10, 20], (20, 30], (20, 30]]
#print(bins_res.labels)
print(pd.value_counts(bins_res))
'''
(20, 30]    5
(10, 20]    3
(30, 40]    2
'''
#&#x8FD8;&#x53EF;&#x4EE5;&#x6307;&#x5B9A;&#x7EC4;&#x540D;
group_name = ['10&#x4EE3;','20&#x4EE3;','30&#x4EE3;']
print(pd.value_counts(pd.cut(age,bins,labels=group_name)))
'''
20&#x4EE3;    5
10&#x4EE3;    3
30&#x4EE3;    2
'''

查看空值

df = pd.DataFrame([range(3),[0,np.nan,0],[0,0,np.nan],range(3)])
print(df.isnull())
'''
       0      1      2
0  False  False  False
1  False   True  False
2  False  False   True
3  False  False  False
'''
print(df.isnull().any())#&#x8FD8;&#x53EF;&#x4EE5;&#x6307;&#x5B9A;axis=1
'''
0    False
1     True
2     True
'''
#&#x627E;&#x5230;&#x7F3A;&#x5931;&#x503C;&#x6240;&#x5728;&#x4F4D;&#x7F6E;
print(df[df.isnull().any(axis = 1)])
df = df.fillna(5) #&#x7528;5&#x6765;&#x586B;&#x5145;&#x7F3A;&#x5931;&#x503C;
print(df)

Original: https://blog.csdn.net/qq_45003520/article/details/123551945
Author: indigo女孩
Title: pandas基础_1

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/698261/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球