数据使用泰坦尼克数据
基本操作
读取训练集数据
df = pd.read_csv('train.csv')
常用的基本信息查询
df.head() #默认查看前5条,可以使用head(10)查看前10条
df.tail() #默认查看最后5行数据
df.info() #返回当前信息
df.index() #索引
df.columns #列名
df.dtype #类型
df.values #值
df.describe()
#有数值的列的count(),mean(),std(),min(),0.25,0.5,0.75,max()
取指定的数据,取得的数据为Series结构(DataFrame中的一行或者一列)
age = df['Age']
可以重新设置自己想要的索引
df = df.set_index('Name')
把名字设置为索引之后,可以根据索引提取想要的数据
age['Allen, Mr. William Henry']
索引结构
df['Age'] # 定位某一列
df[['Age','Fare']][:5] #定位两个列,注意要用[]来括住这两个列
loc
用label来去定位
iloc
用position来去定位
df.iloc[0]
df.iloc[0:5] #此时默认提出全部列
df.iloc[0:5,1:3] #提出0-4行,1-2列
df.set_index('Name')
df.loc['Allen, Mr. William Henry','Fare']
df.loc['Moran, Mr. James':'Andersson, Mr. Anders Johan',:]
df['Fare'] > 40 #大于40返回True,小于40返回False
df[df['Fare'] > 40] #返回大于40的行
df[df['Sex'] == 'male']
df.loc[df['Sex'] == 'male','Age'].mean() #求男性的平均年龄
(df['Age'] > 70).sum()
GROUP BY
df1 = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],'value':[0,5,10,5,10,15,10,15,20]}
df.groupby('key').sum()
#还可以进行numpy里的操作
df1.groupby('key').aggregate(np.mean)
#对泰坦尼克数据根据性别计算年龄均值
df.groupby('Sex')['Age'].mean()
#根据性别计算幸存率
df.groupby('Sex')['Survived'].mean()
数值计算
#首先构造一个DataFrame
df2 = pd.DataFrame([[1,2,3],[4,5,6]])
df2.index=['a','b']
df2.columns=['A','B','C']
print(df2)
数值计算的一些基本操作: sum()
, mean()
, max()
, min()
, median()
等
print(df2.sum()) #df2.sum(axis = 0)
'''
A 5
B 7
C 9
'''
print(df2.sum(axis = 1)) #df2.sum(axis = 'columns)
'''
a 6
b 15
'''
二元统计
df = pd.read_csv('train.csv')
df.cov()
#返回特征之间的协方差
df.corr()
#返回特征之间的相关系数,若两个系数相关性强,则可以选择删除其中一个系数
df['Age'].value_counts() #统计所有频数
df['Age'].value_counts(ascending = True) #升序排列
#此时会统计每个数的频数,可以指定参数bins,限定范围计算频数
df['Age'].value_counts(ascending = True, bins = 5)
'''
(64.084, 80.0] 11
(48.168, 64.084] 69
(0.339, 16.336] 100
(32.252, 48.168] 188
(16.336, 32.252] 346
'''
df['Age'].count()
#返回不为0的列有多少个
对象操作
data = [10,11,12]
index = ['a','b','c']
s = pd.Series(data = data,index = index)
'''
a 10
b 11
c 12
'''
pandas的series类似表格中的一个列(column),类似一维数组,可以保存任何数据类型,由索引和列组成 pandas.Series(data,index,dtype,name,copy)
数值的修改:
s[0] #10
s[0:2]
'''
a 10
b 11
'''
mask = [True,False,True]
s[mask]
'''
a 10
c 12
'''
s.loc['b'] # 11
s.iloc[1] # 11
s1 = s.copy()
s1['a'] = 100
s1.replace(to_replace = 100, value = 101, inplace = False)
#inplace指定为False,则只会展示结果,但不会改变s1,当inplace改为True时才会修改s1的值
索引的修改:
s1.index
#Index(['a', 'b', 'c'], dtype='object')
s1.index = ['a','b','d']
#Index(['a', 'b', 'd'], dtype='object')
s1.rename(index = {'a':'A'},inplace=True)
#Index(['A', 'b', 'c'], dtype='object')
增加:
s2 = pd.Series([100,110],index = ['h','k'])
s1 = s1.append(s2)
s1['j'] = 500
s3 = s1.append(s2,ignore_index=False) #不会生成新索引
s3 = s1.append(s2,ignore_index=True) #自动生成新索引
print(s3.index)
#RangeIndex(start=0, stop=5, step=1)
删除:
del s1['A']
s1.drop(['b','c'],inplace = True)
data =[[1,2,3],[4,5,6]]
index=['a','b']
columns=['A','B','C']
df = pd.DataFrame(data = data, index = index, columns = columns)
查操作是类似的 loc()
和 iloc()
修改:
df['a']['A'] = 150 #改值
df.index = ['f','g'] #改索引
增加:
df.loc['c'] = [1,2,3] #增加一行
#增加多行
data =[[1,2,3],[4,5,6]]
index=['j','h']
columns=['A','B','C']
df2 = pd.DataFrame(data = data, index = index, columns = columns)
df3 = pd.concat([df,df2])
#还可以指定轴
df3 = pd.concat([df,df2],axis = 1)
#增加列
df2['Tang'] = [10,11]
df4 = pd.DataFrame([[10,11].[12,13]],index=['j','h'],columns=['D','E']) #保证index相同才能新增多列
df5 = pd.concat([df2,df4],axis = 1)
删除:
df5.drop(['j'],axis=0,inplace = True) #删除单行
del df5['Tang'] #删除单列
df5.drop(['A','B','C'],axis = 1,inplace = True) # 删除多列
merge函数
left = pd.DataFrame({'key':['k0','k1','k2','k3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['k0','k1','k2','k3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
pd.merge(left,right,on='key') #on指定以什么键合并
'''
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
'''
合并时有重复的列,但并不是主键,则会生成_x,_y
left = pd.DataFrame({'key1':['k0','k1','k2','k3'],
'key2':['k0','k1','k2','k3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['k0','k1','k2','k3'],
'key2':['k0','k1','k2','k3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(pd.merge(left,right,on='key1'))
'''
key1 key2_x A B key2_y C D
0 k0 k0 A0 B0 k0 C0 D0
1 k1 k1 A1 B1 k1 C1 D1
2 k2 k2 A2 B2 k2 C2 D2
3 k3 k3 A3 B3 k3 C3 D3
'''
连接的不同类型
left = pd.DataFrame({'key1':['k0','k1','k2','k3'],
'key2':['k0','k1','k2','k3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['k0','k1','k2','k3'],
'key2':['k0','k1','k2','k4'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
pd.merge(left,right,on=['key1','key2'])
#此时只有0,1,2三列,默认为内连接,指定参数how决定什么连接方式,how的取值有inner(默认),outer,left,right
pd.merge(left,right,on=['key1','key2'],how = 'outer')
#指定外连接,此时每一列都会出现,但连接不到的地方会出现NaN
pd.merge(left,right,on=['key1','key2'],how = 'outer',indicator = True) #指定器indicator会在每行的最后告诉你是both、left_only、right_only
显示设置
get:查询,set:设置
pd.get_option('display.max_rows')
pd.set_option('display.max_rows',6) #设置显示出来的行数为6行
pd.set_option('display.max_columns',6) #设置显示出来的列数为6列
pd.set_option('display.max_colwodth',100) #设置字符串的长度显示
pd.set_option('display.precision',20) #设置小数的精度
pivot数据透视表
example = pd.DataFrame({'Month':['Jan','Jan','Jan','Feb','Feb','Feb','Mar','Mar','Mar',],
'Category':['cola','tea','coffee','cola','tea','coffee','cola','tea','coffee'],
'Amount':[74,235,32,46,346,32,57,96,76]})
example_pivot = example.pivot(index = 'Category',columns= 'Month',values='Amount')
#这样使用要求index+columns可以生成唯一字段
print(example_pivot)
#index为透视表的行,columns为列 values是统计值
'''
Month Feb Jan Mar
Category
coffee 32 32 76
cola 46 74 57
tea 346 235 96
'''
print(example_pivot.sum(axis=1))
'''
Category
coffee 140
cola 177
tea 677
'''
print(example_pivot.sum(axis=0))
'''
Month
Feb 424
Jan 341
Mar 229
'''
数据透视表pivot在实际数据中的使用方式:
df = pd.read_csv('train.csv')
df_pivot = df.pivot_table(index = 'Sex',columns='Pclass',values='Fare')
print(df_pivot)
#默认求平均值,可以设置aggfunc参数,求max(),min(),count()等
'''
Pclass 1 2 3
Sex
female 106.125798 21.970121 16.118810
male 67.226127 19.741782 12.661633
'''
df_pivot = df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='max')
print(df_pivot)
'''
Pclass 1 2 3
Sex
female 512.3292 65.0 69.55
male 512.3292 73.5 69.55
'''
#当数据透视表统计count时,还可以使用crosstab
df_crosstab = pd.crosstab(index=df['Sex'],columns=df['Pclass'])
df_pivot = df.pivot_table(index = 'Pclass',columns='Sex',values='Survived',aggfunc='mean')
print(df_pivot)
'''
Sex female male
Pclass
1 0.968085 0.368852
2 0.921053 0.157407
3 0.500000 0.135447
'''
df['Underaged'] = df['Age'] < 18
df_pivot = df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='mean')
print(df_pivot)
'''
Sex female male
Underaged
False 0.752896 0.165703
True 0.690909 0.396552
'''
时间操作
import datetime
dt = datetime.datetime(year=2017,month=11,day=24,hour=10,minute=30)
print(dt) #2017-11-24 10:30:00
在pandas中的时间操作
ts = pd.Timestamp('2017-11-24') #时间戳
ts #Timestamp('2017-11-24 00:00:00')
ts.month # 11
ts.day #24
ts +pd.Timedelta('5 days') #Timestamp('2017-11-29 00:00"00')
pd.to_datetime('2017-11-24')
pd.to_datetime('24/11/2017')
#读入表,对表中时间进行转换
data = pd.resd_csv('time.csv')
data['Time'] = pd.to_datetime(data['Time']) #把原本的Time列转换为时间格式
data = data.set_index('Time') #把Time设置为索引
data = pd.read_csv('time.csv',index_col=0,parse_dates=True) #也可以导入的时候一步设置好
'''
L06_347 LS06_347 LS06_348 month
Time
2009-01-01 00:00:00 0.137417 0.097500 0.016833 1
2009-01-01 03:00:00 0.131250 0.088833 0.016417 1
2009-01-01 06:00:00 0.113500 0.091250 0.016750 1
'''
#取数据
data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')] #不加Timestamp也可以取出来
#想直接取2013年的数据
data['2013']
#取2012年1月到3月
data['2012-01':'2012-03']
#另一种取法,取所有1月的数据(12年1月份、13年1月份)
data[data.index.month == 1]
data[(data.index.hour > 8) & (data.index.hour <12)] #所有8-12点 #取时间还可以用between_time,这个就包含两头的端点 data.between_time('08:00','12:00') < code></12)]>
时间序列重采样
#计算每天的均值
data.resample('D').mean().head() #多天指定3D即可
'''
L06_347 LS06_347 LS06_348 month
Time
2009-01-01 0.125010 0.092281 0.016635 1
2009-01-02 0.124146 0.095781 0.016406 1
'''
data.resample('M').mean().plot() #可以直接画图
常用操作
按照指定顺序进行排序
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c',],
'data':[4,3,2,1,12,3,4,5,7]})
data.sort_values(by=['group','data'],ascending=[False,True],inplace=True)
#group降序,再data升序
print(data)
'''
data group
6 4 c
7 5 c
8 7 c
3 1 b
5 3 b
4 12 b
2 2 a
1 3 a
0 4 a
'''
删除重复值
data1 = pd.DataFrame({'k1':['one']*3+['two']*4,
'k2':[3,2,1,3,3,4,4]})
data1 = data1.drop_duplicates() #去掉重复的值
data1 = data1.drop_duplicates(subset='k1') #指定某一列去重
'''
k1 k2
0 one 3
3 two 3
'''
print(data1)
map映射
data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],
'data':[1,2,3,4,5,6,7]})
food_map = {'A1':'A','A2':'A','B1':'B','B2':'B','B3':'B','C1':'C','C2':'C'}
data['food_map'] = data['food'].map(food_map)
#map映射,map必须是字典
print(data)
'''
food data food_map
0 A1 1 A
1 A2 2 A
2 B1 3 B
3 B2 4 B
4 B3 5 B
5 C1 6 C
6 C2 7 C
'''
添加比率的另一种写法,以及去除某列
df = pd.DataFrame({'data1':np.random.randn(3),
'data2':np.random.randn(3)})
df = df.assign(ration = df['data1']/df['data2'])
print(df)
'''
data1 data2 ration
0 1.764052 2.240893 0.787209
1 0.400157 1.867558 0.214268
2 0.978738 -0.977278 -1.001494
'''
df.drop('ration',axis='columns',inplace=True)
print(df)
'''
data1 data2
0 1.764052 2.240893
1 0.400157 1.867558
2 0.978738 -0.977278
'''
age = [13,23,35,23,36,24,12,17,22,30]
bins = [10,20,30,40]
bins_res = pd.cut(age,bins)
print(bins_res)
#[(10, 20], (20, 30], (30, 40], (20, 30], (30, 40], (20, 30], (10, 20], (10, 20], (20, 30], (20, 30]]
#print(bins_res.labels)
print(pd.value_counts(bins_res))
'''
(20, 30] 5
(10, 20] 3
(30, 40] 2
'''
#还可以指定组名
group_name = ['10代','20代','30代']
print(pd.value_counts(pd.cut(age,bins,labels=group_name)))
'''
20代 5
10代 3
30代 2
'''
查看空值
df = pd.DataFrame([range(3),[0,np.nan,0],[0,0,np.nan],range(3)])
print(df.isnull())
'''
0 1 2
0 False False False
1 False True False
2 False False True
3 False False False
'''
print(df.isnull().any())#还可以指定axis=1
'''
0 False
1 True
2 True
'''
#找到缺失值所在位置
print(df[df.isnull().any(axis = 1)])
df = df.fillna(5) #用5来填充缺失值
print(df)
Original: https://blog.csdn.net/qq_45003520/article/details/123551945
Author: indigo女孩
Title: pandas基础_1
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/698261/
转载文章受原作者版权保护。转载请注明原作者出处!