Pandas再来一次
文章目录
配合机器学习食用更佳。
一、Series和DataFrame
- series的创建(列表、数组、字典)、values/index、索引(默认、指定) 取一个值 取多个值、遍历
- dataframe的创建、values/index/columns、df.sort_index(axis=0) 行排序和列排序、df.sort_values(by=”) # 根据值排序,可以指定行还是列
- df.describe()/df.T
import pandas as pd
import numpy as np
s1 = pd.Series([1,2,3,-5])
print(s1)
0 1
1 2
2 3
3 -5
dtype: int64
s1.values
array([ 1, 2, 3, -5])
s1.index
RangeIndex(start=0, stop=4, step=1)
for i in s1.index:
print(i)
0
1
2
3
s1[0]
1
s2 = pd.Series([1,2,3,4],index=['a','b','c','d'])
print(s2)
a 1
b 2
c 3
d 4
dtype: int64
s2[0]
1
s2['a']
1
s2[['a','b','c','d']]
a 1
b 2
c 3
d 4
dtype: int64
s2[[0,1,2]]
a 1
b 2
c 3
dtype: int64
for i in s1:
print(i)
1
2
3
-5
for i in s2:
print(i)
1
2
3
4
'a' in s2
True
dic1 = {'apple':5,'pen':3}
s3 = pd.Series(dic1)
print(s3)
apple 5
pen 3
dtype: int64
data = {'year':[2014,2015,2016,2017],
'income':[1000,2000,3000,4000],
'pay':[5000,20000,30000,40000]}
df1 = pd.DataFrame(data)
df1
yearincomepay0201410005000120152000200002201630003000032017400040000
df2 = pd.DataFrame(np.arange(12).reshape(3,-1))
print(df2)
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df3 = pd.DataFrame(np.arange(12).reshape(3,4),index=['a','b','c'],columns=['A','B','C','D'])
print(df3)
A B C D
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
df1.index
RangeIndex(start=0, stop=4, step=1)
df1.columns
Index(['year', 'income', 'pay'], dtype='object')
df1.values
array([[ 2014, 1000, 5000],
[ 2015, 2000, 20000],
[ 2016, 3000, 30000],
[ 2017, 4000, 40000]])
df1.describe()
yearincomepaycount4.0000004.0000004.000000mean2015.5000002500.00000023750.000000std1.2909941290.99444914930.394056min2014.0000001000.0000005000.00000025%2014.7500001750.00000016250.00000050%2015.5000002500.00000025000.00000075%2016.2500003250.00000032500.000000max2017.0000004000.00000040000.000000
print(df1)
year income pay
0 2014 1000 5000
1 2015 2000 20000
2 2016 3000 30000
3 2017 4000 40000
print(df1.T)
0 1 2 3
year 2014 2015 2016 2017
income 1000 2000 3000 4000
pay 5000 20000 30000 40000
df1.sort_index(axis=1)
incomepayyear0100050002014120002000020152300030000201634000400002017
df1.sort_index(axis=0)
yearincomepay0201410005000120152000200002201630003000032017400040000
df3.sort_values(by='a',axis=1)
ABCDa0123b4567c891011
df3
ABCDa0123b4567c891011
二、选择数据
- 取行、取列、取多行、取多列、取元素、取True元素、切片
- df[] # 只能取列 可以取多列
- df.loc[] # 按照标签取
- df.iloc[] # 按照位置取
- df[df>6] # 取True元素
dates = pd.date_range('20170101',periods=6)
df1 = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
print(df1)
A B C D
2017-01-01 0 1 2 3
2017-01-02 4 5 6 7
2017-01-03 8 9 10 11
2017-01-04 12 13 14 15
2017-01-05 16 17 18 19
2017-01-06 20 21 22 23
df1['A']
2017-01-01 0
2017-01-02 4
2017-01-03 8
2017-01-04 12
2017-01-05 16
2017-01-06 20
Freq: D, Name: A, dtype: int64
df1[0:2]
ABCD2017-01-0101232017-01-024567
df1['20170102':'20170104']
ABCD2017-01-0245672017-01-038910112017-01-0412131415
df1.loc['20170102']
A 4
B 5
C 6
D 7
Name: 2017-01-02 00:00:00, dtype: int64
df1.loc['20170101','A']
0
df1.loc[:,'A']
2017-01-01 0
2017-01-02 4
2017-01-03 8
2017-01-04 12
2017-01-05 16
2017-01-06 20
Freq: D, Name: A, dtype: int64
df1.loc[:,['A','B']]
AB2017-01-01012017-01-02452017-01-03892017-01-0412132017-01-0516172017-01-062021
df1.loc[::2,['A','B']]
AB2017-01-01012017-01-03892017-01-051617
df1.iloc[2]
A 8
B 9
C 10
D 11
Name: 2017-01-03 00:00:00, dtype: int64
df1.iloc[[1,2]]
ABCD2017-01-0245672017-01-03891011
df1.iloc[1:3,2:4]
CD2017-01-02672017-01-031011
df1 > 6
ABCD2017-01-01FalseFalseFalseFalse2017-01-02FalseFalseFalseTrue2017-01-03TrueTrueTrueTrue2017-01-04TrueTrueTrueTrue2017-01-05TrueTrueTrueTrue2017-01-06TrueTrueTrueTrue
df1.A > 6
2017-01-01 False
2017-01-02 False
2017-01-03 True
2017-01-04 True
2017-01-05 True
2017-01-06 True
Freq: D, Name: A, dtype: bool
df1[df1>6]
ABCD2017-01-01NaNNaNNaNNaN2017-01-02NaNNaNNaN7.02017-01-038.09.010.011.02017-01-0412.013.014.015.02017-01-0516.017.018.019.02017-01-0620.021.022.023.0
三、赋值及操作(增、删、改)
- 增:在尾部增加一行、一列、在指定位置插入一列(insert) 添加一行只能先拆开后合并
- 删:del pop
- 改:找到相应位置改即可
df1
ABCD2017-01-0101232017-01-0245672017-01-038910112017-01-04121314152017-01-05161718192017-01-0620212223
df1.loc['20170101','A'] = 10000
df1
ABCD2017-01-01100001232017-01-0245672017-01-038910112017-01-04121314152017-01-05161718192017-01-0620212223
df1[df1>10] = -1
df1
ABCD2017-01-01-11232017-01-0245672017-01-038910-12017-01-04-1-1-1-12017-01-05-1-1-1-12017-01-06-1-1-1-1
df1['E'] = 10
df1
ABCDE2017-01-01-1123102017-01-024567102017-01-038910-1102017-01-04-1-1-1-1102017-01-05-1-1-1-1102017-01-06-1-1-1-110
df1['F'] = pd.Series([1,2,3,4,5,6],index=dates)
df1
ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106
s1 = pd.Series([1,2,3,4],index=['A','B','C','D'],)
df2 = df1.append(s1,ignore_index=True)
df2
ABCDEF0-1.01.02.03.010.01.014.05.06.07.010.02.028.09.010.0-1.010.03.03-1.0-1.0-1.0-1.010.04.04-1.0-1.0-1.0-1.010.05.05-1.0-1.0-1.0-1.010.06.061.02.03.04.0NaNNaN
df2.loc[7,['A','B','C']] = [1,2,3]
df2
ABCDEF0-1.01.02.03.010.01.014.05.06.07.010.02.028.09.010.0-1.010.03.03-1.0-1.0-1.0-1.010.04.04-1.0-1.0-1.0-1.010.05.05-1.0-1.0-1.0-1.010.06.061.02.03.04.0NaNNaN71.02.03.0NaNNaNNaN
s2 = pd.Series([1,2,4,5,6],index=['A','B','C','D','E'])
s2.name = 'wang'
df2.append(s2)
ABCDEF0-1.01.02.03.010.01.014.05.06.07.010.02.028.09.010.0-1.010.03.03-1.0-1.0-1.0-1.010.04.04-1.0-1.0-1.0-1.010.05.05-1.0-1.0-1.0-1.010.06.061.02.03.04.0NaNNaN71.02.03.0NaNNaNNaNwang1.02.04.05.06.0NaN
df1
ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106
df1.insert(1,'Z',df2['E'])
df1
AZGBCDEF2017-01-01-1NaNNaN1231012017-01-024NaNNaN5671022017-01-038NaNNaN910-11032017-01-04-1NaNNaN-1-1-11042017-01-05-1NaNNaN-1-1-11052017-01-06-1NaNNaN-1-1-1106
df2['E']
0 10.0
1 10.0
2 10.0
3 10.0
4 10.0
5 10.0
6 NaN
7 NaN
Name: E, dtype: float64
g = df1.pop('G')
g
2017-01-01 NaN
2017-01-02 NaN
2017-01-03 NaN
2017-01-04 NaN
2017-01-05 NaN
2017-01-06 NaN
Freq: D, Name: G, dtype: float64
del df1['Z']
df1
ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106
df2 = df1.drop(['A','B'],axis=1)
df2
CDEF2017-01-01231012017-01-02671022017-01-0310-11032017-01-04-1-11042017-01-05-1-11052017-01-06-1-1106
df1
ABCDEF2017-01-01-11231012017-01-0245671022017-01-038910-11032017-01-04-1-1-1-11042017-01-05-1-1-1-11052017-01-06-1-1-1-1106
四、处理丢失数据
- dropna
- fillna
- isnull
- np.any
- np.all
dates = np.arange(20170101,20170105)
df1 = pd.DataFrame(np.arange(12).reshape(4,3),index=dates,columns=['A','B','C'])
df1
ABC2017010101220170102345201701036782017010491011
df2 = pd.DataFrame(df1,index=dates,columns=['A','B','C','D','E'])
df2
ABCDE20170101012NaNNaN20170102345NaNNaN20170103678NaNNaN2017010491011NaNNaN
s1 = pd.Series([3,4,6],index=dates[:3])
s2 = pd.Series([32,5,2],index=dates[1:])
df2['D'] = s1
df2['E'] = s2
df2
ABCDE201701010123.0NaN201701023454.032.0201701036786.05.02017010491011NaN2.0
df2.dropna(axis=0,how='any')
ABCDE201701023454.032.0201701036786.05.0
df2.dropna(axis=0,how='all')
ABCDE201701010123.0NaN201701023454.032.0201701036786.05.02017010491011NaN2.0
df2.dropna(axis=1,how='any')
ABC2017010101220170102345201701036782017010491011
df2.dropna(axis=1,how='all')
ABCDE201701010123.0NaN201701023454.032.0201701036786.05.02017010491011NaN2.0
df2.fillna(value=0)
ABCDE201701010123.00.0201701023454.032.0201701036786.05.020170104910110.02.0
df2.isnull()
ABCDE20170101FalseFalseFalseFalseTrue20170102FalseFalseFalseFalseFalse20170103FalseFalseFalseFalseFalse20170104FalseFalseFalseTrueFalse
np.any(df2.isnull())
True
np.all(df2.isnull())
False
五、读取并写入文件
- read
- to
file = pd.read_csv('../people.csv',encoding='gbk')
file
地点名字职位工资在职情况0北京小红渠道合作经理15000在职1北京小明行政专员8000离职2北京小白行政专员9000在职3上海小绿商户经理12000在职4上海小黄商户经理10000离职5上海小黑团队长12000在职6广州小紫渠道合作主管20000在职7广州小粉渠道合作主管20000在职8广州小青经理10000离职9广州小蓝团队长13000在职
file.loc[1,'地点'] = '志哥'
file
地点名字职位工资在职情况0北京小红渠道合作经理15000在职1志哥小明行政专员8000离职2北京小白行政专员9000在职3上海小绿商户经理12000在职4上海小黄商户经理10000离职5上海小黑团队长12000在职6广州小紫渠道合作主管20000在职7广州小粉渠道合作主管20000在职8广州小青经理10000离职9广州小蓝团队长13000在职
file.to_csv('peo2.csv')
六、concat拼接
- 0行1列 outer:默认,缺少部分填空 inner:缺少部分去掉 ignore_index=True不考虑原来的index
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','d'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d'])
df3 = pd.DataFrame(np.arange(24,36).reshape((3,4)),columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
a b c d
0 12 13 14 15
1 16 17 18 19
2 20 21 22 23
a b c d
0 24 25 26 27
1 28 29 30 31
2 32 33 34 35
df4 = pd.concat([df1,df2,df3],axis=0)
df4
abcd00123145672891011012131415116171819220212223024252627128293031232333435
df4 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
df4
abcd00123145672891011312131415416171819520212223624252627728293031832333435
df5 = pd.concat([df1,df2,df3],axis=1)
df5
abcdabcdabcd00123121314152425262714567161718192829303128910112021222332333435
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','c','d','e'])
print(df1)
print(df2)
a b c f
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
a c d e
0 12 13 14 15
1 16 17 18 19
2 20 21 22 23
df6 = pd.concat([df1,df2],join='outer',ignore_index=True)
df6
abcfde001.023.0NaNNaN145.067.0NaNNaN289.01011.0NaNNaN312NaN13NaN14.015.0416NaN17NaN18.019.0520NaN21NaN22.023.0
df7 = pd.concat([df1,df2],join='inner',ignore_index=True)
df7
ac0021462810312134161752021
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((4,3)),columns=['a','c','d'])
print(df1)
print(df2)
a b c f
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
a c d
0 12 13 14
1 15 16 17
2 18 19 20
3 21 22 23
df8 = pd.concat([df1,df2],axis=0)
df8
abcfd001.023.0NaN145.067.0NaN289.01011.0NaN012NaN13NaN14.0115NaN16NaN17.0218NaN19NaN20.0321NaN22NaN23.0
df8 = pd.concat([df1,df2],axis=1)
df8
abcfacd00.01.02.03.012131414.05.06.07.015161728.09.010.011.01819203NaNNaNNaNNaN212223
七、merge合并
- outter、inner(默认)、left、right
- suffixes 为columns名增加后缀
- left_index right_index 左index索引和右index索引拼接
- indicator 显示merge信息
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
res = pd.merge(left,right,on='key')
res
keyABCD0K0A0B0C0D01K1A1B1C1D12K2A2B2C2D23K3A3B3C3D3
pd.concat([left,right],axis=1)
keyABkeyCD0K0A0B0K0C0D01K1A1B1K1C1D12K2A2B2K2C2D23K3A3B3K3C3D3
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K3'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K3 K0 C3 D3
res = pd.merge(left,right,on=['key1','key2'],how='outer')
res
key1key2ABCD0K0K0A0B0C0D01K0K1A1B1NaNNaN2K1K0A2B2C1D13K1K0A2B2C2D24K2K1A3B3NaNNaN5K3K0NaNNaNC3D3
res = pd.merge(left,right,on=['key1','key2'],how='inner')
res
key1key2ABCD0K0K0A0B0C0D01K1K0A2B2C1D12K1K0A2B2C2D2
res = pd.merge(left,right,on=['key1','key2'],how='left')
res
key1key2ABCD0K0K0A0B0C0D01K0K1A1B1NaNNaN2K1K0A2B2C1D13K1K0A2B2C2D24K2K1A3B3NaNNaN
res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator=True)
res
key1key2ABCD_merge0K0K0A0B0C0D0both1K0K1A1B1NaNNaNleft_only2K1K0A2B2C1D1both3K1K0A2B2C2D2both4K2K1A3B3NaNNaNleft_only5K3K0NaNNaNC3D3right_only
res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator='indicator_column')
res
key1key2ABCDindicator_column0K0K0A0B0C0D0both1K0K1A1B1NaNNaNleft_only2K1K0A2B2C1D1both3K1K0A2B2C2D2both4K2K1A3B3NaNNaNleft_only5K3K0NaNNaNC3D3right_only
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index=['K0','K2','K3'])
print(left)
print(right)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
res
ABCDK0A0B0C0D0K1A1B1NaNNaNK2A2B2C2D2K3NaNNaNC3D3
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
print(boys)
print(girls)
k age
0 K0 1
1 K1 2
2 K2 3
k age
0 K0 4
1 K0 5
2 K3 6
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='outer')
res
kage_boyage_girl0K01.04.01K01.05.02K12.0NaN3K23.0NaN4K3NaN6.0
八、pandas的plot
- 画二维数据 index和value
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
print(data.head())
0 1.034952
1 1.375026
2 1.193759
3 1.519312
4 1.499621
dtype: float64
data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=['A','B','C','D'])
data = data.cumsum()
print(data.head())
A B C D
0 0.026638 -0.482280 -1.087122 0.081180
1 -0.855294 0.472860 -1.712153 -0.122494
2 -2.542649 0.781326 -1.618599 -1.055795
3 -2.540139 1.202146 0.936892 -2.537980
4 -1.896418 0.601826 1.585485 -2.012536
data.plot()
plt.show()
ax = data.plot.scatter(x='A',y='B',color='Blue',label='class 1')
data.plot.scatter(x='A',y='C',color='Green',label='class 2',ax=ax)
plt.show()
Original: https://blog.csdn.net/qq_49821869/article/details/122399570
Author: aJupyter
Title: Python进阶—Pandas
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/675766/
转载文章受原作者版权保护。转载请注明原作者出处!