【星海随笔】pandas全总结

import pandas as pd

pandas总结

载入CSV文件,并展示

df = pd.read_csv('1ttdata.csv')
df

展示元数据

df.info()
type(df)
df.index

支持的读取方式

df.head()
df.tail()

df.values[0]

df.keys()

df['XXXkey']
df['Age'].head(10)[:5]

dg = df.set_index('Name')

将数据分类


train_data = data.sample(frac = 0.8)
test_data = data.drop(train_data.index)

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

np.random.shuffle()

通过一个volume或者index

dg = df.set_index('Name')
test1 = dg['age']
test1['Braund, Mr. Owen Harris']

test1.mean()

test1.min()

test1.max()

自定义DataFrame

data = {'country':['aaa','bbb','ccc'],
        'population':[10,12,14]}
df_data = pd.DataFrame(data)
df_data
df_date.describe()

显示图表的5行,并展示2到3列

df.iloc[:5,1:3]

设置Name列为索引

df = df.set_index('Name')
df.loc['Cumings, Mrs. John Bradley (Florence Briggs Thayer)']

df.loc['Cumings, Mrs. John Bradley (Florence Briggs Thayer)','Age']

根据索引拿出部分

df.loc['Cumings, Mrs. John Bradley (Florence Briggs Thayer)':'Allen, Mr. William Henry']

df.loc['Heikkinen, Miss. Laina','Age'] = 50

df['Age'] > 30

df.loc[df['Sex'] == 'male' ,'Age'].mean()

df.iloc[1]

t1.loc[1,'Age'] = 50

t1 = df.copy()

打印出名称和年龄

df = pd.DataFrame({'name':['A','B','C','A','A','B'],
                  'age':[8,7,9,6,5,4]})
for name in ['A','B','C']:
    print(name,df[df['name'] == name])

根据性别分类分别去求年龄的平均值

df.groupby('Sex')['Age'].mean()

自定义数组

df = pd.DataFrame([[1,2,3],[4,5,6]],index = ['a','b'],columns = ['A','B','C'])

df.sum()

df.sum( axis = 1 )

df.sum(axis = 'columns')

df['Age'].value_counts()

df.cov()

df.corr()

df['Age'].value_counts()

df['Age'].value_counts(ascending = True)

df['Age'].value_counts(ascending = True, bins=5)

df['Age'].count()

s1 = df.append(s2,ignore_index=True)

del s1['k']

s1.drop(891)

s1.drop(2,inplace = True)

df.iloc[0]

df.loc['a']

改操作

df.loc['a']['A'] = 150

进行拼接

df3 = pd.concat([s1,s2],axis=1)

增加列,也可以,axis=1 去拼接

df['test'] = [xxx,xxx,xxx,...]

del df['Age']

df3.drop('h',axis=0,inplace = True)

df3.drop(['Age','Cabin'],axis=1)

构造一个多维表

left = pd.DataFrame({'key':['KO','K1','K3'],
                   'A':['A1','A2','A3'],
                   'B':['B1','B2','B3']})
right = pd.DataFrame({'key':['KO','K1','K3'],
                   'C':['C1','C2','C3'],
                   'D':['D1','D2','D3']})

pd.merge(left,right)

pd.merge(left,right,on='key')

'''
key A   B   key2_x  C   D   key2_y
0   KO  A1  B1  K0  C1  D1  K0
1   K1  A2  B2  K2  C2  D2  K2
2   K3  A3  B3  K3  C3  D3  K3
'''
pd.merge(left,right,on=['key','key2'])

特殊情况

left = pd.DataFrame({'key':['KO','K1','K3'],
                   'A':['A1','A2','A3'],
                   'B':['B1','B2','B3'],
                    'key2':['K0','K2','K3']})
right = pd.DataFrame({'key':['KO','K1','K3'],
                   'C':['C1','C2','C3'],
                   'D':['D1','D2','D3'],
                     'key2':['K0','K2','K4']})

pd.merge(left,right,on=['key','key2'])

pd.merge(left,right,on=['key','key2'],how='outer')

'''
key A   B   key2    C   D
0   KO  A1  B1  K0  C1  D1
1   K1  A2  B2  K2  C2  D2
2   K3  A3  B3  K3  NaN NaN
3   K3  NaN NaN K4  C3  D3
'''
pd.merge(left,right,on=['key','key2'],how='outer',indicator = True)

pd.set_option('display.max_rows',100)

pd.Series(index = range(0,100) )

pd.set_option('display.max_columns',30)

a = pd.DataFrame(columns = range(0,40))

pd.get_option('display.max_colwidth')

pd.Series(index = ['A'],data=['t'* 60])

pd.get_option('display.precision')

pd.set_option('display.precision',10)

数据透视表

a.pivot(index = 'A',columns = 'C',values = 'D')

生成表格

df.pivot_table(index = 'Sex',columns='Pclass',values='Fare')

df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='mean')

df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='count')

pd.crosstab(index = df['Sex'],columns = df['Pclass'])

df['Underaged'] = df['Age']  18

df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='sum')

df.pivot_table(index = 'Underaged',columns='Pclass',values='Fare',aggfunc='count')

df[df['Underaged']]

a = df[df['Underaged']]
a[a['Survived'] == 1]

pandas时间和datetime时间的区别

a = datetime.datetime(year=2020,month=12,day=24,hour=10,minute=30)
b = pd.Timestamp('2020-12-23-10:30')

a.day
b.day

b + pd.Timedelta('50 0000 0000')

b + pd.Timedelta('5 minute')

s = pd.Series(['2017-11-24 00:00:00','2017-11-25 00:00:00','2017-11-26 00:00:00'])
ts = pd.to_datetime(s)
'''
0   2017-11-24
1   2017-11-25
2   2017-11-26
dtype: datetime64[ns]
'''
ts.dt.day

a = list(ts.dt.day)

pandas时间的对时间表格的使用

pd.Series(pd.date_range(start='2022-5-12',periods = 10,freq = '24H'))

data = pd.read_csv('flowdata.csv')
data['Time'] = pd.to_datetime(data['Time'])

data = pd.read_csv('flowdata.csv',index_col = 0 ,parse_dates = True)

data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')]

data['2012-01':'2012-02']

data[data.index.month == 1]

data[(data.index.day == 1) & (data.index.month == 1)]

data.resample('D').mean()

data.resample('3D').max()

%matplotlib notebook

data.resample('M').mean().plot()

condas常用 的命令

进行排序

data.sort_values(by=['group','data'],ascending = [False,True])

data.sort_values(by='data')

data.drop_duplicates().count()

data.drop_duplicates(subset='group')


def food_map(series):
    if series['group'] == 'a':
        return 'A'
    elif series['group'] == 'b':
        return 'B'
data['food_map'] = data.apply(food_map,axis = 'columns')
data

data.apply(print,axis = 'columns')

对数据快速转换

Upper_dict = {
    '通州':'河北',}
    '廊坊':'河北',
    '朝阳':'北京'
}

data['new_flat'] = data['flat'].map(Upper_dict)

df.assign(ration = df['data1']/df['data2'])

df.drop('ration',axis='columns',inplace=True)

ages = [15,18,20,21,22,34,41,52,53,63,79]
bins = [10,20,30,40,50,90]
bins_res = pd.cut(ages,bins)

bins_res
'''
outs:
[(10, 20], (10, 20], (10, 20], (20, 30], (20, 30], ..., (40, 50], (50, 90], (50, 90], (50, 90], (50, 90]]
Length: 11
Categories (5, interval[int64]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 90]]
'''
bins_res.codes

'''
outs:
array([0, 0, 0, 1, 1, 2, 3, 4, 4, 4, 4], dtype=int8)
'''
pd.cut(ages,[10,30,50,80]).codes

'''
array([0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2], dtype=int8)
'''

group_names = ['youth','old']
a = pd.cut(ages,[10,20,80],labels=group_names)
pd.value_counts(a)

'''
old      8
youth    3
dtype: int64
'''

condas 缺失值的 应用

df = pd.DataFrame([range(3),[0, np.nan ,0], [0,0,np.nan],range(3) ])

df.isnull()

df.isnull().any()

df.isnull().any(axis = 1)

df[df.isnull().any(axis=1)]

0 1 2
0 0 1.0 2.0
1 0 NaN 0.0
2 0 0.0 NaN
3 0 1.0 2.0

groupby

df = pd.DataFrame({'A':['foo','foo','foo','bar',
                       'foo','bar','foo','bar'],
                  'B':['one','two','one','two',
                      'one','two','one','one'],
                  'C':np.random.randn(8),
                 'D':np.random.randn(8)})
df.groupby(['A','B']).count()

s = pd.Series([1,2,3,4,3,2],[8,7,6,6,4,5])
grouped = s.groupby(level = 0,sort = False )
grouped.count()
grouped.sum()
grouped.last()
grouped.first()

修改索引的名称

df.groupby(['X']).get_group('A')

index = pd.MultiIndex.from_arrays(arrays,name = ['first','second'])
s = pd.Series(np.random.randn(8),index = index )

s.groupby(level = 'first')

进行细化计算

groupd = df.groupby(['A','B'])
groupd.aggregate[np.sum]

df.groupby(['A','B']).sum().reset_index()

grouped.describe().head()

grouped = df.groupby('A')
grouped['C'].agg([np.sum,np.mean,np.std])

Original: https://blog.csdn.net/weixin_41997073/article/details/124695366
Author: 活跃的煤矿打工人
Title: 【星海随笔】pandas全总结

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/754941/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球