pandas数据处理大全(必备)

目录

文章目录

pandas读取文件

import pandas as pd

'''读取txt文件为dataframe'''
df1 = pd.read_csv("./read.txt")

'''读取csv文件为dataframe'''
df2 = pd.read_csv('./read.csv')

df3 = pd.read_csv('./read.csv',header=None)
df3.columns=['列1','列2','列3','列4']

df4=pd.read_csv('read.csv',usecols=[0,1,2,3])

'''读取excel文件为dataframe'''
df5=pd.read_excel('read.xlsx')

df6=pd.read_excel('read.xlsx',sheet_name='sheet1')

df7=pd.read_excel('read.xlsx',sheet_name=[0,1])
data1=df7.values

df8=pd.read_excel('read.xlsx')
data2=df8.ix[0].values
data3=df8.ix[[1,2]].values

df9=pd.read_excel('read.xlsx')
data4=df9['data'].values

df10=pd.read_excel('read.xlsx')
data5=df9.ix[1,2]
data6=df10.ix[[1,2],['列1','列2']].values

df11=pd.read_excel('read.xlsx')
data7=df11.ix[:,['列1','列2']].values

df12=pd.read_excel('read.xlsx')
print("输出行号列表:\n",df12.index.values)

df13=pd.read_excel('read.xlsx')
print("输出列标题:\n",df13.columns.values)

df14=pd.read_excel('read.xlsx')
print("三行:\n",df14.sample(3).values)

df15=pd.read_excel('read.xlsx')
data15=[]
for i in df15.index.values:

    row_data=df15.ix[i,['列1','列2','列3','列4']].to_dict()
    data15.append(row_data)
print("最终获取到的数据是:\n",data15)

pandas存储文件

import pandas as pd
import os

file_path='./save.excel'

if os.path.exists(file_path):
    os.remove(file_path)

'''写入txt文件, header=0不保存列名,index=0不保存行名'''
pd_data = pd.DataFrame(dataframe_data,columns=('列名1','列名2','列名3'))
pd_data.to_csv('./save.txt')

'''写入csv文件, header=0不保存列名,index=0不保存行名'''
pd_data = pd.DataFrame(dataframe_data,columns=('列名1','列名2','列名3'))
pd_data.to_csv('./save.csv',header=0,index=0)

'''写入excel文件, header=0不保存列名,index=0不保存行名'''
pd_data = pd.DataFrame(dataframe_data,columns=('列名1','列名2','列名3'))
pd_data.to_excel(file_path,sheet_name='sheet_1')

pd_data1 = pd.DataFrame(dataframe_data1,columns=('列名1','列名2','列名3'))
pd_data2 = pd.DataFrame(dataframe_data2,columns=('列名1','列名2'))
writer = pd.ExcelWriter(file_path)
pd_data1.to_excel(excel_writer=writer, sheet_name='sheet_1',startcol=1,startrow=1)
pd_data2.to_excel(excel_writer=writer, sheet_name='sheet_2',startcol=1,startrow=1)
writer.save()
writer.close()

pandas处理空值和缺失值

import pandas as pd
import numpy as np
df = pd.DataFrame({"name1": ['AAAAAA', None, 'CCCCCCC'],
                   "name2": [np.nan, '', 'DDDDDDD'],
                   "name3": [pd.NaT, pd.Timestamp("1999-01-01"),pd.NaT]})

df1=df[(df['name1'].notna()) & (df['name1']!='')]

df2=df.dropna()

df3=df.dropna(axis=1)

df4=df.dropna(how='all')

df5=df.dropna(thresh=3)

df6=df.dropna(subset=['列1', '列2'])

df7=df.fillna(axis=1,method='ffill')

df8=df.fillna(axis=0,method='ffill')

df9=df.fillna(999)

df10=df.fillna(value={'列1':1,'列2':2,'列3':3})

df11=df.isna()
df12=df.isnull()

pandas创建空dataframe

result =pd.DataFrame(columns=('列名1','列名1','列名3'))

dataframe索引值的修改

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(12).reshape((3,4)),index = ['one', 'two', 'three'],columns=('列名1','列名2','列名3','列名4'))

df=df.reindex(index=['a','b','c'],columns=['a','b','c','d'])

dataframe选择行与列

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(12).reshape((3,4)),index = ['one', 'two', 'three'],columns=('列名1','列名2','列名3','列名4'))

print("列名1 列为:\n",df["列名1"])
print("列名1=0.5 列为:\n",df.loc[df['列名1'] == 0.5])

print("one 行为:\n",df.loc['one'])
print("列名1=0.5 列为:\n",df.loc[df['列名1'].isin(['one', 'two'])])

print("列名1,列名2 列为:\n",df[["列名1","列名2"]])

print("one,two 行为:\n",df.loc[['one','two']])

print("根据索引值为:\n",df.iloc[[0,1]])

print("布尔型索引为:\n",df[df>0.5])
print("布尔型索引为:\n",df[df != 0.5])
print("布尔型索引为:\n",df[df[["列名1", "列名2"]] > 0.5])
print("布尔型索引为:\n",df.loc[df['列名1'] != 0.5])
print("布尔型索引为:\n",df[(df['列名1']==0.5)& (df['列名2']==0.4)])

dataframe转置

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)))
print(df)

print(df.T)

dataframe添加数据

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)))

df['f'] = 10

df.loc[5]=5

dataframe修改数据

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])

df['q'] = 0

df.loc[5]=5

df[['q','w']] = 1

df.loc[[0,1]] = 2

dataframe删除数据

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])

del df['q']

df.drop('w', axis=1,inplace=True)

df.drop(0,inplace=True)

dataframe计算

import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])
df2 = pd.DataFrame(np.random.rand(9).reshape((3,3)),columns=['q','w','e'])

print("和为:\n",df1+df2)
print("差为:\n",df1-df2)
print("乘为:\n",df1*df2)
print("商为:\n",df1/df2)

dataframe排序

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])

df.sort_values('q', ascending=False,inplace=True)

df.sort_values(['q','w'], ascending=False,inplace=True)
print(df)

df.sort_values(2, axis=1, ascending=False,inplace=True)
print(df)

df.sort_index(ascending=False,inplace=True)
print(df)

df.sort_index(axis=1,ascending=False,inplace=True)
print(df)

dataframe删除重复的行数据

import pandas as pd
import numpy as np
df= pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])

df.drop_duplicates()

df.drop_duplicates(['q','w'])

dataframe合并

import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.rand(25).reshape((5,5)),index = ['one', 'two', 'three','four','five'],columns=('列名1','列名2','列名3','列名4','列名5'))
df2 = pd.DataFrame(np.random.rand(16).reshape((4,4)),index = ['one', 'two', 'three','four'],columns=('列名1','列名2','列名3','列名4'))
df3 = pd.DataFrame(np.random.rand(9).reshape((3,3)),index = ['one', 'two', 'three'],columns=('列名1','列名2','列名3'))

res1 = pd.concat([df1, df2], axis=0, ignore_index=True,join='outer')

res2 = pd.concat([df1, df2], axis=1, ignore_index=True,join='outer')

res3 = pd.concat([df1, df2], axis=0,join='inner')

res4 = pd.concat([df1, df2], axis=1,join='inner')

res5 = df1.append(df2, ignore_index=True)

res6 = df1.append([df2, df3], ignore_index=True)

dataframe按照关键字分类

import pandas as pd
import numpy as np
df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})

res1=df.groupby('key1').mean()
res2=df.groupby('key1').apply(np.mean)
group=df.groupby('key1')
res3=group['data1'].agg('mean')

res4=df.groupby(['key1','key2']).mean()

dataframe透视表

import pandas as pd
import numpy as np
df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})

res1=pd.pivot_table(df, index='key1', columns='key2')
res2=df.pivot_table(['data1'], index='key1',columns='key2')

dataframe替换,更改列的数据类型

import pandas as pd
import numpy as np
df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})

df1 = df['data1'].replace(0.5, 0)
df2 = df['data1'].replace([0.1,0.2,0.3,0.4,0.5], 0)

if df['实发工资']>0.5:
    df['实发工资']=df['应发工资']+20000

df=pd.DataFrame(df,dtype='float')

Original: https://blog.csdn.net/sinat_39616953/article/details/120058558
Author: 鼠小米
Title: pandas数据处理大全(必备)

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/740726/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球