目录
文章目录
- 目录
* - pandas读取文件
- pandas存储文件
- pandas处理空值和缺失值
- pandas创建空dataframe
- dataframe索引值的修改
- dataframe选择行与列
- dataframe转置
- dataframe添加数据
- dataframe修改数据
- dataframe删除数据
- dataframe计算
- dataframe排序
- dataframe删除重复的行数据
- dataframe合并
- dataframe按照关键字分类
- dataframe透视表
- dataframe替换,更改列的数据类型
pandas读取文件
import pandas as pd
'''读取txt文件为dataframe'''
df1 = pd.read_csv("./read.txt")
'''读取csv文件为dataframe'''
df2 = pd.read_csv('./read.csv')
df3 = pd.read_csv('./read.csv',header=None)
df3.columns=['列1','列2','列3','列4']
df4=pd.read_csv('read.csv',usecols=[0,1,2,3])
'''读取excel文件为dataframe'''
df5=pd.read_excel('read.xlsx')
df6=pd.read_excel('read.xlsx',sheet_name='sheet1')
df7=pd.read_excel('read.xlsx',sheet_name=[0,1])
data1=df7.values
df8=pd.read_excel('read.xlsx')
data2=df8.ix[0].values
data3=df8.ix[[1,2]].values
df9=pd.read_excel('read.xlsx')
data4=df9['data'].values
df10=pd.read_excel('read.xlsx')
data5=df9.ix[1,2]
data6=df10.ix[[1,2],['列1','列2']].values
df11=pd.read_excel('read.xlsx')
data7=df11.ix[:,['列1','列2']].values
df12=pd.read_excel('read.xlsx')
print("输出行号列表:\n",df12.index.values)
df13=pd.read_excel('read.xlsx')
print("输出列标题:\n",df13.columns.values)
df14=pd.read_excel('read.xlsx')
print("三行:\n",df14.sample(3).values)
df15=pd.read_excel('read.xlsx')
data15=[]
for i in df15.index.values:
row_data=df15.ix[i,['列1','列2','列3','列4']].to_dict()
data15.append(row_data)
print("最终获取到的数据是:\n",data15)
pandas存储文件
import pandas as pd
import os
file_path='./save.excel'
if os.path.exists(file_path):
os.remove(file_path)
'''写入txt文件, header=0不保存列名,index=0不保存行名'''
pd_data = pd.DataFrame(dataframe_data,columns=('列名1','列名2','列名3'))
pd_data.to_csv('./save.txt')
'''写入csv文件, header=0不保存列名,index=0不保存行名'''
pd_data = pd.DataFrame(dataframe_data,columns=('列名1','列名2','列名3'))
pd_data.to_csv('./save.csv',header=0,index=0)
'''写入excel文件, header=0不保存列名,index=0不保存行名'''
pd_data = pd.DataFrame(dataframe_data,columns=('列名1','列名2','列名3'))
pd_data.to_excel(file_path,sheet_name='sheet_1')
pd_data1 = pd.DataFrame(dataframe_data1,columns=('列名1','列名2','列名3'))
pd_data2 = pd.DataFrame(dataframe_data2,columns=('列名1','列名2'))
writer = pd.ExcelWriter(file_path)
pd_data1.to_excel(excel_writer=writer, sheet_name='sheet_1',startcol=1,startrow=1)
pd_data2.to_excel(excel_writer=writer, sheet_name='sheet_2',startcol=1,startrow=1)
writer.save()
writer.close()
pandas处理空值和缺失值
import pandas as pd
import numpy as np
df = pd.DataFrame({"name1": ['AAAAAA', None, 'CCCCCCC'],
"name2": [np.nan, '', 'DDDDDDD'],
"name3": [pd.NaT, pd.Timestamp("1999-01-01"),pd.NaT]})
df1=df[(df['name1'].notna()) & (df['name1']!='')]
df2=df.dropna()
df3=df.dropna(axis=1)
df4=df.dropna(how='all')
df5=df.dropna(thresh=3)
df6=df.dropna(subset=['列1', '列2'])
df7=df.fillna(axis=1,method='ffill')
df8=df.fillna(axis=0,method='ffill')
df9=df.fillna(999)
df10=df.fillna(value={'列1':1,'列2':2,'列3':3})
df11=df.isna()
df12=df.isnull()
pandas创建空dataframe
result =pd.DataFrame(columns=('列名1','列名1','列名3'))
dataframe索引值的修改
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(12).reshape((3,4)),index = ['one', 'two', 'three'],columns=('列名1','列名2','列名3','列名4'))
df=df.reindex(index=['a','b','c'],columns=['a','b','c','d'])
dataframe选择行与列
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(12).reshape((3,4)),index = ['one', 'two', 'three'],columns=('列名1','列名2','列名3','列名4'))
print("列名1 列为:\n",df["列名1"])
print("列名1=0.5 列为:\n",df.loc[df['列名1'] == 0.5])
print("one 行为:\n",df.loc['one'])
print("列名1=0.5 列为:\n",df.loc[df['列名1'].isin(['one', 'two'])])
print("列名1,列名2 列为:\n",df[["列名1","列名2"]])
print("one,two 行为:\n",df.loc[['one','two']])
print("根据索引值为:\n",df.iloc[[0,1]])
print("布尔型索引为:\n",df[df>0.5])
print("布尔型索引为:\n",df[df != 0.5])
print("布尔型索引为:\n",df[df[["列名1", "列名2"]] > 0.5])
print("布尔型索引为:\n",df.loc[df['列名1'] != 0.5])
print("布尔型索引为:\n",df[(df['列名1']==0.5)& (df['列名2']==0.4)])
dataframe转置
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)))
print(df)
print(df.T)
dataframe添加数据
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)))
df['f'] = 10
df.loc[5]=5
dataframe修改数据
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])
df['q'] = 0
df.loc[5]=5
df[['q','w']] = 1
df.loc[[0,1]] = 2
dataframe删除数据
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])
del df['q']
df.drop('w', axis=1,inplace=True)
df.drop(0,inplace=True)
dataframe计算
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])
df2 = pd.DataFrame(np.random.rand(9).reshape((3,3)),columns=['q','w','e'])
print("和为:\n",df1+df2)
print("差为:\n",df1-df2)
print("乘为:\n",df1*df2)
print("商为:\n",df1/df2)
dataframe排序
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])
df.sort_values('q', ascending=False,inplace=True)
df.sort_values(['q','w'], ascending=False,inplace=True)
print(df)
df.sort_values(2, axis=1, ascending=False,inplace=True)
print(df)
df.sort_index(ascending=False,inplace=True)
print(df)
df.sort_index(axis=1,ascending=False,inplace=True)
print(df)
dataframe删除重复的行数据
import pandas as pd
import numpy as np
df= pd.DataFrame(np.random.rand(25).reshape((5,5)),columns=['q','w','e','r','t'])
df.drop_duplicates()
df.drop_duplicates(['q','w'])
dataframe合并
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.rand(25).reshape((5,5)),index = ['one', 'two', 'three','four','five'],columns=('列名1','列名2','列名3','列名4','列名5'))
df2 = pd.DataFrame(np.random.rand(16).reshape((4,4)),index = ['one', 'two', 'three','four'],columns=('列名1','列名2','列名3','列名4'))
df3 = pd.DataFrame(np.random.rand(9).reshape((3,3)),index = ['one', 'two', 'three'],columns=('列名1','列名2','列名3'))
res1 = pd.concat([df1, df2], axis=0, ignore_index=True,join='outer')
res2 = pd.concat([df1, df2], axis=1, ignore_index=True,join='outer')
res3 = pd.concat([df1, df2], axis=0,join='inner')
res4 = pd.concat([df1, df2], axis=1,join='inner')
res5 = df1.append(df2, ignore_index=True)
res6 = df1.append([df2, df3], ignore_index=True)
dataframe按照关键字分类
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
res1=df.groupby('key1').mean()
res2=df.groupby('key1').apply(np.mean)
group=df.groupby('key1')
res3=group['data1'].agg('mean')
res4=df.groupby(['key1','key2']).mean()
dataframe透视表
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
res1=pd.pivot_table(df, index='key1', columns='key2')
res2=df.pivot_table(['data1'], index='key1',columns='key2')
dataframe替换,更改列的数据类型
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
df1 = df['data1'].replace(0.5, 0)
df2 = df['data1'].replace([0.1,0.2,0.3,0.4,0.5], 0)
if df['实发工资']>0.5:
df['实发工资']=df['应发工资']+20000
df=pd.DataFrame(df,dtype='float')
Original: https://blog.csdn.net/sinat_39616953/article/details/120058558
Author: 鼠小米
Title: pandas数据处理大全(必备)
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/740726/
转载文章受原作者版权保护。转载请注明原作者出处!