pandas数据处理

pandas数据处理

对数据进行分类统计排序

算法设计逻辑

  1. 从全部数据中选取电影分类数据Gerne
  2. 将gerne数据处理成电影类型名称集合
  3. 构造values全为0的 电影数量 x 电影类型的 DataFrame
  4. 将各符合电影种类的电影类型对应位置Values设为1
  5. 各类电影所在列值相加,得到各类电影的总数
  6. 画图
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

file_path = './IMDB-Movie-Data.csv'
df = pd.read_csv(file_path)
print(df.head(1))
print(df.info())

print(df['Genre'])
#统计分类的列表
temp_list = df['Genre'].str.split(',').tolist()
print('temp_list')
print(temp_list)
gerne_list = list( set([i for j in temp_list for i in j ]))

print('gerne_list')
print(gerne_list)

zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(gerne_list))),columns = gerne_list)
print('zeros_df')
print(zeros_df)

for i in range(df.shape[0]):
    zeros_df.loc[i,temp_list[i]] = 1

print(zeros_df.head(4))
gerne_count = zeros_df.sum(axis=0)
print('gerne_count')
print(gerne_count)

gerne_count = gerne_count.sort_values(ascending=True)
print('gerne_count')
print(gerne_count)
_x = gerne_count.index
_y = gerne_count.values

plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()

数据的合并

使用join,merge合并数据
合并数据时,join()中没有的数据用NAN代替

import  numpy as np
import pandas as pd

list1 = np.array([[1,2,3],[4,4,5],[6,346,63],[235,564,5]])
print(list1)
print(list1.shape[0])
print(list1.shape[1])

df1 = pd.DataFrame(np.ones((2,4)),index = ['A','B'],columns=list('abcd'))
df2 = pd.DataFrame(np.ones((3,3)),index =['A','B','C'],columns=list('xyz'))
df3 = pd.DataFrame(np.zeros((3,3)),columns = list('fax'))

print('df1')
print(df1)
print('df2')
print(df2)
print('df3')
print(df3)
t1 = df1.join(df2)
t2 = df2.join(df1)
t3 = df1.merge(df3,on='a')
print(t1)
print(t2)
print(t3)

使用merge合并数据时,数据将按照指定的方式进行合并

df4.loc['A','c']= 'a'
df4.loc['B','c']= 'c'
df4.loc['C','c']= 'c'
print('df4')
print(df4)

df5.loc['A','x']='c'
df5.loc['B','x']='d'
print('df5')
print(df5)

t1 = df4.merge(df5,left_on='c',right_on='x',how='inner')
print('t1')
print(t1)

pandas数据处理
how=’inner’: 取交集(默认连接方式)
pandas数据处理
how=’outer’:取并集
how=’left’:按照左侧数据连接
how=’right’:按照右侧数据连接

分组和聚合

import pandas as pd
import numpy as np

file_path = './starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)

print(df.head(1))
print(df.info())

grouped = df.groupby(by='Country')
print('grouped')
print(grouped)

for i,j in grouped:
    print(i)
    print('-'*100)
    print(j)
    print("*"*100)

us_data = df[ df['Country'] =='US']
# print(us_data)
#
country_count = grouped['Brand'].count()
print(country_count['US'])
print(country_count['CN'])

统计中国每个省份的店铺的数量
china_data = df[df['Country'] == 'CN']
grouped_china = china_data.groupby(by = 'State/Province').count()['Brand']
#
print(grouped_china)

数据按照多个条件进行分组
multi_grouped = df['Brand'].groupby(by = [df['Country'],df['State/Province']]).count()
print(multi_grouped,type(multi_grouped))
print("*"*100)

multi_grouped1 = df[['Brand']].groupby(by = [df['Country'],df['State/Province']]).count()
multi_grouped2 = df.groupby(by = [df['Country'],df['State/Province']])[['Brand']].count()
multi_grouped3 = df.groupby(by = [df['Country'],df['State/Province']]).count()[['Brand']]
print(multi_grouped1,type(multi_grouped1))
print("*"*100)
print(multi_grouped2,type(multi_grouped2))
print("*"*100)
print(multi_grouped3,type(multi_grouped3))
print("*"*100)

索引的方法和属性
print(multi_grouped3.index)

关于groupby用法详解,找到了一篇不错的文章: link.

索引和复合索引

import pandas as pd
import numpy as np

df1 = pd.DataFrame([[100,1,1,1],[1,1,1,1]],index=['A','B'],columns=list('abcd'))
print('--------------1---------------')
print(df1)

df1.index = ['X','Y']
print('--------------2----------------')
print(df1)
print('--------------3----------------')
print(df1.reindex(list('Xdef'))) #reindex不会改变df1的索引
print('---------------4----------------')
print(df1.set_index(['c','d'],drop=False))#set_index不会改变df1的值

a = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','one','two','two','two'],'d':list('hjklmno')})
print('---------------5----------------')
print(a)

a1是Dataframe
a1 = a.set_index(['c','d'])
print('----------a1-----------')
print(a1)

a2 = a1['a']
print('--------------a2-----------')
print(a2)

a3 = a2['one']
a4 = a2['two']['m']
print('-----------a3----------')
print(a3)
print('-----------a4----------')
print(a4)

c是series
c = a.set_index(['c','d'])['a']
## cc是DataFrame
cc = a.set_index(['c','d'])[['a']]
print('--------------c------------')
print(c)
print('--------------cc-----------')
print(cc)
#series取索引,直接取
print(c['one','k'])
#Dataframe取索引,使用标签
print('Dataframe取值')
print(a1.loc['one'].loc['j'])

a1交换里外层索引
c1 = a1.swaplevel()
print('c1')
print(c1)

分组,聚合及索引的应用

Task1 将店铺中数量排名前25的省份及店铺数量画图表示

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import font_manager

my_font = font_manager.FontProperties(fname="C:\Windows\Fonts\STXINGKA.TTF")
file_path = './starbucks_store_worldwide.csv'
df = pd.read_csv(file_path)

data1 = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10]
print(data1)
data2 = df[df['Country'] == "CN"].groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25]
print(data2)

_x1 = data1.index
_y1 = data1.values

_x2 = data2.index
_y2 = data2.values

plt.figure(figsize=(18,15),dpi=80)
plt.bar(range(len(_x1)),_y1)
plt.xticks(range(len(_x1)),_x1)
plt.show()

plt.barh(range(len(_x2)),_y2,height=0.3,color='orange')
plt.yticks(range(len(_x2)),_x2,fontproperties=my_font)
plt.show()

Task2 不同年份书籍数量以及不同年份书籍的平均评分情况

import pandas as pd
from matplotlib import pyplot as plt

file_path = './books.csv'
df = pd.read_csv(file_path)

print(df.head(1))
print(df.info())

Task1-------------------------
data1 = df[pd.notnull(df['original_publication_year'])] #处理缺失数据:删掉缺失
booknumber_peryear = data1.groupby(by='original_publication_year').count()['id']
print('每年书籍')
print(booknumber_peryear)
#-----------------------------------

data2 = df[pd.notnull(df['average_rating'])] #删掉缺失数据
average_rating = data2['average_rating'].groupby(by=data2['original_publication_year']).mean()
print('average_rating')
print(average_rating)

_x = average_rating.index
_y = average_rating.values

plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(list(range(len(_x)))[::10],_x[::10].astype(int),rotation=45)

plt.show()

Original: https://blog.csdn.net/Lyfyh1314/article/details/121407863
Author: Lyfyh1314
Title: pandas数据处理

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/675810/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球