电子产品销售分析练习项目

读取数据

导入所需科学计算库

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
忽略小警告
import warnings
warnings.filterwarnings('ignore')

读取数据

df = pd.read_csv('D:\数据分析相关笔记等\数据集\电子产品销售分析数据集\电子产品销售分析.csv')
df.head()

Unnamed: 0event_timeorder_idproduct_idcategory_idcategory_codebrandpriceuser_idagesexlocal002020-04-24 11:50:39 UTC229435993205453698615159662235090899062.268105e+18electronics.tabletsamsung162.011.515916e+1824.0女海南112020-04-24 11:50:39 UTC229435993205453698615159662235090899062.268105e+18electronics.tabletsamsung162.011.515916e+1824.0女海南222020-04-24 14:37:43 UTC229444402405808622022739483190571836582.268105e+18electronics.audio.headphonehuawei77.521.515916e+1838.0女北京332020-04-24 14:37:43 UTC229444402405808622022739483190571836582.268105e+18electronics.audio.headphonehuawei77.521.515916e+1838.0女北京442020-04-24 19:16:21 UTC229458426315407423622739483168174244392.268105e+18NaNkarcher217.571.515916e+1832.0女广东

df['event_time'].unique()
array(['2020-04-24 11:50:39 UTC', '2020-04-24 14:37:43 UTC',
       '2020-04-24 19:16:21 UTC', ..., '2020-11-21 10:10:01 UTC',
       '2020-11-21 10:10:13 UTC', '2020-11-21 10:10:30 UTC'], dtype=object)
df['age'].unique()
array([24., 38., 32., 20., 21., 16., 49., 43., 37., 27., 48., 47., 28.,
       41., 22., 18., 44., 50., 46., 25., 34., 45., 17., 30., 36., 35.,
       39., 19., 40., 33., 31., 42., 23., 29., 26.])

清洗数据

提取数据时,处理与业务流程不符合数据,售价为负

df[df['price'] < 0]

处理空值

df.info()
RangeIndex: 564169 entries, 0 to 564168
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype
 0   Unnamed: 0     419890 non-null  int64
 1   event_time     419890 non-null  datetime64[ns, UTC]
 2   order_id       419890 non-null  int64
 3   product_id     419890 non-null  int64
 4   category_id    419890 non-null  int64
 5   category_code  419890 non-null  object
 6   brand          419890 non-null  object
 7   price          419890 non-null  float64
 8   user_id        419890 non-null  int64
 9   age            419890 non-null  int64
 10  sex            419890 non-null  object
 11  local          419890 non-null  object
 12  month          419890 non-null  int64
 13  quater         419890 non-null  period[Q-DEC]
dtypes: datetime64[ns, UTC](1), float64(1), int64(7), object(4), period[Q-DEC](1)
memory usage: 48.1+ MB

处理年龄组

bins = [16, 20, 30, 40, 60]
labels = ['16-19', '20-29', '30-39', '40-60']
df['agegroup'] = pd.cut(df['age'], bins, right = False, labels = labels)

查看数据

df.describe()

Unnamed: 0order_idproduct_idcategory_idpriceuser_idagemonthcount4.198900e+054.198900e+054.198900e+054.198900e+05419890.0000004.198900e+05419890.000000419890.000000mean1.799809e+062.370509e+181.676753e+182.274301e+18254.2816191.515916e+1833.1440697.754607std7.529273e+052.014144e+163.171234e+172.438523e+16321.1676132.372070e+0710.1198002.448404min0.000000e+002.294360e+181.515966e+182.268105e+180.0000001.515916e+1816.0000001.00000025%1.669817e+062.354505e+181.515966e+182.268105e+1839.3300001.515916e+1824.0000007.00000050%1.854132e+062.375487e+181.515966e+182.268105e+18138.8700001.515916e+1833.0000008.00000075%2.463694e+062.388441e+181.515966e+182.268105e+18347.2000001.515916e+1842.00000010.000000max2.633520e+062.388441e+182.388434e+182.374499e+1811574.0500001.515916e+1850.00000011.000000

price的均值在254.28元,中位数在138.87元,可能存在极值影响 年龄的平均值、中位数均在33岁,说明该数据服从正态分布或近似正态分布

数据分析

查看全年销售额的增长率

销售额数据分组汇总
sales_month = df.groupby(['month'])['price'].sum()
计算2-11月销售增长率
sales_rate_2 = sales_month[2] / sales_month[1] - 1
sales_rate_3 = sales_month[3] / sales_month[2] - 1
sales_rate_4 = sales_month[4] / sales_month[3] - 1
sales_rate_5 = sales_month[5] / sales_month[4] - 1
sales_rate_6 = sales_month[6] / sales_month[5] - 1
sales_rate_7 = sales_month[7] / sales_month[6] - 1
sales_rate_8 = sales_month[8] / sales_month[7] - 1
sales_rate_9 = sales_month[9] / sales_month[8] - 1
sales_rate_10 = sales_month[10] / sales_month[9] - 1
sales_rate_11 = sales_month[11] / sales_month[10] - 1
设置2-11月的增长率标签
sales_rate_2_label = '%.2f%%' % (sales_rate_2 * 100)
sales_rate_3_label = '%.2f%%' % (sales_rate_3 * 100)
sales_rate_4_label = '%.2f%%' % (sales_rate_4 * 100)
sales_rate_5_label = '%.2f%%' % (sales_rate_5 * 100)
sales_rate_6_label = '%.2f%%' % (sales_rate_6 * 100)
sales_rate_7_label = '%.2f%%' % (sales_rate_7 * 100)
sales_rate_8_label = '%.2f%%' % (sales_rate_8 * 100)
sales_rate_9_label = '%.2f%%' % (sales_rate_9 * 100)
sales_rate_10_label = '%.2f%%' % (sales_rate_10 * 100)
sales_rate_11_label = '%.2f%%' % (sales_rate_11 * 100)
将1-11月的销售额以及增长率转换为DataFrame
sales = pd.DataFrame({
    'sales_month':sales_month,
    'sales_rate':[0, sales_rate_2, sales_rate_3, sales_rate_4, sales_rate_5, sales_rate_6,
                  sales_rate_7, sales_rate_8, sales_rate_9, sales_rate_10, sales_rate_11],
    'sales_rate_label':['0.00%', sales_rate_2_label, sales_rate_3_label, sales_rate_4_label,
                        sales_rate_5_label, sales_rate_6_label, sales_rate_7_label,
                        sales_rate_8_label, sales_rate_9_label, sales_rate_10_label, sales_rate_11_label]
})
绘制图形
from matplotlib import ticker
设置中文字符
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
设置背景样式
plt.style.use('ggplot')
x轴数据
x = ['{}月'.format(values) for values in sales.index.tolist()]
y轴数据
y1 = sales['sales_month']
y2 = sales['sales_rate']
创建画布
fig = plt.figure(figsize = (20, 8), dpi = 80)
ax1 = fig.add_subplot(111)
ax2 = ax1.twinx()
绘制图形
ax1.bar(x, y1, color = 'pink', label = '销售额')
ax2.plot(x, y2, color = 'black',marker = '*', label = '增长率')
ax1.set_xticks(x)
ax1.set_ylim(0, 30000000)
ax2.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1, decimals=1))
ax1.set_xlabel('月份')
ax1.set_ylabel('销售额')
ax2.set_ylabel('增长率')
ax1.set_title("销售额与增长率")
plt.legend()
plt.show()

电子产品销售分析练习项目

各省销售情况

各省销售占比

按省分类
sales_area = df.groupby('local')['price'].sum()
绘图
sales_area.plot(figsize=(20,8),kind = 'pie',autopct='%1.1f%%',title='2020年总销售额占比')
# 准备数据
labels = sales_area.index
size = sales_area.values
# 设置画布
plt.figure(figsize = (20, 8), dpi = 80)
# 饼图属性
plt.pie(size, labels = labels, autopct = '%.2f%%')
# 长宽更改
plt.axis('equal')
plt.title('2020年总销售额占比', fontsize='x-large')
plt.legend()
plt.show()

电子产品销售分析练习项目
各地区每一年的销售额
month_sales_area = df.groupby(by=['local','month'])['price'].sum()
将分组后的多层索引转换为列数据
month_sales_area = month_sales_area.reset_index(level=[0, 1])
使用数据透视表重新整理数据
month_sales_area = pd.pivot_table(month_sales_area,
                                  index='local',
                                  columns='month',
                                  values='price')
绘制图形
month_sales_area.plot(figsize=(20, 8), kind='bar', title='2020年不同省份销售额对比')
plt.show()

电子产品销售分析练习项目

各省8月份的销售额均大于其他月份,说明在8月份消费者更容易消费

前四个月的销售额很低迷,商家应针对不同月份退出不同优惠活动

各省份不同类别产品销售比

各省份不同类型产品的销售额
category_sales_area = df.groupby(by = ['local','category_code'])['price'].sum().reset_index().sort_values('price',ascending = False).head(50)
category_sales_area = category_sales_area.groupby(by=['local','category_code'])['price'].sum()
将分组后的多层索引设置为列数据
category_sales_area = category_sales_area.reset_index()
数据透视表整理数据
category_sales_area = pd.pivot_table(category_sales_area,
                                    index='local',
                                    columns='category_code',
                                    values='price')
绘制图形
category_sales_area.plot(figsize=(20, 8), kind ='bar', title='不同类别产品T50在各省份销售对比')
plt.show()

电子产品销售分析练习项目

Original: https://blog.csdn.net/yhlauvchuchu/article/details/126049577
Author: yhlauvchuchu
Title: 电子产品销售分析练习项目

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/743033/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球