数据分析实习代码总结【进阶】Python

import pandas as pd
import numpy as np
file_name0 =[r'信息明细表-2021-1.csv',
            r'信息明细表-2021-2.csv',
             r'信息明细表-2021-3.csv',
             ...,
             r'信息明细表-2021-50.csv'
            ]
df = []
for i in range(len(file_name0)):
    df.append(pd.read_csv(file_name0[i]))
data = pd.concat(df)

data.to_csv('信息明细表-2021-1-50.csv',encoding='utf_8_sig', index=None)
df.name = df.name.str.replace('公司 ', '')
#去重保留第一个
df.drop_duplicates('name', keep='first', inplace=True)
dataset['repair_interval'] = pd.DataFrame(pd.to_datetime(dataset['repair_time'])-pd.to_datetime(dataset['online_time']))
将xx days转为 xx数值
dataset.repair_interval = dataset.repair_interval.map(lambda x: x/np.timedelta64(1,'D'))

取出月份
dataset['activate_month'] = dataset['activate_time'].map(lambda x: x[:6])

遇到过一次最恶心的日期长这样:
0 14/八月/21 3:00 下午
1 29/七月/21 1:57 下午
2 29/三月/21 3:07 下午
3 05/七月/21 9:37 上午
4 16/六月/21 11:05 上午
解决方法:(笨但有用)

去掉 几点和上下午
data1['已更新'] = data1['已更新'].str.split(' ',expand=True)[0]
月份改成数字, 此处应该可以简化代码
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('十二月','12'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('十一月','11'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('十月','10'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('九月','9'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('八月','8'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('七月','7'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('六月','6'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('五月','5'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('四月','4'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('三月','3'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('二月','2'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('一月','1'))

data1['已更新'] =pd.to_datetime(data1['已更新'], format='%d/%m/%y', errors='coerce')
data1['已更新']

output:
0 2021-08-14
1 2021-08-04
2 2021-08-04
3 2021-07-06
4 2021-07-06

#直接指定修改
ans.columns = ['WK41', 'WK42','WK43', 'WK44', 'WK45', 'WK46']
#修改指定类名
ans.rename(columns={'日期':'repair_time'}, inplace=True)
rans.rename(columns={'总价':'total_cost'}, inplace=True)
#直接修改index
row_name = ['0~10','10~20','20~30','30~40','40~50','50~60','60~70','70~80','80~90','90~100']
ans.index = pd.core.indexes.base.Index(row_name)

5.1找出表A中不含B的那一部分

若只取新增的
def anti_join(x, y, on):
"""
    :param x:
    :param y:
    :param on:如没有特殊需求,可以不要这个参数
    :return: 返回x中不包含y的部分
"""
    ans = pd.merge(left=x, right=y, how='left', indicator=True, on=on)
    ans = ans.loc[ans._merge == 'left_only', :].drop(columns='_merge')
    return ans

5.2 找出第一次/个出现时的数

def get_first_ele(x):
    if x.first_valid_index() is None:
        return np.nan
    else:
        return x[x.first_valid_index()]
df_online_week = df1.groupby(['device_name','weekofyear'])['weekofyear'].first().unstack().apply(get_first_ele, axis=1)

以df_2为参考,对df_1进行模糊匹配

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    s = df_2[key2].tolist()
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
    df_1['matches'] = m

    m2 = df_1['matches'].apply(lambda x: [i[0] for i in x if i[1]>=threshold][0] if len([i[0] for i in x if i[1] >=threshold]) > 0 else None)
    df_1['final_matches'] = m2
    return df_1
df1 =fuzzy_merge(file1,refer, 'title', '名称', threshold=45 )
#threshold 自行选择

对于未匹配成功的,则用原来的代入:

df1['final_matches'].fillna(df1['title'],inplace=True)

对匹配后的表汇总(groupby),并加上平均值、计数(agg),降序排列

dg1 = df1.groupby('final_matches')['uration'].agg(['mean', 'count']).sort_values(by='mean', ascending=False)

注意:空格 != None

感觉代码可以优化,
类似输出
OUTPUT:
99%~100% 42
98%~99% 64
97%~98% 60
96%~97% 36
95%~96% 37
94%~95% 13
90%~94% 40
80%~90% 18
0%~80% 5

row_name = ['99%~100%','95%~99%','90%~95%','80%~90%','0%~80%']
def alist(df2):
    data1 = df2[(df2['TT']>0.99)&(df2['TT']<=1)]['tt'].count() data2="df2[(df2['TT']">0.95)&(df2['TT']<=0.99)]['tt'].count() data3="df2[(df2['TT']">0.90)&(df2['TT']<=0.95)]['tt'].count() data4="df2[(df2['TT']">0.80)&(df2['TT']<=0.90)]['tt'].count() data5="df2[(df2['TT']">0)&(df2['TT']<=0.80)]['tt'].count() # 动态变量名 list1="[]" names="locals()" for i in range(1,10): df="names.get('data'+str(i))" list1.append(df) list1.index="pd.core.indexes.base.Index(row_name)" return < code></=0.80)]['tt'].count()></=0.90)]['tt'].count()></=0.95)]['tt'].count()></=0.99)]['tt'].count()></=1)]['tt'].count()>
data2['&#x6570;&#x91CF;'] = data2['resolution'].apply(str)
data2['title_num'] = data2['title'] + "_" + data2['&#x6570;&#x91CF;']

name title resolution 数量 title_num
0 站 #超时 9 9 #超时_9
1
站 断开 1 1 断开_1

result2 = (
    data2.groupby(data2["name"])
      .agg(
          # &#x65B0;&#x5217;&#x540D; = (&#x539F;&#x5217;&#x540D;&#xFF0C;&#x51FD;&#x6570;)
          title_temp=("title_num", lambda x : "&#xFF0C; ".join(x)),
      )
      .reset_index()
)
result2

output:
name title_temp
0 *站 #超时_9,断开_1, 手动_3…

1 *站 超时_2, 伸出_1

stack,unstack也很好用
还有re

Original: https://blog.csdn.net/weixin_44625028/article/details/121518943
Author: 巫巫9
Title: 数据分析实习代码总结【进阶】Python

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/739573/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球