import pandas as pd
import numpy as np
file_name0 =[r'信息明细表-2021-1.csv',
r'信息明细表-2021-2.csv',
r'信息明细表-2021-3.csv',
...,
r'信息明细表-2021-50.csv'
]
df = []
for i in range(len(file_name0)):
df.append(pd.read_csv(file_name0[i]))
data = pd.concat(df)
data.to_csv('信息明细表-2021-1-50.csv',encoding='utf_8_sig', index=None)
df.name = df.name.str.replace('公司 ', '')
#去重保留第一个
df.drop_duplicates('name', keep='first', inplace=True)
dataset['repair_interval'] = pd.DataFrame(pd.to_datetime(dataset['repair_time'])-pd.to_datetime(dataset['online_time']))
将xx days转为 xx数值
dataset.repair_interval = dataset.repair_interval.map(lambda x: x/np.timedelta64(1,'D'))
取出月份
dataset['activate_month'] = dataset['activate_time'].map(lambda x: x[:6])
遇到过一次最恶心的日期长这样:
0 14/八月/21 3:00 下午
1 29/七月/21 1:57 下午
2 29/三月/21 3:07 下午
3 05/七月/21 9:37 上午
4 16/六月/21 11:05 上午
解决方法:(笨但有用)
去掉 几点和上下午
data1['已更新'] = data1['已更新'].str.split(' ',expand=True)[0]
月份改成数字, 此处应该可以简化代码
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('十二月','12'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('十一月','11'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('十月','10'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('九月','9'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('八月','8'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('七月','7'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('六月','6'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('五月','5'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('四月','4'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('三月','3'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('二月','2'))
data1['已更新'] = data1['已更新'].map(lambda x: str(x).replace('一月','1'))
data1['已更新'] =pd.to_datetime(data1['已更新'], format='%d/%m/%y', errors='coerce')
data1['已更新']
output:
0 2021-08-14
1 2021-08-04
2 2021-08-04
3 2021-07-06
4 2021-07-06
…
#直接指定修改
ans.columns = ['WK41', 'WK42','WK43', 'WK44', 'WK45', 'WK46']
#修改指定类名
ans.rename(columns={'日期':'repair_time'}, inplace=True)
rans.rename(columns={'总价':'total_cost'}, inplace=True)
#直接修改index
row_name = ['0~10','10~20','20~30','30~40','40~50','50~60','60~70','70~80','80~90','90~100']
ans.index = pd.core.indexes.base.Index(row_name)
5.1找出表A中不含B的那一部分
若只取新增的
def anti_join(x, y, on):
"""
:param x:
:param y:
:param on:如没有特殊需求,可以不要这个参数
:return: 返回x中不包含y的部分
"""
ans = pd.merge(left=x, right=y, how='left', indicator=True, on=on)
ans = ans.loc[ans._merge == 'left_only', :].drop(columns='_merge')
return ans
5.2 找出第一次/个出现时的数
def get_first_ele(x):
if x.first_valid_index() is None:
return np.nan
else:
return x[x.first_valid_index()]
df_online_week = df1.groupby(['device_name','weekofyear'])['weekofyear'].first().unstack().apply(get_first_ele, axis=1)
以df_2为参考,对df_1进行模糊匹配
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: [i[0] for i in x if i[1]>=threshold][0] if len([i[0] for i in x if i[1] >=threshold]) > 0 else None)
df_1['final_matches'] = m2
return df_1
df1 =fuzzy_merge(file1,refer, 'title', '名称', threshold=45 )
#threshold 自行选择
对于未匹配成功的,则用原来的代入:
df1['final_matches'].fillna(df1['title'],inplace=True)
对匹配后的表汇总(groupby),并加上平均值、计数(agg),降序排列
dg1 = df1.groupby('final_matches')['uration'].agg(['mean', 'count']).sort_values(by='mean', ascending=False)
注意:空格 != None
感觉代码可以优化,
类似输出
OUTPUT:
99%~100% 42
98%~99% 64
97%~98% 60
96%~97% 36
95%~96% 37
94%~95% 13
90%~94% 40
80%~90% 18
0%~80% 5
row_name = ['99%~100%','95%~99%','90%~95%','80%~90%','0%~80%']
def alist(df2):
data1 = df2[(df2['TT']>0.99)&(df2['TT']<=1)]['tt'].count() data2="df2[(df2['TT']">0.95)&(df2['TT']<=0.99)]['tt'].count() data3="df2[(df2['TT']">0.90)&(df2['TT']<=0.95)]['tt'].count() data4="df2[(df2['TT']">0.80)&(df2['TT']<=0.90)]['tt'].count() data5="df2[(df2['TT']">0)&(df2['TT']<=0.80)]['tt'].count() # 动态变量名 list1="[]" names="locals()" for i in range(1,10): df="names.get('data'+str(i))" list1.append(df) list1.index="pd.core.indexes.base.Index(row_name)" return < code></=0.80)]['tt'].count()></=0.90)]['tt'].count()></=0.95)]['tt'].count()></=0.99)]['tt'].count()></=1)]['tt'].count()>
data2['数量'] = data2['resolution'].apply(str)
data2['title_num'] = data2['title'] + "_" + data2['数量']
name title resolution 数量 title_num
0 站 #超时 9 9 #超时_9
1 站 断开 1 1 断开_1
…
result2 = (
data2.groupby(data2["name"])
.agg(
# 新列名 = (原列名,函数)
title_temp=("title_num", lambda x : ", ".join(x)),
)
.reset_index()
)
result2
output:
name title_temp
0 *站 #超时_9,断开_1, 手动_3…
1 *站 超时_2, 伸出_1
…
stack,unstack也很好用
还有re
Original: https://blog.csdn.net/weixin_44625028/article/details/121518943
Author: 巫巫9
Title: 数据分析实习代码总结【进阶】Python
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/739573/
转载文章受原作者版权保护。转载请注明原作者出处!