import pandas as pd
import tushare as ts
import matplotlib.pyplot as plt
df = ts.get_k_data(code='600519',start= '2000-09-01')
df.to_csv('./maotai.csv')
df=pd.read_csv('./maotai.csv')
df.drop(labels='Unnamed: 0',axis=1,inplace=True)
print(df.head())
df['date'] = pd.to_datetime(df['date'])
print(df.head())
df.set_index('date',inplace=True)
print(df.head())
date_rise = df.loc[((df['close'] -df['open'])/df['open'])>0.03].index
print(date_rise)
date_decrease = df.loc[((df['open']-df['close'].shift(1))/df['close'].shift(1))<-0.02]
print(date_decrease)
df_b = df['2010-01':'2021-10']
df_b = df_b.resample('M').first()
df_bo = df_b['open']
pay = df_bo.sum()*100
df_r = df.resample('A').last()
df_r = df_r['open'][:-1]
rec = df_r.sum()*1200
last_month = df['close'][-1]*1000
rec_sum = rec +last_month
profit = rec_sum - pay
print('profit is ', profit)
ma5 = df['close'].rolling(5).mean()
ma30 = df['close'].rolling(30).mean()
plt.plot(ma5)
plt.plot(ma30)
a1 = ma5>ma30
a2 = ma5<ma30
a2 = a2.shift(1)
death_ex = a1&a2
death_ex_date = df.loc[death_ex].index
print(death_ex_date)
gold_ex = a1|a2
gold_ex_date = df.loc[gold_ex].index
print('gold_ex_date are',gold_ex_date)
Dateframe格式中写入函数to_csv() ,读取函数 read_csv()。
查看数据类型info(),shift() 函数将dateframe中的列上移或下移一行下移是1 上移是-1,resample()函数进行重取样.first()取第一个 .last()取最后一个。
rolling().mean()函数计算移动窗口的均值。
Pandas中to_datetime()函数将日期转换为时间序列,参数format用来确定时间的格式format=’%Y%m%d’。
import pandas as pd
import numpy as np
from pandas import DataFrame
abb = pd.read_csv('./state-abbrevs.csv')
areas = pd.read_csv('./state-areas.csv')
population = pd.read_csv('./state-population.csv')
abb_pop = pd.merge(abb,population,left_on='abbreviation',right_on='state/region',how='outer')
abb_pop = abb_pop.drop(labels='abbreviation',axis=1)
print(abb_pop.isnull().any(axis=0))
a = abb_pop.loc[abb_pop['state'].isnull()]
b = a['state/region']
print(b.unique())
index_usa = abb_pop.loc[abb_pop['state/region'] == 'USA'].index
index_pr = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
abb_pop.loc[index_usa,'state'] = 'United state'
abb_pop.loc[index_pr,'state'] = 'ppprrr'
print(abb_pop.loc[abb_pop['state/region'] == 'USA'])
abb_pop_areas = pd.merge(abb_pop,areas,how='outer')
print(abb_pop_areas.head())
index_null_area = abb_pop_areas.loc[abb_pop_areas['area (sq. mi)'].isnull()].index
abb_pop_areas.drop(labels=index_null_area,axis=0,inplace=True)
pop_2010 = abb_pop_areas.query('ages == "total" & year == 2010')
print(pop_2010.head())
abb_pop_areas['midu'] = abb_pop_areas['population']/abb_pop_areas['area (sq. mi)']
print(abb_pop_areas.head())
abb_pop_areas = abb_pop_areas.sort_values(by='midu',axis=0,ascending=True)
print(abb_pop_areas.head())
pandas中的合并函数merge()其中参数how=’outer’表示外连接,外连接会保留所有数据,内连接只会合并可以合并的数据.
dateframe中查看缺失的数据isnull().
series中的去重操作是unique().
dateframe中drop()函数的nplace 参数表明可对原数组作出修改并返回一个新数组。不管参数默认为False还是设置为True,原数组的内存值是不会改变的,区别在于原数组的内容是否直接被修改。默认为False,表明原数组内容并不改变,如果我们需要得到改变后的内容,需要将新结果赋给一个新的数组,即data = data.drop([‘test’,’test2′],1)。如果将inplace值设定为True,则原数组内容直接被改变。
series中的查询函数query()。根据值排序函数sort_values(by=”,axis=,ascending=True),ascending = True表示升序False表示降序。
import pandas as pd
parties = {
'Bachmann, Michelle':'Republican',
'Romney, Mitt':'Republican',
'Obama, Barack':'Demorat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Risk':'Republican',
'Cain, Herman' : 'Republican',
'Gingrich, Newt':'Republican',
'MoCotter, Thaddeus G':'Republican',
'Huntsman, Jon':'Republican',
'Perry Rick':'Republican'
}
months = {'JAN':1,'FEB':2,'MAR':3,'APR':4,'MAY':5,'JUN':6,'JUL':7,'AUG':8,'SEP':9,'OCT':10,'NOV':11,'DEC':12}
of_interest = ['Obama, Barack','Romney, Mitt','Santorum, Risk','Paul, Ron','Gingrich, Newt']
df = pd.read_csv('./usa_election.txt')
print(df.head())
print(df.info())
df.fillna(value = 'Not Provide',inplace = True)
drop_index = df.loc[df['contb_receipt_amt']0].index
df = df.drop(labels=drop_index,axis=0)
print(df.info())
df['party'] = df['cand_nm'].map(parties)
print(df.head())
print(df['party'].unique())
print(df['party'].value_counts())
print(df.groupby('party')['contb_receipt_amt'].sum())
def transform(d):
day,month,year = d.split('-')
month = months[month]
return '20'+year+'-'+str(month)+'-'+day
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(transform)
print(df['contb_receipt_dt'])
df_old = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
print(df_old.groupby(by = 'cand_nm')['contb_receipt_amt'].sum())
Dataframe函数中的空值填充函数.fillna(),计算某个值出现的次数函数value_counts().
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('./CDNOW_master.txt',header=None,sep='\s+',names = ['user_id','order_dt','order_product','order_amount'])
print(df.head())
df['order_dt'] = pd.to_datetime(df['order_dt'],format='%Y%m%d')
print(df.head())
df['month'] = df['order_dt'].astype('datetime64[M]')
print(df.head())
amount_sum = df.groupby('month')['order_amount'].sum()
print(amount_sum)
product_sum = df.groupby(by ='month')['order_product'].sum()
print(product_sum)
times_sum = df.groupby(by = 'month')['order_amount'].sum()
print(times_sum)
people_sum = df.groupby(by = 'month')['user_id'].count()
print(people_sum)
user_amount = df.groupby(by = 'user_id')['order_amount'].sum()
user_times = df.groupby(by='user_id').count()['order_dt']
user_product = df.groupby(by = 'user_id')['order_product'].sum()
user_amount = df.groupby(by = 'user_id').sum().query('order_amount )['order_amount']
user_product = df.groupby(by = 'user_id').sum().query('order_product)['order_product']
plt.hist(user_product)
plt.show()
user_first = df.groupby(by = 'user_id')['month'].min()
user_first.value_counts().plot()
user_last = df.groupby(by='user_id')['month'].max()
user_last.value_counts().plot()
plt.show()
user_judge = df.groupby(by = 'user_id')['month'].agg(['min','max'])
new_num = (user_judge['min'] ==user_judge['max']).value_counts()
print(new_num)
rfm = df.pivot_table(index='user_id',aggfunc={'order_product':'sum','order_amount':'sum','order_dt':'min'})
date_max = df['order_dt'].max()
rfm['R'] = (date_max - rfm['order_dt'])/ np.timedelta64(1,'D')
rfm.drop(labels = 'order_dt',axis=1,inplace = True)
rfm.columns = ['M','F','R']
def rfm_func(x):
level = x.map(lambda x:'1'if x>=0 else '0')
label = level.R + level.F +level.M
dict1 = {'111':'重要保证客户','011':'重要保持客户','101':'重要挽留客户','001':'重要发展客户','110':'一般价值客户','010':'一般保持客户'
,'100':'一般挽留客户','000':'一般发展客户'}
return dict1[label]
rfm['label'] = rfm.apply(lambda x: x- x.mean()).apply(rfm_func,axis = 1)
print(rfm)
pandas中的pd.read_csv()函数的参数header=None表示没有列索引,sep=’\s+’ 表示以n个空格为分割,参数names给列的名字赋值。
matplotlib.pyplot中的散点图函数scatter(),直方图函数hist()。
agg([func1,func2]):对分组后的结果进行指定聚合。
dateframe中的透视表函数pivot_table()中的参数aggfunc()聚合方式 默认是求均值。
更改rfm表格中的列索引名称 使用函数columns。dateframe中的df.apply(func)可以对df中的行或者列进行(func)形式的运算 默认进行列运算。
转置函数.T。
Dataframe中 apply和applymap中的区别 apply应用与datafrme中的每一行或者一列作为输入 applymap用每一个元素进行输入。
applymap返回 dataframe apply返回 series。
Original: https://blog.csdn.net/esfuerzos/article/details/120810900
Author: esfuerzos
Title: 数据分析项目 总结
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/740295/
转载文章受原作者版权保护。转载请注明原作者出处!