【2019bike】数据处理记录

import pandas as pd
import numpy as np
import csv
from datetime import datetime

零、数据筛选

0-1 搭建grid号

from utils2019 import get_pos_lola

filename=['./data/datajun','./data/dataoct']
import pandas as pd

for fn in filename:
    start_grid=[]
    end_grid=[]
    df=pd.read_csv(fn+'.csv')
    for i in range(len(df)):
        start_grid.append(get_pos_lola(df.loc[i]["starting_lng"],df.loc[i]['starting_lat']))
        end_grid.append(get_pos_lola(df.loc[i]["dest_lng"],df.loc[i]['dest_lat']))
    df["start_grid"]=start_grid
    df["end_grid"]=end_grid
    df["ridetime"]=pd.DataFrame((pd.to_datetime(df['finished_time']) -
              pd.to_datetime(df['begun_time'])).apply(lambda x: int(x/np.timedelta64(1,'m'))))
    print(df.head(3))
    df.to_csv(fn+"_new.csv")

0-2数据筛选

  • 添加grid-后的数据
    【2019bike】数据处理记录
  • 调用函数筛选数据
from utils2019 import ready_data

month = "jun"
ready_data(month)

ready_data(month)
  • 结果展示
month = "jun"
root = "../data/"
leaf = root+month +"_ready.csv"
pd.read_csv(leaf)
文件的列名为:
Index(['starting_lng', 'starting_lat', 'dest_lng', 'dest_lat', 'start_grid',
       'end_grid', 'ridetime', 'start_day', 'start_hour', 'start_minute',
       'end_day', 'end_minute', 'end_hour'],
      dtype='object')
文件的大小为:
(374944, 13)

一、以时间为轴的流量数据

对每个月的每日数据进行保存

1.1 数据结构搭建和保存

from tqdm import trange

class Obtain_timeflow(object):
    def __init__(self,month):
        self.root = "../data/"
        if isinstance(month,(int,float)) :
            d = {2:"feb",3:"mar",6:"jun",10:"oct"}
            self.month = d[int(month)]
        else:
            self.month = month.lower()
        self.DAY = self.repre_day(self.month)
        path = self.root+self.month +"_ready.csv"
        if os.path.exists(path):
            self.df =pd.read_csv(path,index_col=False)
            self.columns = self.df.columns
        else:
            print("没有该文件!")

    def build_flow(self,interval=None):
        self.remove_file()
        if interval is None:
            interval = 15
        Inval = int(60/interval)
        Monarr = []
        for day in trange(1,self.DAY+1):
            if not os.path.exists(self.root+self.month+"/"):
                os.makedirs(self.root+self.month+"/")
            inter_path =self.root+self.month+"/"+str(day)+".npy"
            if os.path.exists(inter_path):
                Monarr.append(np.load(inter_path))
                continue
            Dayarr=np.zeros((24,Inval,500,2))
            for hour in range(24):
                rs=self.day_record(day,hour,"start",Inval)
                re=self.day_record(day,hour,"end",  Inval )

                Dayarr[hour]=np.concatenate((rs,re),axis=2)
            np.save(inter_path,Dayarr)

            Monarr.append(Dayarr)
        Monarr = np.array(Monarr)
        print("concate data shape:",Monarr.shape)
        print("every day sumflow")
        print(Monarr.mean(axis=(1,2,3,4)))
        Monarr = Monarr.reshape(-1,500,2)
        print("flow feature data shape:",Monarr.shape)
        Mon_path = self.root+self.month+"_interval"+str(interval)+".npy"
        np.save(Mon_path,Monarr)
        return Monarr

    def day_record(self,day,hour,point,Inval):

        d = str(point) +"_day"
        h = str(point) +"_hour"
        m = str(point) +"_minute"
        g = str(point) +"_grid"
        if d not in self.columns:
            print("列名不准确")
        record=np.zeros((500,Inval))

        d1=self.df[(self.df[d]==day) &(self.df[h]==hour)][[g,m]]

        for node in set(d1[g]):

            L= d1[d1[g]==node][m].tolist()
            if len(L) ==0:
                record[node] = np.zeros(shape=(Inval,))
            else:
                record[node] = self.node_flow_feature(L,Inval)
        r=record.transpose(1, 0)
        r=np.expand_dims(r, axis=2)
        return r

    def remove_file(self,filename=None):
        if filename is None :
            filename =self.root+self.month+"/"+str(0)+".npy"
        if os.path.exists(filename):
            os.remove(filename)
    def remove_dir(self,):
        floot_dir=self.root+self.month+"/"
        fl = os.listdir(floot_dir)
        for p in fl:
            path = floot_dir+p
            self.remove_file(path)

    @classmethod
    def repre_day(self,month):
        if month.lower() =="feb":
            return 28
        elif month.lower() =="jun":
            return 30
        else:
            return 31
    @classmethod
    def node_flow_feature(self,L,Inval):
        R = np.zeros(shape=(Inval,))
        mod = 60/Inval
        for m in L:
            if m ==60:
                m -=1
            index = int(m//mod )
            R[index] += 1
        return R

调用1:


month = "jun"
otf=Obtain_timeflow(month)
Monarr=otf.build_flow()

调用2:


month = "jun"
interval = 5
otf=Obtain_timeflow(month)
otf.remove_dir()
Monarr=otf.build_flow(interval)

1.2 滑动窗口数据

def windows_slices(month,interval,in_window,out_window):

    filename = "../data/"+month+"_interval"+str(interval)+".npy"
    Marr = np.load(filename)
    Marr = np.transpose(Marr,(1,2,0))
    print(month+" data shape is:",Marr.shape)

    Inval = int(60/interval)
    window = in_window+out_window
    N = Marr.shape[2]-window
    indices=[(i,  i+ window) for i in range(N)]

    feature, target =[], []
    for i,j in indices:
        feature.append(
            Marr[:, :,    i:i+in_window  ].transpose((0,2,1))
            )
        target.append(
            Marr[:, :,    i+in_window:j  ]
            )
    feature= np.array(feature)
    print("feature shape [S, N,W,F] = ",feature.shape)
    np.save("../data/"+month+"_interval"+str(interval)+"_feature.npy",feature)
    target = np.array(target)
    print( "target shape [S, N, F,W] = ",target.shape)
    np.save("../data/"+month+"_interval"+str(interval)+"_target.npy",target)
    return feature,target

调用:

month = "jun"
interval = 15
in_window = 6
out_window = 2
feature,target = windows_slices(month,interval, in_window,out_window)

out:

jun data shape is: (500, 2, 2880)
feature shape [S, N,W,F] =  (2872, 500, 6, 2)
target shape [S, N, F,W] =  (2872, 500, 2, 2)

二、空间矩阵的搭建

grid性质搭建ing

2.1 grid 4个经纬度【左右上下】顺序

Grid=[118.750802,32.019297,118.818880,32.063531]
def grid_longitude_latitude(Grid):
    import numpy as np
    num_jing=25
    num_wei=20

    list_jing=np.linspace(Grid[0],Grid[2],num=num_jing+1)
    list_wei =np.linspace(Grid[1],Grid[3],num=num_wei+1)

    id=0
    grids=collections.defaultdict(list)

    for i  in range(num_wei):
         for j in range(num_jing):

                grids[id].append(list_jing[j])
                grids[id].append(list_wei[i])
                grids[id].append(list_jing[j+1])
                grids[id].append(list_wei[i+1])
                grids[id].append(i)
                grids[id].append(j)
                id+=1
    return grids

2.2grid 空间邻居矩阵


def get_neighbor_matrix(grids):
    import numpy as np
    num_nodes = 500
    NM=np.zeros((num_nodes,num_nodes))

    for i in range(num_nodes):
        for j in range(i,num_nodes):

            if (grids[i][5] == grids[j][5]) & (grids[i][4]+1 == grids[j][4]):
                NM[i][j]=1
                NM[j][i]=1

            if (grids[i][4] == grids[j][4]) & (grids[i][5]+1 == grids[j][5]):
                NM[i][j]=1
                NM[j][i]=1

            if (grids[i][4]+1 == grids[j][4]) & (grids[i][5]+1 == grids[j][5]):
                NM[i][j]=1
                NM[j][i]=1
    return NM

数据保存

Grid = [118.750802,32.019297,118.818880,32.063531]
grids = grid_longitude_latitude(Grid)
NM = get_neighbor_matrix(grids)
fpath="../data/Spatial_neighbor.npy"
np.save(fpath,NM)
NM1=np.load(fpath)
NM1

三、文本矩阵的搭建

3.1 grid的pois特征搭建

数据样貌:

【2019bike】数据处理记录
统计每个poi在每个grid区域的个数,并保存
csvs=["100000","120000","140000","150000","160000","170000","050000","060000","070000","080000","090000"]
pops = np.zeros(shape=(11,500))
for k in range(11):
    df = pd.read_csv("../poi_data/解析结果_118.750802,32.063531#118.818880,32.019297types_"+csvs[k]+".csv")
    Cter=defaultdict(lambda : 0) # 默认字典设置value=0
    for i in range(len(df)):
        grid=get_pos_lola(df.loc[i]["wgs84_lon"], df.loc[i]["wgs84_lat"])
        if grid != -1:
            Cter[grid] +=1
    for j in range(500):
        pops[k][j] = Cter[j]

poi = np.transpose(pops,(1,0))
np.save("../poi_data/characteristic500.npy",poi)

结果展示:

【2019bike】数据处理记录

3.2 pois特征关系提取–fastDTW

fast-DTW距离矩阵

def dis_DTW(feature=None) :
    if feature is None:
        feature = np.load("../data/common/"+"/characteristic500.npy")
    print("feature data shape is ",feature.shape)
    NUM = feature.shape[0]
    D = np.zeros(shape =(NUM,NUM))
    for i in trange(NUM):
        for j in range(i+1,NUM):
            d,path = fastdtw(feature[i],feature[j])
            D[i][j] = d
            D[j][i] = d
    np.save("../data/common/dis_DTW.npy",D)
    return D

获得归一化后PA矩阵

def get_matrix(ADJ,D=None,Thre=None):
    if D is None:
        D =np.load("../data/common/dis_DTW.npy")

    if Thre is None:
        Thre = int(min(np.mean(D),np.median(D))/5)
        print("median and mean limit the value is :",Thre)
    NUM = D.shape[0]
    A = np.zeros(shape=(NUM,NUM))
    fname = "../data/common/PN_"+ADJ+".npy"
    if  ADJ =="weighted":
        for i in range(NUM):
            for j in range(NUM):
                if round(D[i][j],1) ==0:
                    A[i][j] = Thre
                elif round(D[i][j],1) >Thre :
                    A[i][j] = 0
                else :
                    A[i][j] =round(1/(round(D[i][j],1 )),2)

        big_A = np.max(A)
        small_A = np.min(A)
        A = (A - small_A) / (big_A - small_A)
        np.save(fname,A)
        return np.round(A,2)
    elif ADJ =="unweighted":
        Au =A <Thre
        np.save(fname,Au)
        return Au

调用:

ADJ ="weighted"
D = dis_DTW()
A=get_matrix(ADJ,D,Thre=None)

注:函数fastdtw():见链接fastdtw
验证矩阵:

【2019bike】数据处理记录

; utils.py

import numpy as np
def node_flow_feature(data,node):

    data.columns=["station","time"]
    T = np.zeros(shape=(12,))

    L = data[data["station"]==node]["time"].tolist()

    for t in L:
        if (t>=0) & (t<5):
            T[0]+=1
        elif t<10:
            T[1]+=1
        elif t<15:
            T[2]+=1
        elif t<20:
            T[3]+=1
        elif t<25:
            T[4]+=1
        elif t<30:
            T[5]+=1
        elif t<35:
            T[6]+=1
        elif t<40:
            T[7]+=1
        elif t<45:
            T[8]+=1
        elif t<50:
            T[9]+=1
        elif t<55:
            T[10]+=1
        else:
            T[11]+=1

    return T

import decimal
def get_pos_lola(lo,la):

    lo=decimal.Decimal(lo)
    la=decimal.Decimal(la)

    lo_l=decimal.Decimal(118.750802)
    la_l=decimal.Decimal(32.019297)
    lo_r=decimal.Decimal(118.818880)
    la_r=decimal.Decimal(32.063531)

    d_lo=(lo_r-lo_l)/25
    d_la=(la_r-la_l)/20

    dx_lo=(lo-lo_l)//d_lo
    dy_la=(la-la_l)//d_la

    if (0dx_lo24)&(0dy_la19):
        d_xy=int(dx_lo+25*dy_la)

        return d_xy
    elif (0dx_lo24)&(la==la_r):
        d_xy=int(dx_lo+25*decimal.Decimal(19))

        return d_xy
    else:

        return -1
def ready_data(month):
    root = "../data/"
    filename = root+month+"_grid.csv"
    if not os.path.exists(filename):
        print("文件不存在!")
    else :
        df=pd.read_csv(filename)

        df.drop(['vehicle_id','is_mkt_card','birth_year',"Unnamed: 0"],axis=1,inplace=True)

        df.drop(df[df["start_grid"]== -1].index,inplace=True)
        df.drop(df[df["end_grid"]== -1].index,inplace=True)

        df['start_day']=pd.to_datetime(df['begun_time']).dt.day
        df['start_hour']=pd.to_datetime(df['begun_time']).dt.hour
        df['start_minute']=pd.to_datetime(df['begun_time']).dt.minute
        df['end_day']=pd.to_datetime(df['finished_time']).dt.day
        df['end_minute']=pd.to_datetime(df['finished_time']).dt.minute
        df['end_hour']=pd.to_datetime(df['finished_time']).dt.hour

        del df["begun_time"]
        del df["finished_time"]

        savefile = root+month+"_ready.csv"
        df.to_csv(savefile,index=False)
        print("文件的列名为:")
        print(df.columns)
        print("文件的大小为:")
        print(df.shape)

Original: https://blog.csdn.net/panbaoran913/article/details/121621691
Author: panbaoran913
Title: 【2019bike】数据处理记录

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/756772/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球