在已爬取的上海二手房数据基础上,调用高德接口,获取房屋经纬度及人民广场经纬度。在原有数据上添加一列”房子到人民广场的距离”,然后利用k-means进行聚类分析。本文只记录数据预处理部分。
REF:
1.1 数据导入
import pickle
import pandas as pd
import re
import numpy as np
import requests
import json
import math
from tqdm import tqdm, trange
读取
shanghai_ershou = pickle.load(open( './shanghai_ershou_v2.pkl', 'rb'))
hangzhou_new = pickle.load(open( './hangzhou_new.pkl', 'rb'))
l = []
for key in shanghai_ershou.keys():
l.append(len(shanghai_ershou[key].keys()))
print('爬取房屋的总数:',sum(l))
转化为dataframe
shanghai_ershou_df = pd.DataFrame(columns = pd.DataFrame(shanghai_ershou['locationbeicaipg1']).index)
for i in shanghai_ershou.keys():
temp = pd.DataFrame(shanghai_ershou[i]).T
shanghai_ershou_df = pd.concat([shanghai_ershou_df, temp], ignore_index = True)
1.2 已有数据进行数据预处理
发现dataframe中每个元素都是一个列表(有可能为空)
def extract_0(x):
"""
提取列表中第一个元素,若列表为空,则返回none
"""
try:
return x[0]
except:
return None
df_shanghai = shanghai_ershou_df.copy()
dataframe中每一列数据都从列表中提取出来
for col in df_shanghai.columns:
df_shanghai[col] = df_shanghai[col].apply(extract_0)
对总价进行处理
df_shanghai.total_price = df_shanghai.total_price.astype('float')
对单价进行处理
df_shanghai.unit_price = df_shanghai.unit_price.str.extract(r'([\d,]+)').replace(',','', regex = True).astype('float')[0]
对面积进行处理
df_shanghai['area'] = df_shanghai.loc[:,'info'].str.extract('([\d.]+)平米')
1.3 批量调用接口并解析出经纬度
res_dict = {}
for i in trange(df_shanghai.shape[0]):
for i in tqdm(range(5)):
url = 'https://restapi.amap.com/v3/geocode/geo?key=c00a9fc63a97c64fe63bf1ff051a285e&address=上海市{}&city=上海市'
location = df_shanghai.iloc[i, 5]+df_shanghai.iloc[i,0]
try:
res = requests.get(url.format(location.rstrip())).text
except:
res = None
finally:
res_dict[i] = res
df_shanghai['api'] = pd.Series(res_dict)
def parse_location(res):
if res != None:
geocodes = json.loads(res).get('geocodes')[0]
location = geocodes.get('location')
return location
else:
return None
经纬度
df_shanghai['location'] = df_shanghai.api.map(parse_location)
经度,纬度
df_shanghai['longitude'] = df_shanghai.location.str.extract('([\d.]+),')
df_shanghai['latitude'] = df_shanghai.location.str.extract(',([\d.]+)')
df_shanghai['longitude'] = df_shanghai['longitude'].astype('float')
df_shanghai['latitude'] = df_shanghai['latitude'].astype('float')
1.4 计算距离
获取人民广场的经纬度
url = 'https://restapi.amap.com/v3/geocode/geo?key=c00a9fc63a97c64fe63bf1ff051a285e&address=上海市{}&city=上海市'
location = '人民广场'
res = requests.get(url.format(location.rstrip())).text
rg_location = json.loads(res).get('geocodes')[0].get('location')
matchObj = re.search('([\d.]+),', rg_location)
rg_longitude = float(matchObj.group(1)) # 经度
matchObj = re.search(',([\d.]+)', rg_location)
rg_latitude = float(matchObj.group(1)) # 纬度
定义用于计算距离的函数
def angle2radian(x):
"角度转换为弧度"
return x*math.pi/180
def rec2sphere(lng1, lat1):
"球坐标系->直角坐标系"
R = 6371
x1 = R*math.cos(lng1)*math.cos(lat1)
y1 = R*math.cos(lng1)*math.sin(lat1)
z1 = R*math.sin(lat1)
return x1, y1, z1
def get_chord_length(x1, y1, z1, x2, y2, z2):
"获取直角坐标系中的直线距离"
dx = x1 - x2
dy = y1 - y2
dz = z1 - z2
lenth = np.sqrt(dx**2+dy**2+dz**2)
return lenth
def get_distance(lng1, lat1, lng2 , lat2):
"输入经纬度,得到两地距离(km)"
R = 6371
# 角度转化为弧度
lng1 = angle2radian(lng1)
lat1 = angle2radian(lat1)
lng2 = angle2radian(lng2)
lat2 = angle2radian(lat2)
# 球坐标->直角坐标
x1, y1, z1 = rec2sphere(lng1, lat1)
x2, y2, z2 = rec2sphere(lng2, lat2)
# 三维空间中的距离(大圆中的弦长)
lenth = get_chord_length(x1, y1, z1, x2, y2, z2)
# 大圆中的弧长
alpha = math.asin(lenth/2/R)*2
r = alpha*R
return r
dataframe的每一行计算距离
distance_dict = {}
for i in trange(df_shanghai.shape[0]):
lng1 = df_shanghai.iloc[i].longitude
lat1 = df_shanghai.iloc[i].latitude
distance = get_distance(lng1, lat1, rg_longitude, rg_latitude)
distance_dict[i] = distance
df_shanghai['distance_rg'] = pd.Series(distance_dict)
1.4 数据存储
df_shanghai.drop(columns = ['api', 'location'], inplace=True)
df_shanghai.to_csv('sh_ershou_clean_v2.csv')
step2:
Original: https://blog.csdn.net/nikita_zj/article/details/122342746
Author: nikita_zj
Title: 聚类尝试-kmeans-step1数据预处理
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/550262/
转载文章受原作者版权保护。转载请注明原作者出处!