歩行量(survery_data)とWi-Fiアドレス数(wifi_data)との比較を、階層線形モデルで行う。
手順:
マニュアルページ:http://www.statsmodels.org/stable/mixed_linear.html
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# テストデータ
df = {}
df[1]= pd.read_csv('/home/tamada/survey_wifi20181130.csv', names = ('point','date','time','wifi_data','survey_data'))
df[2] = pd.read_csv('/home/tamada/survey_wifi20181201.csv', names = ('point','date','time','wifi_data','survey_data'))
df[3] = pd.read_csv('/home/tamada/survey_wifi20181202.csv', names = ('point','date','time','wifi_data','survey_data'))
plt.scatter(df[1]['wifi_data'], df[1]['survey_data'], label = "20181130")
plt.scatter(df[2]['wifi_data'], df[2]['survey_data'], label = "20181201")
plt.scatter(df[3]['wifi_data'], df[3]['survey_data'], label = "20181202")
plt.xlabel("wifi_data")
plt.ylabel("survey_data")
plt.legend()
plt.show()
# テストデータ日を除いた日を訓練データ(train_df)として係数算出 (OLS)
train_df = {}
train_df[1] = pd.concat([df[2],df[3]])
train_df[2] = pd.concat([df[3],df[1]])
train_df[3] = pd.concat([df[1],df[2]])
import statsmodels.api as sm
result = {}
for i in train_df.keys():
Y = train_df[i]['survey_data'].values
df_X = train_df[i][["wifi_data"]]
X = df_X.values
X1 = sm.add_constant(X)
model = sm.OLS(Y,X1)
result[i] = model.fit()
print(result[i].summary())
# 混合線形モデル(mixedlm)での係数算出
import statsmodels.formula.api as smf
for i in train_df.keys():
md = smf.mixedlm("survey_data ~ wifi_data", train_df[i], groups=train_df[i]["point"], re_formula="~wifi_data")
mdf = md.fit()
# print(mdf.fe_params)
# mixedlmでの地点ごとの係数を(目的日、地点)ごとに算出
# coef[テストデータ用ID][地点ID] 訓練データは地点データを除いた2日間
import statsmodels.formula.api as smf
coef = {}
for i in df.keys():
md = smf.mixedlm("survey_data ~ wifi_data", train_df[i], groups=train_df[i]["point"], re_formula="~wifi_data")
mdf = md.fit()
Intercept_common = mdf.fe_params.Intercept # 共通切片
coef_common = mdf.fe_params.wifi_data # 共通傾き
random_coef = mdf.random_effects # ランダム項
coef[i] = {}
for j in random_coef.keys(): # 共通とランダムの和を計算
coef[i][j] = [random_coef[j].Group+ Intercept_common, random_coef[j].wifi_data+ coef_common]
# import pprint
# pprint.pprint(coef) # 係数を書き出してみる
# 訓練データから得られた回帰係数を用いてテストデータ日のWi-Fiデータより歩行者数の予測値を計算
dfa = {} # 結果を入れるdataframe
for i in df.keys():
predict_dict = {}
for j in df[i].index:
item = df[i].loc[j]
this_coef = coef[i][item.point] # 地点ごとの係数
predict_dict[j] = {"predict_val": item.wifi_data*this_coef[1]+ this_coef[0]}
#print(predict_dict)
predict_fr = pd.DataFrame(predict_dict).T
dfa[i] = pd.concat([df[i],predict_fr], axis=1)
# 結果表示と描画
print(dfa[1][['predict_val','survey_data']].corr())
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(17, 5))
dfa[1]['color'] = dfa[1]['point'].astype(int) # 色は整数値で指定
dfa[1].plot(kind="scatter", ax=axes[0], x="predict_val", y="survey_data",
title="訓練データ12/1,12/2 テストデータ 11/30",
c="color", colormap='Accent', colorbar=False)
print(dfa[2][['predict_val','survey_data']].corr())
dfa[2]['color'] = dfa[2]['point'].astype(int) # 色は整数値で指定
dfa[2].plot(kind="scatter", ax=axes[1], x="predict_val", y="survey_data",
title="訓練データ11/30,12/1 テストデータ 12/1",
c="color", colormap='Accent', colorbar=False)
print(dfa[3][['predict_val','survey_data']].corr())
dfa[3]['color'] = dfa[3]['point'].astype(int) # 色は整数値で指定
dfa[3].plot(kind="scatter", ax=axes[2], x="predict_val", y="survey_data",
title="訓練データ11/30, 2/1 テストデータ 12/2",
c="color", colormap='Accent', colorbar=False)
plt.savefig("comp_kofu_2018.png")
# 予測値と歩行量調査値との差(率)が大きい順に並べてみる
# 地点名を各日のデータフレームに追加
name_data= pd.read_csv('/home/tamada/kofupointname.csv', encoding='cp932',names = ('point','name'))
point_name = {}
for i,n in name_data.iterrows():
point_name[n['point']] = n['name']
name_ser = [point_name[val['point']] for i,val in dfa[1].iterrows()]
for i in dfa:
# 地点名を追加
dfa[i]['point_name'] = name_ser
# 予測と歩行量調査との差を追加
dfa[i]['diff'] = dfa[i]['predict_val'] - dfa[i]['survey_data']
# 差の率を追加
dfa[i]['diff_ratio'] = abs(dfa[i]['predict_val'] - dfa[i]['survey_data'])/ dfa[i]['predict_val']
# 並べ替え
dfa[i].sort_values('diff_ratio',ascending=False)
# dfa[1] # 表示
# 地点ごとの誤差の合計
# groupbyを使う 参考: http://sinhrks.hatenablog.com/entry/2014/10/13/005327
daily_sum = {}
for i in dfa:
daily_sum[i] =dfa[i].groupby('point_name')['survey_data','predict_val','diff'].sum().sort_values('diff', ascending=False)
# ラベルの変更
daily_sum[1].columns = ["survey1130", "predict1130","diff1130"]
daily_sum[2].columns = ["survey1201", "predict1201","diff1201"]
daily_sum[3].columns = ["survey1202", "predict1202","diff1202"]
# 3つのデータフレームのマージ
three_day_sum = pd.merge(daily_sum[1], daily_sum[2], on=['point_name'])
three_day_sum = pd.merge(three_day_sum, daily_sum[3], on=['point_name'])
# 各日の誤差率
# 3日間の合計
three_day_sum['歩行合計'] = three_day_sum['survey1130'] + three_day_sum['survey1201'] + three_day_sum['survey1202']
three_day_sum['推計合計'] = three_day_sum['predict1130'] + three_day_sum['predict1201'] + three_day_sum['predict1202']
three_day_sum['差合計'] = three_day_sum['diff1130'] + three_day_sum['diff1201'] + three_day_sum['diff1202']
three_day_sum['誤差率'] = three_day_sum['差合計'] / three_day_sum['歩行合計']
three_day_sum
three_day_sum[['歩行合計', '推計合計', '差合計', '誤差率']]
循環的に訓練データとテストデータを入れ替えて評価しているので、プラスマイナス逆になったものを使うことになって、誤差が相殺されてしまっているかもしれない。
以下は元のまま
mdf.fittedvalues[:5]
df_all['fitted_lmer'] = mdf.fittedvalues
df_all['handcount'] = d2['survey_data']
df_all.head()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
df_all['color'] = df_all['point'].astype(int) # 色は整数値で指定
df_all.plot(kind="scatter", ax=axes[0], x="fitted_olm", y="handcount", c="color", colormap='Accent', colorbar=False)
df_all.plot(kind="scatter", ax=axes[1], x="fitted_lmer", y="handcount", c="color",colormap='tab20', colorbar=False)
print(df_all[['fitted_olm','survey_data']].corr())
print(df_all[['fitted_lmer','handcount']].corr())
name_data= pd.read_csv('/home/tamada/kofupointname.csv', encoding='cp932',names = ('point','name'))
name_data
pd.merge(df_all,name_data, on='point')
point_num = df_all["point"].unique()
ig, axes = plt.subplots(nrows=25, ncols=1, figsize=(5, 100))
for point in point_num:
pd.merge(df_all,name_data, on='point')
df_kofu = df_all[df_all["point"] == point].reset_index(drop = True)
df_kofu.plot(kind="scatter", ax=axes[point], x="fitted_lmer", y="handcount", c='color',colormap='tab20', colorbar=False,label=point)
df_all.plot(kind = "scatter" , y = "fitted_lmer", x ="wifi_data")