scikit-learnメモ

Contents

1 LogisticRegressionが算出する直線の意味
- 1.1 softmax関数にとおすまえの式をイコールとおくと決定境界になる
2 fit/fit_transform/transform
3 scikit-learnでの学習済みモデルの保存と読み込み

LogisticRegressionが算出する直線の意味

2値分類においては1本の線が算出される。coef_属性で係数(傾き)、intercept_属性で切片が得られる。係数を各変数にかけて右辺をゼロとおき、グラフの縦軸にしたい変数について解くとグラフの式を得ることができる。なぜゼロとおくかというと、「そういうモデルだから」という理由のほかに、リンク関数であるロジット関数(log(p/(1-p)))に0.5を入れるとちょうどゼロになるからとも解釈できる。

2値分類においてはこの線は決定境界にぴたりと一致する。

いちおうコードもメモ。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import mglearn

X, y = datasets.make_blobs(n_samples=500, n_features=2, centers=2, random_state=1)
log_reg = LogisticRegression().fit(X, y)

x0_, x1_ = np.mgrid[-14:2:0.01, -7:7:0.01]
z = log_reg.predict(np.c_[x0_.ravel(), x1_.ravel()])
plt.contourf(x0_, x1_, z.reshape(x0_.shape))
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(-14, 2, 500)
plt.plot(x_axis, -(log_reg.coef_[0][0] * x_axis + log_reg.intercept_) / log_reg.coef_[0][1], linewidth=5)
plt.title('LogisticRegression(2値分類)')
plt.show()

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn import datasets

from sklearn.linear_model import LogisticRegression

import mglearn

X, y = datasets.make_blobs(n_samples=500, n_features=2, centers=2, random_state=1)

log_reg = LogisticRegression().fit(X, y)

x0_, x1_ = np.mgrid[-14:2:0.01, -7:7:0.01]

z = log_reg.predict(np.c_[x0_.ravel(), x1_.ravel()])

plt.contourf(x0_, x1_, z.reshape(x0_.shape))

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(-14, 2, 500)

plt.plot(x_axis, -(log_reg.coef_[0][0] * x_axis + log_reg.intercept_) / log_reg.coef_[0][1], linewidth=5)

plt.title('LogisticRegression(2値分類)')

plt.show()

3値分類ではこのようになる。

X, y = datasets.make_blobs(n_samples=500, n_features=2, centers=3, random_state=7)

log_reg = LogisticRegression().fit(X, y)

x0, x1 = np.mgrid[-12:12:0.01, -2:8:0.1]
z = log_reg.predict(np.c_[x0.ravel(), x1.ravel()])
plt.contourf(x0, x1, z.reshape(x0.shape))
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(-10, 10, 500)
for coef, intercept in zip(log_reg.coef_, log_reg.intercept_):
    plt.plot(x_axis, -(coef[0] * x_axis + intercept) / coef[1], linewidth=5)
plt.title('LogisticRegression(3値分類)')
plt.show()

X, y = datasets.make_blobs(n_samples=500, n_features=2, centers=3, random_state=7)

log_reg = LogisticRegression().fit(X, y)

x0, x1 = np.mgrid[-12:12:0.01, -2:8:0.1]

z = log_reg.predict(np.c_[x0.ravel(), x1.ravel()])

plt.contourf(x0, x1, z.reshape(x0.shape))

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(-10, 10, 500)

for coef, intercept in zip(log_reg.coef_, log_reg.intercept_):

plt.plot(x_axis, -(coef[0] * x_axis + intercept) / coef[1], linewidth=5)

plt.title('LogisticRegression(3値分類)')

plt.show()

3値分類でもなにか明確な意味があるのではないかと思い、ググったり、実験して調べてみた。各線は任意の2つのグループの真ん中に引かれているのではないかとあたりをつけつつ調べたが、そんなことはなかった。2値分類とはちがい、綺麗な意味はなさそうだ。ひとつ気づく点は、線の交点が決定境界上にあること。ほかになにか本質的な関係がありそうではあるがいまはわからない。ググっても出てこない(後述するが、ずっとまえから持っている本に載っていた)。

そもそもの話、ロジスティック回帰は何本かの線を引き、交差エントロピー誤差が最小になるようにパラメーター(coef_, intercept_)を更新していくだけだ。算出される線と決定境界には直接的な関係はないと思う。少なくとも同一のものではないことだけはたしかだ。2値分類で両者が一致するので、3値以上の分類でもわかりやすい意味があるのではと勘違いして時間を使ってしまった。

ついでにKerasでつくったシンプルなNNではこうなる。

from keras import models, layers

simple_NN = models.Sequential()
simple_NN.add(layers.Dense(3, activation='softmax'))
simple_NN.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = simple_NN.fit(X, y, epochs=100)

x0, x1 = np.mgrid[-12:12:0.01, -2:8:0.1]
z = simple_NN.predict(np.c_[x0.ravel(), x1.ravel()])
plt.contourf(x0, x1, z.argmax(axis=1).reshape(x0.shape))
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(-10, 10, 500)
coefs, intercepts = simple_NN.get_weights()
for coef, intercept in zip(coefs.T, intercepts):
    plt.plot(x_axis, -(coef[0] * x_axis + intercept) / coef[1], linewidth=5)
plt.title('simple NN(3値分類)')
plt.show()

from keras import models, layers

simple_NN = models.Sequential()

simple_NN.add(layers.Dense(3, activation='softmax'))

simple_NN.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = simple_NN.fit(X, y, epochs=100)

x0, x1 = np.mgrid[-12:12:0.01, -2:8:0.1]

z = simple_NN.predict(np.c_[x0.ravel(), x1.ravel()])

plt.contourf(x0, x1, z.argmax(axis=1).reshape(x0.shape))

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(-10, 10, 500)

coefs, intercepts = simple_NN.get_weights()

for coef, intercept in zip(coefs.T, intercepts):

plt.plot(x_axis, -(coef[0] * x_axis + intercept) / coef[1], linewidth=5)

plt.title('simple NN(3値分類)')

plt.show()

線が点にかぶってるが、決定境界はきれいでちゃんと分離できている。この例では決定境界の交点と線の交点がほぼ重なっているが、重なっていない場合もある。

softmax関数にとおすまえの式をイコールとおくと決定境界になる

「詳解ディープラーニング」p.117より、softmax関数にとおすまえの式をイコールとおくと決定境界と一致する直線になることがわかった。これは要するに、2種類のデータである確率が等しい線を3本求めているということ。

X, y = make_blobs(n_samples=100, random_state=8)

model_log = LogisticRegression().fit(X, y)
coefs, intercepts = model_log.coef_, model_log.intercept_
coefs, intercepts
# (array([[-0.86790082,  1.33365625],
#         [ 0.9853658 , -0.64157825],
#         [-0.80453798, -0.33672346]]),
#  array([-0.23312217, -3.7817332 , -0.07957079]))

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(X[:, 0].min(), X[:, 0].max(), 500)
# for coef, intercept in zip(model_log.coef_, model_log.intercept_):
#     plt.plot(x_axis, -(coef[0] * x_axis + intercept) / coef[1])

# 確率が等しくなる線を3本ひく。
plt.plot(x_axis, -((coefs[0][0] - coefs[1][0]) * x_axis + (intercepts[0] - intercepts[1])) / (coefs[0][1] - coefs[1][1]))
plt.plot(x_axis, -((coefs[1][0] - coefs[2][0]) * x_axis + (intercepts[1] - intercepts[2])) / (coefs[1][1] - coefs[2][1]))
plt.plot(x_axis, -((coefs[2][0] - coefs[0][0]) * x_axis + (intercepts[2] - intercepts[0])) / (coefs[2][1] - coefs[0][1]))

n_samples = 500
x0_ = np.linspace(X[:, 0].min(), X[:, 0].max(), n_samples)
x1_ = np.linspace(X[:, 1].min(), X[:, 1].max(), n_samples)
x0, x1 = np.meshgrid(x0_, x1_)
grid_points = np.c_[x0.ravel(), x1.ravel()]
pred = model_log.predict(grid_points)
plt.contourf(x0_, x1_, pred.reshape(x0.shape), alpha=0.5)
plt.title('LogisticRegression')

plt.grid()
plt.show()

X, y = make_blobs(n_samples=100, random_state=8)

model_log = LogisticRegression().fit(X, y)

coefs, intercepts = model_log.coef_, model_log.intercept_

coefs, intercepts

# (array([[-0.86790082, 1.33365625],

# [ 0.9853658 , -0.64157825],

# [-0.80453798, -0.33672346]]),

# array([-0.23312217, -3.7817332 , -0.07957079]))

mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

x_axis = np.linspace(X[:, 0].min(), X[:, 0].max(), 500)

# for coef, intercept in zip(model_log.coef_, model_log.intercept_):

# plt.plot(x_axis, -(coef[0] * x_axis + intercept) / coef[1])

# 確率が等しくなる線を3本ひく。

plt.plot(x_axis, -((coefs[0][0] - coefs[1][0]) * x_axis + (intercepts[0] - intercepts[1])) / (coefs[0][1] - coefs[1][1]))

plt.plot(x_axis, -((coefs[1][0] - coefs[2][0]) * x_axis + (intercepts[1] - intercepts[2])) / (coefs[1][1] - coefs[2][1]))

plt.plot(x_axis, -((coefs[2][0] - coefs[0][0]) * x_axis + (intercepts[2] - intercepts[0])) / (coefs[2][1] - coefs[0][1]))

n_samples = 500

x0_ = np.linspace(X[:, 0].min(), X[:, 0].max(), n_samples)

x1_ = np.linspace(X[:, 1].min(), X[:, 1].max(), n_samples)

x0, x1 = np.meshgrid(x0_, x1_)

grid_points = np.c_[x0.ravel(), x1.ravel()]

pred = model_log.predict(grid_points)

plt.contourf(x0_, x1_, pred.reshape(x0.shape), alpha=0.5)

plt.title('LogisticRegression')

plt.grid()

plt.show()

決定境界と3本の直線がぴたり一致。ちなみに「詳解ディープラーニング」では2本しか線を引いていなかったが、実際には3本ある。

以下はKerasによるシンプルなNN。2本はLogisticRegressionとおなじになったが、1本だけ違う線になった。エポック数を増やしても違う線のまま。なんでだろ。

fit/fit_transform/transform

fitは変換式を計算し、transformは変換式を使ってデータを変換する。fit_transformはそれらを同時に行う。

テストデータは、訓練データで計算した変換式で変換しなければならない。うっかりテストデータで計算した変換式でテストデータを変換しないこと。

訓練データにはfit_transformを使い、テストデータにはtransformを使えばいい。

import numpy as np
from sklearn import preprocessing

train_data = np.array([1, 3, 5, 7, 9])
test_data = np.array([2, 100])

# 正しい例
scaler = preprocessing.StandardScaler()
scaled_train_data = scaler.fit_transform(train_data.reshape(-1, 1)) # 変換式求めたあと、変換
scaled_test_data = scaler.transform(test_data.reshape(-1, 1)) # ↑で得られた変換式で変換する

# 誤った例
scaler = preprocessing.StandardScaler()
scaled_train_data = scaler.fit_transform(train_data.reshape(-1, 1))
scaled_test_data = scaler.fit_transform(test_data.reshape(-1, 1)) # 新たに変換式を求めてはだめ

import numpy as np

from sklearn import preprocessing

train_data = np.array([1, 3, 5, 7, 9])

test_data = np.array([2, 100])

# 正しい例

scaler = preprocessing.StandardScaler()

scaled_train_data = scaler.fit_transform(train_data.reshape(-1, 1)) # 変換式求めたあと、変換

scaled_test_data = scaler.transform(test_data.reshape(-1, 1)) # ↑で得られた変換式で変換する

# 誤った例

scaler = preprocessing.StandardScaler()

scaled_train_data = scaler.fit_transform(train_data.reshape(-1, 1))

scaled_test_data = scaler.fit_transform(test_data.reshape(-1, 1)) # 新たに変換式を求めてはだめ

scikit-learnでの学習済みモデルの保存と読み込み

from sklearn.externals import joblib

# 分類機clfをセーブ
joblib.dump(clf, 'clf.pkl', compress=True)

# 分類機clfをロード
clf = joblib.load('clf.pkl')

from sklearn.externals import joblib

# 分類機clfをセーブ

joblib.dump(clf, 'clf.pkl', compress=True)

# 分類機clfをロード

clf = joblib.load('clf.pkl')

joblib関数を使うのが推奨されているみたいだが、普通にpickleモジュールで保存することもできる。

import pickle

# 分類機clfをセーブ
pickle.dump(clf, open('clf.pkl', 'wb'))

# 分類機clfをロード
clf = pickle.load(open('clf.pkl', 'rb'))

import pickle

# 分類機clfをセーブ

pickle.dump(clf, open('clf.pkl', 'wb'))

# 分類機clfをロード

clf = pickle.load(open('clf.pkl', 'rb'))