데이터 준비¶
In [12]:
import pandas as pd
df = pd.read_csv('http://bit.ly/perch_csv')
perch_full = df.to_numpy()
print(perch_full)
[[ 8.4 2.11 1.41]
[13.7 3.53 2. ]
[15. 3.82 2.43]
[16.2 4.59 2.63]
[17.4 4.59 2.94]
[18. 5.22 3.32]
[18.7 5.2 3.12]
[19. 5.64 3.05]
[19.6 5.14 3.04]
[20. 5.08 2.77]
[21. 5.69 3.56]
[21. 5.92 3.31]
[21. 5.69 3.67]
[21.3 6.38 3.53]
[22. 6.11 3.41]
[22. 5.64 3.52]
[22. 6.11 3.52]
[22. 5.88 3.52]
[22. 5.52 4. ]
[22.5 5.86 3.62]
[22.5 6.79 3.62]
[22.7 5.95 3.63]
[23. 5.22 3.63]
[23.5 6.28 3.72]
[24. 7.29 3.72]
[24. 6.38 3.82]
[24.6 6.73 4.17]
[25. 6.44 3.68]
[25.6 6.56 4.24]
[26.5 7.17 4.14]
[27.3 8.32 5.14]
[27.5 7.17 4.34]
[27.5 7.05 4.34]
[27.5 7.28 4.57]
[28. 7.82 4.2 ]
[28.7 7.59 4.64]
[30. 7.62 4.77]
[32.8 10.03 6.02]
[34.5 10.26 6.39]
[35. 11.49 7.8 ]
[36.5 10.88 6.86]
[36. 10.61 6.74]
[37. 10.84 6.26]
[37. 10.57 6.37]
[39. 11.14 7.49]
[39. 11.14 6. ]
[39. 12.43 7.35]
[40. 11.93 7.11]
[40. 11.73 7.22]
[40. 12.38 7.46]
[40. 11.14 6.63]
[42. 12.8 6.87]
[43. 11.93 7.28]
[43. 12.51 7.42]
[43.5 12.6 8.14]
[44. 12.49 7.6 ]]
In [13]:
import numpy as np
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
1000.0])
In [14]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_full,perch_weight, random_state = 42)
In [15]:
from sklearn.preprocessing import PolynomialFeatures
In [16]:
poly = PolynomialFeatures()
poly.fit([[2,3]])
print(poly.transform([[2,3]]))
#훈련(fit)을 해야 변환(transform)이 가능. 두 메서드를 하나로 붙인 fit_transform 메서드 존재
[[1. 2. 3. 4. 6. 9.]]
In [17]:
poly = PolynomialFeatures(include_bias=False)
poly.fit([[2,3]])
print(poly.transform([[2,3]]))
#include_bias = False로 지정하지 않아도 사이킷런 모델은 자동으로 특성에 추가된 절편 항을 무시
[[2. 3. 4. 6. 9.]]
In [18]:
poly = PolynomialFeatures(include_bias=False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
print(train_poly.shape)
(42, 9)
In [19]:
#9개의 특성이 어떻게 만들어졌는지 확인하는 방법
poly.get_feature_names()
Out[19]:
['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2', 'x2^2']
In [20]:
test_poly = poly.transform(test_input)
항상 훈련 세트를 기준으로 테스트 세트를 변환하는 습관 들이는 것이 좋습니다¶
다중 회귀 모델 훈련하기¶
In [21]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_poly,train_target)
Out[21]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [22]:
print(lr.score(train_poly,train_target))
0.9903183436982124
In [23]:
print(lr.score(test_poly,test_target))
0.9714559911594132
In [24]:
poly = PolynomialFeatures(degree=5,include_bias=False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)
print(train_poly.shape)
(42, 55)
In [25]:
lr.fit(train_poly,train_target)
print(lr.score(train_poly,train_target))
0.9999999999991096
In [26]:
print(lr.score(test_poly,test_target))
-144.40579242335605
이런 모델은 훈련 세트에 너무 과대적합되므로 테스트 세트에서 형평없는 점수를 만듬¶
In [28]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_poly)
train_scaled = ss.transform(train_poly)
test_scaled = ss.transform(test_poly)
#꼭 훈련 세트로 학습한 변환기를 사용해 테스트까지 변환해야 함
릿지 회귀¶
In [29]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(train_scaled,train_target)
print(ridge.score(train_scaled,train_target))
0.9896101671037343
In [30]:
print(ridge.score(test_scaled,test_target))
0.9790693977615398
In [32]:
import matplotlib.pyplot as plt
train_score = []
test_score = []
In [33]:
for alpha in alpha_list:
#릿지 모델 만들기
ridge = Ridge(alpha = alpha)
#릿지 모델을 훈련
ridge.fit(train_scaled,train_target)
#훈련 점수, 테스트 점수 저장
train_score.append(ridge.score(train_scaled,train_target))
test_score.append(ridge.score(test_scaled,test_target))
In [34]:
plt.plot(np.log10(alpha_list),train_score)
plt.plot(np.log10(alpha_list), test_score)
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.show()
In [35]:
ridge = Ridge(alpha = 0.1)
ridge.fit(train_scaled,train_target)
print(ridge.score(train_scaled,train_target))
print(ridge.score(test_scaled,test_target))
0.9903815817570366
0.9827976465386927
라쏘 회귀¶
In [36]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(train_scaled, train_target)
print(lasso.score(train_scaled,train_target))
0.9897898972080961
In [37]:
print(lasso.score(test_scaled,test_target))
0.9800593698421883
In [38]:
train_score=[]
test_score = []
alpha_list = [0.001,0.01,0.1,1,10,100]
for alpha in alpha_list:
#라쏘 모델 만들기
lasso = Lasso(alpha = alpha, max_iter=10000)
#라쏘 모델 훈련
lasso.fit(train_scaled,train_target)
#훈련 점수 테스트 점수 저장
train_score.append(lasso.score(train_scaled,train_target))
test_score.append(lasso.score(test_scaled,test_target))
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_coordinate_descent.py:476: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 18778.697957792876, tolerance: 518.2793833333334
positive)
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_coordinate_descent.py:476: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 12972.821345404844, tolerance: 518.2793833333334
positive)
In [39]:
plt.plot(np.log10(alpha_list),train_score)
plt.plot(np.log10(alpha_list),test_score)
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.show()
In [40]:
lasso = Lasso(alpha=10)
lasso.fit(train_scaled,train_target)
print(lasso.score(train_scaled,train_target))
print(lasso.score(test_scaled,test_target))
0.9888067471131867
0.9824470598706695
In [41]:
print(np.sum(lasso.coef_==0))
40
In [ ]:
출처 : 혼자 공부하는 머신러인+딥러인/박해선
'ML&DL(수정 중)' 카테고리의 다른 글
트리 알고리즘-1 (0) | 2022.03.22 |
---|---|
확률적 경사 하강법 (0) | 2022.03.22 |
로지스틱 회귀 (0) | 2022.03.22 |
회귀알고리즘과 모델 규제-2 (0) | 2022.03.22 |
회귀 알고리즘과 모델 규제-1 (0) | 2022.03.21 |