W707 s-taiji@is.titech.ac.jp 1 / 1
(lm) lm AIC 2 / 1
: y = β 1 x 1 + β 2 x 2 + + β d x d + β d+1 + ϵ (ϵ N(0, σ 2 )) y R: x R d : β i (i = 1,..., d):, β d+1 : ( ) (d = 1) y = β 1 x 1 + β 2 + ϵ (d > 1) y = β 1 x 1 + β 2 x 2 + + β d x d + β d+1 + ϵ 3 / 1
price 2e+07 4e+07 6e+07 8e+07 20 40 60 80 100 area 4 / 1
n ( ) (y i, x i ) R R d (i = 1,..., n) y 1 x 1 Y =. R n 1 1.., X =.. R n (d+1), ϵ = ϵ R n, x n 1 y n β ( ), Y = X β + ϵ. ϵ n ( ) ˆβ = (X X ) 1 X Y. Cramer-Rao ( ) 5 / 1
X ˆβ ( ( ) ) 1 1 n(ˆβ β ) N 0, σ 2 n X X. ( ) ˆβ = (X X ) 1 X Y = (X X ) 1 X (X β + ϵ) n(ˆβ β ) = n(x X ) 1 X ϵ. (1) ϵ i N(0, σ 2 ) 0. E[n(X X ) 1 X ϵϵ X (X X ) 1 ] = n(x X ) 1 X E[ϵϵ ]X (X X ) 1 = σ ( 2 1 n X X ) 1. σ 2 ( ) 6 / 1
ˆϵ ϵ ˆϵ = Y X ˆβ = X (β ˆβ) + ϵ = (I X (X X ) 1 X )ϵ S = ( 1 n X X ) 1 n( ˆβ j βj )/ S jj t(n (d + 1)) ˆϵ 2 /(n (d + 1)) ( n (d + 1) t ). β j : β j = 0 n ˆβ j / S jj ˆϵ 2 /(n (d + 1)) t α(n (d + 1)) βj = 0 t α t α (βj = 0 ˆβ j ) 7 / 1
1 ˆϵ = Y X ˆβ = X (β ˆβ) + ϵ = (I X (X X ) 1 X )ϵ 0 ˆβ β = (X X ) 1 X ϵ (Eq. (1)) E[ˆϵ(ˆβ β ) ] = O ˆϵ ˆβ β ( ) 2 (I X (X X ) 1 X ) n (d + 1) ˆϵ 2 /σ 2 n (d + 1) χ 2 3 n( ˆβ j β j )/ S jj N(0, σ 2 ) n( ˆβj β j )/ S jj 4 ˆϵ 2 /(n (d+1)) χ2 t σ 2 8 / 1
: F (H 0 ) Ȳ = H 0 : β 1 = β 2 = = β d = 0 vs H 1 : F = X ˆβ 1Ȳ 2 /d ˆϵ 2 F (d, n d 1). /(n (d + 1)) n i=1 y i n (R 2 ) (R 2 A ) R 2 = 1 Y X ˆβ 2 Y 1Ȳ, 2 R2 A = 1 Y X ˆβ 2 /(n d 1) Y 1Ȳ 2 /(n 1) 9 / 1
AIC (overfit), AIC (Akaike Information Criterion) d M d AIC : ˆθ Md M d AIC = 2 log(p({x i } n i=1 ˆθ Md )) + 2m. AIC = 2 + 2 AIC 10 / 1
AIC ( ) ( ) AIC KL-divergence ( KL-divergence) ( ) ( ) 11 / 1
( ) http://www.land.mlit.go.jp/webland/download.html 12 / 1
RC SRC ( ) {0, 1} RC 1, SRC 0 13 / 1
: lm(formula, data, subset, weights, na.action, method = qr, model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, contrasts = NULL, offset,...) formula data subset weights na.action NA 14 / 1
formula y x y x1 + x2 y x1 * x2 y x - 1 y 1 + x + I(xˆ2) y x z y., data = y = β 1 x + b + ϵ y = β 1 x1 + β 2 x2 + b + ϵ y = β 1 x1 + β 2 x2 + β 3 x1x2 + b + ϵ ( ) y = β 1 x + ϵ : y = b + β 1 x + β 2 x 2 + ϵ z y x y x 1,..., x d y = b + d j=1 β jx j + ϵ x <- seq(-10,10); y<- 3*x + rnorm(21); lm(y ~ x) # y x x <- seq(-10,10); z <- seq(-10,10)^2; y<- 3*x + 4*z + rnorm(21); datain <- data.frame(x=x,z=z,y=y) lm(y ~ x, data=datain) # y x lm(y ~., data=datain) # y y 15 / 1
x <- read.csv("setagaya_manshion.csv") sman = data.frame(price=x[[1]],walk=x[[3]],area=x[[5]], str=ifelse(x[[6]]==" ",1,0),kenpei=x[[7]],youseki=x[[8]], tikunen=x[[9]],kyuko=x[[10]]) # 0-1 plot(sman) # 0 10 20 0.0 0.6 100 400 0.0 0.6 price 2e+07 0 10 20 walk area 20 80 0.0 0.6 str kenpei 40 60 80 100 400 youseki tikunen 1970 2000 0.0 0.6 kyuko 16 / 1
(lm) sman.lm <- lm(price ~ area,data=sman) # OK plot(sman$area,sman$price, xlab="area",ylab="price") # abline(sman.lm, lwd=1, col="blue") price 2e+07 4e+07 6e+07 8e+07 20 40 60 80 100 area 17 / 1
(summary) > summary(sman.lm) Call: lm(formula = price ~ area, data = sman) Residuals: Min 1Q Median 3Q Max -16082568-5634345 -789511 4806399 16822060 Coefficients: Estimate Std. Error t value Pr(> t ) (Intercept) -1029167 1519818-0.677 0.5 area 673025 26408 25.485 <2e-16 *** --- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05. 0.1 1 Residual standard error: 7594000 on 128 degrees of freedom Multiple R-squared: 0.8354, Adjusted R-squared: 0.8341 F-statistic: 649.5 on 1 and 128 DF, p-value: < 2.2e-16 18 / 1
(1) Call: lm(formula = price ~ area, data = sman) Residuals: Min 1Q Median 3Q Max -16082568-5634345 -789511 4806399 16822060 y i ŷ i = y i ˆβ x 19 / 1
(2) Coefficients: Estimate Std. Error t value Pr(> t ) (Intercept) -1029167 1519818-0.677 0.5 area 673025 26408 25.485 <2e-16 *** --- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05. 0.1 1 t- p- (Intercept). t- p- 0 y area ( α = 0.05 ) 20 / 1
(3) Residual standard error: 7594000 on 128 degrees of freedom (σ ) 130 1 + 130-2=128. F-statistic: 649.5 on 1 and 128 DF, p-value: < 2.2e-16 β = 0 F (d, n d 1) ( ) (α = 0.05) Multiple R-squared: 0.8354, Adjusted R-squared: 0.8341 (Adjusted R-squared) Adjusted 21 / 1
par(mfrow=c(2,2)) # Figure 2x2 plot(sman.lm) Residuals Standardized residuals 2e+07 0e+00 2e+07 0.0 0.5 1.0 1.5 Residuals vs Fitted 96 58 1e+07 3e+07 5e+07 7e+07 39 Fitted values Scale Location 39 96 58 1e+07 3e+07 5e+07 7e+07 Standardized residuals Standardized residuals 2 1 0 1 2 2 1 0 1 2 Normal Q Q 96 58 39 2 1 0 1 2 Theoretical Quantiles Residuals vs Leverage 58 43 45 Cook s distance 0.00 0.01 0.02 0.03 0.04 0.05 ( ) : ŷ i vs y i ŷ i ( ) Q-Q : Q-Q ( ) Scale-Location : ( ) Cook : ( ). 0.5 Fitted values Leverage 22 / 1
predict interval= confidence ( ˆβ X ) area.plot <- seq(min(sman$area)*0.9,max(sman$area)*1.1,by=1) # sman.con <- predict(sman.lm, data.frame(area=area.plot),interval="confidence") # price 2e+07 4e+07 6e+07 8e+07 1e+08 20 40 60 80 100 120 area (m^2) 23 / 1
> sman.lm3 <- lm(price ~ area + tikunen, data=sman) > (AIC(sman.lm3)) [1] 4443.881 AIC( ) AIC sman.lmall <- lm(price ~., data=sman) sman.lmaic <- step(sman.lmall) summary(sman.lmaic) step( ) AIC ( ) walk + area + tikunen 24 / 1
> shapiro.test(sman.lmaic$residuals) Shapiro-Wilk normality test data: sman.lmaic$residuals W = 0.9874, p-value = 0.2806 25 / 1
QQ qqnorm(sman.lmaic$residual) Normal Q Q Plot Sample Quantiles 1.5e+07 5.0e+06 5.0e+06 1.5e+07 2 1 0 1 2 Theoretical Quantiles QQ 26 / 1