1 15 R Part 4 2017 7 24 4 : website: email: http://www3.u-toyama.ac.jp/kkarato/ kkarato@eco.u-toyama.ac.jp 1 2 2 3 2.1............................... 3 2.2 2................................. 4 2.3................................ 6 3 10 3.1............................... 10 3.2.................................... 12 3.3........................... 13 3.4........................... 13 3.5............................. 16 3.6................................ 17
1 2 1 R 1. 2. R R 3. r 4. data 11.csv data<- read.csv("data_11.csv") read.csv CSV(Comma Separated Value ) data R ctrl + R attach(data) obs<- length(data[,1]) str(data) # summary(data) 4 4 attach length(data[,1]) 1 str data # R summary data str(data) 2162 44
2 3 2 2.1 : curve, lines, abline x-y curve y = 3 2x curve(3-2*x) lines 2 lines(c(0.2,0.8),c(2,2.5)) 1 (0.2, 2) 2 (0.8, 2.5) 2 abline 1 2 (y = 1 + 2x) abline(1,2) abline(v=0.5) x = 0.5 (vertical line) abline(h=1.5) y = 1.5 (horizontal line) 2 2 abline [18]age.sf [25]age.sp plot(age.sf, age.sp) reg<- lm(age.sf~age.sp) abline(reg) lm (fitting linear models) 2 age.sf~age.sp formula age.sf = α + β age.sp lm reg abline abline(lm(age.sf age.sp)) lm reg
2 4 reg Call: lm(formula = age.sf ~ age.sp) Coefficients: (Intercept) age.sp 2.131 0.959 2.2 2 summary lm summary summary(reg) summary(lm(age.sf~age.sp)) summary(reg) Call: lm(formula = age.sf ~ age.sp) Residuals: Min 1Q Median 3Q Max -20.5875-2.6887-0.3003 2.7407 26.4690 Coefficients: Estimate Std. Error t value Pr(> t ) (Intercept) 2.130918 0.393006 5.422 6.55e-08 *** age.sp 0.958977 0.006999 137.011 < 2e-16 *** --- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05. 0.1 1 Residual standard error: 4.581 on 2160 degrees of freedom Multiple R-squared: 0.8968, Adjusted R-squared: 0.8968 F-statistic: 1.877e+04 on 1 and 2160 DF, p-value: < 2.2e-16 α, β u i age.sf = α + β age.sp + u i Residuals 2 Coefficients α, β (Estimate) (Std. Error) t (t value) p (Pr(> t )) (Intercept) ˆα age.sp
2 5 age.sp ˆβ * Signif. codes *** 0.1% ** 1% * 5%. 10% Residual standard error ˆσ = û 2 i / degrees of freedom Multiple R-squared R 2 Adjusted R-squared adj.r 2 F-statistic 0 H 0 : β = 0 F DF p summary summary names(lm(formula)) names(reg) [1] "coefficients" "residuals" "effects" "rank" [5] "fitted.values" "assign" "qr" "df.residual" [9] "xlevels" "call" "terms" "model" summary Value summary(reg)[6] summary(reg)$sig summary(reg)$sigma 1 1 summary(lm(formula)) Value [3] $res [4] $coef t p [6] $sig [8] $r.sq [9] $adj.r.sq [10] $fstat F [11] $cov
2 6 2.3 [21] hour.sf [18] age.sf [5] sex D<- ifelse(sex==2,1,0) sex==2 D hour.sf!=888 y<- hour.sf[hour.sf!=888] x<- age.sf[hour.sf!=888] D<- D[hour.sf!=888] y = β 1 + β 2 x + β 3 D + u (1) reg<- lm(y~x + D) summary(reg) y x2, x3, x4 y~x2+x3+x4 formula subset subset pref==16 [21] hour.sf subset(hour.sf,pref==16) pref==16 sex==1 [21] hour.sf subset(hour.sf,pref==16 & sex==1)
2 7 data subset(data,pref==16 & sex==1) hour.sf==888 data subset(data,hour.sf!=888) data.frame factor csv data<- read.csv("data_11.csv") data data.frame subset data.frame data2 data2<- data.frame(subset(data, hour.sf!=888)) (1) reg<- lm(hour.sf~age.sf + factor(sex), data=data2) summary(reg) lm 2 data=data2 factor sex 1 2 factor 1 2 lm formula factor Coefficients: Estimate Std. Error t value Pr(> t ) (Intercept) 63.65005 1.76697 36.02 <2e-16 *** age.sf -0.35703 0.03288-10.86 <2e-16 *** factor(sex)2-15.95804 0.80737-19.77 <2e-16 *** lm(y~ x + D) factor(sex)2 sex==2 hour.sf 1 0.357 15.958
2 8 2 2 hour.sf = β 1 age.sf + β 2 age.sf 2 + β 3 factor(sex)2 + u (2) 2 formula I (Inhibit Interpretation/Conversion of Objects) I(age.sf^2) reg<- lm(hour.sf~age.sf + I(age.sf^2) + factor(sex), data=data2) summary(reg) (2) Coefficients: Estimate Std. Error t value Pr(> t ) (Intercept) 27.605957 5.633785 4.900 1.07e-06 *** age.sf 1.169697 0.229275 5.102 3.84e-07 *** I(age.sf^2) -0.015166 0.002255-6.726 2.56e-11 *** factor(sex)2-16.279732 0.795979-20.452 < 2e-16 *** 2 summary(reg)$coef β 2 b2 β 3 b3 b2<- summary(reg)$coef[2,1] b3<- summary(reg)$coef[3,1] b2 = 1.169697 b3 = -0.015166 Ŷ Ŷ = 1.170x 0.015x 2 + extras (3) x extras curve(b2*x+b3*x^2, 20,80, axes=f) axis(1) axes=f axis(1) 30 (3) x 0 x = 38.56 R -b2/(2*b3) [1] 38.5621
2 9 1 data2 hour.sf 1. 2 (job.sf) 2. 10 (age.sf10) (job.sf) factor 1 1 reg<- lm(hour.sf~age.sf + I(age.sf^2) + factor(sex) + factor(job.sf), data=data2) summary(reg) Coefficients: Estimate Std. Error t value Pr(> t ) (Intercept) 36.195526 5.322200 6.801 1.56e-11 *** age.sf 1.002056 0.208864 4.798 1.78e-06 *** I(age.sf^2) -0.012484 0.002071-6.029 2.13e-09 *** factor(sex)2-9.741887 0.859124-11.339 < 2e-16 *** factor(job.sf)2-5.885321 1.442993-4.079 4.80e-05 *** factor(job.sf)3-21.253979 1.584140-13.417 < 2e-16 *** factor(job.sf)4-12.676375 3.859618-3.284 0.00105 ** factor(job.sf)5-10.263944 1.667614-6.155 9.88e-10 *** factor(job.sf)6-5.737439 2.217513-2.587 0.00978 ** factor(job.sf)2 factor(job.sf)6 job.sf==1 5.8 21.2 1 2 reg<- lm(hour.sf~factor(age.sf10) + factor(sex) + factor(job.sf), data=data2) summary(reg) age.sf10 10 factor(age.sf10)30 factor(age.sf10)80 20 age.sf10==20 20 30, 40, 50 60 20 8.0 70 20 14.6
3 10 Coefficients: Estimate Std. Error t value Pr(> t ) (Intercept) 57.3306 2.2986 24.942 < 2e-16 *** factor(age.sf10)30-2.0756 1.9640-1.057 0.290799 factor(age.sf10)40-1.6515 1.9093-0.865 0.387197 factor(age.sf10)50-3.1518 1.9292-1.634 0.102551 factor(age.sf10)60-8.0257 2.0113-3.990 6.95e-05 *** factor(age.sf10)70-14.6730 2.6060-5.630 2.19e-08 *** factor(age.sf10)80-23.0555 5.7030-4.043 5.58e-05 *** factor(sex)2-9.7094 0.8668-11.202 < 2e-16 *** factor(job.sf)2-5.9239 1.4554-4.070 4.97e-05 *** factor(job.sf)3-21.3883 1.5956-13.405 < 2e-16 *** factor(job.sf)4-13.3221 3.8877-3.427 0.000629 *** factor(job.sf)5-10.7186 1.6755-6.397 2.18e-10 *** factor(job.sf)6-5.8525 2.2318-2.622 0.008831 ** 3 3.1 work.sf work.sf==1 work.sf==2 work.sf==3 { 1 Y i = 0 i = 1, 2,, n (4) 2 Y i = α + βx i + u i (5) (5) 2 2 0 1 0 1 Y i 0 1 Y i 1 p i Y i = 0 Y i = 1 1 E(Y i ) = p i V (Y i ) = p i (1 p i )
3 11 (5) 2 Y i = α + βx i + u i (6) Y i < 0 Y i = 0 Y i 0 Y i = 1 Yi (latent variable) Yi 2 Y i α + βx i + u i Yi < 0 Y i = 0 Y i 0 Y i = 1 0 u i (5) 2 2 (generalized linear model; GLM) 3 Y i X i α + βx i Y i E(Y i ) = p i g g(e(y i )) = α + βx i (7) g u i N(0, σ 2 ) E(Y i ) = α + βx i (4) 0 1 2 Pr(Y i = 1) = p i p i = eα+βx i 1+e α+βx i ln *1 E(Y i ) = p i g(p i ) p i 1 p i ln p i 1 p i = α + βx i (8) *1 10
3 12 3.2 (maximum likelihood method; ML) (likelihood) * 2 L = Pr(Y 1 = y 1 ) Pr(Y 2 = y 2 ) Pr(Y n = y n ) (9) y i 0 1 n p 2 B(n, p) Y y Pr(Y = y) = n C y p y (1 p) n y n = 1 Y = 0, 1 Pr(Y = 0) = p 0 (1 p) 1 0 = 1 p, Pr(Y = 1) = p 1 (1 p) 1 1 = p i (Y i = 1) (Y i = 0) Pr(Y i = y i ) = p y i i (1 p i) 1 y i (10) (7) X i Y i = 1 Pr(Y i = 1) = Pr(Y i 0) (11) = Pr(α + βx i + u i 0) = Pr(u i α βx i ) = p i Y i = 0 Pr(Y i = 0) = Pr(Y i < 0) (12) = Pr(α + βx i + u i < 0) = Pr(u i < α βx i ) = 1 p i (12) (13) (10) L = Pr(Y 1 = y 1 ) Pr(Y 2 = y 2 ) Pr(Y n = y n ) = Pr(u 1 α βx 1 ) y 1 Pr(u 1 < α βx 1 ) 1 y 1 Pr(u 2 α βx 2 ) y 2 Pr(u 2 < α βx 2 ) 1 y 2 Pr(u n α βx n ) y n Pr(u n < α βx n ) 1 y n X 1,, X n L α, β L α, β LL = ln L α, β ˆα, ˆβ *2
3 13 3.3 β = 0 0 (likelihood ratio test; LR) LR = 2(LL0 LL) χ 2 (k) (13) LL0 0 k k 2 F (Akaike s Information Criterion; AIC) AIC = 2LL + 2(k + 1) (14) AIC 2 (residual deviance) Dev Dev = 2LL (15) 2 LR AIC LR = Dev0 Dev, AIC = Dev + 2k (McFadden s pseudo R-squared) pr 2 = 1 LL LL0 (16) LR 2 3.4 glm R glm y<- ifelse(work.sf==1 1,0) y.glm<- glm(y~ age.sf + factor(sex), family=binomial(link="logit")) summary(y.glm) 2 (4) work.sf y y age.sf factor(sex) glm family 2 binomial link="logit"
3 14 Coefficients: Estimate Std. Error z value Pr(> z ) (Intercept) 5.527970 0.273707 20.20 <2e-16 *** age.sf -0.077703 0.004228-18.38 <2e-16 *** factor(sex)2-1.381965 0.109007-12.68 <2e-16 *** --- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05. 0.1 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 2877.6 on 2161 degrees of freedom Residual deviance: 2360.4 on 2159 degrees of freedom AIC: 2366.4 Number of Fisher Scoring iterations: 4 z value 2 k = 2 Null deviance 2 0 (Dev0 = 2LL0) n = 2162 Null deviance n 1 Residual deviance LL Dev = 2LL n (k + 1) AIC = Dev + 2(k + 1) = 2360.4 + 2 3 sumamry deviance LL<- y.glm$dev/(-2) LL0<- y.glm$null.dev/(-2) LR<- -2*(LL0-LL) bhat<- y.glm$coef k<- length(bhat)-1 LRp<- 1-pchisq(LR,df=k) AIC<- y.glm$aic pr2<- 1 - LL/LL0 $dev (15) y.glm$dev/(-2) LL $coef length 1 k = 2 (13) p k 2 $aic (16)
3 15 2 2162 ME 5.528 0.274 *** - -0.078 0.004 *** -0.014-1.382 0.109 *** -0.254 LL -1180.209 LR 517.220 LR p 0.000 AIC 2366.417 pr 2 0.180 X 1 Pr(Y = 1) (marginal effects) 10 ME = Pr(Y i = 1) X i = ˆβp (1 p ) (17) p (1 p ) X i ˆp i = Pr(u i ˆα ˆβX i ) p (1 p ) = i ˆp i(1 ˆp i ) n y.glm$fit phat<-y.glm$fit ME<- bhat*mean(phat*(1-phat)) ME (Intercept) age.sf D 1.01781992-0.01430682-0.25444983 bhat phat mean(phat*(1-phat)) age.sf 1 0.0143 (1.43%) D 0.2544 (25.44%) ln p i 1 p i = β 1 + β 2 age.sf i + β 3 D i + u i 2
3 16 2 ownhouse==1 y = 1 y = 0 2 age.sf factor(income.hh) factor(pref) 3.5 (12) p i (12) p i N(0, σ 2 ) (probit model) p i = Pr(u i α + βx i ) = α+βxi f(u i )du i (18) f(u i ) (18) f(u i ) F (α + βx i ) α + βx i F 1 (p i ) F 1 (p i ) = α + βx i (19) glm y 2 y.glm<- glm(y~ age.sf + factor(sex), family=binomial(link="probit")) summary(y.glm) Coefficients: Estimate Std. Error z value Pr(> z ) (Intercept) 3.262107 0.150845 21.63 <2e-16 *** age.sf -0.045510 0.002371-19.19 <2e-16 *** factor(sex)2-0.854726 0.063304-13.50 <2e-16 *** AIC R ME = Pr(Y i = 1) X i = ˆβ i f(α + βx i) n (20)
3 17 3 2162 3.262 0.151 *** - -0.046 0.002 *** -0.014-0.855 0.063 *** -0.265 LL -1180.950 LR 517.220 LR p 0.000 AIC 2367.900 pr 2 0.180 phat<-y.glm$fit ME<- bhat*mean(dnorm(qnorm(phat))) ME (Intercept) age.sf factor(sex)2 1.0103654-0.0140957-0.2647326 3 3 2 3.6 op56: Y i = {1, 2, 3, 4, 5} Y i 2 (7) Y i = βx i + u i (21) ζ j 1 < Y i < ζ j Y i = j (j = 1, 2, 3, 4, 5) (breakpoints) ζ 0, ζ 1, ζ 2, ζ 3, ζ 4, ζ 5 j
3 18 Y i = 1 ζ 0 < Yi < ζ 1 ζ 0 βx i < u i < ζ 1 βx i Y i = 2 ζ 1 < Yi < ζ 2 ζ 1 βx i < u i < ζ 2 βx i Y i = 3 ζ 2 < Yi < ζ 3 ζ 2 βx i < u i < ζ 3 βx i Y i = 4 ζ 3 < Yi < ζ 4 ζ 3 βx i < u i < ζ 4 βx i Y i = 5 ζ 4 < Yi < ζ 5 ζ 4 βx i < u i < ζ 5 βx i ζ 0 =, ζ 5 = ζ 1, ζ 2, ζ 3, ζ 4 Λ Y i = j X i p ij = Pr(Y i = j) = Λ(ζ j βx i ) Λ(ζ j 1 βx i ) (22) F p ij = Pr(Y i = j) = F (ζ j βx i ) F (ζ j 1 βx i ) (23) 2 (p y i1 i1 ) (py i2 i2 ) (py i3 i3 ) (py i4 i4 ) (py i5 i5 ) (22) (23) AIC = 2LL + 2(k + k ζ ) (24) k k ζ 5 ζ 1, ζ 2, ζ 3, ζ 4 : : Pr(Y i = j) = [λ(ˆζ j 1 X ˆβX i ) λ(ˆζ j ˆβX i )] ˆβ i (25) Pr(Y i = j) = [f(ˆζ j 1 X ˆβX i ) f(ˆζ j ˆβX i )] ˆβ i (26) λ f op56 age.sf facto(sex) 6 c06 6 12 c0612 12 18 c1218 factor(edu.sf) pkg MASS R MASS library
3 19 library(mass) polr ologit<- polr(as.factor(op56)~ age.sf + factor(sex) + c06 + c0612 + c1218 + factor(edu.sf), method = "logistic") summary(ologit) op56 as.ordered method = "logistic" method = "probit" AIC LL<- ologit$deviance/(-2) ologit0<- polr(as.ordered(op56)~ 1, method = "logistic") LL0<- ologit0$deviance/(-2) LR<- -2*(LL0-LL); # LRp<- 1-pchisq(LR,df=k) # H0: 0 p bhat<- matrix(ologit$coef) # k<- length(bhat) # kz<- length(ologit$zeta) # - 1 AIC<- -2*LL + 2*(k+kz) # pr2<- 1-(LL/LL0) # polr 0 (null.deviance) polr(as.ordered(op56)~ 1, method = "logistic") LL0 u i ologit$fit (22) phat<- ologit$fit cphat<- array(0,c(ologit$n,4)) cphat[,1]<- phat[,1] cphat[,2]<- phat[,1] + phat[,2] cphat[,3]<- phat[,1] + phat[,2] + phat[,3] cphat[,4]<- phat[,1] + phat[,2] + phat[,3] + phat[,4] lambda<- dlogis(qlogis(cphat)) lambda 4 (25) λ(ζ 1 βx i ) λ(ζ 2 βx i ) λ(ζ 3 βx i ) λ(ζ 4 βx i )
3 20 dl1<- mean(-lambda[,1]) dl2<- mean(lambda[,1]-lambda[,2]) dl3<- mean(lambda[,2]-lambda[,3]) dl4<- mean(lambda[,3]-lambda[,4]) dl5<- mean(lambda[,4]) (25) ME ME<- cbind(bhat*dl1, bhat*dl2, bhat*dl3, bhat*dl4, bhat*dl5) ME [,1] [,2] [,3] [,4] [,5] [1,] -0.0013880099-0.0010179285 0.0002881974 0.001078294 0.001039447 [2,] -0.0037981183-0.0027854363 0.0007886166 0.002950620 0.002844318 [3,] -0.0203954382-0.0149574575 0.0042347764 0.015844473 0.015273646 [4,] -0.0300370033-0.0220283180 0.0062366885 0.023334654 0.022493979 [5,] -0.0336688021-0.0246917801 0.0069907716 0.026156066 0.025213744 [6,] 0.0197321449 0.0144710162-0.0040970545-0.015329185-0.014776922 [7,] 0.0009674851 0.0007095272-0.0002008823-0.000751604-0.000724526 [8,] 0.0173066125 0.0126921970-0.0035934327-0.013444877-0.012960500 [9,] -0.0097591267-0.0071570770 0.0020263217 0.007581510 0.007308372 4 6 12 12 18 6 1 0.14% 0.10% 0.11% 0.10% 6 12 3.00% 2.20% 2.33% 2.25%
3 21 4 2162 Y = 1 Y = 2 Y = 3 Y = 4 Y = 5 age.sf 0.010 0.004 ** -0.0014-0.0010 0.0003 0.0011 0.0010 factor(sex)2 0.028 0.082-0.0038-0.0028 0.0008 0.0030 0.0028 c06 0.150 0.101-0.0204-0.0150 0.0042 0.0158 0.0153 c0612 0.220 0.080 *** -0.0300-0.0220 0.0062 0.0233 0.0225 c1218 0.247 0.074 *** -0.0337-0.0247 0.0070 0.0262 0.0252 factor(edu.sf)2-0.145 0.282 0.0197 0.0145-0.0041-0.0153-0.0148 factor(edu.sf)3-0.007 0.275 0.0010 0.0007-0.0002-0.0008-0.0007 factor(edu.sf)4-0.127 0.291 0.0173 0.0127-0.0036-0.0134-0.0130 factor(edu.sf)5 0.072 0.280-0.0098-0.0072 0.0020 0.0076 0.0073 : 1 2 ζ 1-0.972 0.409 2 3 ζ 2 0.222 0.408 3 4 ζ 3 1.529 0.409 4 5 ζ 4 2.714 0.413 LL -3353.736 LR 22.522 LR p 0.007 AIC 6733.473 pr 2 0.003 12 18 3.37% 2.47% 2.62% 2.52% 0.38% 0.28% 0.30% 0.28% 4 job.sati data3<- data.frame(subset(data, work.sf == 1)) 5 1 as.ordered(6-job.sati) age.sf factor(sex) factor(income.sf) hour.sf library(mass)
3 22 > data3<- data.frame(subset(data, work.sf == 1)) > ologit<- polr(as.ordered(6-job.sati)~ age.sf + factor(sex) + + factor(income.sf) + hour.sf +, method = "logistic", data=data3) > summary(ologit) Re-fitting to get Hessian Call: polr(formula = as.ordered(6 - job.sati) ~ age.sf + factor(sex) + factor(income.sf) + hour.sf, data = data3, method = "logistic") Coefficients: Value Std. Error t value age.sf 0.006856 0.004474 1.532 factor(sex)2 0.436100 0.131630 3.313 factor(income.sf)2 0.533906 0.144522 3.694 factor(income.sf)3 0.647301 0.158829 4.075 factor(income.sf)4 1.536530 0.236342 6.501 hour.sf -0.010312 0.003584-2.877 Intercepts: Value Std. Error t value 1 2-3.2136 0.3643-8.8205 2 3-1.8616 0.3355-5.5490 3 4-0.3207 0.3277-0.9786 4 5 1.4821 0.3301 4.4903 Residual Deviance: 3500.94 AIC: 3520.94