Let’s look at a new example: regressions of hourly wages on years of education (note this is US data)
library(haven)
getwd()
## [1] "C:/Users/Ralf Martin/Dropbox/datastories/datastorieshub/code"
#load dataset
data <- read.csv("https://www.dropbox.com/s/9agc2vmamfztlel/WAGE1.csv?dl=1")
head(data)
## X wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 1 3.10 11 2 0 0 1 0 2 1 0 0
## 2 2 3.24 12 22 2 0 1 1 3 1 0 0
## 3 3 3.00 11 2 0 0 0 0 2 0 0 0
## 4 4 6.00 8 44 28 0 0 1 0 1 0 0
## 5 5 5.30 12 7 2 0 0 1 1 0 0 0
## 6 6 8.75 16 9 8 0 0 1 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 1 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 0 0 0
## 3 1 0 0 0 1 0 0 0 0
## 4 1 0 0 0 0 0 0 0 1
## 5 1 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 1 1 0
## servocc lwage expersq tenursq
## 1 0 1.131402 4 0
## 2 1 1.175573 484 4
## 3 0 1.098612 4 0
## 4 0 1.791759 1936 784
## 5 0 1.667707 49 4
## 6 0 2.169054 81 64
summary(data)
## X wage educ exper
## Min. : 1.0 Min. : 0.530 Min. : 0.00 Min. : 1.00
## 1st Qu.:132.2 1st Qu.: 3.330 1st Qu.:12.00 1st Qu.: 5.00
## Median :263.5 Median : 4.650 Median :12.00 Median :13.50
## Mean :263.5 Mean : 5.896 Mean :12.56 Mean :17.02
## 3rd Qu.:394.8 3rd Qu.: 6.880 3rd Qu.:14.00 3rd Qu.:26.00
## Max. :526.0 Max. :24.980 Max. :18.00 Max. :51.00
## tenure nonwhite female married
## Min. : 0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 2.000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean : 5.105 Mean :0.1027 Mean :0.4791 Mean :0.6084
## 3rd Qu.: 7.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :44.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## numdep smsa northcen south
## Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.000 Median :1.0000 Median :0.000 Median :0.0000
## Mean :1.044 Mean :0.7224 Mean :0.251 Mean :0.3555
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:0.750 3rd Qu.:1.0000
## Max. :6.000 Max. :1.0000 Max. :1.000 Max. :1.0000
## west construc ndurman trcommpu
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.1692 Mean :0.04563 Mean :0.1141 Mean :0.04373
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## trade services profserv profocc
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2871 Mean :0.1008 Mean :0.2586 Mean :0.3669
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## clerocc servocc lwage expersq
## Min. :0.0000 Min. :0.0000 Min. :-0.6349 Min. : 1.0
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 1.2030 1st Qu.: 25.0
## Median :0.0000 Median :0.0000 Median : 1.5369 Median : 182.5
## Mean :0.1673 Mean :0.1407 Mean : 1.6233 Mean : 473.4
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 1.9286 3rd Qu.: 676.0
## Max. :1.0000 Max. :1.0000 Max. : 3.2181 Max. :2601.0
## tenursq
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 4.00
## Mean : 78.15
## 3rd Qu.: 49.00
## Max. :1936.00
mod1 <- lm(wage ~ educ, data)
summary(mod1)
##
## Call:
## lm(formula = wage ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3396 -2.1501 -0.9674 1.1921 16.6085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.90485 0.68497 -1.321 0.187
## educ 0.54136 0.05325 10.167 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.378 on 524 degrees of freedom
## Multiple R-squared: 0.1648, Adjusted R-squared: 0.1632
## F-statistic: 103.4 on 1 and 524 DF, p-value: < 2.2e-16
plot(data$educ, data$wage, col = "blue")
lines(data$educ, mod1$fitted.values, col = "red")
summary(lm(exper ~ educ , data))
##
## Call:
## lm(formula = exper ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.652 -9.971 -2.971 9.125 30.625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.4615 2.6279 13.494 < 2e-16 ***
## educ -1.4682 0.2043 -7.187 2.3e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.96 on 524 degrees of freedom
## Multiple R-squared: 0.08973, Adjusted R-squared: 0.08799
## F-statistic: 51.65 on 1 and 524 DF, p-value: 2.295e-12
summary(lm(wage ~ educ + exper, data))
##
## Call:
## lm(formula = wage ~ educ + exper, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5532 -1.9801 -0.7071 1.2030 15.8370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.39054 0.76657 -4.423 1.18e-05 ***
## educ 0.64427 0.05381 11.974 < 2e-16 ***
## exper 0.07010 0.01098 6.385 3.78e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.257 on 523 degrees of freedom
## Multiple R-squared: 0.2252, Adjusted R-squared: 0.2222
## F-statistic: 75.99 on 2 and 523 DF, p-value: < 2.2e-16
summary(lm(educ ~ female,data))
##
## Call:
## lm(formula = educ ~ female, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.3175 -0.7883 -0.3175 1.6825 5.6825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.7883 0.1668 76.652 <2e-16 ***
## female -0.4709 0.2410 -1.953 0.0513 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.762 on 524 degrees of freedom
## Multiple R-squared: 0.00723, Adjusted R-squared: 0.005335
## F-statistic: 3.816 on 1 and 524 DF, p-value: 0.05129
summary(lm(wage ~ educ+female,data))
##
## Call:
## lm(formula = wage ~ educ + female, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9890 -1.8702 -0.6651 1.0447 15.4998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.62282 0.67253 0.926 0.355
## educ 0.50645 0.05039 10.051 < 2e-16 ***
## female -2.27336 0.27904 -8.147 2.76e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.186 on 523 degrees of freedom
## Multiple R-squared: 0.2588, Adjusted R-squared: 0.256
## F-statistic: 91.32 on 2 and 523 DF, p-value: < 2.2e-16
summary(lm(wage ~ educ+exper+female,data))
##
## Call:
## lm(formula = wage ~ educ + exper + female, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3856 -1.9652 -0.4931 1.1199 14.8217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.73448 0.75362 -2.302 0.0218 *
## educ 0.60258 0.05112 11.788 < 2e-16 ***
## exper 0.06424 0.01040 6.177 1.32e-09 ***
## female -2.15552 0.27031 -7.974 9.74e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.078 on 522 degrees of freedom
## Multiple R-squared: 0.3093, Adjusted R-squared: 0.3053
## F-statistic: 77.92 on 3 and 522 DF, p-value: < 2.2e-16
library(haven) # make sure libraries such as this are installed. If not go to Tools -> Install Packages
df=read.csv("https://www.dropbox.com/s/g1w75gkw7g91zef/foreigners.csv?dl=1")
df['crimesPc']=df$crimes11/df$pop11
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df=df%>%merge(read.csv("https://www.dropbox.com/s/gwq2wmmxr8s3v7t/foreigners_more.csv?dl=1"),
by="area") # Getting further variables
reg1=lm(crimesPc~b_migr11,df)
reg1=lm(crimesPc~pop11+b_migr11,df)
summary(reg1)
##
## Call:
## lm(formula = crimesPc ~ pop11 + b_migr11, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6243 -0.4052 -0.1253 0.2347 13.8304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.124e+00 1.018e-01 11.034 < 2e-16 ***
## pop11 -1.033e-06 5.078e-07 -2.034 0.0428 *
## b_migr11 4.105e-02 5.335e-03 7.694 1.77e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9627 on 321 degrees of freedom
## (96 observations deleted due to missingness)
## Multiple R-squared: 0.1561, Adjusted R-squared: 0.1508
## F-statistic: 29.68 on 2 and 321 DF, p-value: 1.483e-12
summary(reg1)
##
## Call:
## lm(formula = crimesPc ~ pop11 + b_migr11, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6243 -0.4052 -0.1253 0.2347 13.8304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.124e+00 1.018e-01 11.034 < 2e-16 ***
## pop11 -1.033e-06 5.078e-07 -2.034 0.0428 *
## b_migr11 4.105e-02 5.335e-03 7.694 1.77e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9627 on 321 degrees of freedom
## (96 observations deleted due to missingness)
## Multiple R-squared: 0.1561, Adjusted R-squared: 0.1508
## F-statistic: 29.68 on 2 and 321 DF, p-value: 1.483e-12
summary(lm(crimesPc~b_migr11+pop11,df))
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + pop11, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6243 -0.4052 -0.1253 0.2347 13.8304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.124e+00 1.018e-01 11.034 < 2e-16 ***
## b_migr11 4.105e-02 5.335e-03 7.694 1.77e-13 ***
## pop11 -1.033e-06 5.078e-07 -2.034 0.0428 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9627 on 321 degrees of freedom
## (96 observations deleted due to missingness)
## Multiple R-squared: 0.1561, Adjusted R-squared: 0.1508
## F-statistic: 29.68 on 2 and 321 DF, p-value: 1.483e-12
summary(lm(b_migr11~pop11,df))
##
## Call:
## lm(formula = b_migr11 ~ pop11, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.039 -5.187 -2.698 1.225 40.835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.240e+00 9.530e-01 6.548 2.18e-10 ***
## pop11 3.088e-05 4.883e-06 6.326 8.02e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.863 on 337 degrees of freedom
## (81 observations deleted due to missingness)
## Multiple R-squared: 0.1061, Adjusted R-squared: 0.1035
## F-statistic: 40.01 on 1 and 337 DF, p-value: 8.024e-10
summary(lm(crimesPc~b_migr11+pop11+urate2011,df))
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + pop11 + urate2011, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.98367 -0.29360 -0.07496 0.18189 2.92237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.461e-01 8.058e-02 9.259 < 2e-16 ***
## b_migr11 2.242e-02 2.875e-03 7.799 9.54e-14 ***
## pop11 -3.273e-07 2.804e-07 -1.167 0.244
## urate2011 5.527e-02 9.504e-03 5.815 1.50e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5047 on 311 degrees of freedom
## (105 observations deleted due to missingness)
## Multiple R-squared: 0.2674, Adjusted R-squared: 0.2603
## F-statistic: 37.84 on 3 and 311 DF, p-value: < 2.2e-16
summary(lm(b_migr11~urate2011,df))
##
## Call:
## lm(formula = b_migr11 ~ urate2011, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.693 -6.169 -2.860 1.549 42.717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6865 1.5206 4.397 1.49e-05 ***
## urate2011 0.5890 0.1827 3.224 0.00139 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.25 on 326 degrees of freedom
## (92 observations deleted due to missingness)
## Multiple R-squared: 0.03091, Adjusted R-squared: 0.02793
## F-statistic: 10.4 on 1 and 326 DF, p-value: 0.00139
summary(lm(crimesPc~b_migr11+pop11+medianage,df))
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + pop11 + medianage, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3870 -0.3786 -0.1240 0.1837 14.0674
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.131e+00 8.862e-01 3.533 0.000472 ***
## b_migr11 2.907e-02 7.465e-03 3.894 0.000120 ***
## pop11 -1.509e-06 5.462e-07 -2.764 0.006045 **
## medianage -4.439e-02 1.947e-02 -2.280 0.023275 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9565 on 320 degrees of freedom
## (96 observations deleted due to missingness)
## Multiple R-squared: 0.1696, Adjusted R-squared: 0.1618
## F-statistic: 21.78 on 3 and 320 DF, p-value: 7.329e-13
summary(lm(crimesPc~b_migr11+pop11+urate2011+medianage,df))
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + pop11 + urate2011 + medianage,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8873 -0.2680 -0.0783 0.1434 3.1754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.689e+00 4.855e-01 7.599 3.57e-13 ***
## b_migr11 5.446e-03 3.879e-03 1.404 0.16130
## pop11 -8.656e-07 2.793e-07 -3.099 0.00212 **
## urate2011 4.016e-02 9.320e-03 4.309 2.20e-05 ***
## medianage -6.305e-02 1.027e-02 -6.138 2.55e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4774 on 310 degrees of freedom
## (105 observations deleted due to missingness)
## Multiple R-squared: 0.3468, Adjusted R-squared: 0.3383
## F-statistic: 41.14 on 4 and 310 DF, p-value: < 2.2e-16
An alternative strategy: Unemployment in 2004 can’t be affected by the surge in migration after 2004
summary(lm(crimesPc~b_migr11+pop11+medianage+urate2004,df %>% filter(crimesPc<15)))
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + pop11 + medianage + urate2004,
## data = df %>% filter(crimesPc < 15))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9334 -0.3021 -0.0885 0.1659 3.1744
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.037e+00 5.105e-01 7.907 4.44e-14 ***
## b_migr11 2.124e-03 4.075e-03 0.521 0.60257
## pop11 -8.958e-07 2.911e-07 -3.077 0.00227 **
## medianage -6.787e-02 1.086e-02 -6.252 1.31e-09 ***
## urate2004 4.623e-02 1.825e-02 2.534 0.01178 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5011 on 316 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3226, Adjusted R-squared: 0.314
## F-statistic: 37.62 on 4 and 316 DF, p-value: < 2.2e-16
summary(lm(crimesPc~b_migr11+urate2011+medianage,df))
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + urate2011 + medianage, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9690 -0.2528 -0.0703 0.1431 3.1876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.169745 0.461898 6.862 3.67e-11 ***
## b_migr11 0.005737 0.003931 1.459 0.145442
## urate2011 0.036038 0.009351 3.854 0.000141 ***
## medianage -0.053052 0.009886 -5.366 1.57e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4839 on 311 degrees of freedom
## (105 observations deleted due to missingness)
## Multiple R-squared: 0.3265, Adjusted R-squared: 0.32
## F-statistic: 50.26 on 3 and 311 DF, p-value: < 2.2e-16
summary(lm(crimesPc~b_migr11+urate2004+medianage,df))
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + urate2004 + medianage, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0193 -0.2801 -0.0955 0.1499 3.1917
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.520392 0.488534 7.206 4.23e-12 ***
## b_migr11 0.002952 0.004120 0.717 0.4742
## urate2004 0.037707 0.018277 2.063 0.0399 *
## medianage -0.058060 0.010515 -5.522 7.00e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5078 on 317 degrees of freedom
## (99 observations deleted due to missingness)
## Multiple R-squared: 0.3023, Adjusted R-squared: 0.2957
## F-statistic: 45.78 on 3 and 317 DF, p-value: < 2.2e-16
plot(df$pop11,df$b_migr11)
plot(df$urate2011,df$b_migr11)
plot(df$medianage,df$b_migr11)
#df[df$pop11>600000,"area"]
How about differentiating by different groups? Muslims’ get a lot of hate by English neo nazis
reg2=lm(crimesPc~b_migr11+mus_sh+pop11+medianage+urate2011,df)
summary(reg2)
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + mus_sh + pop11 + medianage +
## urate2011, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8951 -0.2700 -0.0780 0.1408 3.1797
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.729e+00 4.903e-01 7.606 3.44e-13 ***
## b_migr11 5.940e-03 3.965e-03 1.498 0.13509
## mus_sh -5.206e-03 8.458e-03 -0.615 0.53870
## pop11 -8.385e-07 2.830e-07 -2.963 0.00329 **
## medianage -6.419e-02 1.045e-02 -6.144 2.48e-09 ***
## urate2011 4.091e-02 9.408e-03 4.349 1.86e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4778 on 309 degrees of freedom
## (105 observations deleted due to missingness)
## Multiple R-squared: 0.3476, Adjusted R-squared: 0.337
## F-statistic: 32.92 on 5 and 309 DF, p-value: < 2.2e-16
library(ggplot2)
data=data %>% mutate(educ_in_days=educ*365)
cor(data %>% select(educ,educ_in_days))
## educ educ_in_days
## educ 1 1
## educ_in_days 1 1
ggplot(data,aes(x=educ,y=educ_in_days))+geom_point()
reg2=lm(wage~female+educ+educ_in_days,data)
reg2 %>% summary()
##
## Call:
## lm(formula = wage ~ female + educ + educ_in_days, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9890 -1.8702 -0.6651 1.0447 15.4998
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.62282 0.67253 0.926 0.355
## female -2.27336 0.27904 -8.147 2.76e-15 ***
## educ 0.50645 0.05039 10.051 < 2e-16 ***
## educ_in_days NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.186 on 523 degrees of freedom
## Multiple R-squared: 0.2588, Adjusted R-squared: 0.256
## F-statistic: 91.32 on 2 and 523 DF, p-value: < 2.2e-16
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
evenmore=read.csv( "https://www.dropbox.com/s/pwotro2ghawkppg/foreign_evenmore.csv?dl=1")
df=df%>% inner_join(evenmore,by="area")
rr=lm(crimesPc~b_migr11+urate2011+
pop11+
shxage0t17+
shxage18t29+shxage30t44+shxage45t64+meanage,df %>% filter(crimesPc<150))
rr%>% summary()
##
## Call:
## lm(formula = crimesPc ~ b_migr11 + urate2011 + pop11 + shxage0t17 +
## shxage18t29 + shxage30t44 + shxage45t64 + meanage, data = df %>%
## filter(crimesPc < 150))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9592 -0.2153 -0.0735 0.1329 3.1625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.089e+00 3.394e+01 0.238 0.81180
## b_migr11 1.646e-04 5.270e-03 0.031 0.97510
## urate2011 3.746e-02 9.443e-03 3.967 9.07e-05 ***
## pop11 -8.774e-07 2.736e-07 -3.206 0.00149 **
## shxage0t17 -6.445e-02 3.035e-01 -0.212 0.83198
## shxage18t29 -5.900e-03 2.483e-01 -0.024 0.98106
## shxage30t44 -2.058e-02 1.833e-01 -0.112 0.91064
## shxage45t64 -8.662e-02 1.187e-01 -0.730 0.46614
## meanage -6.790e-02 4.269e-01 -0.159 0.87372
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4645 on 306 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.3896, Adjusted R-squared: 0.3737
## F-statistic: 24.41 on 8 and 306 DF, p-value: < 2.2e-16
cor(df %>% select(shxage0t17,
shxage18t29,
shxage30t44,
shxage45t64,meanage),use="complete.obs")
## shxage0t17 shxage18t29 shxage30t44 shxage45t64 meanage
## shxage0t17 1.00000000 0.01257122 0.3281674 -0.2878871 -0.5427118
## shxage18t29 0.01257122 1.00000000 0.5810169 -0.9006728 -0.8061182
## shxage30t44 0.32816735 0.58101695 1.0000000 -0.7408842 -0.8229079
## shxage45t64 -0.28788711 -0.90067279 -0.7408842 1.0000000 0.8938519
## meanage -0.54271181 -0.80611820 -0.8229079 0.8938519 1.0000000
A joint hypothesis test could also be a good idea
library("car")
rr%>% vif()
## b_migr11 urate2011 pop11 shxage0t17 shxage18t29 shxage30t44
## 4.493034 1.278986 1.346358 471.751245 1556.049097 348.477568
## shxage45t64 meanage
## 180.462107 2271.595826
Testing multiple restrictions at once; e.g. does age really not matter in the regression above?
library("car")
linearHypothesis(rr, c("shxage0t17 =0" ,
"shxage18t29=0",
"shxage30t44=0",
"shxage45t64=0",
"meanage=0"
) )
## Linear hypothesis test
##
## Hypothesis:
## shxage0t17 = 0
## shxage18t29 = 0
## shxage30t44 = 0
## shxage45t64 = 0
## meanage = 0
##
## Model 1: restricted model
## Model 2: crimesPc ~ b_migr11 + urate2011 + pop11 + shxage0t17 + shxage18t29 +
## shxage30t44 + shxage45t64 + meanage
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 311 79.228
## 2 306 66.011 5 13.217 12.254 7.689e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
wage1 <- read.csv("https://www.dropbox.com/s/9agc2vmamfztlel/WAGE1.csv?dl=1")
wage_educ_exper <- lm(wage ~ educ + exper, wage1)
#< by transforming model
wage1["educPexper"] <- wage1$educ+wage1$exper
head(wage1)
## X wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 1 3.10 11 2 0 0 1 0 2 1 0 0
## 2 2 3.24 12 22 2 0 1 1 3 1 0 0
## 3 3 3.00 11 2 0 0 0 0 2 0 0 0
## 4 4 6.00 8 44 28 0 0 1 0 1 0 0
## 5 5 5.30 12 7 2 0 0 1 1 0 0 0
## 6 6 8.75 16 9 8 0 0 1 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 1 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 0 0 0
## 3 1 0 0 0 1 0 0 0 0
## 4 1 0 0 0 0 0 0 0 1
## 5 1 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 1 1 0
## servocc lwage expersq tenursq educPexper
## 1 0 1.131402 4 0 13
## 2 1 1.175573 484 4 34
## 3 0 1.098612 4 0 13
## 4 0 1.791759 1936 784 52
## 5 0 1.667707 49 4 19
## 6 0 2.169054 81 64 25
#>
summary( lm(wage ~ educPexper + exper, wage1))
##
## Call:
## lm(formula = wage ~ educPexper + exper, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5532 -1.9801 -0.7071 1.2030 15.8370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.39054 0.76657 -4.423 1.18e-05 ***
## educPexper 0.64427 0.05381 11.974 < 2e-16 ***
## exper -0.57418 0.05159 -11.129 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.257 on 523 degrees of freedom
## Multiple R-squared: 0.2252, Adjusted R-squared: 0.2222
## F-statistic: 75.99 on 2 and 523 DF, p-value: < 2.2e-16
library("car")
linearHypothesis(wage_educ_exper, c("educ =0" ,"exper=0") )
## Linear hypothesis test
##
## Hypothesis:
## educ = 0
## exper = 0
##
## Model 1: restricted model
## Model 2: wage ~ educ + exper
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 525 7160.4
## 2 523 5548.2 2 1612.2 75.99 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1