Returns to education again

library(AER)
library(dplyr)
df=read.csv("https://www.dropbox.com/s/diecbkq03gfid0p/card1993.csv?dl=1") 
#from https://davidcard.berkeley.edu/data_sets.html

lm(wage76 ~ ed76, data = df) %>% summary()
## 
## Call:
## lm(formula = wage76 ~ ed76, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -623.6 -173.7  -33.3  128.3 1687.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  183.934     23.160   7.942 2.78e-15 ***
## ed76          29.566      1.712  17.274  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 251.7 on 3015 degrees of freedom
##   (596 observations deleted due to missingness)
## Multiple R-squared:  0.09006,    Adjusted R-squared:  0.08976 
## F-statistic: 298.4 on 1 and 3015 DF,  p-value: < 2.2e-16
library(ggplot2)
ggplot(data=df, aes(x=ed76,y=wage76)) + geom_point() + theme_minimal()
## Warning: Removed 596 rows containing missing values (geom_point).

#  library(ggplot2)
#  ggplot(data=CollegeDistance, aes(x=distance)) + geom_histogram() + theme_minimal()
 first=lm(ed76~nearc4a, data = df)   
 first %>% summary()
## 
## Call:
## lm(formula = ed76 ~ nearc4a, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6575  -1.6575  -0.6575   2.3425   5.1935 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.80654    0.06343 201.911   <2e-16 ***
## nearc4a      0.85094    0.09041   9.412   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.717 on 3611 degrees of freedom
## Multiple R-squared:  0.02394,    Adjusted R-squared:  0.02367 
## F-statistic: 88.58 on 1 and 3611 DF,  p-value: < 2.2e-16
iv=ivreg(wage76 ~ ed76  | nearc4a, data=df)
iv %>% summary()
## 
## Call:
## ivreg(formula = wage76 ~ ed76 | nearc4a, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -941.253 -211.663   -6.304  204.517 1683.696 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -943.77     198.71  -4.749 2.13e-06 ***
## ed76          114.59      14.97   7.652 2.64e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 339.4 on 3015 degrees of freedom
## Multiple R-Squared: -0.6547, Adjusted R-squared: -0.6552 
## Wald test: 58.56 on 1 and 3015 DF,  p-value: 2.639e-14
iv=ivreg(wage76 ~ ed76+factor(region)+nearc4b+nearc2 | nearc4a+nearc4b+nearc2+factor(region)
         , data=df )
iv %>% summary()
## 
## Call:
## ivreg(formula = wage76 ~ ed76 + factor(region) + nearc4b + nearc2 | 
##     nearc4a + nearc4b + nearc2 + factor(region), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -853.100 -198.498   -6.864  187.265 1630.953 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -729.039    291.987  -2.497  0.01258 *  
## ed76              98.811     20.916   4.724 2.42e-06 ***
## factor(region)2 -126.976     40.111  -3.166  0.00156 ** 
## factor(region)3  -36.559     34.319  -1.065  0.28683    
## factor(region)4  -21.313     42.946  -0.496  0.61974    
## factor(region)5  -14.037     38.400  -0.366  0.71474    
## factor(region)6  -62.001     30.296  -2.047  0.04079 *  
## factor(region)7   17.031     24.201   0.704  0.48166    
## factor(region)8  -11.016     23.908  -0.461  0.64499    
## factor(region)9  -19.767     35.121  -0.563  0.57360    
## nearc4b           -8.513     15.013  -0.567  0.57075    
## nearc2            27.357     12.357   2.214  0.02691 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 311.4 on 3005 degrees of freedom
## Multiple R-Squared: -0.3884, Adjusted R-squared: -0.3935 
## Wald test: 17.66 on 11 and 3005 DF,  p-value: < 2.2e-16
 first=lm(ed76~nearc4a+factor(region)+nearc4b+nearc2, data = df)   
 first %>% summary()
## 
## Call:
## lm(formula = ed76 ~ nearc4a + factor(region) + nearc4b + nearc2, 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3438  -1.6767  -0.2662   2.0324   5.9761 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     13.41923    0.18020  74.469  < 2e-16 ***
## nearc4a          0.66714    0.10785   6.186 6.88e-10 ***
## factor(region)2  0.25742    0.29739   0.866   0.3868    
## factor(region)3 -0.88954    0.20580  -4.322 1.58e-05 ***
## factor(region)4 -1.39531    0.21202  -6.581 5.35e-11 ***
## factor(region)5 -1.31319    0.18230  -7.204 7.11e-13 ***
## factor(region)6 -0.17377    0.22849  -0.761   0.4470    
## factor(region)7 -0.15300    0.18226  -0.839   0.4013    
## factor(region)8 -0.15192    0.18456  -0.823   0.4105    
## factor(region)9 -0.54352    0.25847  -2.103   0.0355 *  
## nearc4b          0.25475    0.13183   1.932   0.0534 .  
## nearc2           0.03420    0.09664   0.354   0.7234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.662 on 3601 degrees of freedom
## Multiple R-squared:  0.06556,    Adjusted R-squared:  0.0627 
## F-statistic: 22.97 on 11 and 3601 DF,  p-value: < 2.2e-16
 linearHypothesis(first,c("nearc4a=0"))
## Linear hypothesis test
## 
## Hypothesis:
## nearc4a = 0
## 
## Model 1: restricted model
## Model 2: ed76 ~ nearc4a + factor(region) + nearc4b + nearc2
## 
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   3602 25791                                  
## 2   3601 25520  1    271.15 38.261 6.882e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 reduced=lm(wage76~nearc4a+factor(region)+nearc4b+nearc2, data = df)   
 reduced %>% summary()
## 
## Call:
## lm(formula = wage76 ~ nearc4a + factor(region) + nearc4b + nearc2, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -631.80 -164.44  -37.61  120.43 1735.20 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      600.144     18.934  31.697  < 2e-16 ***
## nearc4a           65.301     11.192   5.835 5.97e-09 ***
## factor(region)2  -79.575     31.983  -2.488  0.01290 *  
## factor(region)3 -122.686     21.488  -5.709 1.24e-08 ***
## factor(region)4 -152.045     22.014  -6.907 6.03e-12 ***
## factor(region)5 -142.303     19.136  -7.436 1.34e-13 ***
## factor(region)6  -72.354     24.341  -2.973  0.00298 ** 
## factor(region)7   -1.002     19.112  -0.052  0.95819    
## factor(region)8  -25.757     19.240  -1.339  0.18075    
## factor(region)9  -75.947     26.576  -2.858  0.00430 ** 
## nearc4b           18.110     13.632   1.329  0.18409    
## nearc2            29.110      9.963   2.922  0.00351 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 252.2 on 3005 degrees of freedom
##   (596 observations deleted due to missingness)
## Multiple R-squared:  0.08978,    Adjusted R-squared:  0.08644 
## F-statistic: 26.94 on 11 and 3005 DF,  p-value: < 2.2e-16

The effect of subsidies on unemployment

  library(haven)
  library(AER)
  df=read.csv( "https://www.dropbox.com/s/8pdffaq268v7m8o/unempprep.csv?dl=1")
  #names(df)
  summary(df$DDDNGE)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.30000  0.00000  0.00000 -0.01505  0.00000  0.35000

Simple OLS

     regOLS=lm(  DDDln1Punemp ~ DDDNGE ,df)
   summary(regOLS)
## 
## Call:
## lm(formula = DDDln1Punemp ~ DDDNGE, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5451 -0.1907  0.0165  0.2109  2.8601 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -0.462220   0.003648 -126.703  < 2e-16 ***
## DDDNGE      -0.221062   0.036012   -6.138 8.62e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3743 on 10762 degrees of freedom
## Multiple R-squared:  0.003489,   Adjusted R-squared:  0.003396 
## F-statistic: 37.68 on 1 and 10762 DF,  p-value: 8.624e-10

IV

List of historic deprivation controls

controls=c("gdp91","manufshare_1991","popdens_1981","current_unemprate1991","actrate_1991","resid_emp_rate92")
fff=paste(controls, collapse  ="+")
fff
## [1] "gdp91+manufshare_1991+popdens_1981+current_unemprate1991+actrate_1991+resid_emp_rate92"

Now run iv:

 fffiv  =paste0( "DDDln1Punemp ~ DDDNGE+     ",fff, "| DDDxnivav +",fff)
 fffiv
## [1] "DDDln1Punemp ~ DDDNGE+     gdp91+manufshare_1991+popdens_1981+current_unemprate1991+actrate_1991+resid_emp_rate92| DDDxnivav +gdp91+manufshare_1991+popdens_1981+current_unemprate1991+actrate_1991+resid_emp_rate92"
 summary(ivreg(  fffiv ,data=df))
## 
## Call:
## ivreg(formula = fffiv, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.52984 -0.18444  0.01339  0.20575  2.74181 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -0.7851275  0.0820543  -9.568  < 2e-16 ***
## DDDNGE                -0.4546888  0.1131435  -4.019 5.89e-05 ***
## gdp91                  0.0030923  0.0003235   9.559  < 2e-16 ***
## manufshare_1991        0.5714105  0.0481121  11.877  < 2e-16 ***
## popdens_1981           0.0004555  0.0004222   1.079    0.281    
## current_unemprate1991 -2.1341897  0.4972815  -4.292 1.79e-05 ***
## actrate_1991           0.4900480  0.0628717   7.794 7.07e-15 ***
## resid_emp_rate92      -0.3915193  0.0876794  -4.465 8.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3666 on 10756 degrees of freedom
## Multiple R-Squared: 0.04475, Adjusted R-squared: 0.04413 
## Wald test: 84.77 on 7 and 10756 DF,  p-value: < 2.2e-16

Check first stage

  ffffs  =paste0( "DDDNGE~DDDxnivav+     ",fff)
 
  regFS=lm(  ffffs ,df)
  summary(regFS)     
## 
## Call:
## lm(formula = ffffs, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38214 -0.01537  0.00344  0.02657  0.43048 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.210e-01  2.303e-02  -9.599  < 2e-16 ***
## DDDxnivav              1.015e+00  2.932e-02  34.627  < 2e-16 ***
## gdp91                 -4.575e-04  8.145e-05  -5.617 2.00e-08 ***
## manufshare_1991       -1.038e-01  1.146e-02  -9.059  < 2e-16 ***
## popdens_1981           1.969e-04  1.070e-04   1.839  0.06592 .  
## current_unemprate1991  1.061e+00  1.372e-01   7.732 1.16e-14 ***
## actrate_1991          -4.786e-02  1.592e-02  -3.007  0.00265 ** 
## resid_emp_rate92       3.674e-01  2.482e-02  14.803  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09356 on 10756 degrees of freedom
## Multiple R-squared:  0.1284, Adjusted R-squared:  0.1278 
## F-statistic: 226.3 on 7 and 10756 DF,  p-value: < 2.2e-16
  linearHypothesis(regFS,"DDDxnivav=0")
## Linear hypothesis test
## 
## Hypothesis:
## DDDxnivav = 0
## 
## Model 1: restricted model
## Model 2: DDDNGE ~ DDDxnivav + gdp91 + manufshare_1991 + popdens_1981 + 
##     current_unemprate1991 + actrate_1991 + resid_emp_rate92
## 
##   Res.Df     RSS Df Sum of Sq    F    Pr(>F)    
## 1  10757 104.652                                
## 2  10756  94.156  1    10.496 1199 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reduced Form:

  fffrf  =paste0( "DDDln1Punemp~DDDxnivav+     ",fff)
  summary(lm(  fffrf ,df))
## 
## Call:
## lm(formula = fffrf, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5300 -0.1834  0.0138  0.2023  2.7342 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -0.6846189  0.0898471  -7.620 2.75e-14 ***
## DDDxnivav             -0.4616909  0.1144068  -4.036 5.49e-05 ***
## gdp91                  0.0033003  0.0003178  10.386  < 2e-16 ***
## manufshare_1991        0.6185964  0.0446947  13.840  < 2e-16 ***
## popdens_1981           0.0003659  0.0004176   0.876    0.381    
## current_unemprate1991 -2.6164480  0.5352012  -4.889 1.03e-06 ***
## actrate_1991           0.5118111  0.0621070   8.241  < 2e-16 ***
## resid_emp_rate92      -0.5585603  0.0968272  -5.769 8.21e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.365 on 10756 degrees of freedom
## Multiple R-squared:  0.0527, Adjusted R-squared:  0.05209 
## F-statistic: 85.49 on 7 and 10756 DF,  p-value: < 2.2e-16