ISLR Home

Question

p299

The Wage data set contains a number of other features not explored in this chapter, such as marital status (maritl), job class (jobclass), and others. Explore the relationships between some of these other predictors and wage, and use non-linear fitting techniques in order to fit flexible models to the data. Create plots of the results obtained, and write a summary of your findings.


7a

library(tidyverse) 
library(ISLR)
attach(Wage)
glimpse(Wage)
## Rows: 3,000
## Columns: 11
## $ year       <int> 2006, 2004, 2003, 2003, 2005, 2008, 2009, 2008, 2006, 2004…
## $ age        <int> 18, 24, 45, 43, 50, 54, 44, 30, 41, 52, 45, 34, 35, 39, 54…
## $ maritl     <fct> 1. Never Married, 1. Never Married, 2. Married, 2. Married…
## $ race       <fct> 1. White, 1. White, 1. White, 3. Asian, 1. White, 1. White…
## $ education  <fct> 1. < HS Grad, 4. College Grad, 3. Some College, 4. College…
## $ region     <fct> 2. Middle Atlantic, 2. Middle Atlantic, 2. Middle Atlantic…
## $ jobclass   <fct> 1. Industrial, 2. Information, 1. Industrial, 2. Informati…
## $ health     <fct> 1. <=Good, 2. >=Very Good, 1. <=Good, 2. >=Very Good, 1. <…
## $ health_ins <fct> 2. No, 2. No, 1. Yes, 1. Yes, 1. Yes, 1. Yes, 1. Yes, 1. Y…
## $ logwage    <dbl> 4.318063, 4.255273, 4.875061, 5.041393, 4.318063, 4.845098…
## $ wage       <dbl> 75.04315, 70.47602, 130.98218, 154.68529, 75.04315, 127.11…
library(skimr)
skim(Wage)
Data summary
Name Wage
Number of rows 3000
Number of columns 11
_______________________
Column type frequency:
factor 7
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
maritl 0 1 FALSE 5 2. : 2074, 1. : 648, 4. : 204, 5. : 55
race 0 1 FALSE 4 1. : 2480, 2. : 293, 3. : 190, 4. : 37
education 0 1 FALSE 5 2. : 971, 4. : 685, 3. : 650, 5. : 426
region 0 1 FALSE 1 2. : 3000, 1. : 0, 3. : 0, 4. : 0
jobclass 0 1 FALSE 2 1. : 1544, 2. : 1456
health 0 1 FALSE 2 2. : 2142, 1. : 858
health_ins 0 1 FALSE 2 1. : 2083, 2. : 917

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2005.79 2.03 2003.00 2004.00 2006.00 2008.00 2009.00 ▇▃▃▃▆
age 0 1 42.41 11.54 18.00 33.75 42.00 51.00 80.00 ▃▇▇▃▁
logwage 0 1 4.65 0.35 3.00 4.45 4.65 4.86 5.76 ▁▁▇▇▁
wage 0 1 111.70 41.73 20.09 85.38 104.92 128.68 318.34 ▂▇▂▁▁

plot(jobclass,wage) 

plot(maritl,wage)

# jobclass.fit = lm(wage\~I(jobclass), data = Wage)

Wage vs Marital Status

g1 <- ggplot(Wage, aes(x = maritl, y = wage, fill = maritl)) + geom_boxplot() + theme(legend.position = "none")
g1

Married has the highest average salary.

Wage vs Job Class

g2 <- ggplot(Wage, aes(x = jobclass, y = wage, fill = jobclass)) + geom_boxplot() + theme(legend.position = "none")
g2

require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, ncol = 2)
grid.arrange(g1, g2, ncol = 2)

Wage vs Race

ggplot(Wage, aes(x = race, y = wage, fill = race)) + geom_boxplot() + theme(legend.position = "none")

Wage vs Education

ggplot(Wage, aes(x = education, y = wage, fill = education)) + geom_boxplot() + theme(legend.position = "none")

Wage vs Job Class

ggplot(Wage, aes(x = health_ins, y = wage, fill = health_ins)) + geom_boxplot() + theme(legend.position = "none")

Wage vs Region

ggplot(Wage, aes(x = region, y = wage, fill = region)) + geom_boxplot() + theme(legend.position = "none")

Wage vs Age

g3 <- ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.5) +
  geom_smooth()
g3
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Just a line. Not points.

g3 <- ggplot(Wage, aes(x = age, y = wage)) +
  # geom_point(alpha = 0.5) +
    geom_smooth()

g3
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Misc

# jobclass.fit = lm(wage \~ jobclass + cut(age,4), data = Wage)
# plot(jobclass.fit) 
# summary(jobclass.fit) 

#anova(fit1,fit2,fit3)
#cut(Wage\$jobclass, 4)

Model Summary

model <- lm(wage ~ poly(age, 3, raw = T) + cut(year, breaks = 2002:2009) + maritl + race + education + jobclass + health + health_ins, data = Wage)
summary(model)
## 
## Call:
## lm(formula = wage ~ poly(age, 3, raw = T) + cut(year, breaks = 2002:2009) + 
##     maritl + race + education + jobclass + health + health_ins, 
##     data = Wage)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -101.439  -18.601   -3.308   13.816  209.106 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               3.247e+00  1.973e+01   0.165 0.869297
## poly(age, 3, raw = T)1                    3.589e+00  1.409e+00   2.546 0.010931
## poly(age, 3, raw = T)2                   -4.877e-02  3.192e-02  -1.528 0.126684
## poly(age, 3, raw = T)3                    1.629e-04  2.323e-04   0.701 0.483087
## cut(year, breaks = 2002:2009)(2003,2004]  2.596e+00  2.146e+00   1.209 0.226587
## cut(year, breaks = 2002:2009)(2004,2005]  3.120e+00  2.191e+00   1.424 0.154457
## cut(year, breaks = 2002:2009)(2005,2006]  7.621e+00  2.271e+00   3.356 0.000801
## cut(year, breaks = 2002:2009)(2006,2007]  5.199e+00  2.282e+00   2.278 0.022796
## cut(year, breaks = 2002:2009)(2007,2008]  6.409e+00  2.275e+00   2.817 0.004884
## cut(year, breaks = 2002:2009)(2008,2009]  8.390e+00  2.276e+00   3.686 0.000232
## maritl2. Married                          1.338e+01  1.813e+00   7.382 2.02e-13
## maritl3. Widowed                          4.343e-01  7.970e+00   0.054 0.956547
## maritl4. Divorced                         1.464e-01  2.930e+00   0.050 0.960159
## maritl5. Separated                        7.147e+00  4.860e+00   1.471 0.141523
## race2. Black                             -4.788e+00  2.134e+00  -2.244 0.024904
## race3. Asian                             -2.686e+00  2.589e+00  -1.038 0.299564
## race4. Other                             -5.402e+00  5.633e+00  -0.959 0.337673
## education2. HS Grad                       7.549e+00  2.356e+00   3.205 0.001366
## education3. Some College                  1.798e+01  2.506e+00   7.175 9.08e-13
## education4. College Grad                  3.062e+01  2.535e+00  12.079  < 2e-16
## education5. Advanced Degree               5.313e+01  2.796e+00  19.001  < 2e-16
## jobclass2. Information                    3.453e+00  1.319e+00   2.618 0.008877
## health2. >=Very Good                      6.307e+00  1.413e+00   4.464 8.35e-06
## health_ins2. No                          -1.634e+01  1.405e+00 -11.630  < 2e-16
##                                             
## (Intercept)                                 
## poly(age, 3, raw = T)1                   *  
## poly(age, 3, raw = T)2                      
## poly(age, 3, raw = T)3                      
## cut(year, breaks = 2002:2009)(2003,2004]    
## cut(year, breaks = 2002:2009)(2004,2005]    
## cut(year, breaks = 2002:2009)(2005,2006] ***
## cut(year, breaks = 2002:2009)(2006,2007] *  
## cut(year, breaks = 2002:2009)(2007,2008] ** 
## cut(year, breaks = 2002:2009)(2008,2009] ***
## maritl2. Married                         ***
## maritl3. Widowed                            
## maritl4. Divorced                           
## maritl5. Separated                          
## race2. Black                             *  
## race3. Asian                                
## race4. Other                                
## education2. HS Grad                      ** 
## education3. Some College                 ***
## education4. College Grad                 ***
## education5. Advanced Degree              ***
## jobclass2. Information                   ** 
## health2. >=Very Good                     ***
## health_ins2. No                          ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33.77 on 2976 degrees of freedom
## Multiple R-squared:  0.3503, Adjusted R-squared:  0.3452 
## F-statistic: 69.75 on 23 and 2976 DF,  p-value: < 2.2e-16