Specific requirements

In this programming project, complete the following steps on how to implement one of the classification techniques to classify a data set:

Data frame information

Data set name:
Salaries for Professors
Description:
The 2008-09 nine-month academic salary for Assistant Professors, Associate Professors and Professors in a college in the U.S. The data were collected as part of the on-going effort of the college’s administration to monitor salary differences between male and female faculty members.
Usage:
Salaries
Format:
A data frame with 397 observations on the following 6 variables.
rank
          a factor with levels AssocProf AsstProf Prof. 
discipline
          a factor with levels A (“theoretical” departments) or B (“applied” departments).
yrs.since.phd
          years since PhD.
yrs.service
          years of service.
sex
          a factor with levels Female Male.
salary
          nine-month salary, in dollars.
References:
Fox J. and Weisberg, S. (2011) An R Companion to Applied Regression, Second Edition Sage.
Data download page:
http://vincentarelbundock.github.io/Rdatasets/datasets.html

Include the knitr package for integration of R code into Markdown

knitr::opts_chunk$set(echo = TRUE)

Import data

library(readr)
input_data <- read_csv("Salaries.csv", col_types = cols(X1 = col_skip(), 
    discipline = col_character(), rank = col_character(), 
    salary = col_number(), sex = col_character(), 
    yrs.service = col_number(), yrs.since.phd = col_number()))
## Warning: Missing column names filled in: 'X1' [1]

Descriptive statistics

Dim

dim(input_data)
## [1] 397   6

Str

str(input_data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 397 obs. of  6 variables:
##  $ rank         : chr  "Prof" "Prof" "AsstProf" "Prof" ...
##  $ discipline   : chr  "B" "B" "B" "B" ...
##  $ yrs.since.phd: num  19 20 4 45 40 6 30 45 21 18 ...
##  $ yrs.service  : num  18 16 3 39 41 6 23 45 20 18 ...
##  $ sex          : chr  "Male" "Male" "Male" "Male" ...
##  $ salary       : num  139750 173200 79750 115000 141500 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_skip(),
##   ..   rank = col_character(),
##   ..   discipline = col_character(),
##   ..   yrs.since.phd = col_number(),
##   ..   yrs.service = col_number(),
##   ..   sex = col_character(),
##   ..   salary = col_number()
##   .. )

Summary

summary(input_data)
##      rank            discipline        yrs.since.phd    yrs.service   
##  Length:397         Length:397         Min.   : 1.00   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:12.00   1st Qu.: 7.00  
##  Mode  :character   Mode  :character   Median :21.00   Median :16.00  
##                                        Mean   :22.31   Mean   :17.61  
##                                        3rd Qu.:32.00   3rd Qu.:27.00  
##                                        Max.   :56.00   Max.   :60.00  
##      sex                salary      
##  Length:397         Min.   : 57800  
##  Class :character   1st Qu.: 91000  
##  Mode  :character   Median :107300  
##                     Mean   :113706  
##                     3rd Qu.:134185  
##                     Max.   :231545

Tail

tail(input_data)
## # A tibble: 6 x 6
##   rank     discipline yrs.since.phd yrs.service sex   salary
##   <chr>    <chr>              <dbl>       <dbl> <chr>  <dbl>
## 1 Prof     A                     30          19 Male  151292
## 2 Prof     A                     33          30 Male  103106
## 3 Prof     A                     31          19 Male  150564
## 4 Prof     A                     42          25 Male  101738
## 5 Prof     A                     25          15 Male   95329
## 6 AsstProf A                      8           4 Male   81035

Sapply mode

sapply(input_data,mode)
##          rank    discipline yrs.since.phd   yrs.service           sex 
##   "character"   "character"     "numeric"     "numeric"   "character" 
##        salary 
##     "numeric"

Lapply mean

lapply(input_data[,c(3,4,6)],mean)
## $yrs.since.phd
## [1] 22.31486
## 
## $yrs.service
## [1] 17.61461
## 
## $salary
## [1] 113706.5

Lapply median

lapply(input_data[,c(3,4,6)],median)
## $yrs.since.phd
## [1] 21
## 
## $yrs.service
## [1] 16
## 
## $salary
## [1] 107300

Lapply mode

require(modeest)
## Loading required package: modeest
lapply(input_data[,c(3,4,6)],mfv)
## $yrs.since.phd
## [1] 4
## 
## $yrs.service
## [1] 3
## 
## $salary
## [1] 92000

Lapply min

lapply(input_data[,c(3,4,6)],min)
## $yrs.since.phd
## [1] 1
## 
## $yrs.service
## [1] 0
## 
## $salary
## [1] 57800

Lapply max

lapply(input_data[,c(3,4,6)],max)
## $yrs.since.phd
## [1] 56
## 
## $yrs.service
## [1] 60
## 
## $salary
## [1] 231545

Lapply range

lapply(input_data[,c(3,4,6)],range)
## $yrs.since.phd
## [1]  1 56
## 
## $yrs.service
## [1]  0 60
## 
## $salary
## [1]  57800 231545

Lapply variance

lapply(input_data[,c(3,4,6)],var)
## $yrs.since.phd
## [1] 166.0749
## 
## $yrs.service
## [1] 169.1567
## 
## $salary
## [1] 917425865

Lapply standard deviation

lapply(input_data[,c(3,4,6)],sd)
## $yrs.since.phd
## [1] 12.887
## 
## $yrs.service
## [1] 13.00602
## 
## $salary
## [1] 30289.04

Lapply median absolute deviation

lapply(input_data[,c(3,4,6)],mad)
## $yrs.since.phd
## [1] 14.826
## 
## $yrs.service
## [1] 14.826
## 
## $salary
## [1] 29355.48

Plots and Graphs

Box plots

Explanation:

This box plot reveals the mean value, minimum value, and maximum value of each variables.

Observation:

From the box plots, it appears that the sex, rank, and discipline of the professor individually could directly affect the mean, maximum and minimum salaries. Also, it appears that female professors spent less time on the job than male professors.

library(ggplot2)
ggplot(data = input_data, aes(x=, y=salary, fill=sex)) + geom_boxplot() + xlab("Increment") + ylab("Salary") + ggtitle("Salary")

library(ggplot2)
ggplot(data = input_data, aes(x=, y=yrs.since.phd, fill=sex)) + geom_boxplot() + xlab("Increment") + ylab("Years Since PhD") + ggtitle("Years Since PhD")

ggplot(data = input_data, aes(x=, y=yrs.service, fill=sex)) + geom_boxplot() + xlab("Increment") + ylab("Years of Service") + ggtitle("Years of Service")

library(ggplot2)
ggplot(data = input_data, aes(x=, y=salary, fill=rank)) + geom_boxplot() + xlab("Increment") + ylab("Salary") + ggtitle("Salary")

library(ggplot2)
ggplot(data = input_data, aes(x=, y=yrs.since.phd, fill=rank)) + geom_boxplot() + xlab("Increment") + ylab("Years Since PhD") + ggtitle("Years Since PhD")

ggplot(data = input_data, aes(x=, y=yrs.service, fill=rank)) + geom_boxplot() + xlab("Increment") + ylab("Years of Service") + ggtitle("Years of Service")

library(ggplot2)
ggplot(data = input_data, aes(x=, y=salary, fill=discipline)) + geom_boxplot() + xlab("Increment") + ylab("Salary") + ggtitle("Salary")

library(ggplot2)
ggplot(data = input_data, aes(x=, y=yrs.since.phd, fill=discipline)) + geom_boxplot() + xlab("Increment") + ylab("Years Since PhD") + ggtitle("Years Since PhD")

ggplot(data = input_data, aes(x=, y=yrs.service, fill=discipline)) + geom_boxplot() + xlab("Increment") + ylab("Years of Service") + ggtitle("Years of Service")

Column graphs

Explanation:

This Column graph reveals the relationship between depedent and independent variables.

Observation:

From the column graphs, it appears that Years Since PhD has a stronger impact on Salary then Years of Service. Also, it appears that individually the rank and discipline have stronger impact on salary than sex.

library(ggplot2)

ggplot(data = input_data, aes(x=yrs.since.phd, y=salary, fill=sex)) + geom_col(stat="identity", position = "dodge", size=1) + xlab("Years Since PhD") + ylab("Salary") + ggtitle("Salary Vs. Years Since PhD")
## Warning: Ignoring unknown parameters: stat

ggplot(data = input_data, aes(x=yrs.service, y=salary, fill=sex)) + geom_col(stat="identity", position = "dodge", size=1) + xlab("Years of Service") + ylab("Salary") + ggtitle("Salary Vs. Years of Service")
## Warning: Ignoring unknown parameters: stat

ggplot(data = input_data, aes(x=yrs.since.phd, y=salary, fill=rank)) + geom_col(stat="identity", position = "dodge", size=1) + xlab("Years Since PhD") + ylab("Salary") + ggtitle("Salary Vs. Years Since PhD")
## Warning: Ignoring unknown parameters: stat

ggplot(data = input_data, aes(x=yrs.service, y=salary, fill=rank)) + geom_col(stat="identity", position = "dodge", size=1) + xlab("Years of Service") + ylab("Salary") + ggtitle("Salary Vs. Years of Service")
## Warning: Ignoring unknown parameters: stat

ggplot(data = input_data, aes(x=yrs.since.phd, y=salary, fill=discipline)) + geom_col(stat="identity", position = "dodge", size=1) + xlab("Years Since PhD") + ylab("Salary") + ggtitle("Salary Vs. Years Since PhD")
## Warning: Ignoring unknown parameters: stat

ggplot(data = input_data, aes(x=yrs.service, y=salary, fill=discipline)) + geom_col(stat="identity", position = "dodge", size=1) + xlab("Years of Service") + ylab("Salary") + ggtitle("Salary Vs. Years of Service")
## Warning: Ignoring unknown parameters: stat

Regression Model Fitting

Correlation Statistics

Explanation:

The correlation statistics reveal the degree of associations between variables in the data set.

Observation:

It appears that between “Years of PhD” and “Years of Service”, “Years of PhD” has a stronger correlation with Salary. The significantly strong correlation coefficient of 0.9 between “Years of PhD” and “Years of Service” suggests the existence of multicollinearity. Regarding the p-values between the pairwise correlations, it appears that the correlation between “PhD and Years” and “Years of Service” is the most significant, while that of “Years of Service” and Salary is the least significant, which is expected when reviewing the plots and graphs above.

Coefficient R:

cor(input_data[,c(3,4,6)])
##               yrs.since.phd yrs.service    salary
## yrs.since.phd     1.0000000   0.9096491 0.4192311
## yrs.service       0.9096491   1.0000000 0.3347447
## salary            0.4192311   0.3347447 1.0000000

P-Value:

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
attach(input_data[,c(3,4,6)])
cor.test(salary,yrs.since.phd) 
## 
##  Pearson's product-moment correlation
## 
## data:  salary and yrs.since.phd
## t = 9.1775, df = 395, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3346160 0.4971402
## sample estimates:
##       cor 
## 0.4192311
library(Hmisc)
attach(input_data[,c(3,4,6)])
## The following objects are masked from input_data[, c(3, 4, 6)] (pos = 3):
## 
##     salary, yrs.service, yrs.since.phd
cor.test(salary,yrs.service) 
## 
##  Pearson's product-moment correlation
## 
## data:  salary and yrs.service
## t = 7.0602, df = 395, p-value = 7.529e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2443740 0.4193506
## sample estimates:
##       cor 
## 0.3347447
library(Hmisc)
attach(input_data[,c(3,4,6)])
## The following objects are masked from input_data[, c(3, 4, 6)] (pos = 3):
## 
##     salary, yrs.service, yrs.since.phd
## The following objects are masked from input_data[, c(3, 4, 6)] (pos = 4):
## 
##     salary, yrs.service, yrs.since.phd
cor.test(yrs.since.phd,yrs.service) 
## 
##  Pearson's product-moment correlation
## 
## data:  yrs.since.phd and yrs.service
## t = 43.524, df = 395, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8909977 0.9252353
## sample estimates:
##       cor 
## 0.9096491

Categorical Dummy Variable Conversion

Explanation:

Variables rank, discipline and sex are independent categorical variables. Since in order for linear regression to work correctly, all categorical independent variables much be converted into continuous variables. The good news is that R can automatically makes these conversions. However, before R can do so these variables must first be converted into factors.

Converting categorical variables into factors:

input_data$rank = as.factor(input_data$rank)
input_data$discipline = as.factor(input_data$discipline)
input_data$sex = as.factor(input_data$sex)

After converting categorical variables into factors:

str(input_data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 397 obs. of  6 variables:
##  $ rank         : Factor w/ 3 levels "AssocProf","AsstProf",..: 3 3 2 3 3 1 3 3 3 3 ...
##  $ discipline   : Factor w/ 2 levels "A","B": 2 2 2 2 2 2 2 2 2 2 ...
##  $ yrs.since.phd: num  19 20 4 45 40 6 30 45 21 18 ...
##  $ yrs.service  : num  18 16 3 39 41 6 23 45 20 18 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ salary       : num  139750 173200 79750 115000 141500 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_skip(),
##   ..   rank = col_character(),
##   ..   discipline = col_character(),
##   ..   yrs.since.phd = col_number(),
##   ..   yrs.service = col_number(),
##   ..   sex = col_character(),
##   ..   salary = col_number()
##   .. )

1st Stepwise Regression Model Iteration

Explanation:

This is the first stepwise iteration of the regression model. This step will reveal the model statistics such as the R-squared value of the regression and the P-value of each independent variable. The R-squared value indicates the percentage of the actual data points that could be explained by the regression. The p-value of each independent variable indicates the pairwise significance of the association between each independent variable and the dependent variable.

Observation:

The R-squared value of 0.454 means that the regression can explain 45.4% of the actual data points. However, given a range between 0 and 1, a value of 0.454 indicates only a moderate fitting. Given a cut-off point of 0.05, the p-values of independent variables yrs.since.phd, yrs.service and sex are not very significant. Also, given a cut-off point of 5, yrs.since.phd and yrs.service’s variance inflation factors of 7.52 and 5.92 indicate strong multicollinearity. Therefore, all these three variables are considered for removal in the next stepwise iteration.

Stepwise:

rg_output = lm(salary~., data=input_data)
summary(rg_output)
## 
## Call:
## lm(formula = salary ~ ., data = input_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    78862.8     4990.3  15.803  < 2e-16 ***
## rankAsstProf  -12907.6     4145.3  -3.114  0.00198 ** 
## rankProf       32158.4     3540.6   9.083  < 2e-16 ***
## disciplineB    14417.6     2342.9   6.154 1.88e-09 ***
## yrs.since.phd    535.1      241.0   2.220  0.02698 *  
## yrs.service     -489.5      211.9  -2.310  0.02143 *  
## sexMale         4783.5     3858.7   1.240  0.21584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
anova(rg_output)
## Analysis of Variance Table
## 
## Response: salary
##                Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## rank            2 1.4323e+11 7.1616e+10 140.9788 < 2.2e-16 ***
## discipline      1 1.8430e+10 1.8430e+10  36.2801 3.954e-09 ***
## yrs.since.phd   1 1.6565e+08 1.6565e+08   0.3261   0.56830    
## yrs.service     1 2.5763e+09 2.5763e+09   5.0715   0.02488 *  
## sex             1 7.8068e+08 7.8068e+08   1.5368   0.21584    
## Residuals     390 1.9812e+11 5.0799e+08                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Multicollinearity and Stepwise:

library(car)
## Loading required package: carData
# Variance Inflation Factors (>5?)
vif(rg_output)
##                   GVIF Df GVIF^(1/(2*Df))
## rank          2.013193  2        1.191163
## discipline    1.064105  1        1.031555
## yrs.since.phd 7.518936  1        2.742068
## yrs.service   5.923038  1        2.433729
## sex           1.030805  1        1.015285

2nd Stepwise Regression Model Iteration

Explanation:

This is the second stepwise iteration of the regression model. This step will reveal the best fitted model.

Observation:

Independent variables yrs.since.phd, yrs.service and sex are removed from this iteration. As a result, every one of the remaining independent variables has a p-value of 0, which indicates the highest significant association with the dependent variable. Also, no pair of independent variables has a variance inflation factor greater than 5, which indicates no multicollinearity. Also, since the p-value is less than the significance cut-off po