Content
The Boston data frame has 506 rows and 14 columns.
Each record in the database describes a Boston suburb or town. The data was drawn from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970.
The Boston data frame has 506 rows and 14 columns.
The attributes are defined as follows (taken from the UCI Machine Learning Repository)
crim
per capita crime rate by town.
zn
proportion of residential land zoned for lots over 25,000 sq.ft.
indus
proportion of non-retail business acres per town.
chas
Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
nox
nitrogen oxides concentration (parts per 10 million).
rm
average number of rooms per dwelling.
age
proportion of owner-occupied units built prior to 1940.
dis
weighted mean of distances to five Boston employment centres.
rad
index of accessibility to radial highways.
tax
full-value property-tax rate per $10,000.
ptratio
pupil-teacher ratio by town.
black
1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
lstat
lower status of the population (percent).
medv
median value of owner-occupied homes in $1000s.
https://www.kaggle.com/vikrishnan/boston-house-prices
Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand for clean air. J. Environ. Economics and Management 5, 81–102.
Belsley D.A., Kuh, E. and Welsch, R.E. (1980) Regression Diagnostics. Identifying Influential Data and Sources of Collinearity. New York: Wiley.
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 543348 29.1 1241757 66.4 621331 33.2
## Vcells 1026351 7.9 8388608 64.0 1600889 12.3
rm(list = ls())
start_time <- Sys.time()
knitr::opts_chunk$set(echo = TRUE)
library(easypackages)
libraries("caret","caretEnsemble","caTools","class","cluster","data.tree","devtools","doSNOW","dplyr","e1071","factoextra","gbm","FNN","FSelector","ggalt","ggforce","ggfortify","ggplot2","gmodels","klaR","lattice","mlbench","modeest","nnet","neuralnet","outliers","parallel","psych","purrr","readr","rpart","rpart.plot","spatialEco","stats","tidyr","randomForest","ROSE","rsample","ROCR","pROC","glmnet")
oldw <- getOption("warn")
options(warn = -1)
library(readr)
input_data <- read_csv("Boston.csv",
col_types = cols(X1 = col_skip(),
age = col_number(), black = col_number(),
chas = col_character(), crim = col_number(),
dis = col_number(), indus = col_number(),
lstat = col_number(), medv = col_number(),
nox = col_number(), ptratio = col_number(),
rad = col_number(), rm = col_number(),
tax = col_number(), zn = col_number()))
options(warn = -1)
num.names <- input_data %>% select_if(is.numeric) %>% colnames()
ch.names <- input_data %>% select_if(is.character) %>% colnames()
dim(input_data)
## [1] 506 14
str(input_data)
## tibble [506 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ crim : num [1:506] 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num [1:506] 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num [1:506] 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : chr [1:506] "0" "0" "0" "0" ...
## $ nox : num [1:506] 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num [1:506] 6.58 6.42 7.18 7 7.15 ...
## $ age : num [1:506] 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num [1:506] 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num [1:506] 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num [1:506] 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num [1:506] 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num [1:506] 397 397 393 395 397 ...
## $ lstat : num [1:506] 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_skip(),
## .. crim = col_number(),
## .. zn = col_number(),
## .. indus = col_number(),
## .. chas = col_character(),
## .. nox = col_number(),
## .. rm = col_number(),
## .. age = col_number(),
## .. dis = col_number(),
## .. rad = col_number(),
## .. tax = col_number(),
## .. ptratio = col_number(),
## .. black = col_number(),
## .. lstat = col_number(),
## .. medv = col_number()
## .. )
summary(input_data)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Length:506
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 Class :character
## Median : 0.25651 Median : 0.00 Median : 9.69 Mode :character
## Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :88.97620 Max. :100.00 Max. :27.74
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
glimpse(input_data)
## Rows: 506
## Columns: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.088...
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5...
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87,...
## $ chas <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0",...
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.5...
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.6...
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9...
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9...
## $ rad <dbl> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4,...
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2,...
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396...
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17...
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9,...
head(input_data)
## # A tibble: 6 x 14
## crim zn indus chas nox rm age dis rad tax ptratio black
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296 15.3 397.
## 2 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242 17.8 397.
## 3 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8 393.
## 4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222 18.7 395.
## 5 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222 18.7 397.
## 6 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222 18.7 394.
## # ... with 2 more variables: lstat <dbl>, medv <dbl>
tail(input_data)
## # A tibble: 6 x 14
## crim zn indus chas nox rm age dis rad tax ptratio black
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.224 0 9.69 0 0.585 6.03 79.7 2.50 6 391 19.2 397.
## 2 0.0626 0 11.9 0 0.573 6.59 69.1 2.48 1 273 21 392.
## 3 0.0453 0 11.9 0 0.573 6.12 76.7 2.29 1 273 21 397.
## 4 0.0608 0 11.9 0 0.573 6.98 91 2.17 1 273 21 397.
## 5 0.110 0 11.9 0 0.573 6.79 89.3 2.39 1 273 21 393.
## 6 0.0474 0 11.9 0 0.573 6.03 80.8 2.50 1 273 21 397.
## # ... with 2 more variables: lstat <dbl>, medv <dbl>
sapply(input_data,mode)
## crim zn indus chas nox rm
## "numeric" "numeric" "numeric" "character" "numeric" "numeric"
## age dis rad tax ptratio black
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## lstat medv
## "numeric" "numeric"
lapply(input_data[,num.names],mean)
## $crim
## [1] 3.613524
##
## $zn
## [1] 11.36364
##
## $indus
## [1] 11.13678
##
## $nox
## [1] 0.5546951
##
## $rm
## [1] 6.284634
##
## $age
## [1] 68.5749
##
## $dis
## [1] 3.795043
##
## $rad
## [1] 9.549407
##
## $tax
## [1] 408.2372
##
## $ptratio
## [1] 18.45553
##
## $black
## [1] 356.674
##
## $lstat
## [1] 12.65306
##
## $medv
## [1] 22.53281
lapply(input_data[,num.names],median)
## $crim
## [1] 0.25651
##
## $zn
## [1] 0
##
## $indus
## [1] 9.69
##
## $nox
## [1] 0.538
##
## $rm
## [1] 6.2085
##
## $age
## [1] 77.5
##
## $dis
## [1] 3.20745
##
## $rad
## [1] 5
##
## $tax
## [1] 330
##
## $ptratio
## [1] 19.05
##
## $black
## [1] 391.44
##
## $lstat
## [1] 11.36
##
## $medv
## [1] 21.2
lapply(input_data[,num.names],mfv)
## $crim
## [1] 0.01501 14.33370
##
## $zn
## [1] 0
##
## $indus
## [1] 18.1
##
## $nox
## [1] 0.538
##
## $rm
## [1] 5.713 6.127 6.167 6.229 6.405 6.417
##
## $age
## [1] 100
##
## $dis
## [1] 3.4952
##
## $rad
## [1] 24
##
## $tax
## [1] 666
##
## $ptratio
## [1] 20.2
##
## $black
## [1] 396.9
##
## $lstat
## [1] 6.36 7.79 8.05 14.10 18.13
##
## $medv
## [1] 50
lapply(input_data[,num.names],min)
## $crim
## [1] 0.00632
##
## $zn
## [1] 0
##
## $indus
## [1] 0.46
##
## $nox
## [1] 0.385
##
## $rm
## [1] 3.561
##
## $age
## [1] 2.9
##
## $dis
## [1] 1.1296
##
## $rad
## [1] 1
##
## $tax
## [1] 187
##
## $ptratio
## [1] 12.6
##
## $black
## [1] 0.32
##
## $lstat
## [1] 1.73
##
## $medv
## [1] 5
lapply(input_data[,num.names],max)
## $crim
## [1] 88.9762
##
## $zn
## [1] 100
##
## $indus
## [1] 27.74
##
## $nox
## [1] 0.871
##
## $rm
## [1] 8.78
##
## $age
## [1] 100
##
## $dis
## [1] 12.1265
##
## $rad
## [1] 24
##
## $tax
## [1] 711
##
## $ptratio
## [1] 22
##
## $black
## [1] 396.9
##
## $lstat
## [1] 37.97
##
## $medv
## [1] 50
lapply(input_data[,num.names],range)
## $crim
## [1] 0.00632 88.97620
##
## $zn
## [1] 0 100
##
## $indus
## [1] 0.46 27.74
##
## $nox
## [1] 0.385 0.871
##
## $rm
## [1] 3.561 8.780
##
## $age
## [1] 2.9 100.0
##
## $dis
## [1] 1.1296 12.1265
##
## $rad
## [1] 1 24
##
## $tax
## [1] 187 711
##
## $ptratio
## [1] 12.6 22.0
##
## $black
## [1] 0.32 396.90
##
## $lstat
## [1] 1.73 37.97
##
## $medv
## [1] 5 50
lapply(input_data[,num.names],var)
## $crim
## [1] 73.98658
##
## $zn
## [1] 543.9368
##
## $indus
## [1] 47.06444
##
## $nox
## [1] 0.01342764
##
## $rm
## [1] 0.4936709
##
## $age
## [1] 792.3584
##
## $dis
## [1] 4.434015
##
## $rad
## [1] 75.81637
##
## $tax
## [1] 28404.76
##
## $ptratio
## [1] 4.686989
##
## $black
## [1] 8334.752
##
## $lstat
## [1] 50.99476
##
## $medv
## [1] 84.58672
lapply(input_data[,num.names],sd)
## $crim
## [1] 8.601545
##
## $zn
## [1] 23.32245
##
## $indus
## [1] 6.860353
##
## $nox
## [1] 0.1158777
##
## $rm
## [1] 0.7026171
##
## $age
## [1] 28.14886
##
## $dis
## [1] 2.10571
##
## $rad
## [1] 8.707259
##
## $tax
## [1] 168.5371
##
## $ptratio
## [1] 2.164946
##
## $black
## [1] 91.29486
##
## $lstat
## [1] 7.141062
##
## $medv
## [1] 9.197104
lapply(input_data[,num.names],mad)
## $crim
## [1] 0.3283218
##
## $zn
## [1] 0
##
## $indus
## [1] 9.370032
##
## $nox
## [1] 0.1297275
##
## $rm
## [1] 0.5122383
##
## $age
## [1] 28.98483
##
## $dis
## [1] 1.914259
##
## $rad
## [1] 2.9652
##
## $tax
## [1] 108.2298
##
## $ptratio
## [1] 1.70499
##
## $black
## [1] 8.094996
##
## $lstat
## [1] 7.109067
##
## $medv
## [1] 5.9304
To ensure that R’s data science models work correctly, all categorical dependent variables must be explicitly converted into factors. As for the independent variables, if the variable is both categorical and has more than two levels, then it should be converted into a factor.
input_data <- as.data.frame(lapply(input_data, function(x) if(is.character(x)){
x=as.factor(x)
} else x))
Useful for examinating the data values. By sorting the data, one can tell if there are missing or corrupted data values.
input_data <- input_data[order(input_data[,1]),]
glimpse(input_data)
## Rows: 506
## Columns: 14
## $ crim <dbl> 0.00632, 0.00906, 0.01096, 0.01301, 0.01311, 0.01360, 0.013...
## $ zn <dbl> 18.0, 90.0, 55.0, 35.0, 90.0, 75.0, 80.0, 100.0, 60.0, 90.0...
## $ indus <dbl> 2.31, 2.97, 2.25, 1.52, 1.22, 4.00, 0.46, 1.32, 2.93, 1.21,...
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nox <dbl> 0.5380, 0.4000, 0.3890, 0.4420, 0.4030, 0.4100, 0.4220, 0.4...
## $ rm <dbl> 6.575, 7.088, 6.453, 7.241, 7.249, 5.888, 7.875, 6.816, 6.6...
## $ age <dbl> 65.2, 20.8, 31.9, 49.3, 21.9, 47.6, 32.0, 40.5, 18.8, 24.8,...
## $ dis <dbl> 4.0900, 7.3073, 7.3073, 7.0379, 8.6966, 7.3197, 5.6484, 8.3...
## $ rad <dbl> 1, 1, 1, 1, 5, 3, 4, 5, 1, 1, 4, 3, 5, 3, 4, 3, 1, 4, 2, 2,...
## $ tax <dbl> 296, 285, 300, 284, 226, 469, 255, 256, 265, 198, 280, 244,...
## $ ptratio <dbl> 15.3, 15.3, 15.3, 15.5, 17.9, 21.1, 14.4, 15.1, 15.6, 13.6,...
## $ black <dbl> 396.90, 394.72, 394.72, 394.74, 395.93, 396.90, 394.23, 392...
## $ lstat <dbl> 4.98, 7.85, 8.23, 5.49, 4.81, 14.80, 2.97, 3.95, 4.38, 3.16...
## $ medv <dbl> 24.0, 32.2, 22.0, 32.7, 35.4, 18.9, 50.0, 31.6, 29.1, 50.0,...
input_data <- input_data[order(-input_data[,1]),]
glimpse(input_data)
## Rows: 506
## Columns: 14
## $ crim <dbl> 88.9762, 73.5341, 67.9208, 51.1358, 45.7461, 41.5292, 38.35...
## $ zn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ indus <dbl> 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1,...
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nox <dbl> 0.671, 0.679, 0.693, 0.597, 0.693, 0.693, 0.693, 0.679, 0.5...
## $ rm <dbl> 6.968, 5.957, 5.683, 5.757, 4.519, 5.531, 5.453, 6.202, 5.1...
## $ age <dbl> 91.9, 100.0, 100.0, 100.0, 100.0, 85.4, 100.0, 78.7, 100.0,...
## $ dis <dbl> 1.4165, 1.8026, 1.4254, 1.4130, 1.6582, 1.6074, 1.4896, 1.8...
## $ rad <dbl> 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,...
## $ tax <dbl> 666, 666, 666, 666, 666, 666, 666, 666, 666, 666, 666, 666,...
## $ ptratio <dbl> 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2,...
## $ black <dbl> 396.90, 16.45, 384.97, 2.60, 88.27, 329.46, 396.90, 18.82, ...
## $ lstat <dbl> 17.21, 20.62, 22.98, 10.11, 36.98, 27.38, 30.59, 14.52, 20....
## $ medv <dbl> 10.4, 8.8, 5.0, 15.0, 7.0, 8.5, 5.0, 10.9, 16.3, 10.4, 5.6,...
These missing values could cause inaccuracies or errors when calculating data limits, central tendency, dispersion tendency, correlation, multicollinearity, p-values, z-scores, variance inflation factors, etc.
input_data <- as.data.frame(lapply(input_data, function(x) if(is.numeric(x) && is.na(x)){
mean(x, na.rm = TRUE)
} else { if(is.character(x) && is.na(x)){x = "NA"} else x }
))
glimpse(input_data)
## Rows: 506
## Columns: 14
## $ crim <dbl> 88.9762, 73.5341, 67.9208, 51.1358, 45.7461, 41.5292, 38.35...
## $ zn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ indus <dbl> 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1, 18.1,...
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nox <dbl> 0.671, 0.679, 0.693, 0.597, 0.693, 0.693, 0.693, 0.679, 0.5...
## $ rm <dbl> 6.968, 5.957, 5.683, 5.757, 4.519, 5.531, 5.453, 6.202, 5.1...
## $ age <dbl> 91.9, 100.0, 100.0, 100.0, 100.0, 85.4, 100.0, 78.7, 100.0,...
## $ dis <dbl> 1.4165, 1.8026, 1.4254, 1.4130, 1.6582, 1.6074, 1.4896, 1.8...
## $ rad <dbl> 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,...
## $ tax <dbl> 666, 666, 666, 666, 666, 666, 666, 666, 666, 666, 666, 666,...
## $ ptratio <dbl> 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2, 20.2,...
## $ black <dbl> 396.90, 16.45, 384.97, 2.60, 88.27, 329.46, 396.90, 18.82, ...
## $ lstat <dbl> 17.21, 20.62, 22.98, 10.11, 36.98, 27.38, 30.59, 14.52, 20....
## $ medv <dbl> 10.4, 8.8, 5.0, 15.0, 7.0, 8.5, 5.0, 10.9, 16.3, 10.4, 5.6,...
These numeric variables are normalized between 0 and 1. In order to correctly calculate the associations between variables, the values of each variable have to be on the same scale. Also, it is easier to fit smaller numbers onto the axes of a graph.
Normalization = a + ( (x - Min(x) )(b - a))/( Max(x) - Min(x) )
x = variable subjected to normalization
a = lower bound = 0
b = upper bound = 0
input_data <- as.data.frame(lapply(input_data, function(x) if(is.numeric(x)){
(x - min(x)) / (max(x)-min(x))
} else x))
str(input_data)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 1 0.826 0.763 0.575 0.514 ...
## $ zn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ indus : num 0.647 0.647 0.647 0.647 0.647 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.588 0.605 0.634 0.436 0.634 ...
## $ rm : num 0.653 0.459 0.407 0.421 0.184 ...
## $ age : num 0.917 1 1 1 1 ...
## $ dis : num 0.0261 0.0612 0.0269 0.0258 0.0481 ...
## $ rad : num 1 1 1 1 1 1 1 1 1 1 ...
## $ tax : num 0.914 0.914 0.914 0.914 0.914 ...
## $ ptratio: num 0.809 0.809 0.809 0.809 0.809 ...
## $ black : num 1 0.04067 0.96992 0.00575 0.22177 ...
## $ lstat : num 0.427 0.521 0.586 0.231 0.973 ...
## $ medv : num 0.12 0.0844 0 0.2222 0.0444 ...
glimpse(input_data)
## Rows: 506
## Columns: 14
## $ crim <dbl> 1.0000000, 0.8264345, 0.7633424, 0.5746830, 0.5141041, 0.46...
## $ zn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ indus <dbl> 0.6466276, 0.6466276, 0.6466276, 0.6466276, 0.6466276, 0.64...
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nox <dbl> 0.5884774, 0.6049383, 0.6337449, 0.4362140, 0.6337449, 0.63...
## $ rm <dbl> 0.6528071, 0.4590918, 0.4065913, 0.4207703, 0.1835601, 0.37...
## $ age <dbl> 0.9165808, 1.0000000, 1.0000000, 1.0000000, 1.0000000, 0.84...
## $ dis <dbl> 0.0260891706, 0.0611990652, 0.0268984896, 0.0257708991, 0.0...
## $ rad <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ tax <dbl> 0.9141221, 0.9141221, 0.9141221, 0.9141221, 0.9141221, 0.91...
## $ ptratio <dbl> 0.8085106, 0.8085106, 0.8085106, 0.8085106, 0.8085106, 0.80...
## $ black <dbl> 1.000000000, 0.040672752, 0.969917797, 0.005749155, 0.22177...
## $ lstat <dbl> 0.4271523, 0.5212472, 0.5863687, 0.2312362, 0.9726821, 0.70...
## $ medv <dbl> 0.12000000, 0.08444444, 0.00000000, 0.22222222, 0.04444444,...
dim(input_data)
## [1] 506 14
str(input_data)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 1 0.826 0.763 0.575 0.514 ...
## $ zn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ indus : num 0.647 0.647 0.647 0.647 0.647 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.588 0.605 0.634 0.436 0.634 ...
## $ rm : num 0.653 0.459 0.407 0.421 0.184 ...
## $ age : num 0.917 1 1 1 1 ...
## $ dis : num 0.0261 0.0612 0.0269 0.0258 0.0481 ...
## $ rad : num 1 1 1 1 1 1 1 1 1 1 ...
## $ tax : num 0.914 0.914 0.914 0.914 0.914 ...
## $ ptratio: num 0.809 0.809 0.809 0.809 0.809 ...
## $ black : num 1 0.04067 0.96992 0.00575 0.22177 ...
## $ lstat : num 0.427 0.521 0.586 0.231 0.973 ...
## $ medv : num 0.12 0.0844 0 0.2222 0.0444 ...
summary(input_data)
## crim zn indus chas nox
## Min. :0.0000000 Min. :0.0000 Min. :0.0000 0:471 Min. :0.0000
## 1st Qu.:0.0008511 1st Qu.:0.0000 1st Qu.:0.1734 1: 35 1st Qu.:0.1317
## Median :0.0028121 Median :0.0000 Median :0.3383 Median :0.3148
## Mean :0.0405441 Mean :0.1136 Mean :0.3914 Mean :0.3492
## 3rd Qu.:0.0412585 3rd Qu.:0.1250 3rd Qu.:0.6466 3rd Qu.:0.4918
## Max. :1.0000000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## rm age dis rad
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.4454 1st Qu.:0.4338 1st Qu.:0.08826 1st Qu.:0.1304
## Median :0.5073 Median :0.7683 Median :0.18895 Median :0.1739
## Mean :0.5219 Mean :0.6764 Mean :0.24238 Mean :0.3717
## 3rd Qu.:0.5868 3rd Qu.:0.9390 3rd Qu.:0.36909 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## tax ptratio black lstat
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1756 1st Qu.:0.5106 1st Qu.:0.9457 1st Qu.:0.1440
## Median :0.2729 Median :0.6862 Median :0.9862 Median :0.2657
## Mean :0.4222 Mean :0.6229 Mean :0.8986 Mean :0.3014
## 3rd Qu.:0.9141 3rd Qu.:0.8085 3rd Qu.:0.9983 3rd Qu.:0.4201
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## medv
## Min. :0.0000
## 1st Qu.:0.2672
## Median :0.3600
## Mean :0.3896
## 3rd Qu.:0.4444
## Max. :1.0000
glimpse(input_data)
## Rows: 506
## Columns: 14
## $ crim <dbl> 1.0000000, 0.8264345, 0.7633424, 0.5746830, 0.5141041, 0.46...
## $ zn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ indus <dbl> 0.6466276, 0.6466276, 0.6466276, 0.6466276, 0.6466276, 0.64...
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nox <dbl> 0.5884774, 0.6049383, 0.6337449, 0.4362140, 0.6337449, 0.63...
## $ rm <dbl> 0.6528071, 0.4590918, 0.4065913, 0.4207703, 0.1835601, 0.37...
## $ age <dbl> 0.9165808, 1.0000000, 1.0000000, 1.0000000, 1.0000000, 0.84...
## $ dis <dbl> 0.0260891706, 0.0611990652, 0.0268984896, 0.0257708991, 0.0...
## $ rad <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ tax <dbl> 0.9141221, 0.9141221, 0.9141221, 0.9141221, 0.9141221, 0.91...
## $ ptratio <dbl> 0.8085106, 0.8085106, 0.8085106, 0.8085106, 0.8085106, 0.80...
## $ black <dbl> 1.000000000, 0.040672752, 0.969917797, 0.005749155, 0.22177...
## $ lstat <dbl> 0.4271523, 0.5212472, 0.5863687, 0.2312362, 0.9726821, 0.70...
## $ medv <dbl> 0.12000000, 0.08444444, 0.00000000, 0.22222222, 0.04444444,...
head(input_data)
## crim zn indus chas nox rm age dis rad
## 1 1.0000000 0 0.6466276 0 0.5884774 0.6528071 0.9165808 0.02608917 1
## 2 0.8264345 0 0.6466276 0 0.6049383 0.4590918 1.0000000 0.06119907 1
## 3 0.7633424 0 0.6466276 0 0.6337449 0.4065913 1.0000000 0.02689849 1
## 4 0.5746830 0 0.6466276 0 0.4362140 0.4207703 1.0000000 0.02577090 1
## 5 0.5141041 0 0.6466276 0 0.6337449 0.1835601 1.0000000 0.04806809 1
## 6 0.4667072 0 0.6466276 0 0.6337449 0.3774669 0.8496395 0.04344861 1
## tax ptratio black lstat medv
## 1 0.9141221 0.8085106 1.000000000 0.4271523 0.12000000
## 2 0.9141221 0.8085106 0.040672752 0.5212472 0.08444444
## 3 0.9141221 0.8085106 0.969917797 0.5863687 0.00000000
## 4 0.9141221 0.8085106 0.005749155 0.2312362 0.22222222
## 5 0.9141221 0.8085106 0.221771143 0.9726821 0.04444444
## 6 0.9141221 0.8085106 0.829946039 0.7077815 0.07777778
tail(input_data)
## crim zn indus chas nox rm age dis
## 501 8.182544e-05 0.75 0.12976540 0 0.051440329 0.4458709 0.4603502 0.5628950
## 502 7.631796e-05 0.90 0.02785924 0 0.037037037 0.7066488 0.1956746 0.6881030
## 503 7.519399e-05 0.35 0.03885630 0 0.117283951 0.7051159 0.4778579 0.5372696
## 504 5.215248e-05 0.55 0.06561584 0 0.008230453 0.5541291 0.2986612 0.5617674
## 505 3.079694e-05 0.90 0.09200880 0 0.030864198 0.6758000 0.1843460 0.5617674
## 506 0.000000e+00 0.18 0.06781525 0 0.314814815 0.5775053 0.6416066 0.2692031
## rad tax ptratio black lstat medv
## 501 0.08695652 0.53816794 0.9042553 1.0000000 0.36065121 0.3088889
## 502 0.17391304 0.07442748 0.5638298 0.9975541 0.08498896 0.6755556
## 503 0.00000000 0.18511450 0.3085106 0.9945534 0.10375276 0.6155556
## 504 0.00000000 0.21564885 0.2872340 0.9945030 0.17935982 0.3777778
## 505 0.00000000 0.18702290 0.2872340 0.9945030 0.16887417 0.6044444
## 506 0.00000000 0.20801527 0.2872340 1.0000000 0.08967991 0.4222222
sapply(input_data,mode)
## crim zn indus chas nox rm age dis
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## rad tax ptratio black lstat medv
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
lapply(input_data[,num.names],mean)
## $crim
## [1] 0.0405441
##
## $zn
## [1] 0.1136364
##
## $indus
## [1] 0.3913775
##
## $nox
## [1] 0.3491668
##
## $rm
## [1] 0.521869
##
## $age
## [1] 0.6763636
##
## $dis
## [1] 0.2423813
##
## $rad
## [1] 0.3717134
##
## $tax
## [1] 0.4222083
##
## $ptratio
## [1] 0.6229291
##
## $black
## [1] 0.8985678
##
## $lstat
## [1] 0.301409
##
## $medv
## [1] 0.3896179
lapply(input_data[,num.names],median)
## $crim
## [1] 0.002812075
##
## $zn
## [1] 0
##
## $indus
## [1] 0.3383431
##
## $nox
## [1] 0.3148148
##
## $rm
## [1] 0.5072811
##
## $age
## [1] 0.7682801
##
## $dis
## [1] 0.1889487
##
## $rad
## [1] 0.173913
##
## $tax
## [1] 0.2729008
##
## $ptratio
## [1] 0.6861702
##
## $black
## [1] 0.9862323
##
## $lstat
## [1] 0.2657285
##
## $medv
## [1] 0.36
lapply(input_data[,num.names],mfv)
## $crim
## [1] 0.0000976735 0.1610362968
##
## $zn
## [1] 0
##
## $indus
## [1] 0.6466276
##
## $nox
## [1] 0.3148148
##
## $rm
## [1] 0.4123395 0.4916651 0.4993294 0.5112090 0.5449320 0.5472313
##
## $age
## [1] 1
##
## $dis
## [1] 0.2151152
##
## $rad
## [1] 1
##
## $tax
## [1] 0.9141221
##
## $ptratio
## [1] 0.8085106
##
## $black
## [1] 1
##
## $lstat
## [1] 0.1277594 0.1672185 0.1743929 0.3413355 0.4525386
##
## $medv
## [1] 1
lapply(input_data[,num.names],min)
## $crim
## [1] 0
##
## $zn
## [1] 0
##
## $indus
## [1] 0
##
## $nox
## [1] 0
##
## $rm
## [1] 0
##
## $age
## [1] 0
##
## $dis
## [1] 0
##
## $rad
## [1] 0
##
## $tax
## [1] 0
##
## $ptratio
## [1] 0
##
## $black
## [1] 0
##
## $lstat
## [1] 0
##
## $medv
## [1] 0
lapply(input_data[,num.names],max)
## $crim
## [1] 1
##
## $zn
## [1] 1
##
## $indus
## [1] 1
##
## $nox
## [1] 1
##
## $rm
## [1] 1
##
## $age
## [1] 1
##
## $dis
## [1] 1
##
## $rad
## [1] 1
##
## $tax
## [1] 1
##
## $ptratio
## [1] 1
##
## $black
## [1] 1
##
## $lstat
## [1] 1
##
## $medv
## [1] 1
lapply(input_data[,num.names],range)
## $crim
## [1] 0 1
##
## $zn
## [1] 0 1
##
## $indus
## [1] 0 1
##
## $nox
## [1] 0 1
##
## $rm
## [1] 0 1
##
## $age
## [1] 0 1
##
## $dis
## [1] 0 1
##
## $rad
## [1] 0 1
##
## $tax
## [1] 0 1
##
## $ptratio
## [1] 0 1
##
## $black
## [1] 0 1
##
## $lstat
## [1] 0 1
##
## $medv
## [1] 0 1
lapply(input_data[,num.names],var)
## $crim
## [1] 0.009346886
##
## $zn
## [1] 0.05439368
##
## $indus
## [1] 0.06324179
##
## $nox
## [1] 0.05684955
##
## $rm
## [1] 0.01812437
##
## $age
## [1] 0.08403945
##
## $dis
## [1] 0.03666542
##
## $rad
## [1] 0.1433202
##
## $tax
## [1] 0.1034495
##
## $ptratio
## [1] 0.05304424
##
## $black
## [1] 0.05299453
##
## $lstat
## [1] 0.03882837
##
## $medv
## [1] 0.04177122
lapply(input_data[,num.names],sd)
## $crim
## [1] 0.09667929
##
## $zn
## [1] 0.2332245
##
## $indus
## [1] 0.2514792
##
## $nox
## [1] 0.2384314
##
## $rm
## [1] 0.1346268
##
## $age
## [1] 0.2898956
##
## $dis
## [1] 0.1914822
##
## $rad
## [1] 0.3785765
##
## $tax
## [1] 0.3216357
##
## $ptratio
## [1] 0.2303134
##
## $black
## [1] 0.2302054
##
## $lstat
## [1] 0.1970492
##
## $medv
## [1] 0.2043801
lapply(input_data[,num.names],mad)
## $crim
## [1] 0.003690258
##
## $zn
## [1] 0
##
## $indus
## [1] 0.3434762
##
## $nox
## [1] 0.266929
##
## $rm
## [1] 0.09814874
##
## $age
## [1] 0.2985049
##
## $dis
## [1] 0.1740726
##
## $rad
## [1] 0.1289217
##
## $tax
## [1] 0.2065454
##
## $ptratio
## [1] 0.1813819
##
## $black
## [1] 0.02041201
##
## $lstat
## [1] 0.1961663
##
## $medv
## [1] 0.1317867
This box plot reveals the mean value, minimum value, and maximum value of each variables.
It appears that four of these variables have their means very close to the lower bounds, and one variable has the mean very close to the upper bound. Also, it appears that about half of the variables have outliers.
oldw <- getOption("warn")
options(warn = -1)
boxplot(input_data[,num.names])
options(warn = oldw)
These histograms reveal the data value frequency of each variable.
Among the 12 independent varaibles, it appears that the variable values of age, dis, rad, and rm are the most frequent.
oldw <- getOption("warn")
options(warn = -1)
input_data[,num.names] %>% gather(-medv, value="value", key="key") %>% ggplot(aes(value)) + facet_wrap(~ key, scales = "free") + geom_histogram(binwidth=0.01)
options(warn = oldw)
The purpose of scatter plots is to reveal the relationship pattern between each pair of dependent and independent variables. They also reveal the central tendency and the dispersion tendency between the dependent and independent variables.
It appears that only for variables rm and istat, the mean is very close to the median, minimum, and maximum values. The data points for these two variables seem to be the least disperse when compared to the rest of the variables.
oldw <- getOption("warn")
options(warn = -1)
input_data[,num.names] %>% gather(-medv, value="value", key="key") %>%
ggplot(aes(x = value, y = medv)) + facet_wrap(~ key, scales = "free") + geom_point(size=0.1)
options(warn = oldw)
The purpose of line graphs is to reveal the relationship pattern between each pair of dependent and independent variables. They also reveal the central tendency and the dispersion tendency between the dependent and independent variables.
It appears that only for variables rm and istat, there is a clear relationship pattern between the dependent and independent variables. The data points of all the other variables do not seem to have consistent patterns.
oldw <- getOption("warn")
options(warn = -1)
input_data[,num.names] %>% gather(-medv, value="value", key="key") %>%
ggplot(aes(x = value, y = medv)) + facet_wrap(~ key, scales = "free") + geom_line(size=0.1)
options(warn = oldw)
This is a test to check for the existence of outliers associated with each independent variable in the data frame. This test is based on Z-Scores. The function’s null hypothesis is that there are no outliers. If the p-value is smaller than 0.05, then the null hypothesis could be rejected, and the alternative hypothesis that there is at least one outlier could be accepted. The two-tail test is carried out for this data frame.
All variables have p-values smaller than 0.05. Given a significant cut-off point of 0.05, all these variables have outliers.
# Detect outliers via z-score
grubbs.test(input_data$crim,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$crim
## G = 10.34348, U = 0.80427, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$zn,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$zn
## G = 4.28771, U = 0.97089, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$indus,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$indus
## G = 3.9765, U = 0.9836, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$nox,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$nox
## G = 4.19408, U = 0.98099, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$rm,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$rm
## G = 7.42794, U = 0.94527, p-value = 0.02676
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$age,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$age
## G = 3.44952, U = 0.98675, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$dis,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$dis
## G = 5.2224, U = 0.9658, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$rad,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$rad
## G = 2.64147, U = 0.99264, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$tax,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$tax
## G = 3.1091, U = 0.9902, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$ptratio,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$ptratio
## G = 4.3419, U = 0.9802, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$black,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$black
## G = 4.3439, U = 0.9694, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$lstat,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$lstat
## G = 5.07488, U = 0.97046, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
grubbs.test(input_data$medv,two.sided=TRUE,type=11)
##
## Grubbs test for two opposite outliers
##
## data: input_data$medv
## G = 4.89284, U = 0.97514, p-value < 2.2e-16
## alternative hypothesis: 0 and 1 are outliers
The correlation statistics reveal the degree of associations between variables in the data set. Given a range between 0 and 1, a correlation value less than 0.5 in either direction indicates a weak correlation, and a value equal to or greater than 0.5 in either direction indicates a moderate to strong correlation.
It appears that there are pairs of variables that have correlation coefficient greater than 0.5. Strong correlations between independent variables is an indication of multicollinearity.
oldw <- getOption("warn")
options(warn = -1)
pairs.panels(input_data[,num.names],gap=0,bg=c("green","red","yellow","blue","pink","purple"),pch= 21)
options(warn = oldw)
oldw <- getOption("warn")
options(warn = -1)
cor(input_data[,num.names])
## crim zn indus nox rm age
## crim 1.0000000 -0.2004692 0.4065834 0.4209717 -0.2192467 0.3527343
## zn -0.2004692 1.0000000 -0.5338282 -0.5166037 0.3119906 -0.5695373
## indus 0.4065834 -0.5338282 1.0000000 0.7636514 -0.3916759 0.6447785
## nox 0.4209717 -0.5166037 0.7636514 1.0000000 -0.3021882 0.7314701
## rm -0.2192467 0.3119906 -0.3916759 -0.3021882 1.0000000 -0.2402649
## age 0.3527343 -0.5695373 0.6447785 0.7314701 -0.2402649 1.0000000
## dis -0.3796701 0.6644082 -0.7080270 -0.7692301 0.2052462 -0.7478805
## rad 0.6255051 -0.3119478 0.5951293 0.6114406 -0.2098467 0.4560225
## tax 0.5827643 -0.3145633 0.7207602 0.6680232 -0.2920478 0.5064556
## ptratio 0.2899456 -0.3916785 0.3832476 0.1889327 -0.3555015 0.2615150
## black -0.3850639 0.1755203 -0.3569765 -0.3800506 0.1280686 -0.2735340
## lstat 0.4556215 -0.4129946 0.6037997 0.5908789 -0.6138083 0.6023385
## medv -0.3883046 0.3604453 -0.4837252 -0.4273208 0.6953599 -0.3769546
## dis rad tax ptratio black lstat
## crim -0.3796701 0.6255051 0.5827643 0.2899456 -0.3850639 0.4556215
## zn 0.6644082 -0.3119478 -0.3145633 -0.3916785 0.1755203 -0.4129946
## indus -0.7080270 0.5951293 0.7207602 0.3832476 -0.3569765 0.6037997
## nox -0.7692301 0.6114406 0.6680232 0.1889327 -0.3800506 0.5908789
## rm 0.2052462 -0.2098467 -0.2920478 -0.3555015 0.1280686 -0.6138083
## age -0.7478805 0.4560225 0.5064556 0.2615150 -0.2735340 0.6023385
## dis 1.0000000 -0.4945879 -0.5344316 -0.2324705 0.2915117 -0.4969958
## rad -0.4945879 1.0000000 0.9102282 0.4647412 -0.4444128 0.4886763
## tax -0.5344316 0.9102282 1.0000000 0.4608530 -0.4418080 0.5439934
## ptratio -0.2324705 0.4647412 0.4608530 1.0000000 -0.1773833 0.3740443
## black 0.2915117 -0.4444128 -0.4418080 -0.1773833 1.0000000 -0.3660869
## lstat -0.4969958 0.4886763 0.5439934 0.3740443 -0.3660869 1.0000000
## medv 0.2499287 -0.3816262 -0.4685359 -0.5077867 0.3334608 -0.7376627
## medv
## crim -0.3883046
## zn 0.3604453
## indus -0.4837252
## nox -0.4273208
## rm 0.6953599
## age -0.3769546
## dis 0.2499287
## rad -0.3816262
## tax -0.4685359
## ptratio -0.5077867
## black 0.3334608
## lstat -0.7376627
## medv 1.0000000
options(warn = oldw)
This is the automation of multiple linear regression stepwise iterations.
The resulting most robust and parsimonious model has 11 predictor variables.
m_empty = lm(medv~1, data=input_data)
m_all = lm(medv~., data=input_data)
step(m_empty, direction = "both", scope=formula(m_all))
## Start: AIC=-1605.83
## medv ~ 1
##
## Df Sum of Sq RSS AIC
## + lstat 1 11.4785 9.616 -2001.3
## + rm 1 10.1997 10.895 -1938.2
## + ptratio 1 5.4392 15.655 -1754.7
## + indus 1 4.9359 16.159 -1738.7
## + tax 1 4.6308 16.464 -1729.2
## + nox 1 3.8519 17.243 -1705.8
## + crim 1 3.1806 17.914 -1686.5
## + rad 1 3.0722 18.022 -1683.5
## + age 1 2.9974 18.097 -1681.4
## + zn 1 2.7406 18.354 -1674.2
## + black 1 2.3456 18.749 -1663.5
## + dis 1 1.3177 19.777 -1636.5
## + chas 1 0.6479 20.447 -1619.6
## <none> 21.095 -1605.8
##
## Step: AIC=-2001.33
## medv ~ lstat
##
## Df Sum of Sq RSS AIC
## + rm 1 1.9916 7.6244 -2116.8
## + ptratio 1 1.3186 8.2974 -2074.0
## + chas 1 0.3883 9.2277 -2020.2
## + dis 1 0.3814 9.2346 -2019.8
## + age 1 0.1502 9.4657 -2007.3
## + tax 1 0.1355 9.4805 -2006.5
## + black 1 0.0980 9.5180 -2004.5
## + zn 1 0.0792 9.5368 -2003.5
## + crim 1 0.0726 9.5434 -2003.2
## + indus 1 0.0488 9.5672 -2001.9
## <none> 9.6160 -2001.3
## + rad 1 0.0124 9.6036 -2000.0
## + nox 1 0.0024 9.6136 -1999.5
## - lstat 1 11.4785 21.0945 -1605.8
##
## Step: AIC=-2116.77
## medv ~ lstat + rm
##
## Df Sum of Sq RSS AIC
## + ptratio 1 0.8451 6.7793 -2174.2
## + chas 1 0.2709 7.3535 -2133.1
## + black 1 0.2530 7.3714 -2131.8
## + tax 1 0.2100 7.4144 -2128.9
## + dis 1 0.1734 7.4509 -2126.4
## + crim 1 0.1538 7.4706 -2125.1
## + rad 1 0.0891 7.5352 -2120.7
## + indus 1 0.0302 7.5942 -2116.8
## <none> 7.6244 -2116.8
## + zn 1 0.0279 7.5964 -2116.6
## + age 1 0.0100 7.6144 -2115.4
## + nox 1 0.0074 7.6170 -2115.2
## - rm 1 1.9916 9.6160 -2001.3
## - lstat 1 3.2704 10.8948 -1938.2
##
## Step: AIC=-2174.21
## medv ~ lstat + rm + ptratio
##
## Df Sum of Sq RSS AIC
## + dis 1 0.24646 6.5328 -2190.9
## + black 1 0.19244 6.5868 -2186.8
## + chas 1 0.18665 6.5926 -2186.3
## + crim 1 0.06050 6.7187 -2176.8
## + age 1 0.03271 6.7465 -2174.7
## <none> 6.7793 -2174.2
## + tax 1 0.02191 6.7573 -2173.8
## + nox 1 0.01225 6.7670 -2173.1
## + zn 1 0.00739 6.7719 -2172.8
## + rad 1 0.00300 6.7763 -2172.4
## + indus 1 0.00041 6.7788 -2172.2
## - ptratio 1 0.84510 7.6244 -2116.8
## - rm 1 1.51816 8.2974 -2074.0
## - lstat 1 2.47587 9.2551 -2018.7
##
## Step: AIC=-2190.95
## medv ~ lstat + rm + ptratio + dis
##
## Df Sum of Sq RSS AIC
## + nox 1 0.37509 6.1577 -2218.9
## + black 1 0.24822 6.2846 -2208.6
## + chas 1 0.13206 6.4007 -2199.3
## + indus 1 0.11983 6.4130 -2198.3
## + tax 1 0.11869 6.4141 -2198.2
## + crim 1 0.11533 6.4175 -2198.0
## + zn 1 0.07151 6.4613 -2194.5
## + age 1 0.03030 6.5025 -2191.3
## <none> 6.5328 -2190.9
## + rad 1 0.01106 6.5217 -2189.8
## - dis 1 0.24646 6.7793 -2174.2
## - ptratio 1 0.91815 7.4509 -2126.4
## - rm 1 1.29511 7.8279 -2101.4
## - lstat 1 2.64157 9.1744 -2021.1
##
## Step: AIC=-2218.87
## medv ~ lstat + rm + ptratio + dis + nox
##
## Df Sum of Sq RSS AIC
## + chas 1 0.16211 5.9956 -2230.4
## + black 1 0.15399 6.0037 -2229.7
## + zn 1 0.07492 6.0828 -2223.1
## + crim 1 0.06984 6.0879 -2222.6
## + rad 1 0.02641 6.1313 -2219.0
## <none> 6.1577 -2218.9
## + indus 1 0.00845 6.1493 -2217.6
## + tax 1 0.00518 6.1525 -2217.3
## + age 1 0.00012 6.1576 -2216.9
## - nox 1 0.37509 6.5328 -2190.9
## - dis 1 0.60930 6.7670 -2173.1
## - ptratio 1 1.04518 7.2029 -2141.5
## - rm 1 1.25740 7.4151 -2126.8
## - lstat 1 1.80954 7.9672 -2090.5
##
## Step: AIC=-2230.37
## medv ~ lstat + rm + ptratio + dis + nox + chas
##
## Df Sum of Sq RSS AIC
## + black 1 0.13473 5.8609 -2239.9
## + zn 1 0.08119 5.9144 -2235.3
## + crim 1 0.05745 5.9381 -2233.2
## + rad 1 0.02892 5.9667 -2230.8
## <none> 5.9956 -2230.4
## + indus 1 0.01297 5.9826 -2229.5
## + tax 1 0.00207 5.9935 -2228.5
## + age 1 0.00115 5.9944 -2228.5
## - chas 1 0.16211 6.1577 -2218.9
## - nox 1 0.40514 6.4007 -2199.3
## - dis 1 0.56633 6.5619 -2186.7
## - ptratio 1 0.95057 6.9462 -2157.9
## - rm 1 1.22503 7.2206 -2138.3
## - lstat 1 1.73298 7.7286 -2103.9
##
## Step: AIC=-2239.87
## medv ~ lstat + rm + ptratio + dis + nox + chas + black
##
## Df Sum of Sq RSS AIC
## + zn 1 0.09380 5.7671 -2246.0
## + rad 1 0.07127 5.7896 -2244.1
## + crim 1 0.02747 5.8334 -2240.2
## <none> 5.8609 -2239.9
## + indus 1 0.00770 5.8532 -2238.5
## + age 1 0.00466 5.8562 -2238.3
## + tax 1 0.00133 5.8595 -2238.0
## - black 1 0.13473 5.9956 -2230.4
## - chas 1 0.14285 6.0037 -2229.7
## - nox 1 0.30955 6.1704 -2215.8
## - dis 1 0.54485 6.4057 -2196.9
## - ptratio 1 0.89101 6.7519 -2170.3
## - rm 1 1.31270 7.1736 -2139.6
## - lstat 1 1.47731 7.3382 -2128.1
##
## Step: AIC=-2246.03
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn
##
## Df Sum of Sq RSS AIC
## + crim 1 0.04677 5.7203 -2248.2
## + rad 1 0.04623 5.7208 -2248.1
## <none> 5.7671 -2246.0
## + indus 1 0.00792 5.7591 -2244.7
## + tax 1 0.00195 5.7651 -2244.2
## + age 1 0.00074 5.7663 -2244.1
## - zn 1 0.09380 5.8609 -2239.9
## - black 1 0.14734 5.9144 -2235.3
## - chas 1 0.14835 5.9154 -2235.2
## - nox 1 0.30994 6.0770 -2221.5
## - dis 1 0.63035 6.3974 -2195.6
## - ptratio 1 0.67389 6.4410 -2192.1
## - rm 1 1.17756 6.9446 -2154.0
## - lstat 1 1.50741 7.2745 -2130.5
##
## Step: AIC=-2248.15
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim
##
## Df Sum of Sq RSS AIC
## + rad 1 0.11289 5.6074 -2256.2
## <none> 5.7203 -2248.2
## + indus 1 0.00779 5.7125 -2246.8
## + age 1 0.00122 5.7191 -2246.3
## + tax 1 0.00064 5.7196 -2246.2
## - crim 1 0.04677 5.7671 -2246.0
## - black 1 0.10972 5.8300 -2240.5
## - zn 1 0.11309 5.8334 -2240.2
## - chas 1 0.14041 5.8607 -2237.9
## - nox 1 0.28565 6.0059 -2225.5
## - ptratio 1 0.58909 6.3094 -2200.6
## - dis 1 0.66454 6.3848 -2194.5
## - rm 1 1.19485 6.9151 -2154.2
## - lstat 1 1.35971 7.0800 -2142.2
##
## Step: AIC=-2256.24
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim + rad
##
## Df Sum of Sq RSS AIC
## + tax 1 0.13512 5.4723 -2266.6
## <none> 5.6074 -2256.2
## + indus 1 0.01674 5.5907 -2255.8
## + age 1 0.00005 5.6074 -2254.2
## - zn 1 0.08451 5.6919 -2250.7
## - rad 1 0.11289 5.7203 -2248.2
## - crim 1 0.11343 5.7208 -2248.1
## - chas 1 0.13465 5.7421 -2246.2
## - black 1 0.14606 5.7535 -2245.2
## - nox 1 0.38773 5.9951 -2224.4
## - dis 1 0.66241 6.2698 -2201.7
## - ptratio 1 0.70112 6.3085 -2198.6
## - rm 1 1.07781 6.6852 -2169.3
## - lstat 1 1.37545 6.9828 -2147.2
##
## Step: AIC=-2266.58
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
## crim + rad + tax
##
## Df Sum of Sq RSS AIC
## <none> 5.4723 -2266.6
## + indus 1 0.00124 5.4710 -2264.7
## + age 1 0.00003 5.4722 -2264.6
## - chas 1 0.11220 5.5845 -2258.3
## - crim 1 0.12117 5.5935 -2257.5
## - zn 1 0.12732 5.5996 -2256.9
## - black 1 0.13374 5.6060 -2256.4
## - tax 1 0.13512 5.6074 -2256.2
## - rad 1 0.24737 5.7196 -2246.2
## - nox 1 0.26761 5.7399 -2244.4
## - ptratio 1 0.59578 6.0681 -2216.3
## - dis 1 0.71553 6.1878 -2206.4
## - rm 1 0.96971 6.4420 -2186.0
## - lstat 1 1.34493 6.8172 -2157.4
##
## Call:
## lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas +
## black + zn + crim + rad + tax, data = input_data)
##
## Coefficients:
## (Intercept) lstat rm ptratio dis nox
## 0.48382 -0.42083 0.44090 -0.19772 -0.36478 -0.18766
## chas1 black zn crim rad tax
## 0.06042 0.08188 0.10188 -0.21434 0.15313 -0.13715
The purpose of creating separate data sets for training and testing the model is because we want to see how differently the model would perform with data that it has never seen before.
oldw <- getOption("warn")
options(warn = -1)
set.seed(123)
ind <- sample(2,nrow(input_data),replace=T, prob=c(0.5,0.5))
df_sample_train <- input_data[ind==1,]
df_sample_test <- input_data[ind==2,]
options(warn = oldw)
This is the most robust and parsimonious model identified above. It is being served as the baseline model for this regularization analysis.
b_model = lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + crim + rad + tax, data = df_sample_train)
summary(b_model)
##
## Call:
## lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas +
## black + zn + crim + rad + tax, data = df_sample_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33121 -0.06534 -0.00961 0.04088 0.51566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.63825 0.08090 7.889 1.35e-13 ***
## lstat -0.49948 0.05742 -8.698 7.49e-16 ***
## rm 0.25130 0.07464 3.367 0.000895 ***
## ptratio -0.20979 0.04305 -4.874 2.08e-06 ***
## dis -0.45913 0.07152 -6.419 8.12e-10 ***
## nox -0.26804 0.06521 -4.111 5.54e-05 ***
## chas1 0.11020 0.03201 3.442 0.000689 ***
## black 0.08295 0.03773 2.198 0.028953 *
## zn 0.15168 0.05291 2.867 0.004542 **
## crim -0.17697 0.11333 -1.561 0.119824
## rad 0.18194 0.05068 3.590 0.000407 ***
## tax -0.11750 0.06036 -1.947 0.052835 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1149 on 223 degrees of freedom
## Multiple R-squared: 0.6949, Adjusted R-squared: 0.6799
## F-statistic: 46.18 on 11 and 223 DF, p-value: < 2.2e-16
data.frame(coef = round(b_model$coefficients,2))
## coef
## (Intercept) 0.64
## lstat -0.50
## rm 0.25
## ptratio -0.21
## dis -0.46
## nox -0.27
## chas1 0.11
## black 0.08
## zn 0.15
## crim -0.18
## rad 0.18
## tax -0.12
b_model_pred <- predict(b_model, df_sample_test)
b_model_pred_R2 = R2(b_model_pred, df_sample_test$medv)
b_model_pred_R2
## [1] 0.7482301
The purpose of cross-validation is to prevent overfitting. Overfitting happens when the resulting model performs badly on test data. Meaning, if the data points in the test data set are slightly different from the training data set, the model will not be able to make predictions with acceptable accuracy. The purpose of regularization is to further reduce the possibility of overfitting.
Both the baseline model and the cross-validated model have the same model R-squared value of 0.6949.
oldw <- getOption("warn")
options(warn = -1)
# Set up doSNOW package for multi-core training. This will speed up the training process.
numCores <- detectCores()
c1 <- makeCluster(numCores,type="SOCK")
registerDoSNOW(c1)
# Train and test the model with the number of folds
set.seed(123)
lm <- train(medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + crim + rad + tax, data=df_sample_train, method="lm", trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = T))
## Aggregating results
## Fitting final model on full training set
lm
## Linear Regression
##
## 235 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 211, 212, 211, 211, 211, 212, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1182872 0.6741241 0.08361395
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Results
lm$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.1182872 0.6741241 0.08361395 0.0301963 0.1287578 0.01732187
# Summary
summary(lm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33121 -0.06534 -0.00961 0.04088 0.51566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.63825 0.08090 7.889 1.35e-13 ***
## lstat -0.49948 0.05742 -8.698 7.49e-16 ***
## rm 0.25130 0.07464 3.367 0.000895 ***
## ptratio -0.20979 0.04305 -4.874 2.08e-06 ***
## dis -0.45913 0.07152 -6.419 8.12e-10 ***
## nox -0.26804 0.06521 -4.111 5.54e-05 ***
## chas1 0.11020 0.03201 3.442 0.000689 ***
## black 0.08295 0.03773 2.198 0.028953 *
## zn 0.15168 0.05291 2.867 0.004542 **
## crim -0.17697 0.11333 -1.561 0.119824
## rad 0.18194 0.05068 3.590 0.000407 ***
## tax -0.11750 0.06036 -1.947 0.052835 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1149 on 223 degrees of freedom
## Multiple R-squared: 0.6949, Adjusted R-squared: 0.6799
## F-statistic: 46.18 on 11 and 223 DF, p-value: < 2.2e-16
# Plot
plot(lm$finalModel)