Master in Data Science at Utica College

https://programs.online.utica.edu/programs/masters-data-science

Clearing R Studio Memory Usage

gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  542109 29.0    1238217 66.2   621331 33.2
## Vcells 1019753  7.8    8388608 64.0  1600889 12.3
rm(list = ls())

Time Counter Start

start_time <- Sys.time()

Include the knitr package for integration of R code into Markdown

knitr::opts_chunk$set(echo = TRUE)

All the libraries used in this code

library(easypackages)
libraries("data.table","devtools","dplyr","ggplot2","ggmap","ggrepel","ggsn","lubridate","readr","tidyr","tidyverse","sf") 

Download the main crime incident dataset

POI= read.csv('Point_Of_Interest.csv', header=T)

Descriptive statistics before data processing

Dimension of data frame

dim(POI)
## [1] 19010    17

Structure of data frame

str(POI)
## 'data.frame':    19010 obs. of  17 variables:
##  $ SEGMENTID : int  69163 34217 34220 43117 9008897 78137 9004671 17826 67394 34220 ...
##  $ LON       : num  -73.9 -74 -74 -73.9 -73.8 ...
##  $ LAT       : num  40.8 40.8 40.8 40.7 40.9 ...
##  $ COMPLEXID : int  0 0 0 0 515 0 2245 0 0 0 ...
##  $ SAFTYPE   : Factor w/ 4 levels "","G","N","X": 3 1 1 1 1 1 4 3 1 1 ...
##  $ SOS       : int  2 1 1 2 1 NA 2 2 2 1 ...
##  $ PLACEID   : int  12171 13932 2410 3247 8230 5077 7129 13182 15765 5444 ...
##  $ FACI_DOM  : int  10 3 4 1 2 2 3 7 1 3 ...
##  $ BIN       : int  1074333 1024838 1024839 3047371 2093858 2009619 3327986 3332515 4436782 1024843 ...
##  $ BOROUGH   : int  1 1 1 3 2 2 3 3 4 1 ...
##  $ CREATED   : Factor w/ 1147 levels "01/02/2013 12:00:00 AM +0000",..: 447 447 447 447 447 447 447 447 447 447 ...
##  $ MODIFIED  : Factor w/ 1438 levels "01/02/2013 12:00:00 AM +0000",..: 1303 1207 1207 1207 1207 1305 1011 1379 1207 1207 ...
##  $ FACILITY_T: int  2 3 7 9 1 5 1 8 9 3 ...
##  $ SOURCE    : Factor w/ 11 levels "DCP","DOE","DoITT",..: 2 6 9 9 3 3 1 1 5 6 ...
##  $ B7SC      : int  10078501 NA NA NA NA NA 32055002 33361901 NA NA ...
##  $ PRI_ADD   : int  5128938 1023588 1023591 3058824 2101927 2017249 3094096 5125956 0 1023594 ...
##  $ NAME      : Factor w/ 18875 levels "0 BOND STREET",..: 13632 5082 500 6650 3575 9090 1707 4702 16199 16527 ...

Summary statistics of data frame

summary(POI)
##    SEGMENTID            LON              LAT          COMPLEXID      SAFTYPE  
##  Min.   :     10   Min.   :-74.27   Min.   :40.45   Min.   :   0.0    :11219  
##  1st Qu.:  39984   1st Qu.:-73.98   1st Qu.:40.67   1st Qu.:   0.0   G:  662  
##  Median : 109339   Median :-73.94   Median :40.73   Median :   0.0   N: 2847  
##  Mean   :1623955   Mean   :-73.94   Mean   :40.73   Mean   : 260.6   X: 4282  
##  3rd Qu.: 268553   3rd Qu.:-73.89   3rd Qu.:40.80   3rd Qu.: 218.0            
##  Max.   :9024271   Max.   :-73.71   Max.   :40.95   Max.   :4300.0            
##                                                                               
##       SOS           PLACEID           FACI_DOM           BIN         
##  Min.   :1.000   Min.   :      1   Min.   : 1.000   Min.   :      0  
##  1st Qu.:1.000   1st Qu.:   7054   1st Qu.: 2.000   1st Qu.:1004331  
##  Median :2.000   Median :  13796   Median : 3.000   Median :2010983  
##  Mean   :1.511   Mean   : 325108   Mean   : 4.312   Mean   :2030343  
##  3rd Qu.:2.000   3rd Qu.:1010060   3rd Qu.: 6.000   3rd Qu.:3326732  
##  Max.   :2.000   Max.   :1031244   Max.   :18.000   Max.   :5169276  
##  NA's   :2081                                                        
##     BOROUGH                              CREATED     
##  Min.   :1.000   05/14/2009 12:00:00 AM +0000:12800  
##  1st Qu.:1.000   09/15/2008 12:00:00 AM +0000:  200  
##  Median :3.000   08/17/2011 12:00:00 AM +0000:  121  
##  Mean   :2.564   08/16/2011 12:00:00 AM +0000:   93  
##  3rd Qu.:4.000   09/03/2010 12:00:00 AM +0000:   62  
##  Max.   :5.000   09/20/2012 12:00:00 AM +0000:   53  
##  NA's   :221     (Other)                     : 5681  
##                          MODIFIED       FACILITY_T         SOURCE    
##  11/30/1899 12:00:00 AM +0000: 2139   Min.   : 1.000   OTHER  :5056  
##  11/24/2014 12:00:00 AM +0000: 1543   1st Qu.: 2.000   DoITT  :3392  
##  12/09/2010 12:00:00 AM +0000:  148   Median : 4.000   DCP    :2753  
##  11/13/2017 12:00:00 AM +0000:  100   Mean   : 4.784   DOE    :2017  
##  12/11/2012 12:00:00 AM +0000:   69   3rd Qu.: 7.000   NYPD   :1993  
##  01/29/2013 12:00:00 AM +0000:   68   Max.   :13.000   EMS    :1441  
##  (Other)                     :14943                    (Other):2358  
##       B7SC             PRI_ADD                         NAME      
##  Min.   :10000201   Min.   :       0   DEVRY COLLEGE     :    3  
##  1st Qu.:13289506   1st Qu.:       0   HOLY ROSARY CHURCH:    3  
##  Median :30013851   Median : 1031109   HOLY ROSARY SCHOOL:    3  
##  Mean   :27945586   Mean   : 1810248   PS 12             :    3  
##  3rd Qu.:38334504   3rd Qu.: 3109959   PS 146            :    3  
##  Max.   :57019202   Max.   :10173608   PS 15             :    3  
##  NA's   :11270                         (Other)           :18992

Subsetting Data Set

POI = POI[which(POI$BOROUGH>0),]

Add Variable BOROUGHNAME

numbers=sort(unique(POI$BOROUGH))
boroughs=c("Manhattan","Bronx","Brooklyn","Queens","Staten Island")
names(boroughs)=numbers
POI$BOROUGHNAME=boroughs[POI$BOROUGH]

Set API Key ~ put key between the quotations marks

ggmap::register_google(key ="AIzaSyD9jYJNqrIoumMz9pIi6MHshWi20IYilUw")

Define longitude and latitude of the center of your map (currently Williamsburg, Brooklyn, NYC)

center <- c(lon = -73.955, lat = 40.715)

Load the basemap

#experiment with different maptypes ("roadmap", "satellite", "terrain", "hybrid"), and zoom levels (3 - 21)
p1 <- ggmap(get_googlemap(center, maptype = "satellite", zoom=13))
p2 <- ggmap(get_googlemap(center, maptype = "roadmap", zoom=13)) 

Plot the POI points upon the basemap

scalebar1 = function(x,y,w,n,d, units){
  # x,y = Lower left coordinate of bar
  # w = Thickness of bar
  # n = Number of divisions on bar
  # d = Distance along each division (longitude degree)

  bar = data.frame( 
    xmin = seq(0.0, n*d, by=d) + x,
    xmax = seq(0.0, n*d, by=d) + x + d,
    ymin = y,
    ymax = y+w,
    z = rep(c(1,0),n)[1:(n+1)],
    fill.col = rep(c("blue","pink"),n)[1:(n+1)])

  labs = data.frame(
    xlab = c(seq(0.0, (n+1)*d, by=d) + x, x), 
    ylab = c(rep(y-w*1.5, n+2), y-3*w),
    text = c(as.character(  seq(0.0, ((n+1)*d)*55.051, by= round(d*55.051,1) )), units)
    )
  list(bar, labs)
}

sb1 = scalebar1(-73.97, 40.68, 0.001, 2, 0.015, "Miles")


scalebar2 = function(x,y,w,n,d, units){
  # x,y = Lower left coordinate of bar
  # w = Thickness of bar
  # n = Number of divisions on bar
  # d = Distance along each division (longitude degree)

  bar = data.frame( 
    xmin = seq(0.0, n*d, by=d) + x,
    xmax = seq(0.0, n*d, by=d) + x + d,
    ymin = y,
    ymax = y+w,
    z = rep(c(1,0),n)[1:(n+1)],
    fill.col = rep(c("azure4","white"),n)[1:(n+1)])

  labs = data.frame(
    xlab = c(seq(0.0, (n+1)*d, by=d) + x, x), 
    ylab = c(rep(y-w*1.5, n+2), y-3*w),
    text = c(as.character(  seq(0.0, ((n+1)*d)*55.051, by= round(d*55.051,1) )), units)
    )
  list(bar, labs)
}

sb2 = scalebar2(-73.97, 40.68, 0.001, 2, 0.015, "Miles")


#experiment with different point sizes and colors
p1a <- p1 + geom_point(aes(x = LON, y = LAT, col=BOROUGHNAME), data = POI, size = 0.6,) +
       scale_colour_manual(name="Borough Name", 
                           values = c("Bronx"="black", "Brooklyn"="red",
                                      "Queens"="green", "Manhattan"="blue",
                                      "Staten Island"="orange")
                           ) + 
       geom_segment(arrow=arrow(length=unit(4,"mm")),aes(x=-73.989,xend=-73.989,y=40.685,yend=40.695),color="pink", size = 2) +
       annotate(x=-73.989, y=40.682, label="N", color="pink", geom="text", size=10, fontface = "bold") +
       geom_rect(data=sb1[[1]], aes(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, fill=z), inherit.aes=F, show.legend = F, color = "green", fill = sb1[[1]]$fill.col, size=0.5, alpha=5) +
       geom_text(data=sb1[[2]], aes(x=xlab, y=ylab, label=text), inherit.aes=F, show.legend = F, color = "green", size=3) +
       xlab("Longitude") + 
       ylab("Latitude") +
       ggtitle("Google Satellite Map - NYC Points of Interest") + 
       theme(plot.title = element_text(hjust = 0.5)) +
       theme(
             plot.title = element_text(colour = "black"),
             panel.border = element_rect(colour = "black", fill=NA, size=1.5)
            ) 
  
p1a
## Warning: Removed 14627 rows containing missing values (geom_point).