Chapter 3 Data Transformation

3.1 NYPD Arrests

In order to use arrest dates information, we are converting this feature to Date format.

df_arrest_ytd$ARREST_DATE <-as.Date(df_arrest_ytd$ARREST_DATE, format = "%m/%d/%Y")

As per this mapping given by NYPD - B(Bronx), S(Staten Island), K(Brooklyn), M(Manhattan), Q(Queens) - we are updating the borough names for a clearer understanding.

df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "B"] <- "Bronx"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "S"] <- "Staten Island"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "K"] <- "Brooklyn"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "M"] <- "Manhattan"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "Q"] <- "Queens"

This data also contains certain duplicate offenses which are categorized separately.

df_arrest_ytd$OFNS_DESC[df_arrest_ytd$OFNS_DESC == "KIDNAPPING AND RELATED OFFENSES"] <- "KIDNAPPING & RELATED OFFENSES"
df_arrest_ytd$OFNS_DESC[df_arrest_ytd$OFNS_DESC == "KIDNAPPING"] <- "KIDNAPPING & RELATED OFFENSES"
df_arrest_ytd$OFNS_DESC[df_arrest_ytd$OFNS_DESC == "ADMINISTRATIVE CODES"] <- "ADMINISTRATIVE CODE"
arrests_loc_boro <- df_arrest_ytd %>% 
  filter(!OFNS_DESC %in% "") %>%
  group_by(OFNS_DESC, ARREST_BORO, Latitude, Longitude) %>%
  summarise(Total = n())

arrests_boro <- df_arrest_ytd %>% 
  filter(!OFNS_DESC %in% "") %>%
  group_by(OFNS_DESC, ARREST_BORO) %>%
  summarise(Total = n())

write.csv(arrests_boro, './datasets/filtered/arrests_by_offenses_and_borough.csv', row.names = FALSE)

Here to create a filtered dataset for interactive component we have identified summary information for offenses committed in each Borough. This data was also unstacked to achieve the current version of data.

3.2 Human Trafficking

df_fbi_ht_by_states <- df_fbi_ht %>%
  group_by(STATE_NAME) %>%
  summarise(Total = n())

new_info <- data.frame( c("New York", "District of Columbia", "Pennsylvania", "California", "Alabama","Iowa"),  
                        c(0, 0, 0, 0, 0, 0))  
names(new_info) <- c("STATE_NAME", "Total")
df_fbi_ht_by_states <- rbind(df_fbi_ht_by_states, new_info)

3.3 Shootings in NYC

3.3.1 Pre-processing Data

colnames(df_shooting_old)<- toupper(colnames(df_shooting_old))
colnames(df_shooting_ytd)<- toupper(colnames(df_shooting_ytd))
  • Column names in the old and new data have different case, converting to a common format for merging the 2 later.
df_shooting_ytd$OCCUR_DATE <-as.Date(df_shooting_ytd$OCCUR_DATE, format = "%m/%d/%Y")
df_shooting_old$OCCUR_DATE <-as.Date(df_shooting_old$OCCUR_DATE, format = "%m/%d/%Y")

df_shooting_total<-bind_rows(df_shooting_ytd, df_shooting_old)
df_shooting_total$OCCUR_DATE <-as.Date(df_shooting_total$OCCUR_DATE, format = "%m/%d/%Y")
  • Converting the date columns in both to date class using format and then merging the 2 datasets in one.

3.3.2 Feature Engineering

  • creating more features from the existing features to help
# extract year and month from date - #TODO: hour and dayofweek
df_shooting_total<-df_shooting_total %>%
  mutate(year = year(OCCUR_DATE), month = lubridate::month(OCCUR_DATE,label = TRUE), day = lubridate::wday(OCCUR_DATE,label = TRUE),
         murder = ifelse(STATISTICAL_MURDER_FLAG==TRUE, 1, 0))
  • Extracted year, month and day of the week
  • Calculated Percentage change in number of shootings every month.

  • The data is very similar over the years and hence to keep our analysis latest, it is filtered to the last 5 years.

3.4 Park Crime Data

The raw data we obtained was in the form of quarterly data from the year 2015 to 2020. In order to carry out effective analysis, we put together the individual datasets into one large .csv file.

library(readxl)
library(httr)
q1_2015_park <-read_excel('datasets/parks/nyc-park-crime-stats-q1-2015.xlsx', skip=4)
q2_2015_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2015.xlsx', skip=4)
q3_2015_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2015.xlsx', skip=4)
q4_2015_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2015.xlsx', skip=3)

q1_2016_park <-read_excel('datasets/parks/nyc-park-crime-stats-q1-2016.xlsx', skip=3)
q2_2016_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2016.xlsx', skip=3)
q3_2016_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2016.xlsx', skip=3)
q4_2016_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2016.xlsx', skip=3)

q1_2017_park <-read_excel('datasets/parks/nyc-park-crime-stats-q1-2017.xlsx', skip=3)
q2_2017_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2017.xlsx', skip=3)
q3_2017_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2017.xlsx', skip=3)
q4_2017_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2017.xlsx', skip=3)

q1_2018_park <-read_excel('datasets/parks/nyc-park-crime-stats-q1-2018.xlsx', skip=3)
q2_2018_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2018.xlsx', skip=4)
q3_2018_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2018.xlsx', skip=3)
q4_2018_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2018.xlsx', skip=3)

q1_2019_park <-read_excel('datasets/parks/nyc-park-crime-stats-q1-2019.xlsx', skip=3)
q2_2019_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2019.xlsx', skip=3)
q3_2019_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2019.xlsx', skip=3)
q4_2019_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2019.xlsx', skip=3)

q1_2020_park <-read_excel('datasets/parks/nyc-park-crime-stats-q1-2019.xlsx', skip=3)
q2_2020_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2019.xlsx', skip=3)
q3_2020_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2019.xlsx', skip=3)
q4_2020_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2019.xlsx', skip=3)
names(q1_2020_park)
names(q2_2020_park)
names(q3_2020_park)
names(q4_2020_park)

Data has been read in and has to be converted to a large dataset, as a concatenation of the rows along with a quarter and year column.

q1_2015_park$year <- c(rep(2015, nrow(q1_2015_park)))
q1_2016_park$year <- c(rep(2016, nrow(q1_2016_park)))
q1_2017_park$year <- c(rep(2017, nrow(q1_2017_park)))
q1_2018_park$year <- c(rep(2018, nrow(q1_2018_park)))
q1_2019_park$year <- c(rep(2019, nrow(q1_2019_park)))
q1_2020_park$year <- c(rep(2020, nrow(q1_2020_park)))

q2_2015_park$year <- c(rep(2015, nrow(q2_2015_park)))
q2_2016_park$year <- c(rep(2016, nrow(q2_2016_park)))
q2_2017_park$year <- c(rep(2017, nrow(q2_2017_park)))
q2_2018_park$year <- c(rep(2018, nrow(q2_2018_park)))
q2_2019_park$year <- c(rep(2019, nrow(q2_2019_park)))
q2_2020_park$year <- c(rep(2020, nrow(q2_2020_park)))

q3_2015_park$year <- c(rep(2015, nrow(q3_2015_park)))
q3_2016_park$year <- c(rep(2016, nrow(q3_2016_park)))
q3_2017_park$year <- c(rep(2017, nrow(q3_2017_park)))
q3_2018_park$year <- c(rep(2018, nrow(q3_2018_park)))
q3_2019_park$year <- c(rep(2019, nrow(q3_2019_park)))
q3_2020_park$year <- c(rep(2020, nrow(q3_2020_park)))

q4_2015_park$year <- c(rep(2015, nrow(q4_2015_park)))
q4_2016_park$year <- c(rep(2016, nrow(q4_2016_park)))
q4_2017_park$year <- c(rep(2017, nrow(q4_2017_park)))
q4_2018_park$year <- c(rep(2018, nrow(q4_2018_park)))
q4_2019_park$year <- c(rep(2019, nrow(q4_2019_park)))
q4_2020_park$year <- c(rep(2020, nrow(q4_2020_park)))
q1_2015_park$quarter <- c(rep('q1', nrow(q1_2015_park)))
q1_2016_park$quarter <- c(rep('q1', nrow(q1_2016_park)))
q1_2017_park$quarter <- c(rep('q1', nrow(q1_2017_park)))
q1_2018_park$quarter <- c(rep('q1', nrow(q1_2018_park)))
q1_2019_park$quarter <- c(rep('q1', nrow(q1_2019_park)))
q1_2020_park$quarter <- c(rep('q1', nrow(q1_2020_park)))

q2_2015_park$quarter <- c(rep('q2', nrow(q2_2015_park)))
q2_2016_park$quarter <- c(rep('q2', nrow(q2_2016_park)))
q2_2017_park$quarter <- c(rep('q2', nrow(q2_2017_park)))
q2_2018_park$quarter <- c(rep('q2', nrow(q2_2018_park)))
q2_2019_park$quarter <- c(rep('q2', nrow(q2_2019_park)))
q2_2020_park$quarter <- c(rep('q2', nrow(q2_2020_park)))

q3_2015_park$quarter <- c(rep('q3', nrow(q3_2015_park)))
q3_2016_park$quarter <- c(rep('q3', nrow(q3_2016_park)))
q3_2017_park$quarter <- c(rep('q3', nrow(q3_2017_park)))
q3_2018_park$quarter <- c(rep('q3', nrow(q3_2018_park)))
q3_2019_park$quarter <- c(rep('q3', nrow(q3_2019_park)))
q3_2020_park$quarter <- c(rep('q3', nrow(q3_2020_park)))

q4_2015_park$quarter <- c(rep('q4', nrow(q4_2015_park)))
q4_2016_park$quarter <- c(rep('q4', nrow(q4_2016_park)))
q4_2017_park$quarter <- c(rep('q4', nrow(q4_2017_park)))
q4_2018_park$quarter <- c(rep('q4', nrow(q4_2018_park)))
q4_2019_park$quarter <- c(rep('q4', nrow(q4_2019_park)))
q4_2020_park$quarter <- c(rep('q4', nrow(q4_2020_park)))
colnames(q1_2015_park)<- toupper(colnames(q1_2015_park))
colnames(q2_2015_park)<- toupper(colnames(q2_2015_park))
colnames(q3_2015_park)<- toupper(colnames(q3_2015_park))
colnames(q4_2015_park)<- toupper(colnames(q4_2015_park))

colnames(q1_2016_park)<- toupper(colnames(q1_2016_park))
colnames(q2_2016_park)<- toupper(colnames(q2_2016_park))
colnames(q3_2016_park)<- toupper(colnames(q3_2016_park))
colnames(q4_2016_park)<- toupper(colnames(q4_2016_park))

colnames(q1_2017_park)<- toupper(colnames(q1_2017_park))
colnames(q2_2017_park)<- toupper(colnames(q2_2017_park))
colnames(q3_2017_park)<- toupper(colnames(q3_2017_park))
colnames(q4_2017_park)<- toupper(colnames(q4_2017_park))

colnames(q1_2018_park)<- toupper(colnames(q1_2018_park))
colnames(q2_2018_park)<- toupper(colnames(q2_2018_park))
colnames(q3_2018_park)<- toupper(colnames(q3_2018_park))
colnames(q4_2018_park)<- toupper(colnames(q4_2018_park))

colnames(q1_2019_park)<- toupper(colnames(q1_2019_park))
colnames(q2_2019_park)<- toupper(colnames(q2_2019_park))
colnames(q3_2019_park)<- toupper(colnames(q3_2019_park))
colnames(q4_2019_park)<- toupper(colnames(q4_2019_park))

colnames(q1_2020_park)<- toupper(colnames(q1_2020_park))
colnames(q2_2020_park)<- toupper(colnames(q2_2020_park))
colnames(q3_2020_park)<- toupper(colnames(q3_2020_park))
colnames(q4_2020_park)<- toupper(colnames(q4_2020_park))

Putting the dataframes together

park_data <- do.call("rbind", list(q1_2015_park, q2_2015_park ,q3_2015_park, q4_2015_park, q1_2016_park,q2_2016_park, q3_2016_park, q4_2016_park, q1_2017_park, q2_2017_park, q3_2017_park, q4_2017_park, q1_2018_park, q2_2018_park, q3_2018_park, q4_2018_park, q1_2019_park, q2_2019_park, q3_2019_park, q4_2019_park, q1_2020_park, q2_2020_park, q3_2020_park, q4_2020_park))
library(dplyr)
park_data <- park_data %>% filter(PARK != 'TOTAL')

Creating new csv file for output.

write.csv(park_data, './datasets/parks/finalpark.csv', row.names = FALSE)

Testing Dataset - Sanity Checks

nrow(park_data)

3.5 Hate Crime Data

County names are historical in this dataset, and hence have to be changed.

df_hate <- read.csv("datasets/NYPD_Hate_Crimes.csv")
#View(df_hate)
df_hate$County[df_hate$County == "KINGS"] <- "BROOKLYN"
df_hate$County[df_hate$County == "NEW YORK"] <- "MANHATTAN"
df_hate$County[df_hate$County == "RICHMOND"] <- "STATEN ISLAND"