Chapter 3 Data Transformation
3.1 NYPD Arrests
In order to use arrest dates information, we are converting this feature to Date format.
$ARREST_DATE <-as.Date(df_arrest_ytd$ARREST_DATE, format = "%m/%d/%Y") df_arrest_ytd
As per this mapping given by NYPD - B(Bronx), S(Staten Island), K(Brooklyn), M(Manhattan), Q(Queens) - we are updating the borough names for a clearer understanding.
$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "B"] <- "Bronx"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "S"] <- "Staten Island"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "K"] <- "Brooklyn"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "M"] <- "Manhattan"
df_arrest_ytd$ARREST_BORO[df_arrest_ytd$ARREST_BORO == "Q"] <- "Queens" df_arrest_ytd
This data also contains certain duplicate offenses which are categorized separately.
$OFNS_DESC[df_arrest_ytd$OFNS_DESC == "KIDNAPPING AND RELATED OFFENSES"] <- "KIDNAPPING & RELATED OFFENSES"
df_arrest_ytd$OFNS_DESC[df_arrest_ytd$OFNS_DESC == "KIDNAPPING"] <- "KIDNAPPING & RELATED OFFENSES"
df_arrest_ytd$OFNS_DESC[df_arrest_ytd$OFNS_DESC == "ADMINISTRATIVE CODES"] <- "ADMINISTRATIVE CODE" df_arrest_ytd
<- df_arrest_ytd %>%
arrests_loc_boro filter(!OFNS_DESC %in% "") %>%
group_by(OFNS_DESC, ARREST_BORO, Latitude, Longitude) %>%
summarise(Total = n())
<- df_arrest_ytd %>%
arrests_boro filter(!OFNS_DESC %in% "") %>%
group_by(OFNS_DESC, ARREST_BORO) %>%
summarise(Total = n())
write.csv(arrests_boro, './datasets/filtered/arrests_by_offenses_and_borough.csv', row.names = FALSE)
Here to create a filtered dataset for interactive component we have identified summary information for offenses committed in each Borough. This data was also unstacked to achieve the current version of data.
3.2 Human Trafficking
<- df_fbi_ht %>%
df_fbi_ht_by_states group_by(STATE_NAME) %>%
summarise(Total = n())
<- data.frame( c("New York", "District of Columbia", "Pennsylvania", "California", "Alabama","Iowa"),
new_info c(0, 0, 0, 0, 0, 0))
names(new_info) <- c("STATE_NAME", "Total")
<- rbind(df_fbi_ht_by_states, new_info) df_fbi_ht_by_states
3.3 Shootings in NYC
3.3.1 Pre-processing Data
colnames(df_shooting_old)<- toupper(colnames(df_shooting_old))
colnames(df_shooting_ytd)<- toupper(colnames(df_shooting_ytd))
- Column names in the old and new data have different case, converting to a common format for merging the 2 later.
$OCCUR_DATE <-as.Date(df_shooting_ytd$OCCUR_DATE, format = "%m/%d/%Y")
df_shooting_ytd$OCCUR_DATE <-as.Date(df_shooting_old$OCCUR_DATE, format = "%m/%d/%Y")
df_shooting_old
<-bind_rows(df_shooting_ytd, df_shooting_old)
df_shooting_total$OCCUR_DATE <-as.Date(df_shooting_total$OCCUR_DATE, format = "%m/%d/%Y") df_shooting_total
- Converting the date columns in both to date class using format and then merging the 2 datasets in one.
3.3.2 Feature Engineering
- creating more features from the existing features to help
# extract year and month from date - #TODO: hour and dayofweek
<-df_shooting_total %>%
df_shooting_totalmutate(year = year(OCCUR_DATE), month = lubridate::month(OCCUR_DATE,label = TRUE), day = lubridate::wday(OCCUR_DATE,label = TRUE),
murder = ifelse(STATISTICAL_MURDER_FLAG==TRUE, 1, 0))
- Extracted year, month and day of the week
- Calculated Percentage change in number of shootings every month.
- The data is very similar over the years and hence to keep our analysis latest, it is filtered to the last 5 years.
3.4 Park Crime Data
The raw data we obtained was in the form of quarterly data from the year 2015 to 2020. In order to carry out effective analysis, we put together the individual datasets into one large .csv file.
library(readxl)
library(httr)
<-read_excel('datasets/parks/nyc-park-crime-stats-q1-2015.xlsx', skip=4)
q1_2015_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2015.xlsx', skip=4)
q2_2015_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2015.xlsx', skip=4)
q3_2015_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2015.xlsx', skip=3)
q4_2015_park
<-read_excel('datasets/parks/nyc-park-crime-stats-q1-2016.xlsx', skip=3)
q1_2016_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2016.xlsx', skip=3)
q2_2016_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2016.xlsx', skip=3)
q3_2016_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2016.xlsx', skip=3)
q4_2016_park
<-read_excel('datasets/parks/nyc-park-crime-stats-q1-2017.xlsx', skip=3)
q1_2017_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2017.xlsx', skip=3)
q2_2017_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2017.xlsx', skip=3)
q3_2017_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2017.xlsx', skip=3)
q4_2017_park
<-read_excel('datasets/parks/nyc-park-crime-stats-q1-2018.xlsx', skip=3)
q1_2018_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2018.xlsx', skip=4)
q2_2018_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2018.xlsx', skip=3)
q3_2018_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2018.xlsx', skip=3)
q4_2018_park
<-read_excel('datasets/parks/nyc-park-crime-stats-q1-2019.xlsx', skip=3)
q1_2019_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2019.xlsx', skip=3)
q2_2019_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2019.xlsx', skip=3)
q3_2019_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2019.xlsx', skip=3)
q4_2019_park
<-read_excel('datasets/parks/nyc-park-crime-stats-q1-2019.xlsx', skip=3)
q1_2020_park <-read_excel('datasets/parks/nyc-park-crime-stats-q2-2019.xlsx', skip=3)
q2_2020_park <-read_excel('datasets/parks/nyc-park-crime-stats-q3-2019.xlsx', skip=3)
q3_2020_park <-read_excel('datasets/parks/nyc-park-crime-stats-q4-2019.xlsx', skip=3)
q4_2020_park names(q1_2020_park)
names(q2_2020_park)
names(q3_2020_park)
names(q4_2020_park)
Data has been read in and has to be converted to a large dataset, as a concatenation of the rows along with a quarter and year column.
$year <- c(rep(2015, nrow(q1_2015_park)))
q1_2015_park$year <- c(rep(2016, nrow(q1_2016_park)))
q1_2016_park$year <- c(rep(2017, nrow(q1_2017_park)))
q1_2017_park$year <- c(rep(2018, nrow(q1_2018_park)))
q1_2018_park$year <- c(rep(2019, nrow(q1_2019_park)))
q1_2019_park$year <- c(rep(2020, nrow(q1_2020_park)))
q1_2020_park
$year <- c(rep(2015, nrow(q2_2015_park)))
q2_2015_park$year <- c(rep(2016, nrow(q2_2016_park)))
q2_2016_park$year <- c(rep(2017, nrow(q2_2017_park)))
q2_2017_park$year <- c(rep(2018, nrow(q2_2018_park)))
q2_2018_park$year <- c(rep(2019, nrow(q2_2019_park)))
q2_2019_park$year <- c(rep(2020, nrow(q2_2020_park)))
q2_2020_park
$year <- c(rep(2015, nrow(q3_2015_park)))
q3_2015_park$year <- c(rep(2016, nrow(q3_2016_park)))
q3_2016_park$year <- c(rep(2017, nrow(q3_2017_park)))
q3_2017_park$year <- c(rep(2018, nrow(q3_2018_park)))
q3_2018_park$year <- c(rep(2019, nrow(q3_2019_park)))
q3_2019_park$year <- c(rep(2020, nrow(q3_2020_park)))
q3_2020_park
$year <- c(rep(2015, nrow(q4_2015_park)))
q4_2015_park$year <- c(rep(2016, nrow(q4_2016_park)))
q4_2016_park$year <- c(rep(2017, nrow(q4_2017_park)))
q4_2017_park$year <- c(rep(2018, nrow(q4_2018_park)))
q4_2018_park$year <- c(rep(2019, nrow(q4_2019_park)))
q4_2019_park$year <- c(rep(2020, nrow(q4_2020_park))) q4_2020_park
$quarter <- c(rep('q1', nrow(q1_2015_park)))
q1_2015_park$quarter <- c(rep('q1', nrow(q1_2016_park)))
q1_2016_park$quarter <- c(rep('q1', nrow(q1_2017_park)))
q1_2017_park$quarter <- c(rep('q1', nrow(q1_2018_park)))
q1_2018_park$quarter <- c(rep('q1', nrow(q1_2019_park)))
q1_2019_park$quarter <- c(rep('q1', nrow(q1_2020_park)))
q1_2020_park
$quarter <- c(rep('q2', nrow(q2_2015_park)))
q2_2015_park$quarter <- c(rep('q2', nrow(q2_2016_park)))
q2_2016_park$quarter <- c(rep('q2', nrow(q2_2017_park)))
q2_2017_park$quarter <- c(rep('q2', nrow(q2_2018_park)))
q2_2018_park$quarter <- c(rep('q2', nrow(q2_2019_park)))
q2_2019_park$quarter <- c(rep('q2', nrow(q2_2020_park)))
q2_2020_park
$quarter <- c(rep('q3', nrow(q3_2015_park)))
q3_2015_park$quarter <- c(rep('q3', nrow(q3_2016_park)))
q3_2016_park$quarter <- c(rep('q3', nrow(q3_2017_park)))
q3_2017_park$quarter <- c(rep('q3', nrow(q3_2018_park)))
q3_2018_park$quarter <- c(rep('q3', nrow(q3_2019_park)))
q3_2019_park$quarter <- c(rep('q3', nrow(q3_2020_park)))
q3_2020_park
$quarter <- c(rep('q4', nrow(q4_2015_park)))
q4_2015_park$quarter <- c(rep('q4', nrow(q4_2016_park)))
q4_2016_park$quarter <- c(rep('q4', nrow(q4_2017_park)))
q4_2017_park$quarter <- c(rep('q4', nrow(q4_2018_park)))
q4_2018_park$quarter <- c(rep('q4', nrow(q4_2019_park)))
q4_2019_park$quarter <- c(rep('q4', nrow(q4_2020_park))) q4_2020_park
colnames(q1_2015_park)<- toupper(colnames(q1_2015_park))
colnames(q2_2015_park)<- toupper(colnames(q2_2015_park))
colnames(q3_2015_park)<- toupper(colnames(q3_2015_park))
colnames(q4_2015_park)<- toupper(colnames(q4_2015_park))
colnames(q1_2016_park)<- toupper(colnames(q1_2016_park))
colnames(q2_2016_park)<- toupper(colnames(q2_2016_park))
colnames(q3_2016_park)<- toupper(colnames(q3_2016_park))
colnames(q4_2016_park)<- toupper(colnames(q4_2016_park))
colnames(q1_2017_park)<- toupper(colnames(q1_2017_park))
colnames(q2_2017_park)<- toupper(colnames(q2_2017_park))
colnames(q3_2017_park)<- toupper(colnames(q3_2017_park))
colnames(q4_2017_park)<- toupper(colnames(q4_2017_park))
colnames(q1_2018_park)<- toupper(colnames(q1_2018_park))
colnames(q2_2018_park)<- toupper(colnames(q2_2018_park))
colnames(q3_2018_park)<- toupper(colnames(q3_2018_park))
colnames(q4_2018_park)<- toupper(colnames(q4_2018_park))
colnames(q1_2019_park)<- toupper(colnames(q1_2019_park))
colnames(q2_2019_park)<- toupper(colnames(q2_2019_park))
colnames(q3_2019_park)<- toupper(colnames(q3_2019_park))
colnames(q4_2019_park)<- toupper(colnames(q4_2019_park))
colnames(q1_2020_park)<- toupper(colnames(q1_2020_park))
colnames(q2_2020_park)<- toupper(colnames(q2_2020_park))
colnames(q3_2020_park)<- toupper(colnames(q3_2020_park))
colnames(q4_2020_park)<- toupper(colnames(q4_2020_park))
Putting the dataframes together
<- do.call("rbind", list(q1_2015_park, q2_2015_park ,q3_2015_park, q4_2015_park, q1_2016_park,q2_2016_park, q3_2016_park, q4_2016_park, q1_2017_park, q2_2017_park, q3_2017_park, q4_2017_park, q1_2018_park, q2_2018_park, q3_2018_park, q4_2018_park, q1_2019_park, q2_2019_park, q3_2019_park, q4_2019_park, q1_2020_park, q2_2020_park, q3_2020_park, q4_2020_park)) park_data
library(dplyr)
<- park_data %>% filter(PARK != 'TOTAL') park_data
Creating new csv file for output.
write.csv(park_data, './datasets/parks/finalpark.csv', row.names = FALSE)
Testing Dataset - Sanity Checks
nrow(park_data)
3.5 Hate Crime Data
County names are historical in this dataset, and hence have to be changed.
<- read.csv("datasets/NYPD_Hate_Crimes.csv")
df_hate #View(df_hate)
$County[df_hate$County == "KINGS"] <- "BROOKLYN"
df_hate$County[df_hate$County == "NEW YORK"] <- "MANHATTAN"
df_hate$County[df_hate$County == "RICHMOND"] <- "STATEN ISLAND" df_hate