This Notebook is an attempt to show how R can be used for simple exploratory analysis on a Dataset with Categorical Variables as well as datetime data
#Load Required Libraries
library(readr)
library(dplyr)
library(magrittr)
library(ggplot2)
#Import the Data, you can use the 'stringsAsFactors = TRUE' argument if you wish
raw_data <- read.csv("C:/Users/Steve/Desktop/Portfolio/Datasets/Industrial Safety and Health Analytics Data/IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv")
#Preview the first 6 observations of the dataset
head(raw_data)
#Get summary statistics of the dataset
summary(raw_data)
#Convert the dataset to a data table to allow for enhanced operations
raw_data <- data.table::as.data.table(raw_data)
#Confirm data type of the dataset and it's variables
str(raw_data)
Classes 'data.table' and 'data.frame': 425 obs. of 11 variables: $ X : int 0 1 2 3 4 5 6 7 8 9 ... $ Data : chr "2016-01-01 00:00:00" "2016-01-02 00:00:00" "2016-01-06 00:00:00" "2016-01-08 00:00:00" ... $ Countries : chr "Country_01" "Country_02" "Country_01" "Country_01" ... $ Local : chr "Local_01" "Local_02" "Local_03" "Local_04" ... $ Industry.Sector : chr "Mining" "Mining" "Mining" "Mining" ... $ Accident.Level : chr "I" "I" "I" "I" ... $ Potential.Accident.Level: chr "IV" "IV" "III" "I" ... $ Genre : chr "Male" "Male" "Male" "Male" ... $ Employee.or.Third.Party : chr "Third Party" "Employee" "Third Party (Remote)" "Third Party" ... $ Critical.Risk : chr "Pressed" "Pressurized Systems" "Manual Tools" "Others" ... $ Description : chr "While removing the drill rod of the Jumbo 08 for maintenance, the supervisor proceeds to loosen the support of "| __truncated__ "During the activation of a sodium sulphide pump, the piping was uncoupled and the sulfide solution was designed"| __truncated__ "In the sub-station MILPO located at level +170 when the collaborator was doing the excavation work with a pick "| __truncated__ "Being 9:45 am. approximately in the Nv. 1880 CX-695 OB7, the personnel begins the task of unlocking the Soquet "| __truncated__ ... - attr(*, ".internal.selfref")=<externalptr>
#Drop the X variable
raw_data <- select(raw_data, - X)
#Convert grouped Variables to Correct Data type
raw_data %<>% mutate_if(is.character,as.factor)
#Convert Description Variable to character
raw_data$Description <- as.character(raw_data$Description, as.character)
#Convert Data Variable to Date, and change the column name to Date
raw_data$Data <- strptime(raw_data$Data, "%Y-%m-%d %H:%M")
#Rename Data and Genre columns
library(data.table)
setnames(raw_data, c("Data", "Genre"), c("Date", "Gender"))
Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':
between, first, last
#Inspect Data for missing values
vis_miss(raw_data[,2:9])
Error in vis_miss(raw_data[, 2:9]): could not find function "vis_miss" Traceback:
#Relations between Country and Accident Level
table(raw_data[,2], raw_data[,5])
I II III IV V
Country_01 180 19 21 23 8
Country_02 99 19 7 5 0
Country_03 37 2 3 2 0
#Relations between Industry Sector and Accident Level
table(raw_data[,4], raw_data[,5])
I II III IV V
Metals 107 12 7 7 1
Mining 167 26 20 21 7
Others 42 2 4 2 0
#Aggregating Data
ct <- raw_data %>%
group_by(Countries, Industry.Sector) %>%
count(Gender)
head(ct)
| Countries | Industry.Sector | Gender | n |
|---|---|---|---|
| <fct> | <fct> | <fct> | <int> |
| Country_01 | Metals | Female | 2 |
| Country_01 | Metals | Male | 44 |
| Country_01 | Mining | Female | 4 |
| Country_01 | Mining | Male | 199 |
| Country_01 | Others | Female | 1 |
| Country_01 | Others | Male | 1 |
ecols <- c(Mining = "brown2", Metals = "blue2",
Others = "green4")
ggplot(raw_data) +
geom_bar(aes(x = Accident.Level, fill = Industry.Sector)) +
scale_fill_manual(values = ecols) +
facet_wrap(~ Gender)
#Line Graph for Accident Frequencies by Date
#Convert Date Variable from POSIXlt to POSIXct
raw_data$Date <- as.POSIXct(raw_data$Date, format="%Y-%m-%d")
#Create a data frame
short.date.agg <- raw_data %>%
mutate(ym = format(Date, "%Y-%m")) %>%
group_by(ym) %>%
count()
ggplot(short.date.agg, aes(ym, n, group = 1)) +
geom_point() +
geom_line(color = "#00AFBB", size = 1) +
ggtitle("Accident Frequency by Month-Year ") +
xlab("Date") + ylab("Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size =8))
#Load Libraries
library(tidytext)
#Split Variable into words
desc <- strsplit(raw_data$Description, split = " ")
desc <- unlist(desc)
#Remove punctuations
desc <- gsub("[[:punct:][:blank:]]+", "", desc)
#convert text to lowercase
desc <- tolower(desc)
#Remove stop words
desc_df <- data.frame(word = desc)
mb <- anti_join(desc_df, stop_words, by = 'word' )
#Get Word Frequencies
mb %>% count(word, sort = T)
| word | n |
|---|---|
| <chr> | <int> |
| causing | 166 |
| hand | 166 |
| left | 160 |
| employee | 154 |
| 143 | |
| operator | 122 |
| injury | 105 |
| time | 101 |
| activity | 94 |
| moment | 79 |
| equipment | 77 |
| accident | 73 |
| level | 71 |
| collaborator | 68 |
| finger | 68 |
| pipe | 67 |
| floor | 65 |
| assistant | 62 |
| worker | 60 |
| mesh | 58 |
| support | 58 |
| rock | 55 |
| safety | 53 |
| approximately | 48 |
| meters | 48 |
| height | 47 |
| metal | 42 |
| team | 42 |
| injured | 41 |
| kg | 41 |
| ... | ... |
| wila | 1 |
| wilber | 1 |
| wilder | 1 |
| wilmer | 1 |
| winche | 1 |
| winemaker | 1 |
| winery | 1 |
| wires | 1 |
| withdrawing | 1 |
| withdrew | 1 |
| woman | 1 |
| workermechanic | 1 |
| workplace | 1 |
| wounding | 1 |
| x3 | 1 |
| x32 | 1 |
| x40cm | 1 |
| x6 | 1 |
| xray | 1 |
| xrd | 1 |
| xxx | 1 |
| yaranga | 1 |
| ydrs | 1 |
| yolk | 1 |
| z014 | 1 |
| z132 | 1 |
| z332 | 1 |
| zaf | 1 |
| zamac | 1 |
| zinco | 1 |
#Get +ve words
positive <- get_sentiments("bing") %>%
filter(sentiment == "positive")
semi_join(mb, positive, by = "word") %>%
count(word, sort = TRUE)
| word | n |
|---|---|
| <chr> | <int> |
| support | 58 |
| top | 16 |
| master | 12 |
| hot | 8 |
| clean | 7 |
| protection | 7 |
| striking | 7 |
| positive | 6 |
| protective | 5 |
| divine | 4 |
| free | 4 |
| recovery | 4 |
| supported | 4 |
| supporting | 4 |
| supports | 4 |
| correct | 3 |
| fine | 3 |
| safe | 3 |
| boom | 2 |
| excited | 2 |
| fans | 2 |
| solid | 2 |
| strong | 2 |
| superior | 2 |
| comfort | 1 |
| correctly | 1 |
| effective | 1 |
| energize | 1 |
| facilitate | 1 |
| favor | 1 |
| flexible | 1 |
| freed | 1 |
| improve | 1 |
| leads | 1 |
| led | 1 |
| portable | 1 |
| progress | 1 |
| progressive | 1 |
| promptly | 1 |
| properly | 1 |
| rapid | 1 |
| remedy | 1 |
| sensation | 1 |
| soft | 1 |
| stability | 1 |
| success | 1 |
| thinner | 1 |
| timely | 1 |
#Get -ve words
negative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
semi_join(mb, negative, by = "word") %>%
count(word, sort = T)
| word | n |
|---|---|
| <chr> | <int> |
| injury | 105 |
| falls | 31 |
| fall | 25 |
| falling | 19 |
| allergic | 16 |
| wound | 16 |
| fell | 14 |
| pain | 14 |
| sting | 14 |
| tank | 14 |
| burn | 13 |
| grating | 13 |
| struck | 11 |
| suffered | 11 |
| swelling | 11 |
| split | 10 |
| superficial | 10 |
| noise | 9 |
| untimely | 9 |
| broken | 8 |
| discomfort | 8 |
| suffering | 8 |
| blow | 7 |
| gutter | 7 |
| irritation | 7 |
| suffers | 7 |
| breaks | 6 |
| emergency | 6 |
| loose | 6 |
| loses | 6 |
| ... | ... |
| neglected | 1 |
| object | 1 |
| obstruct | 1 |
| obstructing | 1 |
| obstruction | 1 |
| pig | 1 |
| pique | 1 |
| resistance | 1 |
| retreat | 1 |
| risk | 1 |
| scare | 1 |
| shake | 1 |
| slaughter | 1 |
| slow | 1 |
| steep | 1 |
| strained | 1 |
| stumbles | 1 |
| stun | 1 |
| subjection | 1 |
| symptoms | 1 |
| taut | 1 |
| trapped | 1 |
| traumatic | 1 |
| twisted | 1 |
| uneven | 1 |
| unstable | 1 |
| violent | 1 |
| violently | 1 |
| wedge | 1 |
| weed | 1 |
#Alternative methods for getting +ve & -ve sentiments
bing <- get_sentiments("bing")
my_sentiment <- mb %>%
inner_join(bing) %>%
count(word, sentiment, sort = T)
Joining, by = "word"
#Word CLoud Visualization
library(ggwordcloud)
my_sentiment %>%
filter(n>2) %>%
ggplot(aes(size = n, label = word, color = sentiment)) +
geom_text_wordcloud() +
scale_size_area(max_size = 20) +
ggtitle("Contribution to Sentiment")