This Notebook is an attempt to show how R can be used for simple exploratory analysis on a Dataset with Categorical Variables as well as datetime data
#Load Required Libraries
library(readr)
library(dplyr)
library(magrittr)
library(ggplot2)
#Import the Data, you can use the 'stringsAsFactors = TRUE' argument if you wish
raw_data <- read.csv("C:/Users/Steve/Desktop/Portfolio/Datasets/Industrial Safety and Health Analytics Data/IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv")
#Preview the first 6 observations of the dataset
head(raw_data)
#Get summary statistics of the dataset
summary(raw_data)
#Convert the dataset to a data table to allow for enhanced operations
raw_data <- data.table::as.data.table(raw_data)
#Confirm data type of the dataset and it's variables
str(raw_data)
Classes 'data.table' and 'data.frame': 425 obs. of 11 variables: $ X : int 0 1 2 3 4 5 6 7 8 9 ... $ Data : chr "2016-01-01 00:00:00" "2016-01-02 00:00:00" "2016-01-06 00:00:00" "2016-01-08 00:00:00" ... $ Countries : chr "Country_01" "Country_02" "Country_01" "Country_01" ... $ Local : chr "Local_01" "Local_02" "Local_03" "Local_04" ... $ Industry.Sector : chr "Mining" "Mining" "Mining" "Mining" ... $ Accident.Level : chr "I" "I" "I" "I" ... $ Potential.Accident.Level: chr "IV" "IV" "III" "I" ... $ Genre : chr "Male" "Male" "Male" "Male" ... $ Employee.or.Third.Party : chr "Third Party" "Employee" "Third Party (Remote)" "Third Party" ... $ Critical.Risk : chr "Pressed" "Pressurized Systems" "Manual Tools" "Others" ... $ Description : chr "While removing the drill rod of the Jumbo 08 for maintenance, the supervisor proceeds to loosen the support of "| __truncated__ "During the activation of a sodium sulphide pump, the piping was uncoupled and the sulfide solution was designed"| __truncated__ "In the sub-station MILPO located at level +170 when the collaborator was doing the excavation work with a pick "| __truncated__ "Being 9:45 am. approximately in the Nv. 1880 CX-695 OB7, the personnel begins the task of unlocking the Soquet "| __truncated__ ... - attr(*, ".internal.selfref")=<externalptr>
#Drop the X variable
raw_data <- select(raw_data, - X)
#Convert grouped Variables to Correct Data type
raw_data %<>% mutate_if(is.character,as.factor)
#Convert Description Variable to character
raw_data$Description <- as.character(raw_data$Description, as.character)
#Convert Data Variable to Date, and change the column name to Date
raw_data$Data <- strptime(raw_data$Data, "%Y-%m-%d %H:%M")
#Rename Data and Genre columns
library(data.table)
setnames(raw_data, c("Data", "Genre"), c("Date", "Gender"))
Attaching package: 'data.table' The following objects are masked from 'package:dplyr': between, first, last
#Inspect Data for missing values
vis_miss(raw_data[,2:9])
Error in vis_miss(raw_data[, 2:9]): could not find function "vis_miss" Traceback:
#Relations between Country and Accident Level
table(raw_data[,2], raw_data[,5])
I II III IV V Country_01 180 19 21 23 8 Country_02 99 19 7 5 0 Country_03 37 2 3 2 0
#Relations between Industry Sector and Accident Level
table(raw_data[,4], raw_data[,5])
I II III IV V Metals 107 12 7 7 1 Mining 167 26 20 21 7 Others 42 2 4 2 0
#Aggregating Data
ct <- raw_data %>%
group_by(Countries, Industry.Sector) %>%
count(Gender)
head(ct)
Countries | Industry.Sector | Gender | n |
---|---|---|---|
<fct> | <fct> | <fct> | <int> |
Country_01 | Metals | Female | 2 |
Country_01 | Metals | Male | 44 |
Country_01 | Mining | Female | 4 |
Country_01 | Mining | Male | 199 |
Country_01 | Others | Female | 1 |
Country_01 | Others | Male | 1 |
ecols <- c(Mining = "brown2", Metals = "blue2",
Others = "green4")
ggplot(raw_data) +
geom_bar(aes(x = Accident.Level, fill = Industry.Sector)) +
scale_fill_manual(values = ecols) +
facet_wrap(~ Gender)
#Line Graph for Accident Frequencies by Date
#Convert Date Variable from POSIXlt to POSIXct
raw_data$Date <- as.POSIXct(raw_data$Date, format="%Y-%m-%d")
#Create a data frame
short.date.agg <- raw_data %>%
mutate(ym = format(Date, "%Y-%m")) %>%
group_by(ym) %>%
count()
ggplot(short.date.agg, aes(ym, n, group = 1)) +
geom_point() +
geom_line(color = "#00AFBB", size = 1) +
ggtitle("Accident Frequency by Month-Year ") +
xlab("Date") + ylab("Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size =8))
#Load Libraries
library(tidytext)
#Split Variable into words
desc <- strsplit(raw_data$Description, split = " ")
desc <- unlist(desc)
#Remove punctuations
desc <- gsub("[[:punct:][:blank:]]+", "", desc)
#convert text to lowercase
desc <- tolower(desc)
#Remove stop words
desc_df <- data.frame(word = desc)
mb <- anti_join(desc_df, stop_words, by = 'word' )
#Get Word Frequencies
mb %>% count(word, sort = T)
word | n |
---|---|
<chr> | <int> |
causing | 166 |
hand | 166 |
left | 160 |
employee | 154 |
143 | |
operator | 122 |
injury | 105 |
time | 101 |
activity | 94 |
moment | 79 |
equipment | 77 |
accident | 73 |
level | 71 |
collaborator | 68 |
finger | 68 |
pipe | 67 |
floor | 65 |
assistant | 62 |
worker | 60 |
mesh | 58 |
support | 58 |
rock | 55 |
safety | 53 |
approximately | 48 |
meters | 48 |
height | 47 |
metal | 42 |
team | 42 |
injured | 41 |
kg | 41 |
... | ... |
wila | 1 |
wilber | 1 |
wilder | 1 |
wilmer | 1 |
winche | 1 |
winemaker | 1 |
winery | 1 |
wires | 1 |
withdrawing | 1 |
withdrew | 1 |
woman | 1 |
workermechanic | 1 |
workplace | 1 |
wounding | 1 |
x3 | 1 |
x32 | 1 |
x40cm | 1 |
x6 | 1 |
xray | 1 |
xrd | 1 |
xxx | 1 |
yaranga | 1 |
ydrs | 1 |
yolk | 1 |
z014 | 1 |
z132 | 1 |
z332 | 1 |
zaf | 1 |
zamac | 1 |
zinco | 1 |
#Get +ve words
positive <- get_sentiments("bing") %>%
filter(sentiment == "positive")
semi_join(mb, positive, by = "word") %>%
count(word, sort = TRUE)
word | n |
---|---|
<chr> | <int> |
support | 58 |
top | 16 |
master | 12 |
hot | 8 |
clean | 7 |
protection | 7 |
striking | 7 |
positive | 6 |
protective | 5 |
divine | 4 |
free | 4 |
recovery | 4 |
supported | 4 |
supporting | 4 |
supports | 4 |
correct | 3 |
fine | 3 |
safe | 3 |
boom | 2 |
excited | 2 |
fans | 2 |
solid | 2 |
strong | 2 |
superior | 2 |
comfort | 1 |
correctly | 1 |
effective | 1 |
energize | 1 |
facilitate | 1 |
favor | 1 |
flexible | 1 |
freed | 1 |
improve | 1 |
leads | 1 |
led | 1 |
portable | 1 |
progress | 1 |
progressive | 1 |
promptly | 1 |
properly | 1 |
rapid | 1 |
remedy | 1 |
sensation | 1 |
soft | 1 |
stability | 1 |
success | 1 |
thinner | 1 |
timely | 1 |
#Get -ve words
negative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
semi_join(mb, negative, by = "word") %>%
count(word, sort = T)
word | n |
---|---|
<chr> | <int> |
injury | 105 |
falls | 31 |
fall | 25 |
falling | 19 |
allergic | 16 |
wound | 16 |
fell | 14 |
pain | 14 |
sting | 14 |
tank | 14 |
burn | 13 |
grating | 13 |
struck | 11 |
suffered | 11 |
swelling | 11 |
split | 10 |
superficial | 10 |
noise | 9 |
untimely | 9 |
broken | 8 |
discomfort | 8 |
suffering | 8 |
blow | 7 |
gutter | 7 |
irritation | 7 |
suffers | 7 |
breaks | 6 |
emergency | 6 |
loose | 6 |
loses | 6 |
... | ... |
neglected | 1 |
object | 1 |
obstruct | 1 |
obstructing | 1 |
obstruction | 1 |
pig | 1 |
pique | 1 |
resistance | 1 |
retreat | 1 |
risk | 1 |
scare | 1 |
shake | 1 |
slaughter | 1 |
slow | 1 |
steep | 1 |
strained | 1 |
stumbles | 1 |
stun | 1 |
subjection | 1 |
symptoms | 1 |
taut | 1 |
trapped | 1 |
traumatic | 1 |
twisted | 1 |
uneven | 1 |
unstable | 1 |
violent | 1 |
violently | 1 |
wedge | 1 |
weed | 1 |
#Alternative methods for getting +ve & -ve sentiments
bing <- get_sentiments("bing")
my_sentiment <- mb %>%
inner_join(bing) %>%
count(word, sentiment, sort = T)
Joining, by = "word"
#Word CLoud Visualization
library(ggwordcloud)
my_sentiment %>%
filter(n>2) %>%
ggplot(aes(size = n, label = word, color = sentiment)) +
geom_text_wordcloud() +
scale_size_area(max_size = 20) +
ggtitle("Contribution to Sentiment")