#Load Required Libraries
library(readr)
library(dplyr)
library(magrittr)
library(ggplot2)


#Import the Data, you can use the 'stringsAsFactors = TRUE' argument if you wish
raw_data <- read.csv("C:/Users/Steve/Desktop/Portfolio/Datasets/Industrial Safety and Health Analytics Data/IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv")


#Preview the first 6 observations of the dataset
head(raw_data)


#Get summary statistics of the dataset
summary(raw_data)


#Convert the dataset to a data table to allow for enhanced operations
raw_data <- data.table::as.data.table(raw_data)


#Confirm data type of the dataset and it's variables
str(raw_data)

Classes 'data.table' and 'data.frame':	425 obs. of  11 variables:
 $ X                       : int  0 1 2 3 4 5 6 7 8 9 ...
 $ Data                    : chr  "2016-01-01 00:00:00" "2016-01-02 00:00:00" "2016-01-06 00:00:00" "2016-01-08 00:00:00" ...
 $ Countries               : chr  "Country_01" "Country_02" "Country_01" "Country_01" ...
 $ Local                   : chr  "Local_01" "Local_02" "Local_03" "Local_04" ...
 $ Industry.Sector         : chr  "Mining" "Mining" "Mining" "Mining" ...
 $ Accident.Level          : chr  "I" "I" "I" "I" ...
 $ Potential.Accident.Level: chr  "IV" "IV" "III" "I" ...
 $ Genre                   : chr  "Male" "Male" "Male" "Male" ...
 $ Employee.or.Third.Party : chr  "Third Party" "Employee" "Third Party (Remote)" "Third Party" ...
 $ Critical.Risk           : chr  "Pressed" "Pressurized Systems" "Manual Tools" "Others" ...
 $ Description             : chr  "While removing the drill rod of the Jumbo 08 for maintenance, the supervisor proceeds to loosen the support of "| __truncated__ "During the activation of a sodium sulphide pump, the piping was uncoupled and the sulfide solution was designed"| __truncated__ "In the sub-station MILPO located at level +170 when the collaborator was doing the excavation work with a pick "| __truncated__ "Being 9:45 am. approximately in the Nv. 1880 CX-695 OB7, the personnel begins the task of unlocking the Soquet "| __truncated__ ...
 - attr(*, ".internal.selfref")=<externalptr>


#Drop the X variable
raw_data <- select(raw_data, - X)


#Convert grouped Variables to Correct Data type
raw_data %<>% mutate_if(is.character,as.factor)


#Convert Description Variable to character
raw_data$Description <- as.character(raw_data$Description, as.character)


#Convert Data Variable to Date, and change the column name to Date
raw_data$Data <- strptime(raw_data$Data, "%Y-%m-%d %H:%M")


#Rename Data and Genre columns
library(data.table)
setnames(raw_data, c("Data", "Genre"), c("Date", "Gender"))

Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last


#Inspect Data for missing values
vis_miss(raw_data[,2:9])

Error in vis_miss(raw_data[, 2:9]): could not find function "vis_miss"
Traceback:


#Relations between Country and Accident Level
table(raw_data[,2], raw_data[,5])

            
               I  II III  IV   V
  Country_01 180  19  21  23   8
  Country_02  99  19   7   5   0
  Country_03  37   2   3   2   0


#Relations between Industry Sector and Accident Level
table(raw_data[,4], raw_data[,5])

        
           I  II III  IV   V
  Metals 107  12   7   7   1
  Mining 167  26  20  21   7
  Others  42   2   4   2   0


#Aggregating Data
ct <- raw_data %>%
  group_by(Countries, Industry.Sector) %>%
  count(Gender)
head(ct)


ecols <- c(Mining = "brown2",  Metals = "blue2",
           Others = "green4")

ggplot(raw_data) + 
  geom_bar(aes(x = Accident.Level, fill = Industry.Sector)) + 
  scale_fill_manual(values = ecols) +
  facet_wrap(~ Gender)


#Line Graph for Accident Frequencies by Date
#Convert Date Variable from POSIXlt to POSIXct
raw_data$Date <- as.POSIXct(raw_data$Date, format="%Y-%m-%d")


#Create a data frame 
short.date.agg <- raw_data %>%
  mutate(ym = format(Date, "%Y-%m")) %>%
  group_by(ym) %>%
  count()

ggplot(short.date.agg, aes(ym, n, group = 1)) +
  geom_point() + 
  geom_line(color = "#00AFBB", size = 1) + 
  ggtitle("Accident Frequency by Month-Year ") +
  xlab("Date") + ylab("Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size =8))


#Load Libraries
library(tidytext)


#Split Variable into words
desc <- strsplit(raw_data$Description, split = " ")
desc <- unlist(desc)


#Remove punctuations
desc <- gsub("[[:punct:][:blank:]]+", "", desc)


#convert text to lowercase
desc <- tolower(desc)


#Remove stop words
desc_df <- data.frame(word = desc)
mb <- anti_join(desc_df, stop_words, by = 'word' )


#Get Word Frequencies
mb %>% count(word, sort = T)


#Get +ve words
positive <- get_sentiments("bing") %>%
  filter(sentiment == "positive")

semi_join(mb, positive, by = "word") %>%
  count(word, sort = TRUE)


#Get -ve words
negative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")

semi_join(mb, negative, by = "word") %>%
  count(word, sort = T)


#Alternative methods for getting +ve & -ve sentiments
bing  <- get_sentiments("bing")

my_sentiment <- mb %>%
  inner_join(bing) %>%
  count(word, sentiment, sort = T)

Joining, by = "word"


#Word CLoud Visualization
library(ggwordcloud)
my_sentiment %>%
  filter(n>2) %>%
  ggplot(aes(size = n, label = word, color = sentiment)) +
  geom_text_wordcloud() +
  scale_size_area(max_size = 20) +
  ggtitle("Contribution to Sentiment")

word	n
<chr>	<int>
causing	166
hand	166
left	160
employee	154
	143
operator	122
injury	105
time	101
activity	94
moment	79
equipment	77
accident	73
level	71
collaborator	68
finger	68
pipe	67
floor	65
assistant	62
worker	60
mesh	58
support	58
rock	55
safety	53
approximately	48
meters	48
height	47
metal	42
team	42
injured	41
kg	41
...	...
wila	1
wilber	1
wilder	1
wilmer	1
winche	1
winemaker	1
winery	1
wires	1
withdrawing	1
withdrew	1
woman	1
workermechanic	1
workplace	1
wounding	1
x3	1
x32	1
x40cm	1
x6	1
xray	1
xrd	1
xxx	1
yaranga	1
ydrs	1
yolk	1
z014	1
z132	1
z332	1
zaf	1
zamac	1
zinco	1

word	n
<chr>	<int>
support	58
top	16
master	12
hot	8
clean	7
protection	7
striking	7
positive	6
protective	5
divine	4
free	4
recovery	4
supported	4
supporting	4
supports	4
correct	3
fine	3
safe	3
boom	2
excited	2
fans	2
solid	2
strong	2
superior	2
comfort	1
correctly	1
effective	1
energize	1
facilitate	1
favor	1
flexible	1
freed	1
improve	1
leads	1
led	1
portable	1
progress	1
progressive	1
promptly	1
properly	1
rapid	1
remedy	1
sensation	1
soft	1
stability	1
success	1
thinner	1
timely	1

word	n
<chr>	<int>
injury	105
falls	31
fall	25
falling	19
allergic	16
wound	16
fell	14
pain	14
sting	14
tank	14
burn	13
grating	13
struck	11
suffered	11
swelling	11
split	10
superficial	10
noise	9
untimely	9
broken	8
discomfort	8
suffering	8
blow	7
gutter	7
irritation	7
suffers	7
breaks	6
emergency	6
loose	6
loses	6
...	...
neglected	1
object	1
obstruct	1
obstructing	1
obstruction	1
pig	1
pique	1
resistance	1
retreat	1
risk	1
scare	1
shake	1
slaughter	1
slow	1
steep	1
strained	1
stumbles	1
stun	1
subjection	1
symptoms	1
taut	1
trapped	1
traumatic	1
twisted	1
uneven	1
unstable	1
violent	1
violently	1
wedge	1
weed	1

Sentiment Analysis¶

Introduction¶

Data exploration to help us understand the dataset¶

After Inspecting the dataset, Remove variables that aren't needed, and convert the variables into the correct data type¶

Test for Relations between the variables using contigency tables¶

Visual Exploratory Analysis¶

Sentiment Analysis of the Description Variable¶

Countries	Industry.Sector	Gender	n
<fct>	<fct>	<fct>	<int>
Country_01	Metals	Female	2
Country_01	Metals	Male	44
Country_01	Mining	Female	4
Country_01	Mining	Male	199
Country_01	Others	Female	1
Country_01	Others	Male	1