-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAnimals.Rmd
143 lines (107 loc) · 3.59 KB
/
Animals.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
---
title: 'Tidy tuesday: Animals Data'
author: "Lucy Njoki"
date: "5/11/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r package load}
rm(list=ls(all = TRUE))
#package load
library(tidyverse)
library(ggthemes)
library(extrafont)
library(fcuk)
library(RColorBrewer)
extrafont::loadfonts(device = "win")
extrafont::fonttable()
```
```{r getting data}
villagers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv')
head(villagers, 10)
```
```{r exploring the data variables}
glimpse(villagers)
names(villagers)
sapply(villagers, function(x) sum(is.na(x)))
factor_variables <- c("gender", "species", "personality", "phrase")
villagers[,factor_variables] <- lapply(villagers[,factor_variables], factor)
str(villagers)
new_data <- villagers %>%
select(-"song")
head(new_data)
table(new_data$personality, new_data$gender)
```
```{r data vizualization}
theme_set(theme_tufte())
#Distribution of personality by gender
Persn <- new_data %>%
group_by(gender, personality) %>%
summarise(Count=n()) %>%
mutate(Percent = round(Count * 100/ sum(Count), 2))
Persn
P1 <- ggplot(Persn, aes(x = gender, y = Percent, fill = personality)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.6) +
geom_text(aes(label = paste(Percent, "%", sep = "")),
position = position_dodge(0.5), hjust = 0.5,
vjust = -0.25, size = 2.3, colour = "black")+
labs(x = "Personalities", y = "Percent", title = "Distribution of Personalities of the Villagers by Gender") +
theme(plot.title = element_text(family = "Calibri Light", size = rel(1.2), hjust = 0.5), axis.line = element_line(colour = "black", size = 0.5), axis.text.x = element_text(family = "Calibri Light", size = rel(1.0), hjust = 0.5), axis.text.y = element_text(family = "Calibri Light", size = rel(1.0), hjust = 0.5)) +scale_fill_brewer(palette = "Set1")
P1
```
```{r crosstabulation}
library(Rcmdr)
#Is there a relationship between the personality and species
library(abind)
local({
.Table <- xtabs(~personality+species, data=villagers)
cat("\nFrequency table:\n")
print(.Table)
print(fisher.test(.Table, simulate.p.value=TRUE))
})
```
__No significant relationship between species of a villager and their personality.__
#Sentimental Analysis
How negative or positive a comment is?
```{r}
user_reviews <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/user_reviews.tsv')
head(user_reviews)
```
```{r exploring the user-reviews data}
glimpse(user_reviews)
grades <- user_reviews%>%
group_by(grade) %>%
summarise(Count = n())
ggplot(grades, aes(x = grade, y = Count)) +
geom_bar(stat = "identity")
```
```{r}
user_reviews %>%
filter(grade > 8) %>%
sample_n(5) %>%
pull(text)
user_reviews %>%
filter(grade <8) %>%
sample_n(5) %>%
pull(text)
```
```{r}
#removing the final word "Expand" and creating a new categorical variable
reviews_parsed <- user_reviews %>%
mutate(text = str_remove(text, "Expand$")) %>%
mutate(rating = case_when(grade >7 ~ "good",
TRUE ~ "bad"))
reviews_parsed
```
#What is the distribution of words per review?
```{r}
library(tidytext)
words_per_review <- reviews_parsed%>%
unnest_tokens(word, text) %>%
count(user_name, name = "total_words")
words_per_review%>%
ggplot(aes(total_words))+
geom_histogram(fill = "midnightblue", alpha = 0.8)
```