forked from ltcohen43/Book_Cohen_R
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbible.R
139 lines (97 loc) · 4.03 KB
/
bible.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# devtools::install_github("JohnCoene/sacred") # nolint
pacman::p_load(magrittr, ggwordcloud, tm, sacred, RColorBrewer, tidyverse, tidytext) # nolint
httpgd::hgd()
httpgd::hgd_browse()
# https://semba-blog.netlify.app/11/06/2019/looking-at-the-bible-through-wordcloud-in-r/ # nolint
scripture <- sacred::king_james_version
bible <- scripture %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
tidy_books <- bible %>%
tidytext::unnest_tokens(word, text)
## Notice the data format ##
tidy_books
## Count Up Most Common Words ##
tidy_books %>%
dplyr::count(word, sort = TRUE)
## Anti_Join with stop_words df ##
data("middle_english_stopwords")
stop_words[1:5, 1]
tidy_books %>%
dplyr::anti_join(middle_english_stopwords, by = "word") %>%
dplyr::count(word, sort = TRUE)
#### Word cloud ####
heaven <- scripture %>%
filter(book == "mat", psalm == 13) %>%
select(text)
heaven_corpus <- heaven %>%
tm::VectorSource() %>%
tm::VCorpus()
to_Space <- content_transformer(function(x, pattern) gsub(pattern, " ", x)) # nolint
heaven_corpus <- heaven_corpus %>%
tm_map(to_Space, "/") %>%
tm_map(to_Space, " ") %>%
tm_map(to_Space, "\\|")
heaven_corpus <- heaven_corpus %>%
tm_map(FUN = content_transformer(tolower)) %>% # Convert the text to lower case # nolint
tm_map(FUN = removeNumbers) %>% # Remove numbers
tm_map(removeWords, stopwords("english")) %>% # Remove english common stopwords # nolint
tm_map(removeWords, c("ye", "O", "unto", "yet", "thee", "wherein", "neither", "shall", # nolint
"saith", "host", "will", "offer", "say")) %>% # Remove words # nolint
tm_map(removePunctuation) %>% # Remove punctuations
tm_map(stripWhitespace)
heaven_corpus_tb <- heaven_corpus %>%
tm::TermDocumentMatrix(control = list(removeNumbers = TRUE,
stopwords = TRUE,
stemming = TRUE)) %>%
as.matrix() %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
dplyr::rename(word = 1, freq = 2) %>%
dplyr::arrange(desc(freq))
ggplot(data = heaven_corpus_tb,
aes(label = word, size = freq, col = as.character(freq))) +
geom_text_wordcloud(rm_outside = TRUE, max_steps = 1,
grid_size = 1, eccentricity = .9, shape = "triangle-upright") +
scale_size_area(max_size = 12) +
scale_color_brewer(palette = "Paired", direction = -1) +
theme_void()
ggsave(file = "original_word_cloud.png", width = 15, height = 6)
heaven_corpus_tb <- heaven_corpus_tb %>%
dplyr::mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) # nolint
#rotated words= image
ggplot(data = heaven_corpus_tb,
aes(label = word, size = freq, angle = angle, col = as.character(freq))) + # nolint
geom_text_wordcloud(rm_outside = TRUE, max_steps = 1,
grid_size = 1, eccentricity = .9, shape = "triangle-upright") +
scale_size_area(max_size = 12) +
scale_color_brewer(palette = "Paired", direction = -1) +
theme_void()
ggsave(file = "word_cloud.png", width = 15, height = 6)
# the example- Credit: Shelby Taylor- 4030 class
# library(tidyverse)
# library(janeaustenr)
# library(tidytext)
# ## Don't worry about this part too much right now- makes code in readable format## # nolint
# original_books <- austen_books() %>%
# group_by(book) %>%
# mutate(linenumber = row_number(),
# chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
# ignore_case = TRUE)))) %>%
# ungroup()
# tidy_books <- original_books %>%
# tidytext::unnest_tokens(word, text)
# ## Notice the data format ##
# tidy_books
# ## Count Up Most Common Words ##
# tidy_books %>%
# dplyr::count(word, sort = TRUE)
# ## Anti_Join with stop_words df ##
# data("stop_words")
# stop_words[1:5, 1]
# tidy_books %>%
# dplyr::anti_join(stop_words, by = "word") %>%
# dplyr::count(word, sort = TRUE)