-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRscript2_Meetu.R
52 lines (38 loc) · 1.45 KB
/
Rscript2_Meetu.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Text analysis using tidytext and dplyr in R using JEOPARDY_CSV.csv dataset
# install folowing packages and load the libraries
# tidytext
# dplyr
# ggplot2
# Read the JEOPARDY_CSV.csv data, making sure to uncheck StringsAsFactors
# This dataset contains the Answer column which we will mine
library(tidytext)
library(dplyr)
library(ggplot2)
library(data.table)
getwd()
setwd("C:/Meetu/Fall2018/BigData/Project")
# read the data
jeoData <- read.csv("C:/Meetu/Fall2018/BigData/Project/JEOPARDY_CSV.csv", stringsAsFactors=FALSE)
colnames(jeoData)
# count number of rows
ncount <- nrow(jeoData)
ncount
# extract only the Question column into a dataset
answerData <- jeoData$Answer
# convert the data to a data frame
text_df <- data_frame(line = 1:ncount, text = answerData)
head(text_df)
# tokenize with standard tokenization using unnest_tokens from tidytext
token_data <- unnest_tokens(text_df, word, text)
# remove stop-words using anti_join function from dplyr
# stop_words come from tidytext package
token_data <- anti_join(token_data, stop_words)
# use the count() function of dplyr to view most common words
wordcount <- count(token_data,word, sort = TRUE)
# filter for n > 500 using filter function from dplyr
wordcountfiltered <- filter(wordcount, n > 500)
# visualize with ggplot
ggplot(wordcountfiltered, aes(reorder(word, n), n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
coord_flip()