-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.R
92 lines (53 loc) · 1.87 KB
/
scraping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
## ---- scraping.R
library(tidyverse)
library(rvest)
source("url_list.R", echo = F, local = knitr::knit_global())
df_name_correction <- function(df, url){
# Changes the name of given data frame for correct variable labeling
df_name <- gsub("https://kworb.net", "", url)
df_name <- gsub(".html", "", df_name)
df_name <- gsub("/", "_", df_name)
df_name <- sub("^_", "", df_name)
assign(df_name, df, envir = .GlobalEnv)
}
auto_scraping <- function(url){
# Automated web scraping of the pages with data
# Creates correctly formated data frames
## column header class: "th"
## data row class: "td"
### Getting column headers
columns <- read_html(url) %>%
html_elements("th") %>%
html_text2()
### Getting data rows
rows <- read_html(url) %>%
html_elements("td") %>%
html_text2()
## Converting scrapped data into a data frame
### Creating empty data frame with correct dimensions to fill
rows_df <- data.frame(matrix(ncol = length(columns), nrow = 0))
rows_df_colnames <- columns
current_row <- c()
### Filling in the data frame
for (i in 1:length(rows)){
if (i%%length(columns) == 0){
current_row <- append(current_row, rows[i])
rows_df <- rbind(rows_df, current_row)
current_row <- c()
}
else{
current_row <- append(current_row, rows[i])
}
}
colnames(rows_df) <- rows_df_colnames
return(rows_df)
}
# Iterating over the list of urls to create all of the needed data frames from scraped data
for (i in 1:length(url_list)){
data <- auto_scraping(url_list[i])
df_name_correction(data, url_list[i])
}
# Spotify most streamed songs from Wikipedia
tables <- page %>% html_nodes(".wikitable")
most_streamed_songs_table <- tables[[1]]
spotify_most_streamed_songs <- most_streamed_songs_table %>% html_table(fill = TRUE)