1
1
use crate :: core;
2
2
use chrono:: { NaiveDate , TimeZone , Utc } ;
3
3
use chrono_tz:: Europe :: London ;
4
+ use regex:: Regex ;
4
5
use scraper:: { ElementRef , Html , Selector } ;
6
+ use std:: cmp:: min;
5
7
6
8
// Scrapes concerts from BBC Proms website
7
9
pub async fn scrape ( url : & str , client : & reqwest:: Client ) -> Vec < core:: Concert > {
10
+ println ! ( "----------------------------------------" ) ;
11
+ println ! ( "Scraping BBC Proms from URL: {}" , url) ;
12
+ println ! ( "----------------------------------------" ) ;
13
+
8
14
let html: String = client
9
15
. get ( url)
10
16
// it 500's with default user-agent
@@ -47,6 +53,8 @@ struct PromsConcertMetadata {
47
53
url : String ,
48
54
pieces : Vec < core:: Piece > ,
49
55
performers : Vec < core:: Performer > ,
56
+ min_price : Option < u32 > ,
57
+ max_price : Option < u32 > ,
50
58
}
51
59
52
60
/// Scrapes a single date's worth of concerts from the BBC Proms website
@@ -62,7 +70,6 @@ async fn scrape_one_date(date_fragment: ElementRef<'_>) -> (NaiveDate, Vec<Proms
62
70
. trim ( ) ;
63
71
// BBC's website reports dates as e.g. "Fri 23 Aug 2024"
64
72
let date = NaiveDate :: parse_from_str ( date_str, "%a %e %b %Y" ) . unwrap ( ) ;
65
- println ! ( "parsed {date_str} into date: {:?}" , date) ;
66
73
67
74
// Get the concerts themselves
68
75
let mut intermediate_concerts: Vec < PromsConcertMetadata > = vec ! [ ] ;
@@ -113,7 +120,31 @@ fn parse_single_concert(elem: ElementRef<'_>) -> PromsConcertMetadata {
113
120
. map ( |performer_elem| parse_performer ( performer_elem) )
114
121
. collect ( ) ;
115
122
116
- let concert = PromsConcertMetadata {
123
+ let price_selector =
124
+ Selector :: parse ( "div.ev-event-calendar__ticket-link-subtitle--desktop" ) . unwrap ( ) ;
125
+ let price_text = elem
126
+ . select ( & price_selector)
127
+ . next ( )
128
+ . unwrap ( )
129
+ . text ( )
130
+ . next ( )
131
+ . unwrap ( )
132
+ . trim ( ) ;
133
+ // Regexes are hacky, but it works fine for now ... otherwise the website text is very
134
+ // inconsistent and hard to parse.
135
+ let price_re = Regex :: new ( r"£(\d+)" ) . unwrap ( ) ;
136
+ let prices: Vec < u32 > = price_re
137
+ . captures_iter ( price_text)
138
+ . map ( |cap| cap. get ( 1 ) . unwrap ( ) . as_str ( ) . parse ( ) . unwrap ( ) )
139
+ . collect ( ) ;
140
+ let ( min_price, max_price) = match prices[ ..] {
141
+ [ ] => ( None , None ) ,
142
+ [ price] => ( Some ( price * 100 ) , Some ( price * 100 ) ) ,
143
+ [ min_price, max_price] => ( Some ( min_price * 100 ) , Some ( max_price * 100 ) ) ,
144
+ _ => panic ! ( "couldn't parse prices from {:?}" , price_text) ,
145
+ } ;
146
+
147
+ PromsConcertMetadata {
117
148
london_time : parsed_time,
118
149
title : elem
119
150
. select ( & Selector :: parse ( "div.ev-event-calendar__name" ) . unwrap ( ) )
@@ -149,11 +180,9 @@ fn parse_single_concert(elem: ElementRef<'_>) -> PromsConcertMetadata {
149
180
. to_string ( ) ,
150
181
pieces,
151
182
performers,
152
- } ;
153
-
154
- println ! ( "found concert: {:?}" , concert) ;
155
-
156
- concert
183
+ min_price,
184
+ max_price,
185
+ }
157
186
}
158
187
159
188
/// Combines the date and the concert metadata to form a full core::Concert
@@ -163,7 +192,11 @@ fn make_full_concert(date: NaiveDate, metadata: PromsConcertMetadata) -> core::C
163
192
. unwrap ( ) ;
164
193
let tz_datetime = London . from_local_datetime ( & naive_datetime) . unwrap ( ) ;
165
194
166
- core:: Concert {
195
+ let is_rah_prom = metadata. venue == "Royal Albert Hall"
196
+ && ( metadata. title . starts_with ( "Prom" ) || metadata. title . starts_with ( "First Night" ) ) ;
197
+ let promming_price = 800 ;
198
+
199
+ let concert = core:: Concert {
167
200
datetime : tz_datetime. with_timezone ( & Utc ) ,
168
201
url : metadata. url ,
169
202
venue : metadata. venue ,
@@ -172,22 +205,34 @@ fn make_full_concert(date: NaiveDate, metadata: PromsConcertMetadata) -> core::C
172
205
pieces : metadata. pieces ,
173
206
performers : metadata. performers ,
174
207
208
+ // Proms concerts don't have subtitles or programme PDFs available
175
209
subtitle : None ,
176
210
programme_pdf_url : None ,
177
- min_price : Some ( 800 ) ,
178
- max_price : Some ( 800 ) ,
179
211
212
+ // Day promming prices aren't shown on the website so we add them in here
213
+ min_price : if is_rah_prom {
214
+ match metadata. min_price {
215
+ Some ( price) => Some ( min ( price, promming_price) ) ,
216
+ None => Some ( promming_price) ,
217
+ }
218
+ } else {
219
+ metadata. min_price
220
+ } ,
221
+ max_price : metadata. max_price ,
222
+
223
+ // By definition
180
224
is_wigmore_u35 : false ,
181
225
is_prom : true ,
182
- }
226
+ } ;
227
+
228
+ core:: report_concert ( & concert) ;
229
+ concert
183
230
}
184
231
185
232
/// Helper function to parse a piece from a concert
186
233
fn parse_piece ( piece_elem : ElementRef < ' _ > ) -> Option < core:: Piece > {
187
234
// This is kind of hacky but it works
188
235
let all_texts = piece_elem. text ( ) . collect :: < Vec < & str > > ( ) ;
189
- println ! ( "all_texts: {:?}" , all_texts) ;
190
-
191
236
match all_texts[ ..] {
192
237
[ "interval" ] => None ,
193
238
_ => Some ( core:: Piece {
@@ -212,9 +257,6 @@ fn parse_performer(performer_elem: ElementRef<'_>) -> core::Performer {
212
257
. unwrap ( )
213
258
. text ( )
214
259
. collect :: < Vec < & str > > ( ) ;
215
-
216
- println ! ( "found performer: {} ({:?})" , name, role_texts) ;
217
-
218
260
core:: Performer {
219
261
name : name. to_string ( ) ,
220
262
instrument : match & role_texts[ ..] {
0 commit comments