Skip to content

Commit e97adfb

Browse files
authored
feat!: data_warehosue migrating to TheLook Ecommerce dataset (#257)
1 parent 096ca4e commit e97adfb

20 files changed

+976
-450
lines changed

modules/data_warehouse/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ The resources/services/activations/deletions that this module will create/trigge
1212
- Creates a BigQuery Dataset
1313
- Creates a BigQuery Table
1414
- Creates a Google Cloud Storage bucket
15-
- Loads the Google Cloud Storage bucket with data from https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-tlc-trips
15+
- Loads the Google Cloud Storage bucket with data from [TheLook eCommerce Public Dataset](https://console.cloud.google.com/marketplace/product/bigquery-public-data/thelook-ecommerce)
1616
- Provides SQL examples
1717
- Creates and inferences with a BigQuery ML model
1818
- Creates a Looker Studio report
@@ -47,7 +47,7 @@ Functional examples are included in the
4747
|------|-------------|
4848
| bigquery\_editor\_url | The URL to launch the BigQuery editor with the sample query procedure opened |
4949
| ds\_friendly\_name | Dataset name |
50-
| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the taxi data analysis |
50+
| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the e-commerce data analysis |
5151
| neos\_tutorial\_url | The URL to launch the in-console tutorial for the EDW solution |
5252
| raw\_bucket | Raw bucket name |
5353

modules/data_warehouse/assets/data-warehouse-architecture.svg

+2-2
Loading

modules/data_warehouse/bigquery.tf

+124-33
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
# # Create the BigQuery dataset
1919
resource "google_bigquery_dataset" "ds_edw" {
2020
project = module.project-services.project_id
21-
dataset_id = "ds_edw"
21+
dataset_id = "thelook"
2222
friendly_name = "My EDW Dataset"
2323
description = "My EDW Dataset with tables"
2424
location = var.region
2525
labels = var.labels
2626
delete_contents_on_destroy = var.force_destroy
27+
28+
depends_on = [time_sleep.wait_after_apis]
2729
}
2830

2931
# # Create a BigQuery connection
@@ -33,6 +35,7 @@ resource "google_bigquery_connection" "ds_connection" {
3335
location = var.region
3436
friendly_name = "Storage Bucket Connection"
3537
cloud_resource {}
38+
depends_on = [time_sleep.wait_after_apis]
3639
}
3740

3841
# # Grant IAM access to the BigQuery Connection account for Cloud Storage
@@ -42,64 +45,146 @@ resource "google_storage_bucket_iam_binding" "bq_connection_iam_object_viewer" {
4245
members = [
4346
"serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}",
4447
]
48+
}
4549

46-
depends_on = [
47-
google_bigquery_connection.ds_connection,
48-
]
50+
# # Create a Biglake table for events with metadata caching
51+
resource "google_bigquery_table" "tbl_edw_events" {
52+
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
53+
table_id = "events"
54+
project = module.project-services.project_id
55+
deletion_protection = var.deletion_protection
56+
57+
schema = file("${path.module}/src/schema/events_schema.json")
58+
59+
external_data_configuration {
60+
autodetect = true
61+
connection_id = google_bigquery_connection.ds_connection.name
62+
source_format = "PARQUET"
63+
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/events.parquet"]
64+
}
65+
66+
labels = var.labels
4967
}
5068

51-
# # Create a BigQuery external table
52-
resource "google_bigquery_table" "tbl_edw_taxi" {
69+
# # Create a Biglake table for inventory_items
70+
resource "google_bigquery_table" "tbl_edw_inventory_items" {
5371
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
54-
table_id = "taxi_trips"
72+
table_id = "inventory_items"
5573
project = module.project-services.project_id
5674
deletion_protection = var.deletion_protection
5775

76+
schema = file("${path.module}/src/schema/inventory_items_schema.json")
77+
5878
external_data_configuration {
5979
autodetect = true
60-
connection_id = "${module.project-services.project_id}.${var.region}.ds_connection"
80+
connection_id = google_bigquery_connection.ds_connection.name
6181
source_format = "PARQUET"
62-
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/new-york-taxi-trips/tlc-yellow-trips-2022/taxi-*.Parquet"]
82+
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/inventory_items.parquet"]
83+
}
84+
85+
labels = var.labels
86+
}
87+
88+
# # Create a Biglake table with metadata caching for order_items
89+
resource "google_bigquery_table" "tbl_edw_order_items" {
90+
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
91+
table_id = "order_items"
92+
project = module.project-services.project_id
93+
deletion_protection = var.deletion_protection
6394

95+
schema = file("${path.module}/src/schema/order_items_schema.json")
96+
97+
external_data_configuration {
98+
autodetect = true
99+
connection_id = google_bigquery_connection.ds_connection.name
100+
source_format = "PARQUET"
101+
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/order_items.parquet"]
64102
}
65103

66-
schema = file("${path.module}/src/taxi_trips_schema.json")
104+
labels = var.labels
105+
}
106+
107+
# # Create a Biglake table for orders
108+
resource "google_bigquery_table" "tbl_edw_orders" {
109+
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
110+
table_id = "orders"
111+
project = module.project-services.project_id
112+
deletion_protection = var.deletion_protection
113+
114+
schema = file("${path.module}/src/schema/orders_schema.json")
115+
116+
external_data_configuration {
117+
autodetect = true
118+
connection_id = google_bigquery_connection.ds_connection.name
119+
source_format = "PARQUET"
120+
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/orders.parquet"]
121+
}
67122

68123
labels = var.labels
124+
}
69125

70-
depends_on = [
71-
google_bigquery_connection.ds_connection,
72-
google_storage_bucket.raw_bucket,
73-
]
126+
# # Create a Biglake table for products
127+
resource "google_bigquery_table" "tbl_edw_products" {
128+
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
129+
table_id = "products"
130+
project = module.project-services.project_id
131+
deletion_protection = var.deletion_protection
132+
133+
schema = file("${path.module}/src/schema/products_schema.json")
134+
135+
external_data_configuration {
136+
autodetect = true
137+
connection_id = google_bigquery_connection.ds_connection.name
138+
source_format = "PARQUET"
139+
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/products.parquet"]
140+
}
141+
142+
labels = var.labels
143+
}
144+
145+
# # Create a Biglake table for products
146+
resource "google_bigquery_table" "tbl_edw_users" {
147+
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
148+
table_id = "users"
149+
project = module.project-services.project_id
150+
deletion_protection = var.deletion_protection
151+
152+
schema = file("${path.module}/src/schema/users_schema.json")
153+
154+
external_data_configuration {
155+
autodetect = true
156+
connection_id = google_bigquery_connection.ds_connection.name
157+
source_format = "PARQUET"
158+
source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/users.parquet"]
159+
}
160+
161+
labels = var.labels
74162
}
75163

76164
# Load Queries for Stored Procedure Execution
77-
# # Load Lookup Data Tables
165+
# # Load Distribution Center Lookup Data Tables
78166
resource "google_bigquery_routine" "sp_provision_lookup_tables" {
79167
project = module.project-services.project_id
80168
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
81169
routine_id = "sp_provision_lookup_tables"
82170
routine_type = "PROCEDURE"
83171
language = "SQL"
84-
definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id })
85-
86-
depends_on = [
87-
google_bigquery_dataset.ds_edw,
88-
]
172+
definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
89173
}
90174

91-
92-
# # Add Looker Studio Data Report Procedure
93-
resource "google_bigquery_routine" "sproc_sp_demo_datastudio_report" {
175+
# Add Looker Studio Data Report Procedure
176+
resource "google_bigquery_routine" "sproc_sp_demo_lookerstudio_report" {
94177
project = module.project-services.project_id
95178
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
96179
routine_id = "sp_lookerstudio_report"
97180
routine_type = "PROCEDURE"
98181
language = "SQL"
99-
definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id })
182+
definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
100183

101184
depends_on = [
102-
google_bigquery_table.tbl_edw_taxi,
185+
google_bigquery_table.tbl_edw_inventory_items,
186+
google_bigquery_table.tbl_edw_order_items,
187+
google_bigquery_routine.sp_provision_lookup_tables,
103188
]
104189
}
105190

@@ -110,24 +195,26 @@ resource "google_bigquery_routine" "sp_sample_queries" {
110195
routine_id = "sp_sample_queries"
111196
routine_type = "PROCEDURE"
112197
language = "SQL"
113-
definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id })
198+
definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
114199

115200
depends_on = [
116-
google_bigquery_table.tbl_edw_taxi,
201+
google_bigquery_table.tbl_edw_inventory_items,
202+
google_bigquery_table.tbl_edw_order_items,
117203
]
118204
}
119205

120-
# # Add Bigquery ML Model
206+
207+
# Add Bigquery ML Model
121208
resource "google_bigquery_routine" "sp_bigqueryml_model" {
122209
project = module.project-services.project_id
123210
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
124211
routine_id = "sp_bigqueryml_model"
125212
routine_type = "PROCEDURE"
126213
language = "SQL"
127-
definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id })
214+
definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
128215

129216
depends_on = [
130-
google_bigquery_table.tbl_edw_taxi,
217+
google_bigquery_table.tbl_edw_order_items,
131218
]
132219
}
133220

@@ -138,10 +225,10 @@ resource "google_bigquery_routine" "sp_sample_translation_queries" {
138225
routine_id = "sp_sample_translation_queries"
139226
routine_type = "PROCEDURE"
140227
language = "SQL"
141-
definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id })
228+
definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
142229

143230
depends_on = [
144-
google_bigquery_table.tbl_edw_taxi,
231+
google_bigquery_table.tbl_edw_inventory_items,
145232
]
146233
}
147234

@@ -151,6 +238,8 @@ resource "google_project_service_identity" "bigquery_data_transfer_sa" {
151238
provider = google-beta
152239
project = module.project-services.project_id
153240
service = "bigquerydatatransfer.googleapis.com"
241+
242+
depends_on = [time_sleep.wait_after_apis]
154243
}
155244

156245
# # Grant the DTS service account access
@@ -162,6 +251,8 @@ resource "google_project_iam_member" "dts_service_account_roles" {
162251
project = module.project-services.project_id
163252
role = each.key
164253
member = "serviceAccount:${google_project_service_identity.bigquery_data_transfer_sa.email}"
254+
255+
depends_on = [time_sleep.wait_after_apis]
165256
}
166257

167258
# Create specific service account for DTS Run
@@ -206,7 +297,7 @@ resource "google_bigquery_data_transfer_config" "dts_config" {
206297
data_source_id = "scheduled_query"
207298
schedule = "every day 00:00"
208299
params = {
209-
query = "CALL `${module.project-services.project_id}.ds_edw.sp_bigqueryml_model`()"
300+
query = "CALL `${module.project-services.project_id}.${google_bigquery_dataset.ds_edw.dataset_id}.sp_bigqueryml_model`()"
210301
}
211302
service_account_name = google_service_account.dts.email
212303

0 commit comments

Comments
 (0)