Skip to content

Commit 675b35c

Browse files
authored
fix: workflows and remove hardcoding
1 parent d1ecc06 commit 675b35c

17 files changed

Lines changed: 435 additions & 444 deletions

README.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,21 @@ The resources/services/activations/deletions that this module will create/trigge
1515
To deploy this blueprint you must have an active billing account and billing permissions.
1616

1717
## Documentation
18-
- [Hosting a Static Website](https://cloud.google.com/storage/docs/hosting-static-website)
18+
- [Create an Analytics Lakehouse](https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse)
1919

2020
## Usage
2121

2222
Basic usage of this module is as follows:
2323

2424
```hcl
25-
module "" {
26-
source = "terraform-google-modules//google"
27-
version = "~> 0.1"
25+
module "analytics_lakehouse" {
26+
source = "../.."
27+
28+
project_id = var.project_id
29+
region = "us-central1"
30+
deletion_protection = false
31+
force_destroy = true
2832
29-
project_id = "<PROJECT ID>"
30-
bucket_name = "gcs-test-bucket"
3133
}
3234
```
3335

@@ -41,8 +43,8 @@ Functional examples are included in the
4143
|------|-------------|------|---------|:--------:|
4244
| deletion\_protection | Whether or not to protect GCS resources from deletion when solution is modified or changed. | `string` | `true` | no |
4345
| enable\_apis | Whether or not to enable underlying apis in this solution. . | `string` | `true` | no |
44-
| force\_destroy | Whether or not to protect BigQuery resources from deletion when solution is modified or changed. | `string` | `true` | no |
45-
| labels | A map of labels to apply to contained resources. | `map(string)` | <pre>{<br> "edw-bigquery": true<br>}</pre> | no |
46+
| force\_destroy | Whether or not to protect BigQuery resources from deletion when solution is modified or changed. | `string` | `false` | no |
47+
| labels | A map of labels to apply to contained resources. | `map(string)` | <pre>{<br> "analytics-lakehouse": true<br>}</pre> | no |
4648
| project\_id | Google Cloud Project ID | `string` | n/a | yes |
4749
| public\_data\_bucket | Public Data bucket for access | `string` | `"data-analytics-demos"` | no |
4850
| region | Google Cloud Region | `string` | `"us-central1"` | no |
@@ -54,7 +56,6 @@ Functional examples are included in the
5456
|------|-------------|
5557
| bigquery\_editor\_url | The URL to launch the BigQuery editor |
5658
| call\_workflows\_create\_iceberg\_table | Output of the iceberg tables workflow |
57-
| call\_workflows\_create\_views\_and\_others | Output of the create view workflow |
5859
| lakehouse\_colab\_url | The URL to launch the in-console tutorial for the Analytics Lakehouse solution |
5960
| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for data analysis |
6061
| neos\_tutorial\_url | The URL to launch the in-console tutorial for the Analytics Lakehouse solution |

assets/bigquery.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
# Load data from BigQuery.
4040
events = spark.read.format("bigquery") \
41-
.option("table", "gcp_lakehouse_ds.events") \
41+
.option("table", "gcp_lakehouse_ds.gcp_tbl_events") \
4242
.load()
4343
events.createOrReplaceTempView("events")
4444

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"!pip install chart-studio\n",
10+
"!pip install google-cloud-bigquery-connection"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"import pandas as pd\n",
20+
"import numpy as np\n",
21+
"import scipy.optimize\n",
22+
"\n",
23+
"# Import and setup for plotly in Colab\n",
24+
"import chart_studio\n",
25+
"import chart_studio.plotly as py\n",
26+
"import plotly.graph_objects as go\n",
27+
"import plotly.io as pio\n",
28+
"import plotly.express as px\n",
29+
"\n",
30+
"# Enable displaying pandas data frames as interactive tables by default\n",
31+
"from google.colab import data_table\n",
32+
"data_table.enable_dataframe_formatter()"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": null,
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"PROJECT_ID = 'change me'\n",
42+
"REGION = \"US\""
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": null,
48+
"metadata": {},
49+
"outputs": [],
50+
"source": [
51+
"from google.colab import auth\n",
52+
"auth.authenticate_user()"
53+
]
54+
},
55+
{
56+
"cell_type": "code",
57+
"execution_count": null,
58+
"metadata": {},
59+
"outputs": [],
60+
"source": [
61+
"!gcloud config set project {PROJECT_ID}\n",
62+
"!gcloud config get-value project"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": null,
68+
"metadata": {},
69+
"outputs": [],
70+
"source": [
71+
"%%bigquery --project {PROJECT_ID}\n",
72+
"SELECT\n",
73+
" o.order_id,\n",
74+
" o.user_id order_user_id,\n",
75+
" o.status order_status,\n",
76+
" o.created_at order_created_at,\n",
77+
" o.returned_at order_returned_at,\n",
78+
" o.shipped_at order_shipped_at,\n",
79+
" o.delivered_at order_delivered_at,\n",
80+
" o.num_of_item order_number_of_items,\n",
81+
" i.id AS order_items_id,\n",
82+
" i.product_id AS order_items_product_id,\n",
83+
" i.status order_items_status,\n",
84+
" i.sale_price order_items_sale_price,\n",
85+
" p.id AS product_id,\n",
86+
" p.cost product_cost,\n",
87+
" p.category product_category,\n",
88+
" p.name product_name,\n",
89+
" p.brand product_brand,\n",
90+
" p.retail_price product_retail_price,\n",
91+
" p.department product_department,\n",
92+
" p.sku product_sku,\n",
93+
" p.distribution_center_id,\n",
94+
" d.name AS dist_center_name,\n",
95+
" d.latitude dist_center_lat,\n",
96+
" d.longitude dist_center_long,\n",
97+
" u.id AS user_id,\n",
98+
" u.first_name user_first_name,\n",
99+
" u.last_name user_last_name,\n",
100+
" u.age user_age,\n",
101+
" u.gender user_gender,\n",
102+
" u.state user_state,\n",
103+
" u.postal_code user_postal_code,\n",
104+
" u.city user_city,\n",
105+
" u.country user_country,\n",
106+
" u.latitude user_lat,\n",
107+
" u.longitude user_long,\n",
108+
" u.traffic_source user_traffic_source\n",
109+
"FROM\n",
110+
" gcp_lakehouse_ds.gcp_tbl_orders o\n",
111+
"INNER JOIN\n",
112+
" gcp_lakehouse_ds.gcp_tbl_order_items i\n",
113+
"ON\n",
114+
" o.order_id = i.order_id\n",
115+
"INNER JOIN\n",
116+
" gcp_lakehouse_ds.gcp_tbl_products p\n",
117+
"ON\n",
118+
" i.product_id = p.id\n",
119+
"INNER JOIN\n",
120+
" gcp_lakehouse_ds.gcp_tbl_distribution_centers d\n",
121+
"ON\n",
122+
" p.distribution_center_id = d.id\n",
123+
"INNER JOIN\n",
124+
" gcp_lakehouse_ds.gcp_tbl_users u\n",
125+
"ON\n",
126+
" o.user_id = u.id\n",
127+
"limit 100"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": null,
133+
"metadata": {},
134+
"outputs": [],
135+
"source": [
136+
"%%bigquery --project {PROJECT_ID}\n",
137+
"\n",
138+
"SELECT\n",
139+
"sum(order_id) as count,\n",
140+
" date(o.created_at) date\n",
141+
"FROM\n",
142+
" gcp_lakehouse_ds.gcp_tbl_orders o\n",
143+
" group by o.created_at\n",
144+
" order by date(o.created_at)\n",
145+
" limit 500"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": null,
151+
"metadata": {},
152+
"outputs": [],
153+
"source": [
154+
"%%bigquery data --project {PROJECT_ID}\n",
155+
"\n",
156+
"SELECT\n",
157+
"sum(order_id) as count,\n",
158+
" date(o.created_at) date\n",
159+
"FROM\n",
160+
" gcp_lakehouse_ds.gcp_tbl_orders o\n",
161+
" group by o.created_at\n",
162+
" order by date(o.created_at)\n",
163+
" limit 500"
164+
]
165+
},
166+
{
167+
"cell_type": "code",
168+
"execution_count": null,
169+
"metadata": {},
170+
"outputs": [],
171+
"source": [
172+
"data['date'] = pd.to_datetime(data['date'])\n",
173+
"data['date'] = data['date'].astype(np.int64) // 10**9\n",
174+
"data.head()"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": null,
180+
"metadata": {},
181+
"outputs": [],
182+
"source": [
183+
"from datetime import datetime\n",
184+
"\n",
185+
"fig, ax = plt.subplots(figsize=(20,12))\n",
186+
"data.plot(x='date', y='count', kind='scatter', ax=ax)\n",
187+
"ax.set_xticklabels([datetime.fromtimestamp(date).strftime('%Y/%m/%d') for date in ax.get_xticks()])"
188+
]
189+
}
190+
],
191+
"metadata": {
192+
"language_info": {
193+
"name": "python"
194+
},
195+
"orig_nbformat": 4
196+
},
197+
"nbformat": 4,
198+
"nbformat_minor": 2
199+
}

assets/yaml/bucket_copy.yaml renamed to assets/yaml/initial-workflow-copy-data.yaml

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,47 +17,44 @@ main:
1717
steps:
1818
- init:
1919
assign:
20-
- source_bucket: "data-analytics-demos" # "devrel-abc-def-ghi" # "bucket-copy-fths"
21-
- dest_bucket: ${"gcp-lakehouse-edw-export-" + sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
20+
- source_bucket: "data-analytics-demos"
21+
- dest_bucket: ${raw_bucket}
2222
- copied_objects: []
2323
- list_objects:
2424
call: googleapis.storage.v1.objects.list
2525
args:
26-
bucket: ${source_bucket}
27-
# delimiter: "/"s
28-
# prefix: "*.Parquet"
26+
bucket: $${source_bucket}
27+
# prefix: "thelook"
2928
result: list_result
30-
- objects:
31-
call: sys.log
32-
args:
33-
text: ${list_result.items}
29+
- start_counter:
30+
assign:
31+
- copied_objects: 0
3432
- copy_objects:
3533
for:
3634
value: object
3735
index: i
38-
in: ${list_result.items}
36+
in: $${list_result.items}
3937
steps:
4038
- step1:
4139
try:
4240
steps:
4341
- copy_object:
4442
call: googleapis.storage.v1.objects.copy
4543
args:
46-
sourceBucket: ${source_bucket}
47-
sourceObject: ${text.url_encode(object.name)}
48-
destinationBucket: ${dest_bucket}
49-
destinationObject: ${text.url_encode(object.name)}
44+
sourceBucket: $${source_bucket}
45+
sourceObject: $${text.url_encode(object.name)}
46+
destinationBucket: $${dest_bucket}
47+
destinationObject: $${text.url_encode(object.name)}
5048
result: copy_result
5149
- save_result:
5250
assign:
53-
- copied_objects: ${list.concat(copied_objects, copy_result)}
51+
- copied_objects: $${copied_objects + 1}
5452
except:
5553
as: e
5654
raise:
57-
exception: ${e}
58-
sourceBucket: ${source_bucket}
59-
sourceObject: ${object.name}
60-
destinationBucket: ${dest_bucket}
61-
# destinationObject: ${object.name}
55+
exception: $${e}
56+
sourceBucket: $${source_bucket}
57+
sourceObject: $${object.name}
58+
destinationBucket: $${dest_bucket}
6259
- finish:
63-
return: ${copied_objects}
60+
return: $${copied_objects + " objects copied"}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Create tables
16+
- assignStepTables:
17+
assign:
18+
- results: {}
19+
- location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")}
20+
- bucket: ${raw_bucket}
21+
- project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
22+
- map:
23+
gcp_tbl_order_items: $${"CREATE OR REPLACE EXTERNAL TABLE `gcp_lakehouse_ds.gcp_tbl_order_items` WITH CONNECTION `"+location+".gcp_lakehouse_connection` OPTIONS(format ='Parquet', uris = ['gs://" + bucket + "/thelook_ecommerce/order_items-0*.Parquet'], max_staleness = INTERVAL 30 MINUTE, metadata_cache_mode = 'AUTOMATIC');"}
24+
gcp_tbl_orders: $${"CREATE OR REPLACE EXTERNAL TABLE `gcp_lakehouse_ds.gcp_tbl_orders` WITH CONNECTION `"+location+".gcp_lakehouse_connection` OPTIONS(format ='Parquet', uris = ['gs://" + bucket + "/thelook_ecommerce/orders-*.Parquet'], max_staleness = INTERVAL 30 MINUTE, metadata_cache_mode = 'AUTOMATIC');"}
25+
gcp_tbl_users: $${"CREATE OR REPLACE EXTERNAL TABLE `gcp_lakehouse_ds.gcp_tbl_users` WITH CONNECTION `"+location+".gcp_lakehouse_connection` OPTIONS(format ='Parquet', uris = ['gs://" + bucket + "/thelook_ecommerce/users-*.Parquet'], max_staleness = INTERVAL 30 MINUTE, metadata_cache_mode = 'AUTOMATIC');"}
26+
gcp_tbl_distribution_centers: $${"CREATE OR REPLACE EXTERNAL TABLE `gcp_lakehouse_ds.gcp_tbl_distribution_centers` WITH CONNECTION `"+location+".gcp_lakehouse_connection` OPTIONS(format ='Parquet', uris = ['gs://" + bucket + "/thelook_ecommerce/distribution_centers-*.Parquet'], max_staleness = INTERVAL 30 MINUTE, metadata_cache_mode = 'AUTOMATIC');"}
27+
gcp_tbl_inventory_items: $${"CREATE OR REPLACE EXTERNAL TABLE `gcp_lakehouse_ds.gcp_tbl_inventory_items` WITH CONNECTION `"+location+".gcp_lakehouse_connection` OPTIONS(format ='Parquet', uris = ['gs://" + bucket + "/thelook_ecommerce/inventory_items-*.Parquet'], max_staleness = INTERVAL 30 MINUTE, metadata_cache_mode = 'AUTOMATIC');"}
28+
gcp_tbl_products: $${"CREATE OR REPLACE EXTERNAL TABLE `gcp_lakehouse_ds.gcp_tbl_products` WITH CONNECTION `"+location+".gcp_lakehouse_connection` OPTIONS(format ='Parquet', uris = ['gs://" + bucket + "/thelook_ecommerce/products-0*.Parquet'], max_staleness = INTERVAL 30 MINUTE, metadata_cache_mode = 'AUTOMATIC');"}
29+
- loopStepTables:
30+
for:
31+
value: key
32+
in: $${keys(map)}
33+
steps:
34+
- runQuery:
35+
call: googleapis.bigquery.v2.jobs.query
36+
args:
37+
projectId: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
38+
body:
39+
useLegacySql: false
40+
useQueryCache: false
41+
location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")}
42+
timeoutMs: 600000
43+
query: $${map[key]}
44+
result: queryResult
45+
- sumStep:
46+
assign:
47+
- results[key]: $${queryResult}
48+
# Create and Assign Views
49+
- assignStepPolicies:
50+
assign:
51+
- marketing_user: ${marketing_user}
52+
- data_analyst_user: ${data_analyst_user}
53+
- policy_map:
54+
row_policy_usa_filter: $${"CREATE OR REPLACE ROW ACCESS POLICY usa_filter ON `" + sys.get_env("GOOGLE_CLOUD_PROJECT_ID") + ".gcp_lakehouse_ds.gcp_tbl_users` GRANT TO ('serviceAccount:" + data_analyst_user + "') FILTER USING (Country = 'United States')"}
55+
row_policy_product_category_filter: $${"CREATE OR REPLACE ROW ACCESS POLICY product_category_filter ON `" + sys.get_env("GOOGLE_CLOUD_PROJECT_ID") + ".gcp_lakehouse_ds.gcp_tbl_products` GRANT TO ('serviceAccount:" + marketing_user + "') FILTER USING (Category = 'Swim' or Category = 'Active' or Category = 'Fashion Hoodies & Sweatshirts')"}
56+
create_view_ecommerce: $${"call gcp_lakehouse_ds.create_view_ecommerce()"}
57+
- loopStepPolicies:
58+
for:
59+
value: key
60+
in: $${keys(policy_map)}
61+
steps:
62+
- runQueryPolicies:
63+
call: googleapis.bigquery.v2.jobs.query
64+
args:
65+
projectId: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
66+
body:
67+
useLegacySql: false
68+
useQueryCache: false
69+
location: $${sys.get_env("GOOGLE_CLOUD_LOCATION")}
70+
timeoutMs: 600000
71+
query: $${policy_map[key]}
72+
result: queryResult
73+
- sumStepPolicies:
74+
assign:
75+
- results[key]: $${queryResult}
76+
- returnStep:
77+
return: $${results}

0 commit comments

Comments
 (0)