From 838d14ddead8e1e115b2af7583ece0b4537edb6f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 13 Mar 2024 06:55:34 +0000 Subject: [PATCH 1/3] test: include model.register test for BQML CMEK --- bigframes/ml/core.py | 4 ++-- bigframes/session/__init__.py | 5 +++-- tests/system/small/test_encryption.py | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 43a882ecac..03d9b806b9 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -245,7 +245,7 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: options={"vertex_ai_model_id": vertex_ai_model_id} ) # Register the model and wait it to finish - self._session._start_query_create_model(sql) + self._session._start_query_ml_ddl(sql) self._model = self._session.bqclient.get_model(self.model_name) return self @@ -264,7 +264,7 @@ def _create_model_ref( def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: # fit the model, synchronously - _, job = session._start_query_create_model(sql) + _, job = session._start_query_ml_ddl(sql) # real model path in the session specific hidden dataset and table prefix model_name_full = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5266267a22..218531c3a7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1592,12 +1592,13 @@ def _start_query( self.bqclient, sql, job_config, max_results ) - def _start_query_create_model( + def _start_query_ml_ddl( self, sql: str, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ - Starts BigQuery ML CREATE MODEL query job and waits for results. + Starts BigQuery ML DDL query job (CREATE MODEL/ALTER MODEL/...) and + waits for results. """ job_config = self._prepare_query_job_config() diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 0ce9d881fd..b98c0acca2 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -254,3 +254,21 @@ def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): # Assert that model exists in BQ with intended encryption model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) assert model_bq.encryption_configuration.kms_key_name == bq_cmek + + # Assert that model registration keeps the encryption + # Note that model registration only creates an entry (metadata) to be + # included in the Vertex AI Model Registry. See for more details + # https://cloud.google.com/bigquery/docs/update_vertex#add-existing. + # When use deploys the model to an endpoint from the Model Registry then + # they can specify an encryption key to further protect the artifacts at + # rest on the Vertex AI side. See for more details: + # https://cloud.google.com/vertex-ai/docs/general/deployment#deploy_a_model_to_an_endpoint, + # https://cloud.google.com/vertex-ai/docs/general/cmek#create_resources_with_the_kms_key. + # bigframes.ml does not provide any API for the model deployment. + model_registered = new_model.register() + assert ( + model_registered._bqml_model.model.encryption_configuration.kms_key_name + == bq_cmek + ) + model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) + assert model_bq.encryption_configuration.kms_key_name == bq_cmek From ff95f523333a74cf270ff76a525880f5836a9d5a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 13 Mar 2024 19:22:55 +0000 Subject: [PATCH 2/3] propagate _start_query_create_model renaming in unit test --- tests/unit/ml/test_golden_sql.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 25e12d87bf..d63bc7aaa1 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -43,7 +43,7 @@ def mock_session(): mock_session._anonymous_dataset, TEMP_MODEL_ID.model_id ) ) - mock_session._start_query_create_model.return_value = (None, query_job) + mock_session._start_query_ml_ddl.return_value = (None, query_job) return mock_session @@ -104,7 +104,7 @@ def test_linear_regression_default_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -114,7 +114,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -147,7 +147,7 @@ def test_logistic_regression_default_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -161,7 +161,7 @@ def test_logistic_regression_params_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query_create_model.assert_called_once_with( + mock_session._start_query_ml_ddl.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) From b761d6d07ba5769fc3b993a89ac28081e8f1c230 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 13 Mar 2024 21:22:40 +0000 Subject: [PATCH 3/3] add a side assert in to_gbq tes --- tests/system/small/test_encryption.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index b98c0acca2..f13d2b9e1a 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -130,7 +130,7 @@ def test_df_apis(bq_cmek, session_with_bq_cmek, scalars_table_id): # Read a BQ table and assert encryption df = session_with_bq_cmek.read_gbq(scalars_table_id) - # Perform a few dataframe operations and assert assertion + # Perform a few dataframe operations and assert encryption df1 = df.dropna() _assert_bq_table_is_encrypted(df1, bq_cmek, session_with_bq_cmek) @@ -179,15 +179,32 @@ def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): df = session_with_bq_cmek.read_gbq(scalars_table_id) _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) - # Modify the dataframe and assert assertion + # Modify the dataframe and assert encryption df = df.dropna().head() _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) - # Write the result to BQ and assert assertion + # Write the result to BQ and assert encryption output_table_id = df.to_gbq() output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) assert output_table.encryption_configuration.kms_key_name == bq_cmek + # Write the result to BQ custom table and assert encryption + session_with_bq_cmek.bqclient.get_table(output_table_id) + output_table_ref = bigframes.session._io.bigquery.random_table( + session_with_bq_cmek._anonymous_dataset + ) + output_table_id = str(output_table_ref) + df.to_gbq(output_table_id) + output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) + assert output_table.encryption_configuration.kms_key_name == bq_cmek + + # Lastly, assert that the encryption is not because of any default set at + # the dataset level + output_table_dataset = session_with_bq_cmek.bqclient.get_dataset( + output_table.dataset_id + ) + assert output_table_dataset.default_encryption_configuration is None + @pytest.mark.skip( reason="Internal issue 327544164, cmek does not propagate to the dataframe."