From 854f48bb717f5e00a22a397bc08dace2cf779f32 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 7 Nov 2024 21:09:13 +0000 Subject: [PATCH 1/4] fix: exclude index columns from model fitting processes. --- bigframes/ml/core.py | 2 ++ tests/system/large/ml/test_cluster.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4bc61c5015..4431feeb8c 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -306,6 +306,8 @@ def create_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot + + X_train = X_train.reset_index(drop=True) if y_train is None: input_data = X_train.cache() else: diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py index 152fd168be..39368f490b 100644 --- a/tests/system/large/ml/test_cluster.py +++ b/tests/system/large/ml/test_cluster.py @@ -154,3 +154,13 @@ def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id assert reloaded_model.distance_type == "COSINE" assert reloaded_model.max_iter == 30 assert reloaded_model.tol == 0.001 + + +def test_model_centroids_with_custom_index(penguins_df_default_index): + model = cluster.KMeans(n_clusters=3) + penguins = penguins_df_default_index.set_index(["species", "island", "sex"]) + model.fit(penguins) + + assert ( + not model.cluster_centers_["feature"].isin(["species", "island", "sex"]).any() + ) From 5b06d4b9d49c31aae9688de8ebfe2b4369224fbc Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 7 Nov 2024 22:29:17 +0000 Subject: [PATCH 2/4] update logic --- bigframes/ml/core.py | 7 +++--- tests/system/large/ml/test_linear_model.py | 27 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4431feeb8c..86132e6aa3 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -307,11 +307,12 @@ def create_model( # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - X_train = X_train.reset_index(drop=True) if y_train is None: - input_data = X_train.cache() + input_data = X_train.reset_index(drop=True).cache() else: - input_data = X_train.join(y_train, how="outer").cache() + input_data = ( + X_train.join(y_train, how="outer").reset_index(drop=True).cache() + ) options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index f6ca26e7e4..96215c5e47 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -425,3 +425,30 @@ def test_logistic_regression_customized_params_fit_score( assert reloaded_model.tol == 0.02 assert reloaded_model.learning_rate_strategy == "CONSTANT" assert reloaded_model.learning_rate == 0.2 + + +def test_model_centroids_with_custom_index(penguins_df_default_index): + model = bigframes.ml.linear_model.LogisticRegression( + fit_intercept=False, + class_weight="balanced", + l2_reg=0.2, + tol=0.02, + l1_reg=0.2, + max_iterations=30, + optimize_strategy="batch_gradient_descent", + learning_rate_strategy="constant", + learning_rate=0.2, + ) + df = penguins_df_default_index.dropna().set_index(["species", "island"]) + X_train = df[ + [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + ] + ] + y_train = df[["sex"]] + model.fit(X_train, y_train) + + # If this line executes without errors, the model has correctly ignored the custom index columns + model.predict(X_train.reset_index(drop=True)) From e76824cc5e6598f243f35aae44581c0cf9a05045 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 7 Nov 2024 23:01:20 +0000 Subject: [PATCH 3/4] fix unit test --- tests/unit/ml/test_golden_sql.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index ce05011546..b6409e9532 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -85,6 +85,17 @@ def mock_X(mock_y, mock_session): ["index_column_id"], ["index_column_label"], ) + + mock_X.join(mock_y).reset_index(drop=True).sql = "input_X_y_no_index_sql" + mock_X.join(mock_y).reset_index(drop=True).cache.return_value = mock_X.join( + mock_y + ).reset_index(drop=True) + mock_X.join(mock_y).reset_index(drop=True)._to_sql_query.return_value = ( + "input_X_y_no_index_sql", + ["index_column_id"], + ["index_column_label"], + ) + mock_X.cache.return_value = mock_X return mock_X @@ -107,7 +118,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) @@ -117,7 +128,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) @@ -150,7 +161,7 @@ def test_logistic_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=True,\n auto_class_weights=False,\n optimize_strategy='auto_strategy',\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=True,\n auto_class_weights=False,\n optimize_strategy='auto_strategy',\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) @@ -172,7 +183,7 @@ def test_logistic_regression_params_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=False,\n auto_class_weights=True,\n optimize_strategy='batch_gradient_descent',\n l2_reg=0.2,\n max_iterations=30,\n learn_rate_strategy='constant',\n min_rel_progress=0.02,\n calculate_p_values=False,\n enable_global_explain=False,\n l1_reg=0.2,\n learn_rate=0.2,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=False,\n auto_class_weights=True,\n optimize_strategy='batch_gradient_descent',\n l2_reg=0.2,\n max_iterations=30,\n learn_rate_strategy='constant',\n min_rel_progress=0.02,\n calculate_p_values=False,\n enable_global_explain=False,\n l1_reg=0.2,\n learn_rate=0.2,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql" ) From 73a816bbf997439a57938c4c6ebc2ca8662f612b Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 7 Nov 2024 23:02:23 +0000 Subject: [PATCH 4/4] remove empty line --- bigframes/ml/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 86132e6aa3..be67396fba 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -306,7 +306,6 @@ def create_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - if y_train is None: input_data = X_train.reset_index(drop=True).cache() else: