From 854f48bb717f5e00a22a397bc08dace2cf779f32 Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Thu, 7 Nov 2024 21:09:13 +0000
Subject: [PATCH 1/4] fix: exclude index columns from model fitting processes.

---
 bigframes/ml/core.py                  |  2 ++
 tests/system/large/ml/test_cluster.py | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index 4bc61c5015..4431feeb8c 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -306,6 +306,8 @@ def create_model(
         options = dict(options)
         # Cache dataframes to make sure base table is not a snapshot
         # cached dataframe creates a full copy, never uses snapshot
+
+        X_train = X_train.reset_index(drop=True)
         if y_train is None:
             input_data = X_train.cache()
         else:
diff --git a/tests/system/large/ml/test_cluster.py b/tests/system/large/ml/test_cluster.py
index 152fd168be..39368f490b 100644
--- a/tests/system/large/ml/test_cluster.py
+++ b/tests/system/large/ml/test_cluster.py
@@ -154,3 +154,13 @@ def test_cluster_configure_fit_load_params(penguins_df_default_index, dataset_id
     assert reloaded_model.distance_type == "COSINE"
     assert reloaded_model.max_iter == 30
     assert reloaded_model.tol == 0.001
+
+
+def test_model_centroids_with_custom_index(penguins_df_default_index):
+    model = cluster.KMeans(n_clusters=3)
+    penguins = penguins_df_default_index.set_index(["species", "island", "sex"])
+    model.fit(penguins)
+
+    assert (
+        not model.cluster_centers_["feature"].isin(["species", "island", "sex"]).any()
+    )

From 5b06d4b9d49c31aae9688de8ebfe2b4369224fbc Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Thu, 7 Nov 2024 22:29:17 +0000
Subject: [PATCH 2/4] update logic

---
 bigframes/ml/core.py                       |  7 +++---
 tests/system/large/ml/test_linear_model.py | 27 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index 4431feeb8c..86132e6aa3 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -307,11 +307,12 @@ def create_model(
         # Cache dataframes to make sure base table is not a snapshot
         # cached dataframe creates a full copy, never uses snapshot
 
-        X_train = X_train.reset_index(drop=True)
         if y_train is None:
-            input_data = X_train.cache()
+            input_data = X_train.reset_index(drop=True).cache()
         else:
-            input_data = X_train.join(y_train, how="outer").cache()
+            input_data = (
+                X_train.join(y_train, how="outer").reset_index(drop=True).cache()
+            )
             options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()})
 
         session = X_train._session
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
index f6ca26e7e4..96215c5e47 100644
--- a/tests/system/large/ml/test_linear_model.py
+++ b/tests/system/large/ml/test_linear_model.py
@@ -425,3 +425,30 @@ def test_logistic_regression_customized_params_fit_score(
     assert reloaded_model.tol == 0.02
     assert reloaded_model.learning_rate_strategy == "CONSTANT"
     assert reloaded_model.learning_rate == 0.2
+
+
+def test_model_centroids_with_custom_index(penguins_df_default_index):
+    model = bigframes.ml.linear_model.LogisticRegression(
+        fit_intercept=False,
+        class_weight="balanced",
+        l2_reg=0.2,
+        tol=0.02,
+        l1_reg=0.2,
+        max_iterations=30,
+        optimize_strategy="batch_gradient_descent",
+        learning_rate_strategy="constant",
+        learning_rate=0.2,
+    )
+    df = penguins_df_default_index.dropna().set_index(["species", "island"])
+    X_train = df[
+        [
+            "culmen_length_mm",
+            "culmen_depth_mm",
+            "flipper_length_mm",
+        ]
+    ]
+    y_train = df[["sex"]]
+    model.fit(X_train, y_train)
+
+    # If this line executes without errors, the model has correctly ignored the custom index columns
+    model.predict(X_train.reset_index(drop=True))

From e76824cc5e6598f243f35aae44581c0cf9a05045 Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Thu, 7 Nov 2024 23:01:20 +0000
Subject: [PATCH 3/4] fix unit test

---
 tests/unit/ml/test_golden_sql.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py
index ce05011546..b6409e9532 100644
--- a/tests/unit/ml/test_golden_sql.py
+++ b/tests/unit/ml/test_golden_sql.py
@@ -85,6 +85,17 @@ def mock_X(mock_y, mock_session):
         ["index_column_id"],
         ["index_column_label"],
     )
+
+    mock_X.join(mock_y).reset_index(drop=True).sql = "input_X_y_no_index_sql"
+    mock_X.join(mock_y).reset_index(drop=True).cache.return_value = mock_X.join(
+        mock_y
+    ).reset_index(drop=True)
+    mock_X.join(mock_y).reset_index(drop=True)._to_sql_query.return_value = (
+        "input_X_y_no_index_sql",
+        ["index_column_id"],
+        ["index_column_label"],
+    )
+
     mock_X.cache.return_value = mock_X
 
     return mock_X
@@ -107,7 +118,7 @@ def test_linear_regression_default_fit(
     model.fit(mock_X, mock_y)
 
     mock_session._start_query_ml_ddl.assert_called_once_with(
-        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LINEAR_REG',\n  data_split_method='NO_SPLIT',\n  optimize_strategy='auto_strategy',\n  fit_intercept=True,\n  l2_reg=0.0,\n  max_iterations=20,\n  learn_rate_strategy='line_search',\n  min_rel_progress=0.01,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql"
+        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LINEAR_REG',\n  data_split_method='NO_SPLIT',\n  optimize_strategy='auto_strategy',\n  fit_intercept=True,\n  l2_reg=0.0,\n  max_iterations=20,\n  learn_rate_strategy='line_search',\n  min_rel_progress=0.01,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql"
     )
 
 
@@ -117,7 +128,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X,
     model.fit(mock_X, mock_y)
 
     mock_session._start_query_ml_ddl.assert_called_once_with(
-        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LINEAR_REG',\n  data_split_method='NO_SPLIT',\n  optimize_strategy='auto_strategy',\n  fit_intercept=False,\n  l2_reg=0.0,\n  max_iterations=20,\n  learn_rate_strategy='line_search',\n  min_rel_progress=0.01,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql"
+        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LINEAR_REG',\n  data_split_method='NO_SPLIT',\n  optimize_strategy='auto_strategy',\n  fit_intercept=False,\n  l2_reg=0.0,\n  max_iterations=20,\n  learn_rate_strategy='line_search',\n  min_rel_progress=0.01,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql"
     )
 
 
@@ -150,7 +161,7 @@ def test_logistic_regression_default_fit(
     model.fit(mock_X, mock_y)
 
     mock_session._start_query_ml_ddl.assert_called_once_with(
-        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LOGISTIC_REG',\n  data_split_method='NO_SPLIT',\n  fit_intercept=True,\n  auto_class_weights=False,\n  optimize_strategy='auto_strategy',\n  l2_reg=0.0,\n  max_iterations=20,\n  learn_rate_strategy='line_search',\n  min_rel_progress=0.01,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql"
+        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LOGISTIC_REG',\n  data_split_method='NO_SPLIT',\n  fit_intercept=True,\n  auto_class_weights=False,\n  optimize_strategy='auto_strategy',\n  l2_reg=0.0,\n  max_iterations=20,\n  learn_rate_strategy='line_search',\n  min_rel_progress=0.01,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql"
     )
 
 
@@ -172,7 +183,7 @@ def test_logistic_regression_params_fit(
     model.fit(mock_X, mock_y)
 
     mock_session._start_query_ml_ddl.assert_called_once_with(
-        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LOGISTIC_REG',\n  data_split_method='NO_SPLIT',\n  fit_intercept=False,\n  auto_class_weights=True,\n  optimize_strategy='batch_gradient_descent',\n  l2_reg=0.2,\n  max_iterations=30,\n  learn_rate_strategy='constant',\n  min_rel_progress=0.02,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  l1_reg=0.2,\n  learn_rate=0.2,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql"
+        "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n  model_type='LOGISTIC_REG',\n  data_split_method='NO_SPLIT',\n  fit_intercept=False,\n  auto_class_weights=True,\n  optimize_strategy='batch_gradient_descent',\n  l2_reg=0.2,\n  max_iterations=30,\n  learn_rate_strategy='constant',\n  min_rel_progress=0.02,\n  calculate_p_values=False,\n  enable_global_explain=False,\n  l1_reg=0.2,\n  learn_rate=0.2,\n  INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_no_index_sql"
     )
 
 

From 73a816bbf997439a57938c4c6ebc2ca8662f612b Mon Sep 17 00:00:00 2001
From: Huan Chen <huanc@google.com>
Date: Thu, 7 Nov 2024 23:02:23 +0000
Subject: [PATCH 4/4] remove empty line

---
 bigframes/ml/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index 86132e6aa3..be67396fba 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -306,7 +306,6 @@ def create_model(
         options = dict(options)
         # Cache dataframes to make sure base table is not a snapshot
         # cached dataframe creates a full copy, never uses snapshot
-
         if y_train is None:
             input_data = X_train.reset_index(drop=True).cache()
         else: