From 28273d6c88e7644aaa4e02667530b8ec39d979a5 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 16 Apr 2024 17:28:12 +0000 Subject: [PATCH 1/3] feat: Add transpose support for small homogeneously typed DataFrames. --- bigframes/core/blocks.py | 54 +++++++++++++ bigframes/dataframe.py | 7 ++ tests/system/small/test_dataframe.py | 18 +++++ .../bigframes_vendored/pandas/core/frame.py | 76 +++++++++++++++++++ 4 files changed, 155 insertions(+) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0f9cacd83d..9249923249 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1542,6 +1542,13 @@ def melt( var_names=typing.Sequence[typing.Hashable], value_name: typing.Hashable = "value", ): + """ + Unpivot columns to produce longer, narrower dataframe. + Arguments correspond to pandas.melt arguments. + id_vars: passthrough variables that will be repeated N times + value_vars: variables that will be melted together + var_names: + """ # TODO: Implement col_level and ignore_index unpivot_col_id = guid.generate_guid() var_col_ids = tuple([guid.generate_guid() for _ in var_names]) @@ -1570,6 +1577,53 @@ def melt( index_columns=[index_id], ) + def transpose(self) -> Block: + """Transpose the block. Will fail if dtypes incompatible or too many rows""" + original_col_index = self.column_labels + original_row_index = self.index.to_pandas() + original_row_count = len(original_row_index) + LIMIT = 9900 + if original_row_count > LIMIT: + raise NotImplementedError( + f"Object has {original_row_count} rows and is too large to transpose." + ) + + # Add row numbers to both axes to disambiguate, clean them up later + block = self + numbered_block = block.with_column_labels( + utils.combine_indices( + block.column_labels, pd.Index(range(len(block.column_labels))) + ) + ) + numbered_block, offsets = numbered_block.promote_offsets() + + stacked_block = numbered_block.melt( + id_vars=(offsets,), + var_names=( + *[name for name in original_col_index.names], + "col_offset", + ), + value_vars=block.value_columns, + ) + col_labels = stacked_block.value_columns[-2 - original_col_index.nlevels : -2] + col_offset = stacked_block.value_columns[-2] # disambiguator we created earlier + cell_values = stacked_block.value_columns[-1] + # Groupby source column + stacked_block = stacked_block.set_index( + [*col_labels, col_offset] + ) # col index is now row index + result = stacked_block.pivot( + columns=[offsets], + values=[cell_values], + columns_unique_values=tuple(range(original_row_count)), + ) + # Drop the offsets from both axes before returning + return ( + result.with_column_labels(original_row_index) + .order_by([ordering.ascending_over(result.index_columns[-1])]) + .drop_levels([result.index_columns[-1]]) + ) + def _create_stack_column( self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] ): diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 11e592542c..bbb83662fe 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -311,6 +311,13 @@ def bqclient(self) -> bigframes.Session: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def T(self) -> DataFrame: + return DataFrame(self._get_block().transpose()) + + def transpose(self) -> DataFrame: + return self.T + def __len__(self): rows, _ = self.shape return rows diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 4c598a682d..1a2bb828c3 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2465,6 +2465,24 @@ def test_df_describe(scalars_dfs): ).all() +def test_df_transpose(): + # Include some floats to ensure type coercion + values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] + # Test complex case of both axes being multi-indices with non-unique elements + columns = pd.Index(["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow")) + columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) + index = pd.Index(["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow")) + rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) + + pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) + bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) + + pd_result = pd_df.T + bf_result = bf_df.T.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + @pytest.mark.parametrize( ("ordered"), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c692bdbfec..0b1b213a1a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -93,6 +93,82 @@ def values(self) -> np.ndarray: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def T(self) -> DataFrame: + """ + The transpose of the DataFrame. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + + >>> df.T + 0 1 + col1 1 2 + col2 3 4 + + [2 rows x 2 columns] + + Returns: + DataFrame: The transposed DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transpose(self) -> DataFrame: + """ + Transpose index and columns. + + Reflect the DataFrame over its main diagonal by writing rows as columns + and vice-versa. The property :attr:`.T` is an accessor to the method + :meth:`transpose`. + + **Examples:** + **Square DataFrame with homogeneous dtype** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} + >>> df1 = bpd.DataFrame(data=d1) + >>> df1 + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + + >>> df1_transposed = df1.T # or df1.transpose() + >>> df1_transposed + 0 1 + col1 1 2 + col2 3 4 + + [2 rows x 2 columns] + + When the dtype is homogeneous in the original DataFrame, we get a + transposed DataFrame with the same dtype: + + >>> df1.dtypes + col1 Int64 + col2 Int64 + dtype: object + >>> df1_transposed.dtypes + 0 Int64 + 1 Int64 + dtype: object + + Returns: + DataFrame: The transposed DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def info( self, verbose: bool | None = None, From aa43f87dac7e6a3795b345621814a86a0c47246f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 18 Apr 2024 17:55:00 +0000 Subject: [PATCH 2/3] cleanup comments and column limit --- bigframes/core/blocks.py | 7 ++----- third_party/bigframes_vendored/pandas/core/frame.py | 2 ++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9249923249..dd5fe49c31 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -35,6 +35,7 @@ import pyarrow as pa import bigframes._config.sampling_options as sampling_options +import bigframes.constants import bigframes.constants as constants import bigframes.core as core import bigframes.core.expression as ex @@ -1545,9 +1546,6 @@ def melt( """ Unpivot columns to produce longer, narrower dataframe. Arguments correspond to pandas.melt arguments. - id_vars: passthrough variables that will be repeated N times - value_vars: variables that will be melted together - var_names: """ # TODO: Implement col_level and ignore_index unpivot_col_id = guid.generate_guid() @@ -1582,8 +1580,7 @@ def transpose(self) -> Block: original_col_index = self.column_labels original_row_index = self.index.to_pandas() original_row_count = len(original_row_index) - LIMIT = 9900 - if original_row_count > LIMIT: + if original_row_count > bigframes.constants.MAX_COLUMNS: raise NotImplementedError( f"Object has {original_row_count} rows and is too large to transpose." ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 0b1b213a1a..8666199043 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -99,6 +99,7 @@ def T(self) -> DataFrame: The transpose of the DataFrame. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) @@ -130,6 +131,7 @@ def transpose(self) -> DataFrame: :meth:`transpose`. **Examples:** + **Square DataFrame with homogeneous dtype** >>> import bigframes.pandas as bpd From 234da482bfbf861c5ba501317c54df1e7f0febc4 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 23 Apr 2024 00:06:59 +0000 Subject: [PATCH 3/3] address pr comments --- bigframes/core/blocks.py | 2 +- tests/system/small/test_dataframe.py | 5 +++++ third_party/bigframes_vendored/pandas/core/frame.py | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index dd5fe49c31..2a888125f8 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1576,7 +1576,7 @@ def melt( ) def transpose(self) -> Block: - """Transpose the block. Will fail if dtypes incompatible or too many rows""" + """Transpose the block. Will fail if dtypes aren't coercible to a common type or too many rows""" original_col_index = self.column_labels original_row_index = self.index.to_pandas() original_row_count = len(original_row_index) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1a2bb828c3..f41a21add0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2483,6 +2483,11 @@ def test_df_transpose(): pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +def test_df_transpose_error(): + with pytest.raises(TypeError, match="Cannot coerce.*to a common type."): + dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose() + + @pytest.mark.parametrize( ("ordered"), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 8666199043..d639aa251e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -98,6 +98,8 @@ def T(self) -> DataFrame: """ The transpose of the DataFrame. + All columns must be the same dtype (numerics can be coerced to a common supertype). + **Examples:** >>> import bigframes.pandas as bpd @@ -130,6 +132,8 @@ def transpose(self) -> DataFrame: and vice-versa. The property :attr:`.T` is an accessor to the method :meth:`transpose`. + All columns must be the same dtype (numerics can be coerced to a common supertype). + **Examples:** **Square DataFrame with homogeneous dtype**