From e985a4af7db5842feeec3b0289a5d05591ed4172 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 24 Oct 2023 18:48:49 +0000 Subject: [PATCH 1/5] docs: add code samples for df reshaping, function, merge, and join methods --- .../bigframes_vendored/pandas/core/frame.py | 152 +++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b35d0f3b2e..6d660205f4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2121,6 +2121,29 @@ def groupby( used to group large amounts of data and compute operations on these groups. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + + [4 rows x 2 columns] + + >>> df.groupby(['Animal'])['Max Speed'].mean() + Animal + Falcon 375.0 + Parrot 25.0 + Name: Max Speed, dtype: Float64 + Args: by (str, Sequence[str]): A label or list of labels may be passed to group by the columns @@ -2224,7 +2247,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: Python function wrapped by ``remote_function`` decorator, returns a single value from a single value. na_action (Optional[str], default None): - ``{None, 'ignore'}``, default None. If ‘ignore’, propagate NaN + ``{None, 'ignore'}``, default None. If `ignore`, propagate NaN values, without passing them to func. Returns: @@ -2240,6 +2263,45 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: Join columns with `other` DataFrame on index + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Join two DataFrames by specifying how to handle the operation: + + >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}) + >>> df1 + col1 col2 + 0 foo 1 + 1 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}) + >>> df2 + col3 col4 + 0 foo 3 + 1 baz 4 + + [2 rows x 2 columns] + + >>> df1.join(df2, how="outer") + col1 col2 col3 col4 + 0 foo 1 foo 3 + 1 bar 2 baz 4 + + [2 rows x 4 columns] + + Another option to join using the key columns is to use the on parameter: + + >>> df1.join(df2, on="col1", how="right") + col1 col2 col3 col4 + 0 foo 3 + 1 baz 4 + + [2 rows x 4 columns] + Args: other: DataFrame with an Index similar to the Index of this one. @@ -2292,6 +2354,71 @@ def merge( rows will be matched against each other. This is different from usual SQL join behaviour and can lead to unexpected results. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Merge DataFrames df1 and df2 by specifiying type of merge: + + >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + [2 rows x 2 columns] + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + [1 rows x 3 columns] + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1 = bpd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [1, 2, 3, 5]}) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + + [4 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [5, 6, 7, 8]}) + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + [4 rows x 2 columns] + + >>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + [6 rows x 4 columns] + Args: right: Object to merge with. @@ -2342,6 +2469,29 @@ def apply(self, func, *, args=(), **kwargs): the DataFrame's index (``axis=0``) the final return type is inferred from the return type of the applied function. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + + >>> def sqaure(x): + ... return x * x + >>> df1 = df.apply(sqaure) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + Args: func (function): Function to apply to each column or row. From 7bfdf7095a080a6021969bd22edad37e0f0fb48f Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 24 Oct 2023 18:48:49 +0000 Subject: [PATCH 2/5] docs: add code samples for df reshaping, function, merge, and join methods --- .../bigframes_vendored/pandas/core/frame.py | 152 +++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b35d0f3b2e..6d660205f4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2121,6 +2121,29 @@ def groupby( used to group large amounts of data and compute operations on these groups. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df + Animal Max Speed + 0 Falcon 380.0 + 1 Falcon 370.0 + 2 Parrot 24.0 + 3 Parrot 26.0 + + [4 rows x 2 columns] + + >>> df.groupby(['Animal'])['Max Speed'].mean() + Animal + Falcon 375.0 + Parrot 25.0 + Name: Max Speed, dtype: Float64 + Args: by (str, Sequence[str]): A label or list of labels may be passed to group by the columns @@ -2224,7 +2247,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: Python function wrapped by ``remote_function`` decorator, returns a single value from a single value. na_action (Optional[str], default None): - ``{None, 'ignore'}``, default None. If ‘ignore’, propagate NaN + ``{None, 'ignore'}``, default None. If `ignore`, propagate NaN values, without passing them to func. Returns: @@ -2240,6 +2263,45 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: Join columns with `other` DataFrame on index + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Join two DataFrames by specifying how to handle the operation: + + >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}) + >>> df1 + col1 col2 + 0 foo 1 + 1 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}) + >>> df2 + col3 col4 + 0 foo 3 + 1 baz 4 + + [2 rows x 2 columns] + + >>> df1.join(df2, how="outer") + col1 col2 col3 col4 + 0 foo 1 foo 3 + 1 bar 2 baz 4 + + [2 rows x 4 columns] + + Another option to join using the key columns is to use the on parameter: + + >>> df1.join(df2, on="col1", how="right") + col1 col2 col3 col4 + 0 foo 3 + 1 baz 4 + + [2 rows x 4 columns] + Args: other: DataFrame with an Index similar to the Index of this one. @@ -2292,6 +2354,71 @@ def merge( rows will be matched against each other. This is different from usual SQL join behaviour and can lead to unexpected results. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Merge DataFrames df1 and df2 by specifiying type of merge: + + >>> df1 = bpd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) + >>> df1 + a b + 0 foo 1 + 1 bar 2 + + [2 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) + >>> df2 + a c + 0 foo 3 + 1 baz 4 + + [2 rows x 2 columns] + + >>> df1.merge(df2, how="inner", on="a") + a b c + 0 foo 1 3 + + [1 rows x 3 columns] + + Merge df1 and df2 on the lkey and rkey columns. The value columns have + the default suffixes, _x and _y, appended. + + >>> df1 = bpd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [1, 2, 3, 5]}) + >>> df1 + lkey value + 0 foo 1 + 1 bar 2 + 2 baz 3 + 3 foo 5 + + [4 rows x 2 columns] + + >>> df2 = bpd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], + ... 'value': [5, 6, 7, 8]}) + >>> df2 + rkey value + 0 foo 5 + 1 bar 6 + 2 baz 7 + 3 foo 8 + + [4 rows x 2 columns] + + >>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y + 0 foo 1 foo 5 + 1 foo 1 foo 8 + 2 bar 2 bar 6 + 3 baz 3 baz 7 + 4 foo 5 foo 5 + 5 foo 5 foo 8 + + [6 rows x 4 columns] + Args: right: Object to merge with. @@ -2342,6 +2469,29 @@ def apply(self, func, *, args=(), **kwargs): the DataFrame's index (``axis=0``) the final return type is inferred from the return type of the applied function. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + + >>> def sqaure(x): + ... return x * x + >>> df1 = df.apply(sqaure) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + Args: func (function): Function to apply to each column or row. From 0dce18813710099031c60c94b5c192bbe5ee8e16 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 16 Nov 2023 21:35:07 +0000 Subject: [PATCH 3/5] address comments --- .../bigframes_vendored/pandas/core/frame.py | 98 ++++++++++++++++--- 1 file changed, 85 insertions(+), 13 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6d660205f4..571a5de458 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2144,6 +2144,40 @@ def groupby( Parrot 25.0 Name: Max Speed, dtype: Float64 + We can also choose to include NA in group keys or not by setting `dropna` + parameter, the default setting is `True`: + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame([[1, 2, 3],[1, None, 4], [2, 1, 3], [1, 2, 2]], + ... columns=["a", "b", "c"]) + >>> df.groupby(by=["b"]).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + + [2 rows x 2 columns] + + >>> df.groupby(by=["b"], dropna=False).sum() + a c + b + 1.0 2 3 + 2.0 2 5 + 1 4 + + [3 rows x 2 columns] + + We can also choose to return object with group labels or not by setting `as_index`. + + >>> df.groupby(by=["b"], as_index=False).sum() + b a c + 0 1.0 2 3 + 1 2.0 2 5 + + [2 rows x 3 columns] + Args: by (str, Sequence[str]): A label or list of labels may be passed to group by the columns @@ -2270,35 +2304,66 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: Join two DataFrames by specifying how to handle the operation: - >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}) + >>> df1 = bpd.DataFrame({'col1': ['foo', 'bar'], 'col2': [1, 2]}, index=[10, 11]) >>> df1 col1 col2 - 0 foo 1 - 1 bar 2 + 10 foo 1 + 11 bar 2 [2 rows x 2 columns] - >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}) + >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}, index=[21, 22]) >>> df2 col3 col4 - 0 foo 3 - 1 baz 4 + 21 foo 3 + 22 baz 4 [2 rows x 2 columns] - >>> df1.join(df2, how="outer") - col1 col2 col3 col4 - 0 foo 1 foo 3 - 1 bar 2 baz 4 + >>> df1.join(df2) + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 [2 rows x 4 columns] + >>> df1.join(df2, how="left") + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="right") + col1 col2 col3 col4 + 21 foo 3 + 22 baz 4 + + [2 rows x 4 columns] + + >>> df1.join(df2, how="outer") + col1 col2 col3 col4 + 10 foo 1 + 11 bar 2 + 21 foo 3 + 22 baz 4 + + [4 rows x 4 columns] + + >>> df1.join(df2, how="inner") + Empty DataFrame + Columns: [col1, col2, col3, col4] + Index: [] + + [0 rows x 4 columns] + + Another option to join using the key columns is to use the on parameter: >>> df1.join(df2, on="col1", how="right") - col1 col2 col3 col4 - 0 foo 3 - 1 baz 4 + col1 col2 col3 col4 + 21 foo 3 + 22 baz 4 [2 rows x 4 columns] @@ -2383,6 +2448,13 @@ def merge( [1 rows x 3 columns] + >>> df1.merge(df2, how='left', on='a') + a b c + 0 foo 1 3 + 1 bar 2 + + [2 rows x 3 columns] + Merge df1 and df2 on the lkey and rkey columns. The value columns have the default suffixes, _x and _y, appended. From 9ec02e23f785abdf2b6aa7b52595028de9fc2892 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 17 Nov 2023 19:48:41 +0000 Subject: [PATCH 4/5] address additional comments --- .../bigframes_vendored/pandas/core/frame.py | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 571a5de458..5dcf850568 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2144,8 +2144,7 @@ def groupby( Parrot 25.0 Name: Max Speed, dtype: Float64 - We can also choose to include NA in group keys or not by setting `dropna` - parameter, the default setting is `True`: + We can also choose to include NA in group keys or not by setting `dropna`: >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -2312,10 +2311,10 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: [2 rows x 2 columns] - >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}, index=[21, 22]) + >>> df2 = bpd.DataFrame({'col3': ['foo', 'baz'], 'col4': [3, 4]}, index=[11, 22]) >>> df2 col3 col4 - 21 foo 3 + 11 foo 3 22 baz 4 [2 rows x 2 columns] @@ -2323,20 +2322,20 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: >>> df1.join(df2) col1 col2 col3 col4 10 foo 1 - 11 bar 2 + 11 bar 2 foo 3 [2 rows x 4 columns] >>> df1.join(df2, how="left") col1 col2 col3 col4 10 foo 1 - 11 bar 2 + 11 bar 2 foo 3 [2 rows x 4 columns] >>> df1.join(df2, how="right") col1 col2 col3 col4 - 21 foo 3 + 11 bar 2 foo 3 22 baz 4 [2 rows x 4 columns] @@ -2344,26 +2343,24 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: >>> df1.join(df2, how="outer") col1 col2 col3 col4 10 foo 1 - 11 bar 2 - 21 foo 3 + 11 bar 2 foo 3 22 baz 4 - [4 rows x 4 columns] + [3 rows x 4 columns] >>> df1.join(df2, how="inner") - Empty DataFrame - Columns: [col1, col2, col3, col4] - Index: [] + col1 col2 col3 col4 + 11 bar 2 foo 3 - [0 rows x 4 columns] + [1 rows x 4 columns] Another option to join using the key columns is to use the on parameter: >>> df1.join(df2, on="col1", how="right") - col1 col2 col3 col4 - 21 foo 3 - 22 baz 4 + col1 col2 col3 col4 + 11 foo 3 + 22 baz 4 [2 rows x 4 columns] From e939e3ee2d5d9d4f708f6f75e8d58da1295c8fab Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 17 Nov 2023 22:50:41 +0000 Subject: [PATCH 5/5] delete the extra import --- third_party/bigframes_vendored/pandas/core/frame.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 5dcf850568..8033c064d7 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2146,9 +2146,6 @@ def groupby( We can also choose to include NA in group keys or not by setting `dropna`: - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - >>> df = bpd.DataFrame([[1, 2, 3],[1, None, 4], [2, 1, 3], [1, 2, 2]], ... columns=["a", "b", "c"]) >>> df.groupby(by=["b"]).sum() @@ -2168,7 +2165,7 @@ def groupby( [3 rows x 2 columns] - We can also choose to return object with group labels or not by setting `as_index`. + We can also choose to return object with group labels or not by setting `as_index`: >>> df.groupby(by=["b"], as_index=False).sum() b a c