Implement multi-column groupby aggregation for pandas backend
See original GitHub issueCurrently, we’re doing default to pandas in case of multi-column groupby, it should be implemented in a Modin scalable way.
Implementation proposal
The problem of multi-column groupby is that we need to have columns on which we’re grouping on in every column partition. However that problem was solved in this commit that added broadcast_apply_full_axis
method to ModinFrame
, so we can broadcast ‘by’ columns to every column partition and use them to group on, pivot_table
already uses that approach for the same purpose.
I’m attaching a script, that contains sketch of suggested implementation in draft_groupby_agg_implementation
function, and a code that measures the time of its execution in comparison with pandas and old groupby_agg
implementation
Script
import numpy as np
import os
import pandas
from timeit import default_timer as timer
def generate_data_file(
nrows=200000,
ncols=256,
ngroups=100,
filename=None,
force=False,
rand_low=-100,
rang_high=100,
):
if filename is None:
filename = os.path.abspath(
f"int_dataset-{nrows},{ncols},{rand_low},{rang_high},{ngroups}.csv"
)
print("generating", filename)
if os.path.exists(filename) and not force:
return filename
data = {
f"col{i}":
# Generates ngroups for groupby in each column
list(np.random.randint(rand_low, rang_high, ngroups)) * (nrows // ngroups)
for i in np.arange(ncols)
}
print("dict generated!")
df = pandas.DataFrame(data)
print("dataframe created!")
df.to_csv(filename)
print("csv ready!")
return filename
def measure_time(df, by, groupby_implementation, **kwargs):
print(f"\n======= Testing {groupby_implementation.__name__} =======")
# Printing parameters:
kwargs.update({"Frame shape": df.shape, "len(by)": len(by)})
print("Parameters:")
for k, v in kwargs.items():
print(f"\t{k}: {v}")
t1 = timer()
md_res = groupby_implementation(df, by, "quantile")
# repr to trigger materialization
repr(md_res)
md_res_t = timer() - t1
pd_df = df._to_pandas()
t1 = timer()
pd_res = pd_df.groupby(by).agg("quantile")
repr(pd_res)
pd_res_t = timer() - t1
print(f"Modin vs Pandas time: {md_res_t}s | {pd_res_t}s")
# Asserting results equality
df_equals(md_res, pd_res)
def draft_groupby_agg_implementation(df, by, fn):
by_qc = df[by]._query_compiler
df_qc = df._query_compiler.drop(columns=by)
def applyier(df, other):
concated = pandas.concat([df, other], axis=1, copy=False)
return concated.groupby(by).agg(fn)
return df.__constructor__(
query_compiler=df_qc.__constructor__(
df_qc._modin_frame.broadcast_apply_full_axis(
axis=0, func=applyier, other=by_qc._modin_frame
)
)
)
def old_implementation(df, by, fn):
return df.groupby(by).agg(fn)
if __name__ == "__main__":
import modin.pandas as pd
from modin.pandas.test.utils import df_equals
nrows = [200_000, 1_000_000]
by_cols = [["col0"], ["col0", "col1"], ["col0", "col1", "col2", "col3", "col4", "col5"]]
implementations = [draft_groupby_agg_implementation, old_implementation]
ngroups = 100
for nrows_ in nrows:
fname = generate_data_file(nrows_, ngroups=ngroups)
md_df = pd.read_csv(fname)
for implementation in implementations:
for by in by_cols:
measure_time(md_df, by, implementation, ngroups=ngroups)
And here are the digits of groupby.agg("quantile")
with 100 groups and 112 cores CPU.
Shape | ‘by’ length | Suggested implementation | Current implementation | Pandas |
---|---|---|---|---|
(200.000, 257) |
1 | 2.45s | 3.26s | 5.65s |
(200.000, 257) |
2 | 2.5s | 7.06s | 5.57s |
(200.000, 257) |
6 | 2.75s | 7.16s | 5.65s |
Shape | ‘by’ length | Suggested implementation | Current implementation | Pandas |
---|---|---|---|---|
(1.000.000, 257) |
1 | 7.56s | 8.54s | 34.68s |
(1.000.000, 257) |
2 | 7.51s | 39.29s | 34.84s |
(1.000.000, 257) |
6 | 7.41s | 38.25s | 34.34s |
cc @gshimansky @devin-petersohn any comments/suggestions about implementation?
Issue Analytics
- State:
- Created 3 years ago
- Comments:5 (5 by maintainers)
@anmyachev H2O groupby queries is more about dictionary aggregation, #2461 branch will not give much speed-up for dictionary aggregation (and so for h2o). However, solving of #2491 should
This is one of the essential features for benchmarking H20 on modin - https://github.com/h2oai/db-benchmark/issues/38