Multi GroupBy Aggregates and Enhancements
See original GitHub issueGroupBy Enhancements
Hi Pandas developers and enthusiasts! This is my first time posting, so if I did anything wrong, sorry!
I really really love Pandas, and I was always astonished at its super fast speed. Every dataset, including many millions of rows took only milliseconds-seconds to process!
But, when I wanted to use the GroupBy function, I noticed a problem. Since the Agg function uses a dictionary, that means referencing 1 column to do 2 things is not allowed. Also, I was shocked there was no mode function, nor range, IQR etc etc.
So, maybe I’m annoying you guys, I really wanted to make Pandas even easier to use, and more feature complete!
So I made a similar groupby function, but this time, you can write your own functions, and other methods are provided 😃
Thanks for reading!
groupby(data, by = "vendor_id", how = 'mean') #Get all the means
groupby(data, by = ["vendor_id","passenger_count"], how = 'mode') #Get all the modes
groupby(data, by = ["vendor_id","passenger_count"],
how = 'pickup_longitude = iqr') #Get IQR of one col
groupby(data, by = ["vendor_id","passenger_count"],
how = 'pickup_longitude = mean, pickup_longitude = percent_total')
#Same column get mean, percentage of total
groupby(data, by = ["vendor_id","passenger_count"], how = 'pickup_longitude = mean, pickup_longitude = percent_total')
groupby(data, by = ["vendor_id","store_and_fwd_flag"], how = 'passenger_count = f(x.kurt()), passenger_count = f(x.std())')
#Provide own functions with f(x) notation
Functions: Choose from: MODE/LEAST: 1. Mode 2. Mode_group 3. Least 4. Least_group FREQUENCY: 1. Freq_total 2. Freq_group 3. Mode Count 4. Least Count STATISTICS: 1. Range 2. Variance 3. Std 4. IQR FUNCTIONS: Use f(x.var()) notation
def groupby(data, by, how, cols = "all", decimals = 3):
if cols == 'all': cols = columns(data)
new = 0;
if type(by) is str: by = [by]
if "=" not in how:
try:
return eval('data.groupby(by = by)[diff(cols,by)].'+how+'()')
except: new = 1;
if new == 1:
how = [x+" = "+how for x in diff(cols,by)]
how = ', '.join(how)
if "=" in how:
to_do = []
for x in how.split(','):
a = tuple(y.lstrip().rstrip() for y in x.split('='))
to_do.append(a)
grouped = data.groupby(by = by)
if len(by) <= 2: second = by[0];
elif len(by) > 2: second = by[0:-1];
result = None; p = []
for key in to_do:
try:
new_name = key[0]+">"+key[1]
what = key[1].lower()
if what == 'mode':
right = grouped[key[0]].apply(lambda x: x.value_counts().index[0]).reset_index()[by+[key[0]]]
elif what == 'mode_count':
right = grouped[key[0]].apply(lambda x: x.value_counts().iloc[0]).reset_index()[by+[key[0]]]
elif what == 'least_count':
right = grouped[key[0]].apply(lambda x: x.value_counts().iloc[-1]).reset_index()[by+[key[0]]]
elif what == 'mode_group':
right = grouped[key[0]].apply(lambda x: (x.value_counts().iloc[0]/x.value_counts().sum()*100).round(decimals)).reset_index()[by+[key[0]]]
elif what == 'least_group':
right = grouped[key[0]].apply(lambda x: (x.value_counts().iloc[-1]/x.value_counts().sum()*100).round(decimals)).reset_index()[by+[key[0]]]
elif what == 'least':
right = grouped[key[0]].apply(lambda x: x.value_counts().index[-1]).reset_index()[by+[key[0]]]
elif what == 'std':
right = grouped[key[0]].apply(lambda x: x.std().round(decimals)).reset_index()[by+[key[0]]]
elif what == 'var':
right = grouped[key[0]].apply(lambda x: x.var().round(decimals)).reset_index()[by+[key[0]]]
elif what == 'iqr':
right = grouped[key[0]].apply(lambda x: x.quantile(0.75)-x.quantile(0.25)).reset_index()[by+[key[0]]]
elif what == 'range':
right = grouped[key[0]].apply(lambda x: x.max() - x.min()).reset_index()[by+[key[0]]]
elif what in ["freq_total", "percent_total"]:
if what == "freq_total":
right = pd.DataFrame(grouped[key[0]].count()).reset_index()
elif what == 'percent_total':
right = pd.DataFrame(grouped[key[0]].sum()).reset_index()
summed = right[key[0]].sum()
right[key[0]] = round((right[key[0]] / summed*100),decimals)
elif 'group' in what:
if what == "freq_group":
right = pd.DataFrame(grouped[key[0]].count()).reset_index()
summed = data.groupby(by = second).agg({key[0]:'count'}).reset_index()
elif what == 'percent_group':
right = pd.DataFrame(grouped[key[0]].sum()).reset_index()
summed = data.groupby(by = second).agg({key[0]:'sum'}).reset_index()
tot_name = key[0]+">total";
summed.columns = change(columns(summed), key[0], tot_name)
df = pd.merge(left = right, right = summed, on = second, how = 'left')
right[key[0]] = round((df[key[0]]/df[tot_name]*100),decimals)
elif 'f(' in what:
command = what.replace('f(','')[0:-1]
right = eval('grouped[key[0]].apply(lambda x: '+command+').reset_index()[by+[key[0]]]')
else:
right = grouped.agg({key[0]:key[1]}).reset_index()
right.columns = change(columns(right), key[0], new_name)
if result is None: result = exc(grouped.agg({key[0]:'count'}).reset_index(), key[0])
result[right.columns[-1]] = right[right.columns[-1]]
except: pass;
try:
return result.set_index(by, drop = True)
except:
print("Wrong method\n\nChoose from:\nMODE/LEAST: 1. Mode 2. Mode_group 3. Least 4. Least_group")
print("FREQUENCY: 1. Freq_total 2. Freq_group 3. Mode Count 4. Least Count")
print("STATISTICS: 1. Range 2. Variance 3. Std 4. IQR\n")
def columns(x):
return x.columns.tolist()
def diff(want, rem):
w = copy(want)
for j in w:
if j in rem: w.remove(j)
for j in rem:
if j in w: w.remove(j)
return w
def exc(data, x):
return data[diff(columns(data), x)]
def change(l, a, y):
return [x if x != a else y for x in l]
Issue Analytics
- State:
- Created 6 years ago
- Comments:10 (5 by maintainers)
Top GitHub Comments
This should be possible with a syntax like
df.groupby(..).agg({'col': ['mean', 'median']})
Yep sorry!