25 个例子学会Pandas Groupby 操作!
import pandas as pd
sales = pd.read_csv("sales_data.csv")
sales.head()
sales.groupby("store")["stock_qty"].mean()
#输出
store
Daisy 1811.861702
Rose 1677.680000
Violet 14622.406061
Name: stock_qty, dtype: float64
sales.groupby("store")[["stock_qty","price"]].mean()
sales.groupby("store")["stock_qty"].agg(["mean", "max"])
sales.groupby("store").agg(
avg_stock_qty = ("stock_qty", "mean"),
max_stock_qty = ("stock_qty", "max")
)
sales.groupby("store")[["stock_qty","price"]].agg(["mean", "max"])
sales.groupby("store").agg(
avg_stock_qty = ("stock_qty", "mean"),
avg_price = ("price", "mean")
)
sales.groupby("store", as_index=False).agg(
avg_stock_qty = ("stock_qty", "mean"),
avg_price = ("price", "mean")
)
sales.groupby(["store","product_group"], as_index=False).agg(
avg_sales = ("last_week_sales", "mean")
).head()
sales.groupby(["store","product_group"], as_index=False).agg( avg_sales = ("last_week_sales", "mean")
).sort_values(by="avg_sales", ascending=False).head()
sales.groupby("store")["last_week_sales"].nlargest(2)
store
Daisy 413 1883
231 947
Rose 948 883
263 623
Violet 991 3222
339 2690
Name: last_week_sales, dtype: int64
sales.groupby("store")["last_week_sales"].nsmallest(2)
sales_sorted = sales.sort_values(by=["store","last_month_sales"], ascending=False, ignore_index=True)
sales_sorted.groupby("store").nth(4)
sales_sorted.groupby("store").nth(-2)
sales.groupby("store", as_index=False).agg(
unique_values = ("product_code","unique")
)
sales.groupby("store", as_index=False).agg(
number_of_unique_values = ("product_code","nunique")
)
sales.groupby("store").agg(
total_sales_in_thousands = (
"last_month_sales",
lambda x: round(x.sum() / 1000, 1)
)
)
sales.groupby("store").apply(
lambda x: (x.last_week_sales - x.last_month_sales / 4).mean()
)
store
Daisy 5.094149
Rose 5.326250
Violet 8.965152
dtype: float64
sales.loc[1000] = [None, "PG2", 10000, 120, 64, 96, 15, 53]
sales.groupby("store")["price"].mean()
store
Daisy 69.327426
Rose 60.513700
Violet 67.808727
Name: price, dtype: float64
dropna=False)["price"].mean()
store
Daisy 69.327426
Rose 60.513700
Violet 67.808727
NaN 96.000000
Name: price, dtype: float64
sales.groupby(["store", "product_group"]).ngroups
18
aisy_pg1 = sales.groupby(
["Daisy","PG1") ]).get_group((
)
daisy_pg1.head()
sales["rank"] = sales.groupby("store"["price"].rank(
ascending=False, method="dense"
)
sales.head()
import numpy as npdf = pd.DataFrame(
{
"date": pd.date_range(start="2022-08-01", periods=8, freq="D"),
"category": list("AAAABBBB"),
"value": np.random.randint(10, 30, size=8)
}
)
df["cum_sum"] = df.groupby("category")["value"].cumsum()
df["cum_sum_2"] = df.groupby(
"category"
)["value"].expanding().sum().values
df["cum_mean"] = df.groupby(
"category"
)["value"].expanding().mean().values
df["current_highest"] = df.groupby(
"category"
)["value"].expanding().max().values
评论