I have a dataset (CSV file) and I want to build a Cohort analysis chart using plotly library. is It possible? Because I couldn't see any tutorials on it
import plotly.express as px
# just month, time doesn't matter
df["Date"] = pd.to_datetime(df["InvoiceDate"]).dt.date - pd.offsets.MonthBegin(1)
# work out when customer was first a customer to define which cohort
df2 = df.merge(
df.groupby(["CustomerID"], as_index=False).agg(Cohort=("Date", "min")),
on="CustomerID",
)
# months between cohort start and invoice date
df2["Month"] = df2["Date"].dt.to_period("M").view(dtype="int64") - df2[
"Cohort"
].dt.to_period("M").view(dtype="int64")
df_cohort = (
df2.groupby(["Cohort", "Month"])
.apply(lambda d: (d["Quantity"] * d["UnitPrice"]).sum())
.unstack("Month")
)
# rebase as percentage as per referenced example
for c in df_cohort.columns[1:]:
df_cohort[c] = df_cohort[c] / df_cohort[0]
df_cohort[0] = 1
# now the easy bit - generate a figure
px.imshow(
df_cohort, text_auto=".2%", color_continuous_scale="blues", range_color=[0, 1]
).update_xaxes(side="top", dtick=1).update_yaxes(dtick="M1")
import kaggle.cli
import sys
import pandas as pd
from zipfile import ZipFile
from pathlib import Path
import urllib
import plotly.graph_objects as go
# fmt: off
# download data set
url = "https://www.kaggle.com/datasets/carrie1/ecommerce-data"
ds = urllib.parse.urlparse(url).path[1:]
try:
sys.argv = [sys.argv[0]] + f"datasets download {ds}".split(" ")
kaggle.cli.main()
except NameError:
ds = "/".join(ds.split("/")[1:])
sys.argv = [sys.argv[0]] + f"datasets download {ds}".split(" ")
kaggle.cli.main()
zfile = ZipFile(list(Path.cwd().glob(f"{ds.split('/')[-1]}*.zip"))[0])
dfs = {f.filename: pd.read_csv(zfile.open(f), encoding= 'unicode_escape') for f in zfile.infolist()}
# fmt: on
df = dfs["data.csv"]