Show the code
---
title: FAS Account Cohort Analysis
description: Quarterly trends in Fedora account creation and return cohorts.
date: 2025-10-12
---{'title': 'FAS Account Cohort Analysis', 'refresh': 'weekly'}
{'title': 'FAS Account Cohort Analysis', 'refresh': 'weekly'}
This notebook tracks community re‑engagement over time by analyzing Fedora account creation and return activity using weekly cohorts and visualizing user retention rates.
We define a Cohort of FAS accounts based on the week they enrolled into FAS and group their activity going forward.
import os
import glob
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
plt.style.use("seaborn-v0_8")
sns.set_theme(context="notebook", style="whitegrid")# @replace DATA_SOURCES
DATA_SOURCES = {"datagrepper-parse-accounts": "/home/jovyan/work/bus2parquet/output_users"}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(weeks=52)).date()
files = []
for p in Path(parquet_dir).glob("fedora-*.parquet"):
stem = p.stem.replace("_processed", "")
d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
if d >= cutoff_date:
files.append(str(p))
dataset = ds.dataset(files, format="parquet")
chunks = []
for batch in dataset.to_batches(batch_size=50_000):
df = batch.to_pandas()
if "sent_at" not in df.columns or "username" not in df.columns:
continue
df["sent_at"] = pd.to_datetime(df["sent_at"], errors="coerce").dt.floor("s")
chunks.append(df)
combined_df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
if not combined_df.empty:
print("Maximum date in data:", combined_df["sent_at"].max().date())
print("Minimum date in data:", combined_df["sent_at"].min().date())
else:
print("No data found in cutoff range")Maximum date in data: 2025-09-10
Minimum date in data: 2024-09-02
new_users_df = combined_df[
combined_df['topic'] == 'org.fedoraproject.prod.fas.user.create'
].copy()
new_users_df['cohort_week'] = new_users_df['sent_at'].dt.to_period('W').dt.start_time
new_users_df['cohort_label'] = new_users_df['cohort_week'].dt.strftime('Week of %Y-%m-%d')
cohort_sizes = new_users_df.groupby('cohort_week')['username'].nunique()
user_cohorts = new_users_df[['username', 'cohort_week', 'cohort_label']].drop_duplicates()
weekly_activity = (
activity_df
.groupby(['cohort_week', 'week_since_cohort'])['username']
.nunique()
.reset_index()
)
retention_counts = weekly_activity.pivot(
index='cohort_week',
columns='week_since_cohort',
values='username'
).fillna(0)
retention_counts = retention_counts.reindex(cohort_sizes.index.sort_values())
retention_rate = retention_counts.div(cohort_sizes, axis=0) * 100max_date = combined_df['sent_at'].max().date()
annotated = retention_counts.astype(object).copy()
for i, cohort in enumerate(annotated.index):
cohort_start = pd.to_datetime(cohort_sizes.index[i], format='Week of %m/%d').date()
for week in annotated.columns:
week_date = cohort_start + timedelta(weeks=int(week))
if week_date > max_date:
annotated.loc[cohort, week] = " "
else:
returned = retention_counts.loc[cohort, week]
total = cohort_sizes[cohort]
annotated.loc[cohort, week] = f"{int(returned)}/{int(total)}"
retention_rate = retention_rate.iloc[:, :12]
annotated = annotated.iloc[:, :12]
label_map = new_users_df.drop_duplicates('cohort_week').set_index('cohort_week')['cohort_label']The table below shows a weekly count of FAS accounts created each week. The table shows a number in “Number of Returned Users” / “Total Accounts Created in that Week” and then a weekly X axis to show weeks looking forward. If the account that was created in that week returns in what is that user’s Week 2, the count will increase.
plt.figure(figsize=(16, 20))
sns.heatmap(
retention_rate,
annot=annotated,
fmt="",
cmap="Blues",
cbar_kws={'label': 'Retention Rate (%)'},
linewidths=0.5,
linecolor='gray'
)
plt.title('FAS Weekly Cohort Retention')
plt.xlabel('Weeks Since Account Creation')
plt.ylabel('Cohort Week')
plt.xticks(ticks=range(12), labels=[f"Week {i}" for i in range(12)])
plt.yticks(
ticks=np.arange(len(retention_rate.index)),
labels=[label_map.get(cohort, cohort.strftime('%Y-%m-%d')) for cohort in retention_rate.index]
)
plt.tight_layout()
plt.show()
filtered_weekly = (
filtered_activity
.groupby(['cohort_week', 'week_since_cohort'])['username']
.nunique()
.reset_index()
)
filtered_counts = filtered_weekly.pivot(
index='cohort_week',
columns='week_since_cohort',
values='username'
).fillna(0).reindex(cohort_sizes.index.sort_values())
filtered_rate = filtered_counts.div(cohort_sizes, axis=0) * 100filtered_annotated = filtered_counts.astype(object).copy()
for cohort in filtered_annotated.index:
cohort_start = cohort.date()
for week in filtered_annotated.columns:
current_week_date = cohort_start + timedelta(weeks=int(week))
if current_week_date > max_date:
filtered_annotated.loc[cohort, week] = " "
else:
returned = filtered_counts.loc[cohort, week]
total = cohort_sizes.get(cohort, 0)
filtered_annotated.loc[cohort, week] = f"{int(returned)}/{int(total)}" if total else "0/0"
filtered_rate = filtered_rate.iloc[:, :12]
filtered_annotated = filtered_annotated.iloc[:, :12]
label_map = new_users_df.drop_duplicates('cohort_week').set_index('cohort_week')['cohort_label']This view shows if the user performed 5 or more activities across the message bus in that week to be counted as a returned contributor.
plt.figure(figsize=(16, 20))
sns.heatmap(
filtered_rate,
annot=filtered_annotated,
fmt="",
cmap="Blues",
cbar_kws={'label': 'Retention Rate (%)'},
linewidths=0.5,
linecolor='gray'
)
plt.title('Filtered Retention: Users with ≥5 Events')
plt.xlabel('Weeks Since Account Creation')
plt.ylabel('Cohort Week')
plt.xticks(ticks=range(12), labels=[f"Week {i}" for i in range(12)])
plt.yticks(
ticks=np.arange(len(retention_rate.index)),
labels=[label_map.get(cohort, cohort.strftime('%Y-%m-%d')) for cohort in retention_rate.index]
)
plt.tight_layout()
plt.show()