FAS Account Cohort Analysis¶
This notebook tracks community re‑engagement over time by analyzing Fedora account creation and return activity using weekly cohorts and visualizing user retention rates.
We define a Cohort of FAS accounts based on the week they enrolled into FAS and group their activity going forward.
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
parquet_files = sorted(glob.glob(f"{parquet_dir}/*.parquet"))
chunks = []
for batch_start in range(0, len(parquet_files), 50):
batch = parquet_files[batch_start:batch_start + 50]
chunk_df = pd.concat(
[pd.read_parquet(f) for f in batch],
ignore_index=True
)
chunk_df['sent_at'] = pd.to_datetime(chunk_df['sent_at'], errors='coerce').dt.floor('s')
chunks.append(chunk_df)
combined_df = pd.concat(chunks, ignore_index=True)
combined_df.dropna(subset=['sent_at', 'username'], inplace=True)
max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")
Maximum date in data: 2025-06-28 Minimum date in data: 2024-01-01
new_users_df = combined_df[
combined_df['topic'] == 'org.fedoraproject.prod.fas.user.create'
].copy()
new_users_df['cohort_week'] = new_users_df['sent_at'].dt.to_period('W').dt.start_time
new_users_df['cohort_label'] = new_users_df['cohort_week'].dt.strftime('Week of %Y-%m-%d')
cohort_sizes = new_users_df.groupby('cohort_week')['username'].nunique()
user_cohorts = new_users_df[['username', 'cohort_week', 'cohort_label']].drop_duplicates()
activity_df = combined_df.merge(user_cohorts, on='username', how='inner')
activity_df['week_since_cohort'] = ((activity_df['sent_at'] - activity_df['cohort_week']).dt.days // 7).astype(int)
activity_df = activity_df[activity_df['week_since_cohort'] >= 0]
weekly_activity = (
activity_df
.groupby(['cohort_week', 'week_since_cohort'])['username']
.nunique()
.reset_index()
)
retention_counts = weekly_activity.pivot(
index='cohort_week',
columns='week_since_cohort',
values='username'
).fillna(0)
retention_counts = retention_counts.reindex(cohort_sizes.index.sort_values())
retention_rate = retention_counts.div(cohort_sizes, axis=0) * 100
annotated = retention_counts.astype(object).copy()
for i, cohort in enumerate(annotated.index):
cohort_start = pd.to_datetime(cohort_sizes.index[i], format='Week of %m/%d').date()
for week in annotated.columns:
week_date = cohort_start + timedelta(weeks=int(week))
if week_date > max_date:
annotated.loc[cohort, week] = "N/A"
else:
returned = retention_counts.loc[cohort, week]
total = cohort_sizes[cohort]
annotated.loc[cohort, week] = f"{int(returned)}/{int(total)}"
retention_rate = retention_rate.iloc[:, :12]
annotated = annotated.iloc[:, :12]
label_map = new_users_df.drop_duplicates('cohort_week').set_index('cohort_week')['cohort_label']
The table below shows a weekly count of FAS accounts created each week. The table shows a number in "Number of Returned Users" / "Total Accounts Created in that Week" and then a weekly X axis to show weeks looking forward. If the account that was created in that week returns in what is that user's Week 2, the count will increase.
plt.figure(figsize=(16, 20))
sns.heatmap(
retention_rate,
annot=annotated,
fmt="",
cmap="Blues",
cbar_kws={'label': 'Retention Rate (%)'},
linewidths=0.5,
linecolor='gray'
)
plt.title('FAS Weekly Cohort Retention')
plt.xlabel('Weeks Since Account Creation')
plt.ylabel('Cohort Week')
plt.xticks(ticks=range(12), labels=[f"Week {i}" for i in range(12)])
plt.yticks(
ticks=np.arange(len(retention_rate.index)),
labels=[label_map.get(cohort, cohort.strftime('%Y-%m-%d')) for cohort in retention_rate.index]
)
plt.tight_layout()
plt.show()
user_event_counts = activity_df.groupby('username').size()
valid_users = user_event_counts[user_event_counts >= 5].index
filtered_activity = activity_df[activity_df['username'].isin(valid_users)].copy()
filtered_weekly = (
filtered_activity
.groupby(['cohort_week', 'week_since_cohort'])['username']
.nunique()
.reset_index()
)
filtered_counts = filtered_weekly.pivot(
index='cohort_week',
columns='week_since_cohort',
values='username'
).fillna(0).reindex(cohort_sizes.index.sort_values())
filtered_rate = filtered_counts.div(cohort_sizes, axis=0) * 100
filtered_annotated = filtered_counts.astype(object).copy()
for cohort in filtered_annotated.index:
cohort_start = cohort.date()
for week in filtered_annotated.columns:
current_week_date = cohort_start + timedelta(weeks=int(week))
if current_week_date > max_date:
filtered_annotated.loc[cohort, week] = "N/A"
else:
returned = filtered_counts.loc[cohort, week]
total = cohort_sizes.get(cohort, 0)
filtered_annotated.loc[cohort, week] = f"{int(returned)}/{int(total)}" if total else "0/0"
filtered_rate = filtered_rate.iloc[:, :12]
filtered_annotated = filtered_annotated.iloc[:, :12]
label_map = new_users_df.drop_duplicates('cohort_week').set_index('cohort_week')['cohort_label']
This view shows if the user performed 5 or more activities across the message bus in that week to be counted as a returned contributor.
plt.figure(figsize=(16, 20))
sns.heatmap(
filtered_rate,
annot=filtered_annotated,
fmt="",
cmap="Blues",
cbar_kws={'label': 'Retention Rate (%)'},
linewidths=0.5,
linecolor='gray'
)
plt.title('Filtered Retention: Users with ≥5 Events')
plt.xlabel('Weeks Since Account Creation')
plt.ylabel('Cohort Week')
plt.xticks(ticks=range(12), labels=[f"Week {i}" for i in range(12)])
plt.yticks(
ticks=np.arange(len(retention_rate.index)),
labels=[label_map.get(cohort, cohort.strftime('%Y-%m-%d')) for cohort in retention_rate.index]
)
plt.tight_layout()
plt.show()