FAS Account Inflow / Outflow by Month¶
In [1]:
# setup
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
from pathlib import Path
from datetime import datetime
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pyarrow as pa
In [2]:
# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(months=12)).date()
files = []
for p in Path(parquet_dir).glob("fedora-*.parquet"):
stem = p.stem.replace("_processed", "")
d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
if d >= cutoff_date:
files.append(str(p))
dataset = ds.dataset(files, format="parquet")
table = dataset.to_table(columns=["username", "sent_at", "topic"])
combined_df = table.to_pandas().dropna(subset=["sent_at", "username"])
max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")
Maximum date in data: 2025-09-10 Minimum date in data: 2024-09-01
In [3]:
combined_df = combined_df[~combined_df['topic'].str.startswith("org.centos", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("io.pagure.prod", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.mailman.receive", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.bugzilla", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.release-monitoring", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.copr", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.discourse", na=False)]
combined_df['month'] = combined_df['sent_at'].dt.to_period('M')
While we try to deturmine the best count of contributors, the following topics are removed:
- org.centos*
- io.pagure.prod* (Commits on distgit are not counted here)
- org.fedoraproject.prod.mailman.receive* (these messages are not tied to FAS at this time)
- org.fedoraproject.prod.bugzilla* (these messages are not tied to FAS at this time)
- org.release-monitoring* (these messages are not user activity)
- org.fedoraproject.prod.copr* (Due to a processing issue, COPR messages need to be fixed in processing before included in counts as they may be double counted)
- org.fedoraproject.prod.discourse (Matthew has some ideas on how to extract Ask Fedora)
We also require a user to emit at least 10 messages in a month to be counted as Retained or Inflow. If they do not, they are counted as outflow for the next month. Month 0 is dropped from the visualization as Inflow wouldn't make sense (Jan '24 is skipped and Feb '24 is first month instead).
In [4]:
event_counts = (
combined_df.groupby(['username', 'month'])
.size()
.reset_index(name='event_count')
)
filtered_events = event_counts[event_counts['event_count'] > 10]
user_months = defaultdict(set)
for row in filtered_events.itertuples():
user_months[row.username].add(row.month)
valid_months = sorted(filtered_events['month'].unique())
month_to_users = {
month: set(filtered_events[filtered_events['month'] == month]['username'])
for month in valid_months
}
user_activity_rows = []
for username, active_months in user_months.items():
for month in sorted(active_months):
next_month_idx = valid_months.index(month) + 1
if next_month_idx < len(valid_months):
next_month = valid_months[next_month_idx]
active_next_month = username in month_to_users[next_month]
else:
active_next_month = False
user_activity_rows.append({
'user': username,
'month': str(month),
'active_this_month': True,
'active_next_month': active_next_month,
'status': 'returning' if active_next_month else 'churned'
})
user_activity_df = pd.DataFrame(user_activity_rows)
In [5]:
data = pd.DataFrame(user_activity_rows)
all_months = sorted(data['month'].unique())
month_to_users = {
month: set(data[data['month'] == month]['user']) for month in all_months
}
delta_rows = []
for i in range(1, len(all_months)-1):
prev_month = all_months[i - 1]
curr_month = all_months[i]
prev_users = month_to_users[prev_month]
curr_users = month_to_users[curr_month]
retained_users = curr_users & prev_users
outflow_users = prev_users - curr_users
inflow_users = curr_users - prev_users
delta_rows.append({
'month': str(curr_month),
'retained': len(retained_users),
'outflow': len(outflow_users),
'inflow': len(inflow_users),
'retained_users': sorted(retained_users),
'outflow_users': sorted(outflow_users),
'inflow_users': sorted(inflow_users),
})
delta_df = pd.DataFrame(delta_rows).set_index('month')
In [6]:
ax = delta_df[['inflow', 'retained', 'outflow']].plot(
kind='bar',
stacked=True,
color=['lightblue', 'blue', 'red'],
figsize=(12, 6)
)
for idx, row in enumerate(delta_df.itertuples()):
bottom = 0
for col, color in zip(['inflow', 'retained', 'outflow'], ['lightblue', 'blue', 'red']):
value = getattr(row, col)
if value > 0:
ax.text(
idx,
bottom + value / 2,
str(value),
ha='center',
va='center',
fontsize=9,
color='white' if color != 'yellow' else 'black',
fontweight='bold'
)
bottom += value
plt.title("Monthly Contributor Inflow, Retention, and Outflow")
plt.xlabel("Month")
plt.ylabel("Number of Users")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [ ]: