FAS Account Inflow / Outflow by Month¶
In [1]:
# setup
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
In [2]:
# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pyarrow as pa
dataset = ds.dataset(parquet_dir, format="parquet")
table = dataset.to_table(columns=["username", "sent_at", "topic"])
combined_df = table.to_pandas()
combined_df.dropna(subset=['sent_at', 'username'], inplace=True)
max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")
Maximum date in data: 2025-06-28 Minimum date in data: 2024-01-01
In [3]:
combined_df = combined_df[~combined_df['topic'].str.startswith("org.centos", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("io.pagure.prod", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.mailman.receive", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.bugzilla", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.release-monitoring", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.copr", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.discourse", na=False)]
combined_df['month'] = combined_df['sent_at'].dt.to_period('M')
While we try to deturmine the best count of contributors, the following topics are removed:
- org.centos*
- io.pagure.prod* (Commits on distgit are not counted here)
- org.fedoraproject.prod.mailman.receive* (these messages are not tied to FAS at this time)
- org.fedoraproject.prod.bugzilla* (these messages are not tied to FAS at this time)
- org.release-monitoring* (these messages are not user activity)
- org.fedoraproject.prod.copr* (Due to a processing issue, COPR messages need to be fixed in processing before included in counts as they may be double counted)
- org.fedoraproject.prod.discourse (Matthew has some ideas on how to extract Ask Fedora)
We also require a user to emit at least 10 messages in a month to be counted as Retained or Inflow. If they do not, they are counted as outflow for the next month. Month 0 is dropped from the visualization as Inflow wouldn't make sense (Jan '24 is skipped and Feb '24 is first month instead).
In [4]:
event_counts = (
combined_df.groupby(['username', 'month'])
.size()
.reset_index(name='event_count')
)
filtered_events = event_counts[event_counts['event_count'] > 10]
user_months = defaultdict(set)
for row in filtered_events.itertuples():
user_months[row.username].add(row.month)
valid_months = sorted(filtered_events['month'].unique())
month_to_users = {
month: set(filtered_events[filtered_events['month'] == month]['username'])
for month in valid_months
}
user_activity_rows = []
for username, active_months in user_months.items():
for month in sorted(active_months):
next_month_idx = valid_months.index(month) + 1
if next_month_idx < len(valid_months):
next_month = valid_months[next_month_idx]
active_next_month = username in month_to_users[next_month]
else:
active_next_month = False
user_activity_rows.append({
'user': username,
'month': str(month),
'active_this_month': True,
'active_next_month': active_next_month,
'status': 'returning' if active_next_month else 'churned'
})
user_activity_df = pd.DataFrame(user_activity_rows)
In [5]:
data = pd.DataFrame(user_activity_rows)
all_months = sorted(data['month'].unique())
month_to_users = {
month: set(data[data['month'] == month]['user']) for month in all_months
}
delta_rows = []
for i in range(1, len(all_months)-1):
prev_month = all_months[i - 1]
curr_month = all_months[i]
prev_users = month_to_users[prev_month]
curr_users = month_to_users[curr_month]
retained_users = curr_users & prev_users
outflow_users = prev_users - curr_users
inflow_users = curr_users - prev_users
delta_rows.append({
'month': str(curr_month),
'retained': len(retained_users),
'outflow': len(outflow_users),
'inflow': len(inflow_users),
'retained_users': sorted(retained_users),
'outflow_users': sorted(outflow_users),
'inflow_users': sorted(inflow_users),
})
delta_df = pd.DataFrame(delta_rows).set_index('month')
In [6]:
ax = delta_df[['inflow', 'retained', 'outflow']].plot(
kind='bar',
stacked=True,
color=['lightblue', 'blue', 'red'],
figsize=(12, 6)
)
for idx, row in enumerate(delta_df.itertuples()):
bottom = 0
for col, color in zip(['inflow', 'retained', 'outflow'], ['lightblue', 'blue', 'red']):
value = getattr(row, col)
if value > 0:
ax.text(
idx,
bottom + value / 2,
str(value),
ha='center',
va='center',
fontsize=9,
color='white' if color != 'yellow' else 'black',
fontweight='bold'
)
bottom += value
plt.title("Monthly Contributor Inflow, Retention, and Outflow")
plt.xlabel("Month")
plt.ylabel("Number of Users")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()