Event Contributor Analysis¶
Each user is linked to an event based on their badge timestamp, and their activity is measured monthly from four months before to four months after the event (M-4 to M+4). Users are grouped as newcomers if they created their Fedora Account within 30 days prior to the event.
For events that recur annually, we also track whether a user who attended in one year returns to the same event the following year.
import os
import glob
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
plt.style.use("seaborn-v0_8")
sns.set_theme(context="notebook", style="whitegrid")
event_badges = {
# Flock
"flock-2023-attendee": "Flock 2023",
"flock-2024-attendee": "Flock 2024",
"flock-2025-attendee": "Flock 2025",
# FOSDEM
"fosdem-2023-attendee": "FOSDEM 2023",
"fosdem-2024-attendee": "FOSDEM 2024",
"fosdem-2025-attendee": "FOSDEM 2025",
# DevConf CZ
"devconf.cz-2023-attendee": "DevConf CZ 2023",
"devconf.cz-2024-attendee": "DevConf CZ 2024",
"devconf.cz-2025-attendee": "DevConf CZ 2025",
# DevConf US
"devconf.us-2023-attendee": "DevConf US 2023",
"devconf.us-2024-attendee": "DevConf US 2024",
"devconf.us-2025-attendee": "DevConf US 2025",
# DevConf India
"devconf.in-2023-attendee": "DevConf India 2023",
"devconf.in-2024-attendee": "DevConf India 2024",
"devconf.in-2025-attendee": "DevConf India 2025",
# CentOS Connect
"centos-connect-2023-attendee": "CentOS Connect 2023",
"centos-connect-2024-attendee": "CentOS Connect 2024",
"centos-connect-2025-attendee": "CentOS Connect 2025",
# Fedora Mentor Summit
"fedora-mentor-summit-2024-attendee": "Mentor Summit 2024",
"fedora-mentor-summit-2025-attendee": "Mentor Summit 2025",
# Red Hat Summit
"redhat-summit-2023-attendee": "Red Hat Summit 2023",
"redhat-summit-2024-attendee": "Red Hat Summit 2024",
"redhat-summit-2025-attendee": "Red Hat Summit 2025",
}
event_successors = {
# Flock
"Flock 2023": "Flock 2024",
"Flock 2024": "Flock 2025",
# FOSDEM
"FOSDEM 2023": "FOSDEM 2024",
"FOSDEM 2024": "FOSDEM 2025",
# DevConf CZ
"DevConf CZ 2023": "DevConf CZ 2024",
"DevConf CZ 2024": "DevConf CZ 2025",
# DevConf US
"DevConf US 2023": "DevConf US 2024",
"DevConf US 2024": "DevConf US 2025",
# DevConf India
"DevConf India 2023": "DevConf India 2024",
"DevConf India 2024": "DevConf India 2025",
# CentOS Connect
"CentOS Connect 2023": "CentOS Connect 2024",
"CentOS Connect 2024": "CentOS Connect 2025",
# Fedora Mentor Summit
"Mentor Summit 2024": "Mentor Summit 2025",
# Red Hat Summit
"Red Hat Summit 2023": "Red Hat Summit 2024",
"Red Hat Summit 2024": "Red Hat Summit 2025"
}
# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
badge_data = DATA_SOURCES["badges"] + "/badge.csv"
cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(weeks=52)).date()
files = []
for p in Path(parquet_dir).glob("fedora-*.parquet"):
stem = p.stem.replace("_processed", "")
d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
if d >= cutoff_date:
files.append(str(p))
dataset = ds.dataset(files, format="parquet")
chunks = []
for batch in dataset.to_batches(batch_size=50_000):
df = batch.to_pandas()
if "sent_at" not in df.columns or "username" not in df.columns:
continue
df["sent_at"] = pd.to_datetime(df["sent_at"], errors="coerce").dt.floor("s")
chunks.append(df)
combined_df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
if not combined_df.empty:
print("Maximum date in data:", combined_df["sent_at"].max().date())
print("Minimum date in data:", combined_df["sent_at"].min().date())
else:
print("No data found in cutoff range")
activity = combined_df
# Get account creation times
fas_df = activity[activity["topic"] == "org.fedoraproject.prod.fas.user.create"]
account_ages = fas_df.groupby("username")["sent_at"].min().reset_index()
account_ages.columns = ["username", "account_created"]
# Load badge CSV
badge_df = pd.read_csv(badge_data, parse_dates=["timestamp"])
print("Latest Badges data timestamp:", badge_df["timestamp"].max())
badge_df.rename(columns={"fas": "username"}, inplace=True)
badge_df["event"] = badge_df["badge_id"].map(event_badges)
badge_df.dropna(subset=["event", "timestamp"], inplace=True)
print("Latest Event Badges data timestamp:", badge_df["timestamp"].max())
Maximum date in data: 2025-09-12 Minimum date in data: 2024-09-02
Latest Badges data timestamp: 2025-09-11 23:39:38.138414 Latest Event Badges data timestamp: 2025-06-30 12:11:38.063997
# Per-event profile
records = []
for event, group in badge_df.groupby("event"):
usernames = group["username"].unique()
event_time = pd.to_datetime(group["timestamp"].min())
subset = activity[activity["username"].isin(usernames)].copy()
subset["month_offset"] = ((subset["sent_at"] - event_time) / pd.Timedelta(days=30)).round().astype(int)
subset["bucket"] = subset["month_offset"].apply(
lambda x: f"M{x:+d}" if -4 <= x <= 4 and x != 0 else None
)
msg_counts = subset[subset["bucket"].notnull()].groupby(["username", "bucket"]).size().unstack(fill_value=0)
users = pd.DataFrame({"username": usernames})
users["badge_awarded_at"] = users["username"].map(group.set_index("username")["timestamp"])
users["account_created"] = users["username"].map(account_ages.set_index("username")["account_created"])
users["days_before_event"] = (event_time - users["account_created"]).dt.days
users["newcomer_30d"] = users["days_before_event"] <= 30
for row in users.itertuples():
profile = {
"event": event,
"event_date": event_time.date(),
"username": row.username,
"badge_awarded_at": row.badge_awarded_at,
"newcomer_30d": row.newcomer_30d
}
for m in [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]:
profile[m] = msg_counts.loc[row.username, m] if row.username in msg_counts.index and m in msg_counts.columns else 0
records.append(profile)
df = pd.DataFrame(records)
assert "username" in df.columns
# Flag return
if event_successors:
for src, succ in event_successors.items():
future_users = set(badge_df[badge_df["event"] == succ]["username"])
df.loc[df["event"] == src, "returned_next_year"] = df["username"].isin(future_users)
Newcomer Composition by Event¶
Number of newcomers (joined ≤30 days before event) per event.
# Add helper column
df["contributor_type"] = df["newcomer_30d"].map({True: "Newcomer", False: "Existing"})
# Extract event year for sorting
df["event_year"] = df["event"].str.extract(r"(\d{4})").astype(int)
# Count newcomers and existing by event
counts = df.groupby(["event", "event_year", "contributor_type"])["username"].count().reset_index()
pivot = counts.pivot(index=["event", "event_year"], columns="contributor_type", values="username").fillna(0)
pivot = pivot.sort_values("event_year")
# Plot
pivot.plot(kind="bar", stacked=True, figsize=(14, 6), color=["lightblue", "steelblue"])
plt.ylabel("Number of Contributors")
plt.title("Event Attendance: Newcomers vs Existing Contributors")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
Activity Change Pre vs Post Event¶
Plot average activity before (M-1 to M-4) vs after (M+1 to M+4) per event and by contributor type.
activity_cols = [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]
# Melt into long format for time series plotting
melted = df.melt(
id_vars=["event", "event_year", "contributor_type"],
value_vars=activity_cols,
var_name="month",
value_name="msg_count"
)
# Aggregate average msg_count
activity_summary = (
melted.groupby(["event", "contributor_type", "month"])["msg_count"]
.mean()
.reset_index()
)
# Sort months
activity_summary["month"] = pd.Categorical(
activity_summary["month"],
categories=activity_cols,
ordered=True
)
# Plot
g = sns.FacetGrid(
activity_summary,
row="event",
hue="contributor_type",
height=2.5,
aspect=3,
sharey=False,
palette={"Newcomer": "lightblue", "Existing": "steelblue"}
)
g.map(sns.lineplot, "month", "msg_count")
g.add_legend(title="Contributor Type")
g.set_axis_labels("Month Offset from Event", "Avg. Messages")
g.set_titles("{row_name}")
g.fig.subplots_adjust(top=0.95)
g.fig.suptitle("Activity Profiles: Newcomers vs Existing Contributors", fontsize=16)
plt.show()
Return Rate to Next-Year Event¶
Show % of attendees who returned to the next year’s event (from returned_next_year flag).
return_rate = df.groupby("event")["returned_next_year"].mean().sort_values(ascending=False) * 100
plt.figure(figsize=(12, 6))
sns.barplot(x=return_rate.index, y=return_rate.values)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Return Rate (%)")
plt.title("Event-to-Event Return Engagement")
plt.tight_layout()
plt.show()
Heatmap of Monthly Activity Offset¶
Average monthly message count from M-4 to M+4 per event.
activity_columns = [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]
monthly_profile = df.groupby("event")[activity_columns].mean()
plt.figure(figsize=(14, 8))
sns.heatmap(monthly_profile, annot=True, fmt=".1f", cmap="rocket_r", linewidths=0.5, linecolor="gray")
plt.title("Avg. Monthly Activity Offset by Event")
plt.xlabel("Month Offset from Event")
plt.ylabel("Event")
plt.tight_layout()
plt.show()