Event Contributor Analysis¶
Each user is linked to an event based on their badge timestamp, and their activity is measured monthly from four months before to four months after the event (M-4 to M+4). Users are grouped as newcomers if they created their Fedora Account within 30 days prior to the event.
For events that recur annually, we also track whether a user who attended in one year returns to the same event the following year.
import os
import pandas as pd
import pyarrow.dataset as ds
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
# config
# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
badge_data = DATA_SOURCES["badges"] + "/badge.csv"
user_parquets = DATA_SOURCES["datagrepper-parse-accounts"]
event_badges = {
# Flock
"flock-2023-attendee": "Flock 2023",
"flock-2024-attendee": "Flock 2024",
"flock-2025-attendee": "Flock 2025",
# FOSDEM
"fosdem-2023-attendee": "FOSDEM 2023",
"fosdem-2024-attendee": "FOSDEM 2024",
"fosdem-2025-attendee": "FOSDEM 2025",
# DevConf CZ
"devconf.cz-2023-attendee": "DevConf CZ 2023",
"devconf.cz-2024-attendee": "DevConf CZ 2024",
"devconf.cz-2025-attendee": "DevConf CZ 2025",
# DevConf US
"devconf.us-2023-attendee": "DevConf US 2023",
"devconf.us-2024-attendee": "DevConf US 2024",
"devconf.us-2025-attendee": "DevConf US 2025",
# DevConf India
"devconf.in-2023-attendee": "DevConf India 2023",
"devconf.in-2024-attendee": "DevConf India 2024",
"devconf.in-2025-attendee": "DevConf India 2025",
# CentOS Connect
"centos-connect-2023-attendee": "CentOS Connect 2023",
"centos-connect-2024-attendee": "CentOS Connect 2024",
"centos-connect-2025-attendee": "CentOS Connect 2025",
# Fedora Mentor Summit
"fedora-mentor-summit-2024-attendee": "Mentor Summit 2024",
"fedora-mentor-summit-2025-attendee": "Mentor Summit 2025",
# Red Hat Summit
"redhat-summit-2023-attendee": "Red Hat Summit 2023",
"redhat-summit-2024-attendee": "Red Hat Summit 2024",
"redhat-summit-2025-attendee": "Red Hat Summit 2025",
}
event_successors = {
# Flock
"Flock 2023": "Flock 2024",
"Flock 2024": "Flock 2025",
# FOSDEM
"FOSDEM 2023": "FOSDEM 2024",
"FOSDEM 2024": "FOSDEM 2025",
# DevConf CZ
"DevConf CZ 2023": "DevConf CZ 2024",
"DevConf CZ 2024": "DevConf CZ 2025",
# DevConf US
"DevConf US 2023": "DevConf US 2024",
"DevConf US 2024": "DevConf US 2025",
# DevConf India
"DevConf India 2023": "DevConf India 2024",
"DevConf India 2024": "DevConf India 2025",
# CentOS Connect
"CentOS Connect 2023": "CentOS Connect 2024",
"CentOS Connect 2024": "CentOS Connect 2025",
# Fedora Mentor Summit
"Mentor Summit 2024": "Mentor Summit 2025",
# Red Hat Summit
"Red Hat Summit 2023": "Red Hat Summit 2024",
"Red Hat Summit 2024": "Red Hat Summit 2025"
}
# Load badge CSV
badge_df = pd.read_csv(badge_data, parse_dates=["timestamp"])
print("Latest Badges data timestamp:", badge_df["timestamp"].max())
badge_df.rename(columns={"fas": "username"}, inplace=True)
badge_df["event"] = badge_df["badge_id"].map(event_badges)
badge_df.dropna(subset=["event", "timestamp"], inplace=True)
print("Latest Event Badges data timestamp:", badge_df["timestamp"].max())
# Load activity data
activity_ds = ds.dataset(user_parquets, format="parquet")
activity = activity_ds.to_table(columns=["username", "topic", "sent_at"]).to_pandas()
activity["sent_at"] = pd.to_datetime(activity["sent_at"], errors="coerce")
activity.dropna(subset=["username", "sent_at"], inplace=True)
# Get account creation times
fas_df = activity[activity["topic"] == "org.fedoraproject.prod.fas.user.create"]
account_ages = fas_df.groupby("username")["sent_at"].min().reset_index()
account_ages.columns = ["username", "account_created"]
Latest Badges data timestamp: 2025-06-29 00:00:25.166107 Latest Event Badges data timestamp: 2025-06-24 18:57:01.789468
# Per-event profile
records = []
for event, group in badge_df.groupby("event"):
usernames = group["username"].unique()
event_time = pd.to_datetime(group["timestamp"].min())
subset = activity[activity["username"].isin(usernames)].copy()
subset["month_offset"] = ((subset["sent_at"] - event_time) / pd.Timedelta(days=30)).round().astype(int)
subset["bucket"] = subset["month_offset"].apply(
lambda x: f"M{x:+d}" if -4 <= x <= 4 and x != 0 else None
)
msg_counts = subset[subset["bucket"].notnull()].groupby(["username", "bucket"]).size().unstack(fill_value=0)
users = pd.DataFrame({"username": usernames})
users["badge_awarded_at"] = users["username"].map(group.set_index("username")["timestamp"])
users["account_created"] = users["username"].map(account_ages.set_index("username")["account_created"])
users["days_before_event"] = (event_time - users["account_created"]).dt.days
users["newcomer_30d"] = users["days_before_event"] <= 30
for row in users.itertuples():
profile = {
"event": event,
"event_date": event_time.date(),
"username": row.username,
"badge_awarded_at": row.badge_awarded_at,
"newcomer_30d": row.newcomer_30d
}
for m in [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]:
profile[m] = msg_counts.loc[row.username, m] if row.username in msg_counts.index and m in msg_counts.columns else 0
records.append(profile)
df = pd.DataFrame(records)
assert "username" in df.columns
# Flag return
if event_successors:
for src, succ in event_successors.items():
future_users = set(badge_df[badge_df["event"] == succ]["username"])
df.loc[df["event"] == src, "returned_next_year"] = df["username"].isin(future_users)
Newcomer Composition by Event¶
Number of newcomers (joined ≤30 days before event) per event.
# Add helper column
df["contributor_type"] = df["newcomer_30d"].map({True: "Newcomer", False: "Existing"})
# Extract event year for sorting
df["event_year"] = df["event"].str.extract(r"(\d{4})").astype(int)
# Count newcomers and existing by event
counts = df.groupby(["event", "event_year", "contributor_type"])["username"].count().reset_index()
pivot = counts.pivot(index=["event", "event_year"], columns="contributor_type", values="username").fillna(0)
pivot = pivot.sort_values("event_year")
# Plot
pivot.plot(kind="bar", stacked=True, figsize=(14, 6), color=["lightblue", "steelblue"])
plt.ylabel("Number of Contributors")
plt.title("Event Attendance: Newcomers vs Existing Contributors")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
Activity Change Pre vs Post Event¶
Plot average activity before (M-1 to M-4) vs after (M+1 to M+4) per event and by contributor type.
activity_cols = [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]
# Melt into long format for time series plotting
melted = df.melt(
id_vars=["event", "event_year", "contributor_type"],
value_vars=activity_cols,
var_name="month",
value_name="msg_count"
)
# Aggregate average msg_count
activity_summary = (
melted.groupby(["event", "contributor_type", "month"])["msg_count"]
.mean()
.reset_index()
)
# Sort months
activity_summary["month"] = pd.Categorical(
activity_summary["month"],
categories=activity_cols,
ordered=True
)
# Plot
g = sns.FacetGrid(
activity_summary,
row="event",
hue="contributor_type",
height=2.5,
aspect=3,
sharey=False,
palette={"Newcomer": "lightblue", "Existing": "steelblue"}
)
g.map(sns.lineplot, "month", "msg_count")
g.add_legend(title="Contributor Type")
g.set_axis_labels("Month Offset from Event", "Avg. Messages")
g.set_titles("{row_name}")
g.fig.subplots_adjust(top=0.95)
g.fig.suptitle("Activity Profiles: Newcomers vs Existing Contributors", fontsize=16)
plt.show()
Return Rate to Next-Year Event¶
Show % of attendees who returned to the next year’s event (from returned_next_year flag).
return_rate = df.groupby("event")["returned_next_year"].mean().sort_values(ascending=False) * 100
plt.figure(figsize=(12, 6))
sns.barplot(x=return_rate.index, y=return_rate.values)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Return Rate (%)")
plt.title("Event-to-Event Return Engagement")
plt.tight_layout()
plt.show()
Heatmap of Monthly Activity Offset¶
Average monthly message count from M-4 to M+4 per event.
activity_columns = [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]
monthly_profile = df.groupby("event")[activity_columns].mean()
plt.figure(figsize=(14, 8))
sns.heatmap(monthly_profile, annot=True, fmt=".1f", cmap="rocket_r", linewidths=0.5, linecolor="gray")
plt.title("Avg. Monthly Activity Offset by Event")
plt.xlabel("Month Offset from Event")
plt.ylabel("Event")
plt.tight_layout()
plt.show()