Review of attendees of events, their return rates, and how events impact contribution.
Published
October 13, 2025
Show the code
import osimport globfrom pathlib import Pathfrom datetime import datetime, timedeltafrom collections import defaultdictimport numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsimport pyarrow as paimport pyarrow.dataset as dsimport pyarrow.parquet as pqplt.style.use("seaborn-v0_8")sns.set_theme(context="notebook", style="whitegrid")
Show the code
# @replace DATA_SOURCESDATA_SOURCES = {"badges": "/home/jovyan/work/badge/output", "datagrepper-parse-accounts": "/home/jovyan/work/bus2parquet/output_users"}parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]badge_data = DATA_SOURCES["badges"] +"/badge.csv"cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(weeks=52)).date()files = []for p in Path(parquet_dir).glob("fedora-*.parquet"): stem = p.stem.replace("_processed", "") d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()if d >= cutoff_date: files.append(str(p))dataset = ds.dataset(files, format="parquet")chunks = []for batch in dataset.to_batches(batch_size=50_000): df = batch.to_pandas()if"sent_at"notin df.columns or"username"notin df.columns:continue df["sent_at"] = pd.to_datetime(df["sent_at"], errors="coerce").dt.floor("s") chunks.append(df)combined_df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()ifnot combined_df.empty:print("Maximum date in data:", combined_df["sent_at"].max().date())print("Minimum date in data:", combined_df["sent_at"].min().date())else:print("No data found in cutoff range")activity = combined_df# Get account creation timesfas_df = activity[activity["topic"] =="org.fedoraproject.prod.fas.user.create"]account_ages = fas_df.groupby("username")["sent_at"].min().reset_index()account_ages.columns = ["username", "account_created"]# Load badge CSVbadge_df = pd.read_csv(badge_data, parse_dates=["timestamp"])print("Latest Badges data timestamp:", badge_df["timestamp"].max())badge_df.rename(columns={"fas": "username"}, inplace=True)print("Latest Event Badges data timestamp:", badge_df["timestamp"].max())
Maximum date in data: 2025-10-02
Minimum date in data: 2024-10-02
Latest Badges data timestamp: 2025-10-13 01:10:11.515263
Latest Event Badges data timestamp: 2025-10-13 01:10:11.515263
Show the code
import redef parse_badge_id(badge_id: str):""" Parse a badge_id like 'flock-2024-attendee' into components. Returns: (base, year, role) """ m = re.match(r"([a-z0-9.\-]+)-(\d{4})-([a-z]+)", badge_id)if m: base, year, role = m.groups()return base, int(year), rolereturnNone, None, Nonebadge_df[["event_base", "event_year", "event_role"]] = badge_df["badge_id"].apply(lambda b: pd.Series(parse_badge_id(b)))event_display_map = {"flock": "Flock","fosdem": "FOSDEM","devconf.cz": "DevConf CZ","devconf.us": "DevConf US","devconf.in": "DevConf India","centos-connect": "CentOS Connect","fedora-mentor-summit": "Mentor Summit","redhat-summit": "Red Hat Summit",}badge_df["event_display"] = ( badge_df["event_base"].map(event_display_map).fillna(badge_df["event_base"].str.title())+" "+ badge_df["event_year"].astype(str))print("Example parsed events:")print(badge_df[["badge_id", "event_base", "event_year", "event_role", "event_display"]].head())event_successors = {}for base, g in badge_df.groupby("event_base"): years =sorted(g["event_year"].dropna().unique())for y1, y2 inzip(years[:-1], years[1:]): src =f"{event_display_map.get(base, base.title())}{y1}" dst =f"{event_display_map.get(base, base.title())}{y2}" event_successors[src] = dstprint("Sample successors:")for k, v inlist(event_successors.items())[:10]:print(f"{k} > {v}")
# Per-event profile (fixed to use event_display)records = []for event, group in badge_df.groupby("event_display"): usernames = group["username"].unique() event_time = pd.to_datetime(group["timestamp"].min()) subset = activity[activity["username"].isin(usernames)].copy() subset["month_offset"] = ((subset["sent_at"] - event_time) / pd.Timedelta(days=30)).round().astype(int) subset["bucket"] = subset["month_offset"].apply(lambda x: f"M{x:+d}"if-4<= x <=4and x !=0elseNone ) msg_counts = subset[subset["bucket"].notnull()].groupby(["username", "bucket"]).size().unstack(fill_value=0) users = pd.DataFrame({"username": usernames}) badge_times = group.groupby("username")["timestamp"].min() users["badge_awarded_at"] = users["username"].map(badge_times) users["account_created"] = users["username"].map(account_ages.set_index("username")["account_created"]) users["days_before_event"] = (event_time - users["account_created"]).dt.days users["newcomer_30d"] = users["days_before_event"] <=30for row in users.itertuples(): profile = {"event_display": event, # <-- updated"event_base": group["event_base"].iloc[0], # useful for grouping later"event_year": group["event_year"].iloc[0], # keep numeric year"event_date": event_time.date(),"username": row.username,"badge_awarded_at": row.badge_awarded_at,"newcomer_30d": row.newcomer_30d }for m in [f"M{-i}"for i inrange(4, 0, -1)] + [f"M+{i}"for i inrange(1, 5)]: profile[m] = msg_counts.loc[row.username, m] if row.username in msg_counts.index and m in msg_counts.columns else0 records.append(profile)df = pd.DataFrame(records)assert"username"in df.columns# Flag return using dynamic successorsif event_successors:for src, succ in event_successors.items(): future_users =set(badge_df[badge_df["event_display"] == succ]["username"]) df.loc[df["event_display"] == src, "returned_next_year"] = df["username"].isin(future_users)
Newcomer Composition by Event
Number of newcomers (joined ≤30 days before event) per event.
Show the code
# Apply event_display_map, drop events that aren't in the mapbadge_df["event_display"] = badge_df["event_base"].map(event_display_map) +" "+ badge_df["event_year"].astype(str)# Remove rows where event_display_map didn't have a mappingbadge_df = badge_df[badge_df["event_base"].isin(event_display_map.keys())].copy()# Add helper columndf["contributor_type"] = df["newcomer_30d"].map({True: "Newcomer", False: "Existing"})
Show the code
from datetime import datetime# --- Filter last 5 years ---current_year = datetime.now().yearrecent_df = df[df["event_year"] >= current_year -3].copy()recent_df = recent_df[recent_df["event_year"] <= current_year]# Count newcomers and existing by event/yearcounts = ( recent_df.groupby(["event_base", "event_year", "contributor_type"])["username"] .count() .reset_index())# Pivot to get newcomer + existing side by sidepivot = counts.pivot_table( index=["event_base", "event_year"], columns="contributor_type", values="username", fill_value=0).reset_index()# Add totalpivot["Total"] = pivot["Newcomer"] + pivot["Existing"]# Build heatmap matrix of totalsheatmap_data = pivot.pivot(index="event_base", columns="event_year", values="Total")# Build annotation matrix: "New/Existing"anno_matrix = heatmap_data.copy().astype(str)for idx, row in pivot.iterrows(): anno_matrix.loc[row["event_base"], row["event_year"]] =f"{row['Newcomer']}/{row['Existing']}"# --- Plot ---plt.figure(figsize=(14, max(6, len(heatmap_data) *0.6)))sns.heatmap( heatmap_data, annot=anno_matrix, fmt="", cmap="Blues", linewidths=0.5, linecolor="gray", cbar_kws={"label": "Total Attendees"})plt.title("Event Attendance (Newcomers/Existing) - Last 3 Years", fontsize=16)plt.xlabel("Year")plt.ylabel("Event")plt.tight_layout()plt.show()
Activity Change Pre vs Post Event
Plot average activity before (M-1 to M-4) vs after (M+1 to M+4) per event and by contributor type.
Show the code
activity_cols = [f"M{-i}"for i inrange(4, 0, -1)] + [f"M+{i}"for i inrange(1, 5)]filtered_df = df[df["event_base"].isin(event_display_map.keys())].copy()current_year = datetime.now().yearfiltered_df = filtered_df[filtered_df["event_year"] == current_year]# Melt into long format for time series plottingmelted = filtered_df.melt( id_vars=["event_display", "event_base", "event_year", "contributor_type"], value_vars=activity_cols, var_name="month", value_name="msg_count")# Aggregate average msg_countactivity_summary = ( melted.groupby(["event_display", "contributor_type", "month"])["msg_count"] .mean() .reset_index())# Sort monthsactivity_summary["month"] = pd.Categorical( activity_summary["month"], categories=activity_cols, ordered=True)# Plot — one row per event (using display names)g = sns.FacetGrid( activity_summary, row="event_display", hue="contributor_type", height=2.5, aspect=3, sharey=False, palette={"Newcomer": "lightblue", "Existing": "steelblue"})g.map(sns.lineplot, "month", "msg_count")g.add_legend(title="Contributor Type")g.set_axis_labels("Month Offset from Event", "Avg. Messages")g.set_titles("{row_name}")g.fig.subplots_adjust(top=.8)g.fig.suptitle("Activity Profiles: Newcomers vs Existing Contributors", fontsize=16)plt.show()
Return Rate to Next-Year Event
Show % of attendees who returned to the next year’s event (from returned_next_year flag).
Show the code
# Compute return rates by event_display with base + year includedfiltered_df = df[df["event_base"].isin(event_display_map.keys())].copy()current_year = datetime.now().yearfiltered_df = filtered_df[filtered_df["event_year"] >= current_year -4]filtered_df = filtered_df[filtered_df["event_year"] < current_year]return_rate = ( filtered_df.groupby(["event_display", "event_base", "event_year"])["returned_next_year"] .mean() .reset_index())# Convert to percentagereturn_rate["return_rate"] = return_rate["returned_next_year"] *100# Sort by base, then yearreturn_rate = return_rate.sort_values(["event_base", "event_year"])# Plotplt.figure(figsize=(12, 6))sns.barplot( data=return_rate, x="event_display", y="return_rate", color="steelblue")plt.xticks(rotation=45, ha="right")plt.ylabel("Return Rate (%)")plt.title("Event-to-Event Return Engagement (Sorted by Base → Year)")plt.tight_layout()plt.show()
Heatmap of Monthly Activity Offset
Average monthly message count from M-4 to M+4 per event.
Show the code
current_year = datetime.now().yearfiltered_df = df[ (df["event_base"].isin(event_display_map.keys())) & (df["event_year"] == current_year)].copy()activity_columns = [f"M{-i}"for i inrange(4, 0, -1)] + [f"M+{i}"for i inrange(1, 5)]# Group by display/base/year and compute mean activitymonthly_profile = ( filtered_df.groupby(["event_display", "event_base", "event_year"])[activity_columns] .mean() .reset_index() .sort_values(["event_base", "event_year"]) .set_index("event_display"))# Just the activity columns for heatmapheatmap_data = monthly_profile[activity_columns]plt.figure(figsize=(14, max(6, len(heatmap_data) *0.5)))sns.heatmap( heatmap_data, annot=True, fmt=".1f", cmap="rocket_r", linewidths=0.5, linecolor="gray")plt.title("Avg. Monthly Activity Offset by Event (Sorted by Base → Year)")plt.xlabel("Month Offset from Event")plt.ylabel("Event")plt.tight_layout()plt.show()