Event Attendee Analysis (v2)

Review of attendees of events, their return rates, and how events impact contribution.
Author

Robert Wright (rwright@)

Published

November 16, 2025

Show the code
import os
import glob
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq

plt.style.use("seaborn-v0_8")
sns.set_theme(context="notebook", style="whitegrid")
Show the code
# @replace DATA_SOURCES
DATA_SOURCES = {"badges": "/home/jovyan/work/badge/output", "datagrepper-parse-accounts": "/home/jovyan/work/bus2parquet/output_users"}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
badge_data = DATA_SOURCES["badges"] + "/badge.csv"

cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(weeks=52)).date()

files = []
for p in Path(parquet_dir).glob("fedora-*.parquet"):
    stem = p.stem.replace("_processed", "")
    d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
    if d >= cutoff_date:
        files.append(str(p))

dataset = ds.dataset(files, format="parquet")

chunks = []
for batch in dataset.to_batches(batch_size=50_000):
    df = batch.to_pandas()
    if "sent_at" not in df.columns or "username" not in df.columns:
        continue
    df["sent_at"] = pd.to_datetime(df["sent_at"], errors="coerce").dt.floor("s")
    chunks.append(df)

combined_df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()

if not combined_df.empty:
    print("Maximum date in data:", combined_df["sent_at"].max().date())
    print("Minimum date in data:", combined_df["sent_at"].min().date())
else:
    print("No data found in cutoff range")

activity = combined_df

# Get account creation times
fas_df = activity[activity["topic"] == "org.fedoraproject.prod.fas.user.create"]
account_ages = fas_df.groupby("username")["sent_at"].min().reset_index()
account_ages.columns = ["username", "account_created"]


# Load badge CSV
badge_df = pd.read_csv(badge_data, parse_dates=["timestamp"])
print("Latest Badges data timestamp:", badge_df["timestamp"].max())
badge_df.rename(columns={"fas": "username"}, inplace=True)

print("Latest Event Badges data timestamp:", badge_df["timestamp"].max())
Maximum date in data: 2025-11-15
Minimum date in data: 2024-11-02
Latest Badges data timestamp: 2025-11-16 00:30:42.328752
Latest Event Badges data timestamp: 2025-11-16 00:30:42.328752
Show the code
import re

def parse_badge_id(badge_id: str):
    """
    Parse a badge_id like 'flock-2024-attendee' into components.
    Returns: (base, year, role)
    """
    m = re.match(r"([a-z0-9.\-]+)-(\d{4})-([a-z]+)", badge_id)
    if m:
        base, year, role = m.groups()
        return base, int(year), role
    return None, None, None

badge_df[["event_base", "event_year", "event_role"]] = badge_df["badge_id"].apply(
    lambda b: pd.Series(parse_badge_id(b))
)

event_display_map = {
    "flock": "Flock",
    "fosdem": "FOSDEM",
    "devconf.cz": "DevConf CZ",
    "devconf.us": "DevConf US",
    "devconf.in": "DevConf India",
    "centos-connect": "CentOS Connect",
    "fedora-mentor-summit": "Mentor Summit",
    "redhat-summit": "Red Hat Summit",
}

badge_df["event_display"] = (
    badge_df["event_base"].map(event_display_map).fillna(badge_df["event_base"].str.title())
    + " "
    + badge_df["event_year"].astype(str)
)

event_successors = {}

for base, g in badge_df.groupby("event_base"):
    years = sorted(g["event_year"].dropna().unique())
    for y1, y2 in zip(years[:-1], years[1:]):
        src = f"{event_display_map.get(base, base.title())} {y1}"
        dst = f"{event_display_map.get(base, base.title())} {y2}"
        event_successors[src] = dst
Show the code
# Per-event profile (fixed to use event_display)
records = []

for event, group in badge_df.groupby("event_display"):
    usernames = group["username"].unique()
    event_time = pd.to_datetime(group["timestamp"].min())
    
    subset = activity[activity["username"].isin(usernames)].copy()
    subset["month_offset"] = ((subset["sent_at"] - event_time) / pd.Timedelta(days=30)).round().astype(int)
    subset["bucket"] = subset["month_offset"].apply(
        lambda x: f"M{x:+d}" if -4 <= x <= 4 else None
    )
    subset.loc[subset["bucket"] == "M+0", "bucket"] = "M0"
    msg_counts = subset[subset["bucket"].notnull()].groupby(["username", "bucket"]).size().unstack(fill_value=0)

    users = pd.DataFrame({"username": usernames})
    badge_times = group.groupby("username")["timestamp"].min()
    users["badge_awarded_at"] = users["username"].map(badge_times)
    users["account_created"] = users["username"].map(account_ages.set_index("username")["account_created"])
    users["days_before_event"] = (event_time - users["account_created"]).dt.days
    users["newcomer_30d"] = users["days_before_event"] <= 30

    for row in users.itertuples():
        profile = {
            "event_display": event,                     # <-- updated
            "event_base": group["event_base"].iloc[0],  # useful for grouping later
            "event_year": group["event_year"].iloc[0],  # keep numeric year
            "event_date": event_time.date(),
            "username": row.username,
            "badge_awarded_at": row.badge_awarded_at,
            "newcomer_30d": row.newcomer_30d
        }
        for m in [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]:
            profile[m] = msg_counts.loc[row.username, m] if row.username in msg_counts.index and m in msg_counts.columns else 0
        records.append(profile)

df = pd.DataFrame(records)
assert "username" in df.columns

# Flag return using dynamic successors
if event_successors:
    for src, succ in event_successors.items():
        future_users = set(badge_df[badge_df["event_display"] == succ]["username"])
        df.loc[df["event_display"] == src, "returned_next_year"] = df["username"].isin(future_users)

Newcomer Composition by Event

Number of newcomers (joined ≤30 days before event) per event.

Show the code
# Apply event_display_map, drop events that aren't in the map
badge_df["event_display"] = badge_df["event_base"].map(event_display_map) + " " + badge_df["event_year"].astype(str)

# Remove rows where event_display_map didn't have a mapping
badge_df = badge_df[badge_df["event_base"].isin(event_display_map.keys())].copy()

# Add helper column
df["contributor_type"] = df["newcomer_30d"].map({True: "Newcomer", False: "Existing"})
Show the code
# Filter last 3 years
current_year = datetime.now().year
recent_df = df[(df["event_year"] >= current_year - 3) & (df["event_year"] <= current_year)].copy()

# Count newcomers and existing by event/year
counts = (
    recent_df.groupby(["event_base", "event_year", "contributor_type"])["username"]
    .count()
    .reset_index()
)

# Pivot to wide
pivot = counts.pivot_table(
    index=["event_base", "event_year"],
    columns="contributor_type",
    values="username",
    fill_value=0
).reset_index()

# Add totals
pivot["Total"] = pivot["Newcomer"] + pivot["Existing"]

# Main heatmap values (rounded)
heatmap_data = pivot.pivot(index="event_base", columns="event_year", values="Total").round(0)

# Annotation matrix
anno = heatmap_data.copy().astype(str)

for idx, row in pivot.iterrows():
    new_val = int(round(row["Newcomer"]))
    exist_val = int(round(row["Existing"]))
    anno.loc[row["event_base"], row["event_year"]] = (
        f"< 30 Days: {new_val}\nExisting Community: {exist_val}"
    )

plt.figure(figsize=(14, max(6, len(heatmap_data) * 0.6)))

sns.heatmap(
    heatmap_data,
    annot=anno,
    fmt="",
    cmap="BuGn",
    linewidths=0.4,
    linecolor="gray",
    cbar_kws={"label": "Total Contributors"}
)

plt.title("Community Participation by Event and Year", fontsize=16)
plt.xlabel("Year")
plt.ylabel("Event")
plt.tight_layout()
plt.show()

Activity Change Pre vs Post Event

Plot average activity before (M-1 to M-4) vs after (M+1 to M+4) per event and by contributor type.

Show the code
# Your real columns (no M0)
activity_cols = (
    [f"M{-i}" for i in range(4, 0, -1)] +
    ["M0"] +
    [f"M+{i}" for i in range(1, 5)]
)

# Filter events
filtered_df = df[df["event_base"].isin(event_display_map.keys())].copy()
current_year = datetime.now().year
filtered_df = filtered_df[filtered_df["event_year"] == current_year]

# Melt to long form
melted = filtered_df.melt(
    id_vars=["event_display", "event_base", "event_year", "contributor_type"],
    value_vars=activity_cols,
    var_name="month",
    value_name="msg_count"
)

# Aggregate averages
activity_summary = (
    melted.groupby(["event_display", "contributor_type", "month"])["msg_count"]
    .mean()
    .reset_index()
)

# Order months
activity_summary["month"] = pd.Categorical(
    activity_summary["month"],
    categories=activity_cols,
    ordered=True
)

# Plot
g = sns.FacetGrid(
    activity_summary,
    row="event_display",
    hue="contributor_type",
    height=2.5,
    aspect=3,
    sharey=False,
    palette={"Newcomer": "lightblue", "Existing": "steelblue"}
)

g.map(sns.lineplot, "month", "msg_count", linewidth=2)

def add_event_marker(*args, **kwargs):
    plt.axvline(x="M0", color="red", linestyle="--", linewidth=1)

g.map(add_event_marker)

g.add_legend(title="Contributor Type")
g.set_axis_labels("Months Relative to Event", "Avg. Messages")
g.set_titles("{row_name}")

g.fig.subplots_adjust(top=.88)
g.fig.suptitle(
    "Event Participation Drives Activity: Contributors Increase Engagement Leading Into the Event",
    fontsize=15
)

plt.show()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[25], line 14
     11 filtered_df = filtered_df[filtered_df["event_year"] == current_year]
     13 # Melt to long form
---> 14 melted = filtered_df.melt(
     15     id_vars=["event_display", "event_base", "event_year", "contributor_type"],
     16     value_vars=activity_cols,
     17     var_name="month",
     18     value_name="msg_count"
     19 )
     21 # Aggregate averages
     22 activity_summary = (
     23     melted.groupby(["event_display", "contributor_type", "month"])["msg_count"]
     24     .mean()
     25     .reset_index()
     26 )

File /opt/conda/lib/python3.11/site-packages/pandas/core/frame.py:9969, in DataFrame.melt(self, id_vars, value_vars, var_name, value_name, col_level, ignore_index)
   9959 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
   9960 def melt(
   9961     self,
   (...)
   9967     ignore_index: bool = True,
   9968 ) -> DataFrame:
-> 9969     return melt(
   9970         self,
   9971         id_vars=id_vars,
   9972         value_vars=value_vars,
   9973         var_name=var_name,
   9974         value_name=value_name,
   9975         col_level=col_level,
   9976         ignore_index=ignore_index,
   9977     ).__finalize__(self, method="melt")

File /opt/conda/lib/python3.11/site-packages/pandas/core/reshape/melt.py:74, in melt(frame, id_vars, value_vars, var_name, value_name, col_level, ignore_index)
     70 if missing.any():
     71     missing_labels = [
     72         lab for lab, not_found in zip(labels, missing) if not_found
     73     ]
---> 74     raise KeyError(
     75         "The following id_vars or value_vars are not present in "
     76         f"the DataFrame: {missing_labels}"
     77     )
     78 if value_vars_was_not_none:
     79     frame = frame.iloc[:, algos.unique(idx)]

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['M0']"

Return Rate to Next-Year Event

Show % of attendees who returned to the next year’s event (from returned_next_year flag).

Show the code
# Compute return rates by event_display with base + year included
filtered_df = df[df["event_base"].isin(event_display_map.keys())].copy()
current_year = datetime.now().year
filtered_df = filtered_df[filtered_df["event_year"] >= current_year - 4]
filtered_df = filtered_df[filtered_df["event_year"] < current_year]

return_rate = (
    filtered_df.groupby(["event_display", "event_base", "event_year"])["returned_next_year"]
    .mean()
    .reset_index()
)

# Convert to percentage
return_rate["return_rate"] = return_rate["returned_next_year"] * 100

# Sort by base, then year
return_rate = return_rate.sort_values(["event_base", "event_year"])

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=return_rate,
    x="event_display",
    y="return_rate",
    color="steelblue"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Return Rate (%)")
plt.title("Event-to-Event Return Engagement (Sorted by Base → Year)")
plt.tight_layout()
plt.show()

Heatmap of Monthly Activity Offset

Average monthly message count from M-4 to M+4 per event.

Show the code
current_year = datetime.now().year
filtered_df = df[
    (df["event_base"].isin(event_display_map.keys())) &
    (df["event_year"] == current_year)
].copy()

activity_columns = [f"M{-i}" for i in range(4, 0, -1)] + [f"M+{i}" for i in range(1, 5)]


# Group by display/base/year and compute mean activity
monthly_profile = (
    filtered_df.groupby(["event_display", "event_base", "event_year"])[activity_columns]
    .mean()
    .reset_index()
    .sort_values(["event_base", "event_year"])
    .set_index("event_display")
)

# Just the activity columns for heatmap
heatmap_data = monthly_profile[activity_columns]

plt.figure(figsize=(14, max(6, len(heatmap_data) * 0.5)))
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt=".1f",
    cmap="rocket_r",
    linewidths=0.5,
    linecolor="gray"
)
plt.title("Avg. Monthly Activity Offset by Event (Sorted by Base → Year)")
plt.xlabel("Month Offset from Event")
plt.ylabel("Event")
plt.tight_layout()
plt.show()