Weekly Distinct Users by Topic (Top 20)

Show the code
---
title: Weekly Distinct Users by Topic (Top 20)
description: This analysis visualizes distinct user activity by topic across weeks.
date: 2025-10-12
---
{'title': 'Weekly Distinct Users by Topic (Top 20)', 'refresh': 'weekly'}

This analysis visualizes distinct user activity by topic across weeks, using event timestamps from Fedora Messaging logs.

Show the code
import os
import glob
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq

plt.style.use("seaborn-v0_8")
sns.set_theme(context="notebook", style="whitegrid")
Show the code
# @replace DATA_SOURCES
DATA_SOURCES = {"datagrepper-parse-accounts": "/home/jovyan/work/bus2parquet/output_users"}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]

cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(weeks=52)).date()

files = []
for p in Path(parquet_dir).glob("fedora-*.parquet"):
    stem = p.stem.replace("_processed", "")
    d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
    if d >= cutoff_date:
        files.append(str(p))

dataset = ds.dataset(files, format="parquet")

chunks = []
for batch in dataset.to_batches(batch_size=50_000):
    df = batch.to_pandas()
    if "sent_at" not in df.columns or "username" not in df.columns:
        continue
    df["sent_at"] = pd.to_datetime(df["sent_at"], errors="coerce").dt.floor("s")
    chunks.append(df)

combined_df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()

if not combined_df.empty:
    print("Maximum date in data:", combined_df["sent_at"].max().date())
    print("Minimum date in data:", combined_df["sent_at"].min().date())
else:
    print("No data found in cutoff range")
Maximum date in data: 2025-09-10
Minimum date in data: 2024-09-02
Show the code
# Drop rows with invalid timestamps
initial_count = len(combined_df)
combined_df.dropna(subset=['sent_at'], inplace=True)
cleaned_count = len(combined_df)

# Assign week buckets
combined_df['week_start'] = combined_df['sent_at'].dt.to_period('W').dt.start_time
combined_df['week_label'] = combined_df['week_start'].dt.strftime('Week of %Y-%m-%d')
Show the code
# Count distinct users per week per topic
aggregated_df = combined_df.groupby(
    ['week_start', 'week_label', 'topic']
)['username'].nunique().reset_index(name='distinct_user_count')

# Pivot for heatmap
heatmap_df = aggregated_df.pivot(
    index='week_start', columns='topic', values='distinct_user_count'
).fillna(0).sort_index()
heatmap_df.index = heatmap_df.index.strftime('Week of %Y-%m-%d')

# Select top 20 topics
top_topics = aggregated_df.groupby('topic')['distinct_user_count'].sum().nlargest(20).index
heatmap_top = heatmap_df[top_topics]
Show the code
# Plot heatmap
plt.figure(figsize=(20, 24))
sns.heatmap(
    heatmap_top,
    annot=True,
    fmt=".0f",
    cmap='rocket_r',
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'Number of Distinct Users'}
)

plt.title('Weekly Distinct Users for Top 20 Topics', fontsize=18)
plt.xlabel('Topic', fontsize=14)
plt.ylabel('Week', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()