Show the code
---
title: Weekly Distinct Users by Topic (Top 20)
description: This analysis visualizes distinct user activity by topic across weeks.
date: 2025-10-12
---{'title': 'Weekly Distinct Users by Topic (Top 20)', 'refresh': 'weekly'}
{'title': 'Weekly Distinct Users by Topic (Top 20)', 'refresh': 'weekly'}
This analysis visualizes distinct user activity by topic across weeks, using event timestamps from Fedora Messaging logs.
import os
import glob
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
plt.style.use("seaborn-v0_8")
sns.set_theme(context="notebook", style="whitegrid")# @replace DATA_SOURCES
DATA_SOURCES = {"datagrepper-parse-accounts": "/home/jovyan/work/bus2parquet/output_users"}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(weeks=52)).date()
files = []
for p in Path(parquet_dir).glob("fedora-*.parquet"):
stem = p.stem.replace("_processed", "")
d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
if d >= cutoff_date:
files.append(str(p))
dataset = ds.dataset(files, format="parquet")
chunks = []
for batch in dataset.to_batches(batch_size=50_000):
df = batch.to_pandas()
if "sent_at" not in df.columns or "username" not in df.columns:
continue
df["sent_at"] = pd.to_datetime(df["sent_at"], errors="coerce").dt.floor("s")
chunks.append(df)
combined_df = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
if not combined_df.empty:
print("Maximum date in data:", combined_df["sent_at"].max().date())
print("Minimum date in data:", combined_df["sent_at"].min().date())
else:
print("No data found in cutoff range")Maximum date in data: 2025-09-10
Minimum date in data: 2024-09-02
# Drop rows with invalid timestamps
initial_count = len(combined_df)
combined_df.dropna(subset=['sent_at'], inplace=True)
cleaned_count = len(combined_df)
# Assign week buckets
combined_df['week_start'] = combined_df['sent_at'].dt.to_period('W').dt.start_time
combined_df['week_label'] = combined_df['week_start'].dt.strftime('Week of %Y-%m-%d')# Count distinct users per week per topic
aggregated_df = combined_df.groupby(
['week_start', 'week_label', 'topic']
)['username'].nunique().reset_index(name='distinct_user_count')
# Pivot for heatmap
heatmap_df = aggregated_df.pivot(
index='week_start', columns='topic', values='distinct_user_count'
).fillna(0).sort_index()
heatmap_df.index = heatmap_df.index.strftime('Week of %Y-%m-%d')
# Select top 20 topics
top_topics = aggregated_df.groupby('topic')['distinct_user_count'].sum().nlargest(20).index
heatmap_top = heatmap_df[top_topics]# Plot heatmap
plt.figure(figsize=(20, 24))
sns.heatmap(
heatmap_top,
annot=True,
fmt=".0f",
cmap='rocket_r',
linewidths=0.5,
linecolor='gray',
cbar_kws={'label': 'Number of Distinct Users'}
)
plt.title('Weekly Distinct Users for Top 20 Topics', fontsize=18)
plt.xlabel('Topic', fontsize=14)
plt.ylabel('Week', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()