Weekly Distinct Users by Topic (Top 20)¶
This analysis visualizes distinct user activity by topic across weeks, using event timestamps from Fedora Messaging logs.
In [1]:
# setup
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import glob
In [2]:
# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
parquet_files = sorted(glob.glob(f"{parquet_dir}/*.parquet"))
chunks = []
for batch_start in range(0, len(parquet_files), 50):
batch = parquet_files[batch_start:batch_start + 50]
chunk_df = pd.concat(
[pd.read_parquet(f) for f in batch],
ignore_index=True
)
chunk_df['sent_at'] = pd.to_datetime(chunk_df['sent_at'], errors='coerce').dt.floor('s')
chunks.append(chunk_df)
combined_df = pd.concat(chunks, ignore_index=True)
combined_df.dropna(subset=['sent_at', 'username'], inplace=True)
max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")
Maximum date in data: 2025-06-28 Minimum date in data: 2024-01-01
In [3]:
# Drop rows with invalid timestamps
initial_count = len(combined_df)
combined_df.dropna(subset=['sent_at'], inplace=True)
cleaned_count = len(combined_df)
# Assign week buckets
combined_df['week_start'] = combined_df['sent_at'].dt.to_period('W').dt.start_time
combined_df['week_label'] = combined_df['week_start'].dt.strftime('Week of %Y-%m-%d')
In [4]:
# Count distinct users per week per topic
aggregated_df = combined_df.groupby(
['week_start', 'week_label', 'topic']
)['username'].nunique().reset_index(name='distinct_user_count')
# Pivot for heatmap
heatmap_df = aggregated_df.pivot(
index='week_start', columns='topic', values='distinct_user_count'
).fillna(0).sort_index()
heatmap_df.index = heatmap_df.index.strftime('Week of %Y-%m-%d')
# Select top 20 topics
top_topics = aggregated_df.groupby('topic')['distinct_user_count'].sum().nlargest(20).index
heatmap_top = heatmap_df[top_topics]
In [5]:
# Plot heatmap
plt.figure(figsize=(20, 24))
sns.heatmap(
heatmap_top,
annot=True,
fmt=".0f",
cmap='rocket_r',
linewidths=0.5,
linecolor='gray',
cbar_kws={'label': 'Number of Distinct Users'}
)
plt.title('Weekly Distinct Users for Top 20 Topics', fontsize=18)
plt.xlabel('Topic', fontsize=14)
plt.ylabel('Week', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
In [ ]: