Weekly Distinct Users by Topic (Top 20)¶
This analysis visualizes distinct user activity by topic across weeks, using event timestamps from Fedora Messaging logs.
In [1]:
# setup
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import glob
from pathlib import Path
In [2]:
# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]
parquet_files = sorted(glob.glob(f"{parquet_dir}/*.parquet"))
cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(weeks=52)).date()
files = []
for p in Path(parquet_dir).glob("fedora-*.parquet"):
stem = p.stem.replace("_processed", "")
d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
if d >= cutoff_date:
files.append(str(p))
chunks = []
for batch_start in range(0, len(files), 50):
batch = parquet_files[batch_start:batch_start + 50]
chunk_df = pd.concat(
[pd.read_parquet(f) for f in batch],
ignore_index=True
)
chunk_df['sent_at'] = pd.to_datetime(chunk_df['sent_at'], errors='coerce').dt.floor('s')
chunks.append(chunk_df)
combined_df = pd.concat(chunks, ignore_index=True)
combined_df.dropna(subset=['sent_at', 'username'], inplace=True)
max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")
Maximum date in data: 2022-02-04 Minimum date in data: 2021-01-01
In [3]:
# Drop rows with invalid timestamps
initial_count = len(combined_df)
combined_df.dropna(subset=['sent_at'], inplace=True)
cleaned_count = len(combined_df)
# Assign week buckets
combined_df['week_start'] = combined_df['sent_at'].dt.to_period('W').dt.start_time
combined_df['week_label'] = combined_df['week_start'].dt.strftime('Week of %Y-%m-%d')
In [4]:
# Count distinct users per week per topic
aggregated_df = combined_df.groupby(
['week_start', 'week_label', 'topic']
)['username'].nunique().reset_index(name='distinct_user_count')
# Pivot for heatmap
heatmap_df = aggregated_df.pivot(
index='week_start', columns='topic', values='distinct_user_count'
).fillna(0).sort_index()
heatmap_df.index = heatmap_df.index.strftime('Week of %Y-%m-%d')
# Select top 20 topics
top_topics = aggregated_df.groupby('topic')['distinct_user_count'].sum().nlargest(20).index
heatmap_top = heatmap_df[top_topics]
In [5]:
# Plot heatmap
plt.figure(figsize=(20, 24))
sns.heatmap(
heatmap_top,
annot=True,
fmt=".0f",
cmap='rocket_r',
linewidths=0.5,
linecolor='gray',
cbar_kws={'label': 'Number of Distinct Users'}
)
plt.title('Weekly Distinct Users for Top 20 Topics', fontsize=18)
plt.xlabel('Topic', fontsize=14)
plt.ylabel('Week', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
In [ ]: