# setup
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import glob

# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]

parquet_files = sorted(glob.glob(f"{parquet_dir}/*.parquet"))

chunks = []
for batch_start in range(0, len(parquet_files), 50):
    batch = parquet_files[batch_start:batch_start + 50]
    chunk_df = pd.concat(
        [pd.read_parquet(f) for f in batch],
        ignore_index=True
    )
    chunk_df['sent_at'] = pd.to_datetime(chunk_df['sent_at'], errors='coerce').dt.floor('s')
    chunks.append(chunk_df)

combined_df = pd.concat(chunks, ignore_index=True)
combined_df.dropna(subset=['sent_at', 'username'], inplace=True)

max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")

Maximum date in data: 2025-06-28
Minimum date in data: 2024-01-01

# Drop rows with invalid timestamps
initial_count = len(combined_df)
combined_df.dropna(subset=['sent_at'], inplace=True)
cleaned_count = len(combined_df)

# Assign week buckets
combined_df['week_start'] = combined_df['sent_at'].dt.to_period('W').dt.start_time
combined_df['week_label'] = combined_df['week_start'].dt.strftime('Week of %Y-%m-%d')

# Count distinct users per week per topic
aggregated_df = combined_df.groupby(
    ['week_start', 'week_label', 'topic']
)['username'].nunique().reset_index(name='distinct_user_count')

# Pivot for heatmap
heatmap_df = aggregated_df.pivot(
    index='week_start', columns='topic', values='distinct_user_count'
).fillna(0).sort_index()
heatmap_df.index = heatmap_df.index.strftime('Week of %Y-%m-%d')

# Select top 20 topics
top_topics = aggregated_df.groupby('topic')['distinct_user_count'].sum().nlargest(20).index
heatmap_top = heatmap_df[top_topics]

# Plot heatmap
plt.figure(figsize=(20, 24))
sns.heatmap(
    heatmap_top,
    annot=True,
    fmt=".0f",
    cmap='rocket_r',
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'Number of Distinct Users'}
)

plt.title('Weekly Distinct Users for Top 20 Topics', fontsize=18)
plt.xlabel('Topic', fontsize=14)
plt.ylabel('Week', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Weekly Distinct Users by Topic (Top 20)¶