# setup
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import glob

# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]

parquet_files = sorted(glob.glob(f"{parquet_dir}/*.parquet"))

# Read files in chunks to minimize memory usage
chunks = []
for batch_start in range(0, len(parquet_files), 50):  # 50-file chunks
    batch = parquet_files[batch_start:batch_start + 50]
    chunk_df = pd.concat(
        [pd.read_parquet(f) for f in batch],
        ignore_index=True
    )
    chunk_df['sent_at'] = pd.to_datetime(chunk_df['sent_at'], errors='coerce').dt.floor('s')
    chunks.append(chunk_df)

combined_df = pd.concat(chunks, ignore_index=True)
combined_df.dropna(subset=['sent_at', 'username'], inplace=True)

max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")

Maximum date in data: 2025-06-28
Minimum date in data: 2024-01-01

combined_df['month'] = combined_df['sent_at'].dt.to_period('M')

def classify_system(topic):
    if topic.startswith('org.fedoraproject.prod.discourse.topic.topic_created') or \
       topic.startswith('org.fedoraproject.prod.discourse.post.post_created'):
        return 'Discourse Post'
    elif topic == 'org.fedoraproject.prod.mailman.receive':
        return 'Mailing List Post'
    else:
        return 'Other'

combined_df['system'] = combined_df['topic'].map(classify_system)

filtered_df = combined_df[combined_df['system'].isin(['Discourse Post', 'Mailing List Post'])]

activity_summary = (
    filtered_df
    .groupby(['month', 'system'])
    .size()
    .unstack(fill_value=0)
    .sort_index()
)

activity_summary.plot(kind='bar', figsize=(12,6))
plt.title('Discourse vs Mailing List Activity Over Time')
plt.xlabel('Month')
plt.ylabel('Message Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Mailman Posts vs Discourse Posts by Month¶