# setup
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict

# @replace DATA_SOURCES
DATA_SOURCES = {'badges': '/data/badges', 'datagrepper-raw': '/data/datagrepper-raw', 'datagrepper-parse-accounts': '/data/datagrepper-parse-accounts'}
parquet_dir = DATA_SOURCES["datagrepper-parse-accounts"]

import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pyarrow as pa

dataset = ds.dataset(parquet_dir, format="parquet")
table = dataset.to_table(columns=["username", "sent_at", "topic"])
combined_df = table.to_pandas()

combined_df.dropna(subset=['sent_at', 'username'], inplace=True)

max_date = combined_df['sent_at'].max().date()
min_date = combined_df['sent_at'].min().date()
print(f"Maximum date in data: {max_date}")
print(f"Minimum date in data: {min_date}")

Maximum date in data: 2025-06-28
Minimum date in data: 2024-01-01

combined_df = combined_df[~combined_df['topic'].str.startswith("org.centos", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("io.pagure.prod", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.mailman.receive", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.bugzilla", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.release-monitoring", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.copr", na=False)]
combined_df = combined_df[~combined_df['topic'].str.startswith("org.fedoraproject.prod.discourse", na=False)]

combined_df['month'] = combined_df['sent_at'].dt.to_period('M')

event_counts = (
    combined_df.groupby(['username', 'month'])
    .size()
    .reset_index(name='event_count')
)

filtered_events = event_counts[event_counts['event_count'] > 10]

user_months = defaultdict(set)
for row in filtered_events.itertuples():
    user_months[row.username].add(row.month)

valid_months = sorted(filtered_events['month'].unique())
month_to_users = {
    month: set(filtered_events[filtered_events['month'] == month]['username'])
    for month in valid_months
}

user_activity_rows = []

for username, active_months in user_months.items():
    for month in sorted(active_months):
        next_month_idx = valid_months.index(month) + 1
        if next_month_idx < len(valid_months):
            next_month = valid_months[next_month_idx]
            active_next_month = username in month_to_users[next_month]
        else:
            active_next_month = False
        user_activity_rows.append({
            'user': username,
            'month': str(month),
            'active_this_month': True,
            'active_next_month': active_next_month,
            'status': 'returning' if active_next_month else 'churned'
        })

user_activity_df = pd.DataFrame(user_activity_rows)

data = pd.DataFrame(user_activity_rows)
all_months = sorted(data['month'].unique())

month_to_users = {
    month: set(data[data['month'] == month]['user']) for month in all_months
}

delta_rows = []

for i in range(1, len(all_months)-1):
    prev_month = all_months[i - 1]
    curr_month = all_months[i]

    prev_users = month_to_users[prev_month]
    curr_users = month_to_users[curr_month]

    retained_users = curr_users & prev_users
    outflow_users = prev_users - curr_users
    inflow_users = curr_users - prev_users

    delta_rows.append({
        'month': str(curr_month),
        'retained': len(retained_users),
        'outflow': len(outflow_users),
        'inflow': len(inflow_users),
        'retained_users': sorted(retained_users),
        'outflow_users': sorted(outflow_users),
        'inflow_users': sorted(inflow_users),

    })

delta_df = pd.DataFrame(delta_rows).set_index('month')

ax = delta_df[['inflow', 'retained', 'outflow']].plot(
    kind='bar',
    stacked=True,
    color=['lightblue', 'blue', 'red'],
    figsize=(12, 6)
)
for idx, row in enumerate(delta_df.itertuples()):
    bottom = 0
    for col, color in zip(['inflow', 'retained', 'outflow'], ['lightblue', 'blue', 'red']):
        value = getattr(row, col)
        if value > 0:
            ax.text(
                idx,
                bottom + value / 2,
                str(value),
                ha='center',
                va='center',
                fontsize=9,
                color='white' if color != 'yellow' else 'black',
                fontweight='bold'
            )
        bottom += value

plt.title("Monthly Contributor Inflow, Retention, and Outflow")
plt.xlabel("Month")
plt.ylabel("Number of Users")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

FAS Account Inflow / Outflow by Month¶