Bugs Edited by Affiliation

This analysis visualizes communities who are making edits to bugs in Bugzilla based on if you have a redhat.com email or a non-RH.com email.
Published

October 12, 2025

Bugs Edited by Affiliation

This analysis visualizes communities who are making edits to bugs in Bugzilla based on if you have a redhat.com email or a non-RH.com email. This may not be accurate for some community members who do RH work and use their RH email but not as a part of their role. It also might not be correct for shadow RH users who are a personal email for community work. At best this is a estimation.

Show the code
# common fedora commops analytics includes
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
from collections import defaultdict
import os
from pyarrow import fs
import pyarrow.dataset as ds
from pathlib import Path
Show the code
# @replace DATA_SOURCES
DATA_SOURCES = {"datagrepper-topics": "/home/jovyan/work/bus2parquet/output_parquets"}

parquet_dir = DATA_SOURCES["datagrepper-topics"]
topic = "org.fedoraproject.prod.bugzilla.bug.update"

cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(months=12)).date()

files = []
for p in Path(f"{parquet_dir}/{topic}").glob("fedora-*.parquet"):
    stem = p.stem.replace(f"-{topic}", "")
    d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
    # if d >= cutoff_date and os.path.getsize(p) > 0:
    if os.path.getsize(p) > 0:
        files.append(str(p))

local_fs = fs.LocalFileSystem()
tables = []

# First pass: collect all schemas
all_fields = {}
for f in files:
    try:
        tbl = pq.read_table(f)
        for name in tbl.schema.names:
            all_fields[name] = pa.string()  # force everything to string
    except Exception as e:
        print(f"[WARN] Skipping {f}: {e}")

# Build unified schema
unified_schema = pa.schema([pa.field(name, pa.string()) for name in sorted(all_fields)])

# Second pass: cast each table to unified schema
for f in files:
    try:
        tbl = pq.read_table(f)
        # Cast existing columns to string
        casted = {}
        for name in tbl.schema.names:
            col = tbl[name].cast(pa.string())
            casted[name] = col
        # Add missing columns as null strings
        for name in unified_schema.names:
            if name not in casted:
                casted[name] = pa.array([None] * len(tbl), type=pa.string())
        # Build new table with unified schema
        new_tbl = pa.table([casted[name] for name in unified_schema.names], schema=unified_schema)
        tables.append(new_tbl)
    except Exception as e:
        print(f"[WARN] Skipping {f}: {e}")

if tables:
    table = pa.concat_tables(tables, promote=True)
    df = table.to_pandas()
    print(f"Loaded {len(df)} bug update records from {len(tables)} tables.")
else:
    print("No valid parquet files found")
/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3526: FutureWarning: promote has been superseded by promote_options='default'.
  exec(code_obj, self.user_global_ns, self.user_ns)
Loaded 78224 bug update records from 1267 tables.
Show the code
records = []

for row in df.itertuples(index=False):
    try:
        ts = None
        if row.event_time:
            try:
                ts = pd.to_datetime(int(row.event_time), unit="s")
            except Exception:
                ts = pd.NaT

        records.append({
            "bug_id": row.bug_id,
            "priority": row.bug_priority or "unspecified",
            "status": row.bug_status_name,
            "email": row.event_who,
            "action": row.event_action,
            "timestamp": ts,
            "month": ts.strftime("%Y-%m") if pd.notnull(ts) else None,
            "event_category": row.event_routing_key
        })
    except Exception:
        continue

structured_df = pd.DataFrame.from_records(records)
print(f"Structured {len(structured_df)} records.")

if not structured_df.empty:
    print("Maximum date in data:", structured_df["timestamp"].max().date())
    print("Minimum date in data:", structured_df["timestamp"].min().date())
else:
    print("No data found in cutoff range")
Structured 78224 records.
Maximum date in data: 2025-09-30
Minimum date in data: 2022-01-01
Show the code
structured_df["group"] = structured_df["email"].apply(lambda x: "Red Hat" if x.endswith("@redhat.com") else "Community")
Show the code
def group_bugs(dfimport, action_filter, status_filter=None):
    filt = dfimport[dfimport["action"] == action_filter]
    if status_filter:
        filt = filt[filt["status"].str.upper() == status_filter]
    return filt.groupby(["month", "group"])["bug_id"].nunique().unstack(fill_value=0)

bugs_edited = group_bugs(structured_df, "modify")
bugs_opened = group_bugs(structured_df, "create")
bugs_closed = group_bugs(structured_df, "modify", status_filter="CLOSED")
bugs_by_priority = structured_df[structured_df["action"] == "modify"].groupby(["priority", "group"])["bug_id"].nunique().unstack(fill_value=0)
Show the code
import matplotlib.pyplot as plt

def plot(df, title):
    df.sort_index().plot(kind="bar", color={"Red Hat": "red", "Community": "blue"}, stacked=True, figsize=(12,6))
    plt.title(title)
    plt.ylabel("Number of Bugs")
    plt.xlabel("Month or Priority")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
Show the code
plot(bugs_opened, "Bugs Opened: Red Hat vs Community")
plot(bugs_closed, "Bugs Closed: Red Hat vs Community")
plot(bugs_edited, "Bugs Edited: Red Hat vs Community")