Wiki Edit History

Fedora Wiki edit history trends.

Author

Robert Wright (rwright@)

Published

October 12, 2025

Wiki Edits

Show the code

# common fedora commops analytics includes
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
from collections import defaultdict
import os
from pyarrow import fs
import pyarrow.dataset as ds
from pathlib import Path

Show the code

# @replace DATA_SOURCES
DATA_SOURCES = {"datagrepper-topics": "/home/jovyan/work/bus2parquet/output_parquets"}

parquet_dir = DATA_SOURCES["datagrepper-topics"]
topic = "org.fedoraproject.prod.wiki.article.edit"

cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(months=12)).date()

files = []
for p in Path(f"{parquet_dir}/{topic}").glob("fedora-*.parquet"):
    stem = p.stem.replace(f"-{topic}", "")
    d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
    # if d >= cutoff_date and os.path.getsize(p) > 0:
    if os.path.getsize(p) > 0:
        files.append(str(p))

local_fs = fs.LocalFileSystem()
tables = []

# First pass: collect all schemas
all_fields = {}
for f in files:
    try:
        tbl = pq.read_table(f)
        for name in tbl.schema.names:
            all_fields[name] = pa.string()  # force everything to string
    except Exception as e:
        print(f"[WARN] Skipping {f}: {e}")

# Build unified schema
unified_schema = pa.schema([pa.field(name, pa.string()) for name in sorted(all_fields)])

# Second pass: cast each table to unified schema
for f in files:
    try:
        tbl = pq.read_table(f)
        # Cast existing columns to string
        casted = {}
        for name in tbl.schema.names:
            col = tbl[name].cast(pa.string())
            casted[name] = col
        # Add missing columns as null strings
        for name in unified_schema.names:
            if name not in casted:
                casted[name] = pa.array([None] * len(tbl), type=pa.string())
        # Build new table with unified schema
        new_tbl = pa.table([casted[name] for name in unified_schema.names], schema=unified_schema)
        tables.append(new_tbl)
    except Exception as e:
        print(f"[WARN] Skipping {f}: {e}")

if tables:
    table = pa.concat_tables(tables, promote=True)
    df = table.to_pandas()
    print(f"Loaded {len(df)} records from {len(tables)} tables.")
else:
    print("No valid parquet files found")

Loaded 5761 records from 1331 tables.

/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3526: FutureWarning: promote has been superseded by promote_options='default'.
  exec(code_obj, self.user_global_ns, self.user_ns)

Show the code

import json

# Make a working copy
wiki = df.copy()

# Extract 'sent-at' from headers JSON
wiki["sent_at"] = pd.to_datetime(
    wiki["headers"].apply(
        lambda h: json.loads(h).get("sent-at") if pd.notna(h) else None
    ),
    errors="coerce"
)

# Drop rows with no timestamp
monthly_edits = wiki.dropna(subset=["sent_at"]).groupby(wiki["sent_at"].dt.to_period("M")).size()

if monthly_edits.empty:
    print("⚠️ No valid sent_at timestamps found for monthly aggregation.")
else:
    plt.figure(figsize=(12,5))
    monthly_edits.index = monthly_edits.index.astype(str)  # string labels for x-axis
    monthly_edits.plot(kind="line", marker="o", color="teal")
    plt.title("Wiki Edits per Month")
    plt.ylabel("Edits")
    plt.xlabel("Month")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

/tmp/ipykernel_933279/3040856889.py:15: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  monthly_edits = wiki.dropna(subset=["sent_at"]).groupby(wiki["sent_at"].dt.to_period("M")).size()

Show the code

top_editors = wiki["user"].value_counts().head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=top_editors.values, y=top_editors.index, palette="viridis")
plt.title("Top 10 Editors by Edit Count")
plt.xlabel("Edits")
plt.ylabel("User")
plt.tight_layout()
plt.show()

/tmp/ipykernel_933279/1153112511.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_editors.values, y=top_editors.index, palette="viridis")

Show the code


# Normalize titles: group Test_Results pages
wiki["title_grouped"] = wiki["title"].apply(
    lambda t: "Test Results Pages" if isinstance(t, str) and t.startswith("Test_Results") else t
)

top_pages = wiki["title_grouped"].value_counts().head(10)

plt.figure(figsize=(10,5))
sns.barplot(x=top_pages.values, y=top_pages.index, palette="magma")
plt.title("Top 10 Wiki Pages by Edits (Grouped)")
plt.xlabel("Edits")
plt.ylabel("Page Title")
plt.tight_layout()
plt.show()

/tmp/ipykernel_933279/1856106439.py:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_pages.values, y=top_pages.index, palette="magma")

Show the code

# Drop invalid timestamps
wiki = wiki.dropna(subset=["sent_at"]).copy()

wiki["day_of_week"] = wiki["sent_at"].dt.day_name()
wiki["hour"] = wiki["sent_at"].dt.hour

heatmap_data = wiki.groupby(["day_of_week", "hour"]).size().unstack(fill_value=0)

# Order weekdays
days_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
heatmap_data = heatmap_data.reindex(days_order)

plt.figure(figsize=(14,6))
sns.heatmap(heatmap_data, cmap="YlGnBu")
plt.title("Wiki Edits by Day of Week and Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Day of Week")
plt.show()