Show the code
---
title: Wiki Edit History
description: Fedora Wiki edit history trends.
date: 2025-10-12
---{'title': 'Wiki Edits', 'refresh': 'monthly'}
{'title': 'Wiki Edits', 'refresh': 'monthly'}
# common fedora commops analytics includes
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime
from collections import defaultdict
import os
from pyarrow import fs
import pyarrow.dataset as ds
from pathlib import Path# @replace DATA_SOURCES
DATA_SOURCES = {"datagrepper-topics": "/home/jovyan/work/bus2parquet/output_parquets"}
parquet_dir = DATA_SOURCES["datagrepper-topics"]
topic = "org.fedoraproject.prod.wiki.article.edit"
cutoff_date = (pd.Timestamp.now().replace(day=1) - pd.DateOffset(months=12)).date()
files = []
for p in Path(f"{parquet_dir}/{topic}").glob("fedora-*.parquet"):
stem = p.stem.replace(f"-{topic}", "")
d = datetime.strptime(stem.split("-")[1], "%Y%m%d").date()
# if d >= cutoff_date and os.path.getsize(p) > 0:
if os.path.getsize(p) > 0:
files.append(str(p))
local_fs = fs.LocalFileSystem()
tables = []
# First pass: collect all schemas
all_fields = {}
for f in files:
try:
tbl = pq.read_table(f)
for name in tbl.schema.names:
all_fields[name] = pa.string() # force everything to string
except Exception as e:
print(f"[WARN] Skipping {f}: {e}")
# Build unified schema
unified_schema = pa.schema([pa.field(name, pa.string()) for name in sorted(all_fields)])
# Second pass: cast each table to unified schema
for f in files:
try:
tbl = pq.read_table(f)
# Cast existing columns to string
casted = {}
for name in tbl.schema.names:
col = tbl[name].cast(pa.string())
casted[name] = col
# Add missing columns as null strings
for name in unified_schema.names:
if name not in casted:
casted[name] = pa.array([None] * len(tbl), type=pa.string())
# Build new table with unified schema
new_tbl = pa.table([casted[name] for name in unified_schema.names], schema=unified_schema)
tables.append(new_tbl)
except Exception as e:
print(f"[WARN] Skipping {f}: {e}")
if tables:
table = pa.concat_tables(tables, promote=True)
df = table.to_pandas()
print(f"Loaded {len(df)} records from {len(tables)} tables.")
else:
print("No valid parquet files found")Loaded 5761 records from 1331 tables.
/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3526: FutureWarning: promote has been superseded by promote_options='default'.
exec(code_obj, self.user_global_ns, self.user_ns)
import json
# Make a working copy
wiki = df.copy()
# Extract 'sent-at' from headers JSON
wiki["sent_at"] = pd.to_datetime(
wiki["headers"].apply(
lambda h: json.loads(h).get("sent-at") if pd.notna(h) else None
),
errors="coerce"
)
# Drop rows with no timestamp
monthly_edits = wiki.dropna(subset=["sent_at"]).groupby(wiki["sent_at"].dt.to_period("M")).size()
if monthly_edits.empty:
print("⚠️ No valid sent_at timestamps found for monthly aggregation.")
else:
plt.figure(figsize=(12,5))
monthly_edits.index = monthly_edits.index.astype(str) # string labels for x-axis
monthly_edits.plot(kind="line", marker="o", color="teal")
plt.title("Wiki Edits per Month")
plt.ylabel("Edits")
plt.xlabel("Month")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()/tmp/ipykernel_933279/3040856889.py:15: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
monthly_edits = wiki.dropna(subset=["sent_at"]).groupby(wiki["sent_at"].dt.to_period("M")).size()

/tmp/ipykernel_933279/1153112511.py:4: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(x=top_editors.values, y=top_editors.index, palette="viridis")

# Normalize titles: group Test_Results pages
wiki["title_grouped"] = wiki["title"].apply(
lambda t: "Test Results Pages" if isinstance(t, str) and t.startswith("Test_Results") else t
)
top_pages = wiki["title_grouped"].value_counts().head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_pages.values, y=top_pages.index, palette="magma")
plt.title("Top 10 Wiki Pages by Edits (Grouped)")
plt.xlabel("Edits")
plt.ylabel("Page Title")
plt.tight_layout()
plt.show()/tmp/ipykernel_933279/1856106439.py:9: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(x=top_pages.values, y=top_pages.index, palette="magma")

# Drop invalid timestamps
wiki = wiki.dropna(subset=["sent_at"]).copy()
wiki["day_of_week"] = wiki["sent_at"].dt.day_name()
wiki["hour"] = wiki["sent_at"].dt.hour
heatmap_data = wiki.groupby(["day_of_week", "hour"]).size().unstack(fill_value=0)
# Order weekdays
days_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
heatmap_data = heatmap_data.reindex(days_order)
plt.figure(figsize=(14,6))
sns.heatmap(heatmap_data, cmap="YlGnBu")
plt.title("Wiki Edits by Day of Week and Hour")
plt.xlabel("Hour of Day")
plt.ylabel("Day of Week")
plt.show()