QEMU Status Report

Setup

This is the setup pre-amble for the diagram and graph generation. The actual presentation will be done in the usual corporate Google Slides so we just need to export graphics that can easily be embedded.

Generic Helpers

import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
from datetime import datetime
import re

years = mdates.YearLocator()   # every year
twoyears = mdates.YearLocator(2)   # every other year
threeyears = mdates.YearLocator(3)   # every 3rd year
months = mdates.MonthLocator()  # every month
threemonths = mdates.MonthLocator()  # every 3rd month
yearsFmt = mdates.DateFormatter('%Y')

git shortlog -sn --all --since "last year" | wc -l

git log --oneline --no-merges --since "last year" | wc -l

git log --oneline --merges --since "last year" | wc -l

(concat default-directory "generated/kvm25/" file)

(concat default-directory "data/" file)

png=${svg/.svg/.png}
inkscape --export-background-opacity=0 --export-filename="$png" "$svg"
echo "Converted $svg to $png"

(concat default-directory file)

Python Setup

Because I want to use newer python modules that are not packaged by my distro lets create a pyvenv for the project:

# Check if the venv directory exists. If not, create it.
if [ ! -d "$pyvenv_dir" ]; then
  python3 -m venv "$pyvenv_dir"
fi
# Install required packages
"$pyvenv_dir"/bin/pip install pandas plotly plotly-express squarify matplotlib

Treemap Helpers

This function is responsible for extracting the data for a given branch against a given base tag. cherry_filter is used to restrict the final data just to just areas touched by non-upstream code.

We do a fair bit of data massaging to get something useful. We skip imported headers and also most binary files. We use an external tool to measure lines of code (cloc). We also have to take care issuing git commands against an empty file list - but we do use a file list to make sure we get everything in the leaf nodes.

import os
import re
import git
import csv
from subprocess import check_output, STDOUT

# list of arches
arch = []
repo = git.Repo(tree)

# ss_re = re.compile(r'(\d+) file?s changed, (\d+) insertion?s\(\+\), (\d+) deletion?s\(-\)')
# ss_re = re.compile(r'(\d+) file?s changed(?:, (\d+) insertion?s\(\+\))?(?:, (\d+) deletion?s\(-\))?')
# ss_re = re.compile(r'(\d+) file(?:s)? changed, (\d+) insertion(?:s)?\(\+\), (\d+) deletion(?:s)?\(-\)')
ss_re = re.compile(r'(\d+) file(?:s)? changed(?:, (\d+) insertion(?:s)?\(\+\))?(?:, (\d+) deletion(?:s)?\(-\))?')
arch_re = re.compile(r'^configs\/targets\/([^-\/]+)')

os.makedirs(os.path.dirname(cdata), exist_ok=True)
csv_fd = open(cdata, 'w')
csv_out = csv.writer(csv_fd)

# write the header
csv_out.writerow(["first", "second", "third",               # calc
                  "files", "inc_subdir",
                  "lines", "blank", "comment", "code",      # via loc
                  "commits", "changed"])                    # diff

cherry_commits = set()
if cherry_filter == "yes":
    added = repo.git.cherry(base, branch).split("\n")
    cherry_commits = set([c.lstrip('+ ')[:11] for c in added])

# this only actually arches with subdirs
def contains_arch(text):
    return any(a in text for a in arch)

def filter_dirs(dirs):
    # remove extraneous stuff
    minus_headers = [d for d in dirs if
                     "roms" not in d and
                     "pc-bios" not in d and
                     "standard-headers" not in d and
                     "linux-headers" not in d]
    # don't go too deep
    max_depth = [d for d in minus_headers if d.count('/') < 3]

    final_dirs = []
    for d in max_depth:
        last_dir = d.split("/")[-1]
        if contains_arch(d) and not contains_arch(last_dir):
            next
        else:
            final_dirs.append(d)

    return final_dirs


# get list of files, usually we ignore subdirs unless we are too deep
# in which case the files get rolled into the tree.
def get_files(branch, subdir, inc_subdirs=False):
    file_tree = repo.git.ls_tree("--format", "%(objecttype) %(path)",
                                 f"{branch}",
                                 subdir + "/").rstrip().split("\n")
    files = []
    for item in file_tree:
        (thing, path) = item.split()
        if thing == 'blob':
            files.append(path)
        elif inc_subdirs and thing == 'tree':
            more_files=get_files(branch, path)
            files.extend(more_files)
    return files

# Count total lines of code for files on tip of branch
def count_loc(branch, files):
    file_counts = {"lines": 0, "blank": 0, "comment": 0, "code": 0}
    if len(files) > 0:
        for f in files:
            blank = 0
            comment = 0
            lines = 0
            code = 0

            blob = repo.rev_parse(f"{branch}:{f}")

            # skip non-text blobs
            if blob.name.endswith("bz2") or blob.name.endswith("bin"):
                print(f"skipping binary {f}")
                continue
            if blob.name.endswith("whl"):
                print(f"skipping python wheel file {f}")
                continue
            if blob.mime_type == "application/octet-stream":
                print(f"skipping binary {f}")
                continue

            # count lines using cloc
            stats = check_output(["cloc", "--csv", "--hide-rate", "--quiet",
                                  f"--stdin-name={blob.name}", "-"],
                                 input=blob.data_stream.read(),
                                 stderr=STDOUT).decode().strip()

            # fallback on cloc failure
            if not stats or "error" in stats:
                wc = check_output(["wc", "-l",], input=blob.data_stream.read()).decode()
                lines = int(wc.strip())
                comment = lines
            else:
                totals = stats.split("\n")[-1].split(",")
                blank = int(totals[2])
                comment = int(totals[3])
                code = int(totals[4])
                lines = blank + comment + code

            file_counts["lines"] += lines
            file_counts["blank"] += blank
            file_counts["comment"] += comment
            file_counts["code"] += code

    return file_counts

def get_log(branch, base, directory_or_files):
    if directory_or_files:
        log = repo.git.log("--format=%h", f"{base}..{branch}", "--", directory_or_files)
        if log:
            return log.split("\n")
    return []

# get changes from base..branch for a directory or set of files
def get_changes(branch, base, directory, files, inc_subdirs):
    changed = 0
    if inc_subdirs:
        log = get_log(branch, base, directory)
    else:
        log = get_log(branch, base, files)

    if not log:
        return { "commits": 0, "changed": 0 }

    if cherry_filter == "yes":
        log_commits = set(log)
        in_both = log_commits & cherry_commits
        commits = len(in_both)
    else:
        commits = len(log)

    if inc_subdirs:
       diffs = repo.git.diff("--shortstat", f"{base}..{branch}", "--", directory)
    else:
       diffs = repo.git.diff("--shortstat", f"{base}..{branch}", "--", files)

    if diffs:
        ss = ss_re.search(diffs)
        if ss:
            changed += int(ss.group(2)) if ss.group(2) else 0
            changed += int(ss.group(3)) if ss.group(3) else 0
        else:
            print(f"failed to parse: {diffs} for {base}..{branch} -- {files}")

    return { "commits": commits, "changed": changed }

# get a list of arches
for config in get_files(branch, 'configs/targets'):
    m = arch_re.match(config)
    if m and m.group(1) not in arch:
        arch.append(m.group(1))
# these get missed because of numeric prefixes
arch.extend(["riscv", "loongarch"])

all_dirs = repo.git.ls_tree("-r", "-d", "--name-only", f"{branch}").rstrip().split("\n")
all_dirs = filter_dirs(all_dirs)
all_dirs.insert(0, ".")

rows = 0

for d in all_dirs:
    subdirs = d.split("/")
    first_dir = subdirs[0]
    second_dir = ""
    last_dir = ""
    has_subdirs = False

    if len(subdirs) > 1:
        second_dir = subdirs[1]
    if len(subdirs) > 2:
        last_dir = subdirs[2]

    if last_dir != "" or contains_arch(second_dir):
        files_without_subdirs = get_files(branch, d, False)
        files = get_files(branch, d, True)
        has_subdirs = len(files) > len(files_without_subdirs)
    else:
        files = get_files(branch, d, False)

    file_counts = count_loc(branch, files)
    changes = get_changes(branch, base, d, files, has_subdirs)

    out = [ first_dir, second_dir, last_dir,
            len(files), has_subdirs,
            file_counts["lines"],
            file_counts["blank"],
            file_counts["comment"],
            file_counts["code"],
            changes["commits"],
            changes["changed"] ]
    # print(out)
    csv_out.writerow(out)
    csv_fd.flush()
    rows += 1

csv_fd.flush()
csv_fd.close()
print(f"Wrote {rows} rows to {cdata}")

Clean-up the dataframe. For treemap and sunburst to work properly leaf nodes should have None set. This involves loading the dataframe and then detecting which rows are parents and selectively setting None for rows which are not parents.

This all assumes df is the panda dataframe.

#
# Calculate "parent" nodes
#
# We need to do this so we can cleanly set the heirachy for the treemap where
# we set "None" for leaf nodes that don't have a second or third level.
#
df['is_parent'] = False

# Identify first-level parents (rows where first has children)
parents_l1 = set(df.loc[df['second'] != '', 'first'])
df.loc[(df['first'].isin(parents_l1)) & (df['second'] == ''), 'is_parent'] = True

# Identify second-level parents (rows where (first, second) has children)
parents_l2 = set(df.loc[df['third'] != '', ['first', 'second']].itertuples(index=False, name=None))
df.loc[(df.set_index(['first', 'second']).index.isin(parents_l2)) & (df['third'] == ''), 'is_parent'] = True

# Set empty strings to None, but only for rows that are not parents
df.loc[(df['is_parent'] == False) & (df['second'] == ''), 'second'] = None
df.loc[(df['is_parent'] == False) & (df['third'] == ''), 'third'] = None

Plot the treemap of directory sizes - old style as a html map with plotly

import plotly.express as px
import numpy as np
import pandas as pd
import csv

cols = ['first', 'second', 'third', 'files', 'inc_subdir', 'lines', 'blank', 'comment', 'code', 'commits', 'changed']
# use keep_default_na so we don't interperet empty columns as NaN, breaking treemap
df = pd.read_csv(cdata, usecols=cols, keep_default_na = False)

<<setup-parents>>

df["root"] = "QEMU"

# create treemap figure
fig = px.treemap(df, path=['root', 'first', 'second', 'third'], values='code',
                 hover_data=['files', 'lines', 'code', 'comment'])

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

# Add figure title
fig.update_layout(
     title_text="QEMU by Lines of Code"
)

fig.write_html(graph)
print (f"written to {graph}")

import plotly.express as px
import numpy as np
import pandas as pd
import csv
from matplotlib.colors import ListedColormap

cols = ['first', 'second', 'third', 'files', 'inc_subdir', 'lines', 'blank', 'comment', 'code', 'commits', 'changed']
# use keep_default_na so we don't interperet empty columns as NaN, breaking treemap
df = pd.read_csv(cdata, usecols=cols, keep_default_na = False)

# santise the commit data to prevent zeros
# df = df[df["changed"]!=0]
# df = df[df["lines"]!=0]
# df = df[df["commits"]!=0]
df['changed'] = np.where(df['changed'] == 0, 1, df['changed'])
df['commits'] = np.where(df['commits'] == 0, 1, df['commits'])

<<setup-parents>>

# rename root
df.loc[df['first'] == '.', 'is_parent'] = True
df.loc[df['first'] == '.', 'first'] = ''

# calc commits
avg_commits = np.average(df['commits'])
max_commits = np.max(df['commits'])
median_commits = np.median(df["commits"])

linaro_colors = ["#6715e8", "#ffcf00"]
linaro_cmap = ListedColormap(linaro_colors, name="linaro_cmap")

df["root"] = "QEMU"
# for some reason we cant just do first/second/third 
df["directory"] = np.where(df["third"], df["second"] + "/" + df["third"], df["second"])

# Create a clean label column for display
df["label"] = np.where(df["third"] != '', df["third"], df["second"])
df["label"] = np.where(df["label"] == '', df["first"], df["label"])

# create treemap figure
fig = px.treemap(df, labels = df['label'],
                 path=['root', 'first', 'directory'],
                 values='changed',
                 hover_data=['lines', 'commits'],
                 range_color=[0, max_commits],
                 color='commits')

# fig = px.treemap(df, path=['first', 'second', 'third'], values='changed',
#                  hover_data=['lines', 'commits'],
#                  range_color=[0, max_commits],
#                  color='commits')

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))

# Add figure title
fig.update_layout(
    title_text="Commit and change activity"
)

# show treemap
#fig.show()
fig.write_html(graph)
print (f"written to {graph}")

import plotly.express as px
import pandas as pd
import numpy as np
import os

# make sure we have somewhere to put it
os.makedirs(os.path.dirname(graph), exist_ok=True)

cols = ['first', 'second', 'third', 'files', 'inc_subdir', 'lines', 'blank', 'comment', 'code', 'commits', 'changed']
df = pd.read_csv(cdata, usecols=cols, keep_default_na=False)

<<setup-parents>>

df["root"] = "QEMU"

# Now plot the sunburst chart
fig = px.sunburst(df,
                  path=['root', 'first', 'second', 'third'],
                  values='code',
                  hover_data=['files', 'lines'])

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.update_layout(title_text="Size of QEMU (loc) - Sunburst")

fig.write_html(graph)
print(f"written to {graph}")

Ideas and Brainstorming

The talk length is 15 minutes, half the usual slot as its a quick keynote. I doubt there will be questions built in as its a state of the nation type talk about the project.

So useful things we can graph:

a plot of mailing list iteration before merge?
we have Message-id tags in a lot of commits. If we extract them we can grab the original patch it was pulled from and then look at the subject header and grab the version.
another interesting piece could be the time it takes a first-time contributor to get their patch merged. We could then expand to a general graph of average time to merge vs experience of developer.
(side note check on b4’s ability to cache so we don’t spam lore.kernel.org with too many requests).
a track of total build objects
now we have the script ./.gitlab-ci.d/check-units.py we can track the total source and build units over time. It would probably be best just to iterate over the merge commits, trigger a build and then record the two values. This can tie in with the single binary talk.
it might be worth investigating the popular treemap visualisation from my talk at KVM Forum 2023. I may need to tweak the scripts though to handle a deeper depth. I should also investigate alternative renderings and think about the portability into traditional slides (where as the other required a web host).
we will need to summarise some key features over the last year or so. Maybe we can do some sort of commit activity graph over the treemap?
we can fall back on traditional stats or a developer survey but I’m not sure what it would add. (drop unless we are running short)

Message-Ids

We need to think about how to gather this data, the first step should be to count the total number of Message-Id’s in the commits for the last year.

git log -i --since "last year" --grep="Message-Id" | grep -i "message-id" | wc -l

Collecting the data

We need to collect the data. b4 is a useful tool for this but maybe lei is better suited to it. Trouble is there doesn’t seem to many examples of using lei on the web save a few blog posts.

We also have my own mail index so maybe we can just query via mu?

Steps

extract message id from qemu log
extract the author, subject line and count the r-b and s-o-b tags
TODO: track the author first commit and calculate an experience level for author
append to a data table we will return to org-mode
repeat for the remaining message ids
return data table

As this is a long term process we should gather as much data as we can while iterating. This would include fetching the first commit of the author so we can calculate the experience level of the author.

import re
import git
import os
import csv
from subprocess import check_output, CalledProcessError

os.makedirs(os.path.dirname(cdata), exist_ok=True)
csv_writter = csv.writer(open(cdata, 'w'))

# write the header
csv_writter.writerow(["commit", "msgids", "in_mu", "version", "author", "sob-count", "rb-count", "ab-count", "tb-count"])

vpattern = r"\[PATCH v(\d+)"
repo = git.Repo(tree)
commit_count = 0

for commit in repo.iter_commits(since=since):
      # A commit is a merge commit if it has more than one parent
      if len(commit.parents) == 1:
            msg = commit.message.lower()

            # we could use commit.trailers_list here!
            # extract all "Message-Id" (case insentive) from
            msg_ids = re.findall(r"message-id: <(.*?)>", msg)
            sob_count = msg.count("signed-off-by:")
            rb_count = msg.count("reviewed-by:")
            ab_count = msg.count("acked-by:")
            tb_count = msg.count("tested-by:")

            # we should see if mu can find the msg_ids
            in_mu = False
            version = "1"
            if len(msg_ids) > 0:
                  for mid in msg_ids:
                        cmd = f"mu find -u i:{mid} -f d,s"
                        try:
                              (date, subject) = check_output(cmd, shell=True).decode().rstrip().split(",", 1)
                              in_mu = True
                        except CalledProcessError:
                              # not in Mu, maybe try lei?
                              pass

                        ver_str = re.search(vpattern, subject)
                        if ver_str:
                              if version != "1":
                                    version = f"{version}+{ver_str.group(1)}"
                              else:
                                    version = ver_str.group(1)
                        
            csv_writter.writerow([commit.hexsha[:7], len(msg_ids), in_mu, version, commit.author.name, sob_count, rb_count, ab_count, tb_count])
            commit_count += 1

print(f"Wrote {commit_count} rows to {cdata}")

Last three years of author and commit data

:header-args+: :var SINCE=”2022-09-01” :header-args+: :var UNTIL=”2025-09-01”

d=$SINCE
while [ "$(date -d "$d" +%Y%m%d)" -lt "$(date -d "$UNTIL" +%Y%m%d)" ]; do
  n=$(date -I -d "$d + $inc")
  commits=$(git log --no-merges --pretty=oneline --since="$d" --until="$n" "$branch" | wc -l)
  authors=$(git shortlog -sn --all --since="$d" --until="$n" "$branch" | wc -l)
  last=$(git rev-list -1 --before="$n" origin/master)
  echo "$d, $n, $commits, $authors, $last"
  d=$n
done

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
from datetime import datetime

# make sure we have somewhere to put it
os.makedirs(os.path.dirname(graph), exist_ok=True)

column_names = ['start_date', 'end_date', 'commits', 'authors', 'hash']
df = pd.DataFrame(data, columns=column_names)
df['start_date'] = pd.to_datetime(df['start_date'])
df.set_index('start_date', inplace=True)

# the data is quite noisy so lets do a rolling average over the release cycle (4 months)
df['rolling_commits'] = df['commits'].rolling(window=4).mean()
df['rolling_authors'] = df['authors'].rolling(window=4).mean()

# XKCD Style
plt.xkcd(scale=0.5, length=400, randomness=2)
plt.rcParams['font.family'] = [ 'Humor Sans' ]

fig, ax = plt.subplots(figsize=(12,6))

# Set the x-axis ticks to (4 months) and rotate them
xticks = df.index[::3]
ax.set_xticks(xticks)
ax.set_xticklabels([x.strftime('%m/%y') for x in xticks])
ax.set_ylim(0, 1500)

# bar plot of commits per-month
bars = ax.bar(df.index, df['rolling_commits'], align='center',
              width=20, label='Commits')
ax.set_xlabel('Month')
ax.set_ylabel('Number of Commits', color='blue')
ax.tick_params(axis='y', labelcolor='blue')

# Line plot of contributors
ax2 = ax.twinx()
line = ax2.plot(df.index, df['rolling_authors'], color='red',  marker='o', linewidth=2, label='Authors')
ax2.set_ylabel('Number of Authors', color='red')
ax2.set_ylim(0, 200)
ax2.tick_params(axis='y', labelcolor='red')

ax.set_title('Commits and Authors')
plt.savefig(graph, transparent=True) # save graph
plt.close(fig)
print(f"Plotted graph to {graph}")

HackerNews mentions

As an intro into the section on AI code I want to present mentions from Hacker News. There is an API I can use to search HN and get the stories:

import re
import json
import requests
from subprocess import check_output

# convert since into unix time (equivilent to date --date "last year" +'%s')
# egregious hack to avoid major stories dropping out
since_unix = check_output(["date", "--date", "last year - 5 days", "+%s"]).decode().rstrip()

# Search HackerNews API for QEMU mentions
# url like: GET http://hn.algolia.com/api/v1/search_by_date?query=QEMU&tags=story&numericFilters=created_at_i>1719827845
base_url =  "http://hn.algolia.com/api/v1/search"
params = {
    "query": "QEMU",
    "tags": "story",
    "numericFilters": f"created_at_i>{since_unix}"
}

# Construct the URL with parameters
response = requests.get(base_url, params=params)
response.raise_for_status() # Raise an exception for HTTP errors

# Get stories in json form
data = response.json()

stories = []
for hit in data.get('hits', []):
    story_id = hit.get('objectID')
    points = hit.get('points')
    num_comments = hit.get('num_comments')
    title = hit.get('title')

    if all([story_id, points is not None, num_comments is not None, title]):
        stories.append((story_id, points, num_comments, title))

# We are missing one story we have to add manually
stories.append((44382752, 551, 413, "define policy forbidding use of AI code generators"))
stories.sort(key=lambda x: x[1])  
return stories

Gah I should not of re-run this after putting the talk to bed. Oh well…

import matplotlib.pyplot as plt
import numpy as np
import os

# make sure we have somewhere to put it
os.makedirs(os.path.dirname(graph), exist_ok=True)

# Sort my total points, get top 5
sorted_data = sorted(data, key=lambda item: item[1], reverse=True)
top_5 = sorted_data[:6]
points = [item[1] for item in top_5]
titles = [item[3] for item in top_5]
keys = ['a', 'b', 'c', 'd', 'e', 'f']

# XKCD Style
plt.xkcd(scale=0.5, length=200, randomness=2)
plt.rcParams['font.family'] = ['Humor Sans']

fig, ax = plt.subplots()
ax.barh(keys, points, align='center')
ax.set_yticks(keys)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Points')
ax.set_title('Hacker News QEMU Stories')

plt.savefig(graph, transparent=True) # save graph
plt.close(fig)
print(f"Plotted graph to {graph}")
for k, t in zip(keys, titles):
    print(f"{k} -> {t}")

Replay Activity

Thinking about how to track the activity in the replay subsystem is complicated by the fact it interacts with the rest of the hardware emulation. We could look for commits where “replay_” is in the diff but we have other changes were “foo_replay_something” which are unrelated.

import re
import git
from subprocess import check_output, CalledProcessError

replay_api_pattern = re.compile(r'\s(replay_\w+)[\s*=|\(]')
repo = git.Repo(tree)

# results data
data = []

for commit in repo.iter_commits(since=since):
      # A commit is a merge commit if it has more than one parent
      if len(commit.parents) == 1:
            diff = repo.git.diff(commit, commit.parents[0], unified=0)
            hits = replay_api_pattern.search(diff)
            if hits:
                  data.append((commit.hexsha[:8],
                               commit.committed_date,
                               commit.author.name,
                               commit.summary,
                               ",".join(hits.groups())))

return data

import matplotlib.pyplot as plt
import numpy as np
import os
from datetime import datetime

# make sure we have somewhere to put it
os.makedirs(os.path.dirname(graph), exist_ok=True)

# 1. Convert timestamps to "MM YY" strings
month_year_strings = []
dates = [row[1] for row in data]
# the data is in order, but needs reversing so we get oldest first
dates.reverse()

for timestamp in dates:
    dt_object = datetime.fromtimestamp(timestamp)
    month_str = dt_object.strftime("%b") # Short month name (e.g., "Oct")
    year_str = dt_object.strftime("%y")  # Last two digits of the year (e.g., "23")
    month_year_str = f"{month_str} {year_str}"
    month_year_strings.append(month_year_str)

# 2. Bin by month
commit_counts = {}
for month_year_str in month_year_strings:
    commit_counts[month_year_str] = commit_counts.get(month_year_str, 0) + 1

# 3. Prepare data for plotting
months = commit_counts.keys()
counts = [commit_counts[month] for month in months]

# XKCD Style
plt.xkcd(scale=0.5, length=400, randomness=2)
plt.rcParams['font.family'] = [ 'Humor Sans' ]

fig, ax = plt.subplots()

ax.bar(months, counts, align='center')

ax.set_xlabel('Month')
ax.set_title('Replay Commits')
plt.savefig(graph, transparent=True) # save graph
plt.close(fig)
print(f"Plotted graph to {graph}")

Rust Updates

git log --no-merges --oneline --date=format:%m/%y --pretty=format:"%H, %cd, %f" origin/master rust

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import git
import re
from datetime import datetime

# make sure we have somewhere to put it
os.makedirs(os.path.dirname(graph), exist_ok=True)

name_email_re = re.compile(r"^(?P<name>[^<]+)\s*<(?P<email>[^>]+)>$")

repo = git.Repo(tree)

# reverse the order of the data, oldest to newest
data.reverse()

# We will create a array indexed by MM/YY and in each of those buckets add
# the commit objects

months = {}
for githash, month, _ in data:
    commit = repo.commit(githash)

    if month in months:
        months[month].append(commit)
    else:
        months[month] = [ commit ]

# Collect the data for plotting
keys = months.keys()
counts = []
contribs = []

for month in months.values():
    counts.append(len(month))
    contrib_month = []
    for c in month:
        for k, v in c.trailers_list:
            if name_email_re.match(v):
                contrib_month.append(v)
    contribs.append(len(set(contrib_month)))

# Normalise into a panda dataframe
df = pd.DataFrame({'counts': counts, 'contribs': contribs},
                  index=pd.to_datetime(list(keys), format='%m/%y'))
    
# XKCD Style
plt.xkcd(scale=0.5, length=400, randomness=2)
plt.rcParams['font.family'] = [ 'Humor Sans' ]

fig, ax = plt.subplots()

# Set the x-axis ticks to Q (3 months) and rotate them
xticks = df.index[::3]
ax.set_xticks(xticks)
ax.set_xticklabels([x.strftime('%m/%y') for x in xticks])

# bar plot of commits per-month
bars = ax.bar(df.index, df['counts'], align='center',
              width=20, label='Commits')
ax.set_xlabel('Month')
ax.set_ylabel('Number of Commits', color='blue')
ax.tick_params(axis='y', labelcolor='blue')

# Line plot of contributors
ax2 = ax.twinx()
line = ax2.plot(df.index, df['contribs'], color='red',  marker='o', linewidth=2, label='Active Contributors')
ax2.set_ylabel('Active Contributors', color='red')
ax2.set_ylim(0, 10)
ax2.tick_params(axis='y', labelcolor='red')

ax.set_title('Rust Commits and Contributors')
plt.savefig(graph, transparent=True) # save graph
plt.close(fig)
print(f"Plotted graph to {graph}")

Downstream Forks

Analyse iPhone Fork

Fork Data

We want to analyse the downstream forks of QEMU. I propose we use the github API to query the forks.

get the list of forks
for each fork
- find the branch most adrift from upstream
- save that data
then sort and pick the top N forks
plot some sort of diagram with the delta indicated somehow

from git import Repo
from github import Github
from time import sleep
from dateutil import parser
import requests
import csv
import os

QEMU_OWNER = "qemu"
QEMU_REPO = "qemu"
TOP_N_REPOS = 50

# local copy
repo = Repo(tree)
# API access
g = Github(apikey)

# upstream repo
upstream = g.get_user(QEMU_OWNER).get_repo(QEMU_REPO)

# Fetch all the forks, get the most popular by stargazers and watchers
forks = list(upstream.get_forks())
sorted_forks = sorted(forks, key=lambda fork: fork.stargazers_count,  reverse=True)
top_stargazers = sorted_forks[:TOP_N_REPOS]
# sorted_forks = sorted(sorted_forks[TOP_N_REPOS:], key=lambda fork: fork.watchers_count,  reverse=True)
# top_watched = sorted_forks[:TOP_N_REPOS]
# remaining = sorted_forks[TOP_N_REPOS:]

# forks_of_interest = top_stargazers + top_watched
forks_of_interest = top_stargazers

os.makedirs(os.path.dirname(fork_data), exist_ok=True)
csv_fd = open(fork_data, 'w')
csv_writter = csv.writer(csv_fd)

# write the header
csv_writter.writerow(["repo name", "branch", "base tag", "base date", "commits ahead", "head date", "lines added", "lines deleted"])

forks_processed = 0

# For the sake of brevity we will only look at the default branch and
# assume that is the main branch of interest. We then:
#  - generate a compare object between master..DEFAULT_HEAD
#  - record how ahead the branch is and total diff
#  - record the base tag (i.e. version)
#  - then write this out
for f in forks_of_interest:
    branch = f.get_branch(f.default_branch)
    compare = upstream.compare(upstream.default_branch, f"{branch.commit.sha}")

    ahead = compare.ahead_by

    # skip empty branches
    if ahead == 0:
        continue

    # ignore stuff likely to be a maintainer branch
    if branch.name == "staging" or branch.name.endswith("-next"):
        continue

    # ensure we have the upto date info on the commit
    compare.merge_base_commit.update()

    merge_base = compare.merge_base_commit.sha
    tag = repo.git.describe("--tag", merge_base)
    tag_date = parser.parse(compare.merge_base_commit.last_modified)
    diff = requests.get(compare.diff_url)

    # trigger load of rest of details

    branch.commit.complete()
    head_date = parser.parse(branch.commit.last_modified)

    lines = diff.text.splitlines()
    plus = sum(1 for l in lines if l.startswith("+"))
    minus = sum(1 for l in lines if l.startswith("-"))

    csv_writter.writerow([f.full_name, branch.name,
                          tag, tag_date.strftime("%d/%m/%Y"),
                          ahead,
                          head_date.strftime("%d/%m/%Y"),
                          plus, minus])
    csv_fd.flush()
    forks_processed += 1
    # avoid spamming too much
    sleep(0.5)

csv_fd.flush()
csv_fd.close()
print(f"Wrote {forks_processed} rows to {fork_data}")

Unfortunately github doesn’t track all forks unless they are generated by pressing the Fork button. As a result there are a number of well known repos that don’t appear to be based off QEMU despite having a common base. There are also repos hosted elsewhere.

To solve this I’m tracking a number of the well known forks in my local repository (all names forks/project-name) which have their remotes correctly set. So we will gather that data and append it to the CSV, synthesising the repo name as we do.

from git import Repo
import csv
import os
import re

# local copy
repo = Repo(tree)

# We are appending to the github data
csv_fd = open(fork_data, 'a')
csv_writter = csv.writer(csv_fd)

fork_branches = [branch.name
                 for branch in repo.branches
                 if branch.name.startswith("forks/")]

for bname in fork_branches:

    rname = bname.split("/")[-1]
    head = repo.heads[bname]
    head_dt = head.commit.committed_datetime

    # work out merge_base and its meta-data
    merge_base = repo.merge_base("origin/master", head)
    base_tag = repo.git.describe("--abbrev=0", "--tags", merge_base)
    base_tag_dt = repo.tags[base_tag].commit.committed_datetime

    # calculate diffs against base tag
    diffstat = repo.git.diff('--shortstat', merge_base[0], head)
    m = re.search(r'(\d+)\s+insertions', diffstat)
    plus = int(m.group(1)) if m else 0
    m = re.search(r'(\d+)\s+deletions', diffstat)
    minus = int(m.group(1)) if m else 0

    # calculate number of non-upstream commits
    # git cherry will tell us
    ahead = 0
    cherry_commits = repo.git.cherry("origin/master", bname).splitlines()
    unpicked_commits = [commit for commit in cherry_commits if commit.startswith('+')]
    ahead = len(unpicked_commits)

    print(f"{rname} {bname} {base_tag} {base_tag_dt.strftime("%d/%m/%Y")} {ahead} {head_dt.strftime("%d/%m/%Y")} {plus} {minus}")

    csv_writter.writerow([rname, bname,
                          base_tag, base_tag_dt.strftime("%d/%m/%Y"),
                          ahead,
                          head_dt.strftime("%d/%m/%Y"),
                          plus, minus])
    csv_fd.flush()

csv_fd.flush()
csv_fd.close()

To visualise this we are going to plot some sort of network diagram. To define those you need a number of nodes with x positions (which we will use date to scale) and y widths (based on the total size of the repo).

The CSV data hold the timedate of the base tag and the head commit which we will use for the “x” co-ordinates. We will set the width of main trunk based on size at that tag using:

git diff-tree –shortstat 4b825dc642cb6eb9a060e54bf8d69288fbee4904 TAG

So we need to:

read in the CSV file fork_data
extract the base tags/base times and create a sorted list to process
create the nodes in time order for the main trunk
add the nodes for the forks with routed from the base node to their head - size based on the sum of added/subtracted lines

import matplotlib.pyplot as plt
import numpy as np
import csv
import re
from datetime import datetime
from git import Repo
import networkx as nx
import matplotlib.cm as cm

repo = Repo(tree)
empty_tree = '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
version_re = re.compile(r'^v\d+\.\d+\.\d+$')

nodes = []
root_nodes = []
root_tags = set()
min_root_time, max_root_time = None, None
min_head_date, max_head_date = None, None
max_lines = 0
max_ahead = 0

# Create the root nodes based on the version tags
# rts = [n.name for n in repo.tags if version_re.fullmatch(n.name)]

with open(fork_data, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        base_date = datetime.strptime(row['base date'], '%d/%m/%Y')
        head_date = datetime.strptime(row['head date'], '%d/%m/%Y')

        # Update min/max times
        if min_root_time is None or base_date < min_root_time:
            min_root_time = base_date
        if max_root_time is None or base_date > max_root_time:
            max_root_time = base_date
        if min_head_date is None or head_date < min_head_date:
            min_head_date = head_date
        if max_head_date is None or head_date > max_head_date:
            max_head_date = head_date

        tag = row['base tag']
        lines = int(row['lines added']) - int(row['lines deleted'])
        ahead = int(row['commits ahead'])

        # determine a unique enough repo name
        reponame = row['repo name']
        if "/" in reponame:
            user, reponame = row['repo name'].split("/")
            if reponame == "qemu":
                reponame = user

        if ahead > max_ahead:
            max_ahead = ahead

        data = {
            'repo': reponame,
            'root': tag,
            'root_time': base_date,
            'head_time': head_date,
            'lines' : lines,
            'ahead' : ahead
        }
        nodes.append(data)

        # for things not based directly off a release find the base release
        if not version_re.fullmatch(tag):
            try:
                tag = repo.git.describe(
                    '--abbrev=0',
                    '--tags',
                    '--contains', f"{tag}",
                    '--match=v*.?.0')
                tag = re.split('[-^~]', tag)[0]
            except:
                # this is mostly bailing on the 10.1.0-rc's
                pass

        if version_re.fullmatch(tag) and tag not in root_tags:
            tag_commit = repo.tags[tag].commit
            diff_out = repo.git.diff_tree('--shortstat', empty_tree, tag)
            m = re.search(r'(\d+)\s+insertions', diff_out)
            total_lines = int(m.group(1)) if m else 0

            if total_lines > max_lines:
                max_lines = total_lines

            tag_dt = tag_commit.committed_datetime

            root = {
                "root" : tag,
                "root_time" : tag_dt,
                "head_time" : tag_dt,
                "repo" : "upstream",
                "total_lines" : total_lines,
                "debug" : f"{reponame}/{tag}"
            }
            root_nodes.append(root)
            root_tags.add(tag)

root_nodes.sort(key=lambda x: x['root_time'])

G = nx.Graph()

last_node = None
max_diff = 0

# process the root nodes first
for i, data in enumerate(root_nodes):
    G.add_node(data['root'], **data)
    if last_node is not None:
        line_diff = abs(data['total_lines'] -
                        last_node['total_lines'])
        data["diffstat"] = line_diff
        if line_diff > max_diff:
            max_diff = line_diff

    last_node = data

# now we know max diff we can weight the edges
last_node = None
for i, data in enumerate(root_nodes):
    if last_node is not None:
        weight = max(0.1, data["diffstat"] / max_diff)
        G.add_edge(data['root'], last_node["root"], weight = weight)

    last_node = data

# now go through the whole node list linking them to the root nodes
for i, data in enumerate(nodes):
    changed_lines = data['lines']
    ahead = data['ahead']
    
    fork_point = data['root']
    if fork_point not in root_tags:
        # find the closest root tag
        try:
            root_tag = repo.git.describe(
                '--abbrev=0',
                '--tags',
                '--contains', f"{fork_point}",
                '--match=v*.?.0')
            fork_point = re.split('[-^~]', root_tag)[0]
        except:
            pass

    if fork_point in root_tags:
        # set size based on root tag
        total_lines = G.nodes()[fork_point]["total_lines"]
        data["diffstat"] = changed_lines
        data["total_lines"] = total_lines + changed_lines

        G.add_node(data["repo"], **data)
        weight = max(0.1, ahead / max_ahead)
        G.add_edge(data["repo"], fork_point, weight = weight)

# the graphviz layout engine seems to do a pretty good job
# , args='-Gsep="+1.5"'
pos = nx.nx_agraph.graphviz_layout(G, prog="neato")

# Normalise node size based on total lines
all_total_lines = [G.nodes[n]['total_lines'] for n in G.nodes()]
node_sizes = [1600 * (lines / max_lines) for lines in all_total_lines]

# Normalise colour based on head commit ts
all_head_dates_ts = [G.nodes[n]['head_time'].timestamp() for n in G.nodes()]
min_head_date_ts = min_head_date.timestamp()
max_head_date_ts = max_head_date.timestamp()
norm_head_dates = [(ts - min_head_date_ts) / (max_head_date_ts - min_head_date_ts) for ts in all_head_dates_ts]
cmap = cm.viridis
node_colors = [cmap(val) for val in norm_head_dates]

# Normalise edge width based on size of diff
edge_widths = [G.edges[u, v]['weight'] * 20 for u, v in G.edges()]

node_labels = {n: n for n in G.nodes()}

fig, ax = plt.subplots(figsize=(14, 10))
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, alpha=0.8, ax=ax)
nx.draw_networkx_edges(G, pos, width=edge_widths, edge_color='gray', arrows=True, arrowsize=20, ax=ax)
nx.draw_networkx_labels(G, pos, node_labels, font_size=8, font_color='black', ax=ax)

ax.set_title("QEMU Major Forks", size=15)

# plot the color map
if all_head_dates_ts:
    sm = cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=min_head_date_ts, vmax=max_head_date_ts))
    sm.set_array(np.array(all_head_dates_ts))
    cbar = plt.colorbar(sm, ax=ax, orientation='vertical', fraction=0.02, pad=0.05)
    cbar.set_label('Head Commit Timestamp (Recency)', rotation=270, labelpad=15)
    
    tick_locs = cbar.get_ticks()
    date_labels = [datetime.fromtimestamp(ts).strftime('%Y-%m') for ts in tick_locs]
    cbar.ax.set_yticklabels(date_labels)

ax.axis('off')
plt.tight_layout()
plt.savefig(graph)
plt.close(fig)
print(f"Plotted graph to {graph} @ {datetime.now().strftime("%H:%m on %d/%m")}")

VirtIO GPU

# Linaro colours
# Primary
p1col = 0x6715E8 # Linaro purple
p2col = 0xFFCF00 # Linaro gold
# Secondary
s1col = 0xff0099 # pink-ish
s2col = 0x00b2ff # cyan-ish
s3col = 0xff9900 # orange-ish
s4col = 0x6aff00 # light green
s5col = 0xc800ff # purple-ish

Diagram: [

remotew = 3
remoteh = 3

# address space width
asw = remotew / 2

# Guest OS
Remote: box ht remoteh width remotew fill s4col

box "Application" \
  fill p1col \
  with .n at Remote.n + -0.25, -0.5

down
arrow 0.25 <->

MESA: box "Mesa 3D" fill p2col

down
arrow 0.50 <->

GPU: box "virtio-gpu" "driver" \
  fill p2col

right
arrow 0.25 <->

GEM: box "GEM/TTM" "memory" \
  fill p2col


# Draw bounding dotted lines after
# mark the kernel/userspace boundry
box with s at s of Remote width remotew ht remoteh/3 dotted
text with w at w of last box "Kernel"

box with n at n of Remote width remotew ht remoteh - remoteh/3 dotted
text with w at w of last box "Userspace"

# Now draw the guest physical address space

right

GPA: box ht remoteh * 2 width asw dotted  \
  with .n at Remote.n + 3, 1.5 \

line invisible \
  from GPA.nw + 0, 0.25 to GPA.ne \
  "Guest Physical Address Space" above

down

PCIB: box thin \
  ht 1.25 width asw dotted \
  with .n at GPA.s + 0, 2

PCIBTOP: line thin color gray \
  right from 0.25 e of PCIB.ne

PCIBBOT: line thin color gray \
  right from 0.25 e of PCIB.se

arrow <-> thin \
  from 1/2<PCIBTOP.start, PCIBTOP.end> \
  down until even with PCIBBOT.s \
  "PCI" aligned "BAR" aligned color gray

GRAM: box thin "Guest RAM" \
  ht 1 width asw \
  with .n at GPA.n + 0, -1 \
  fill s4col

arrow -> dotted \
  from GRAM.w to Remote.e

# blob memory in PCI Bar, accesses by GEM

GVRAM: box thin \
  ht 0.30 width asw \
  with .w at PCIB.w + 0, 0.25 \
  fill s4col \
  "virtio-gpu"

BLOB: box  \
  ht 0.2 width asw / 4 thin \
  with .w at GVRAM.w + 0.05, 0 \
  fill p2col


# Host Physical Address Space

HPA: box ht remoteh * 2 width asw dotted  \
  with .n at GPA.n + 2.5, 0 \

line invisible \
  from HPA.nw + 0, 0.25 to HPA.ne \
  "Host Physical Address Space" above

HRAM: box thin "Host RAM" \
  ht 2 width asw \
  with .n at HPA.n + 0, -1 \
  fill s2col

GHRAM: box thin width asw  \
  with nw at HRAM.nw + 0, -0.25 \
  fill s4col \
  "Guest"

# show guest RAM as portion of host ram
arrow <- dotted \
  from GRAM.e to GHRAM.w 


HPCIB: box thin \
  ht 1.25 width asw dotted \
  with .n at HPA.s + 0, 2

HVRAM: box thin \
  ht 0.75 width asw \
  with .w at HPCIB.w \
  fill s2col \
  "GPU" "VRAM"

HBLOB: box thin ht 0.25 width asw / 4 \
  with .w at HVRAM.w + 0.1, 0 \
  fill p2col

# arrows to show blob memory
arrow -> dotted \ 
  from HBLOB.s \
  down 0.1 \
  then left until even with BLOB.s \
  then to BLOB.s

arrow -> dotted \
  from BLOB.w to GEM.e

]

QEMU 10.1.0 Wordcloud

git log --no-merges --pretty="format:%s" v10.0.0..v10.1.0 > $titles
cat $titles |
    tr '[:upper:]' '[:lower:]' |
    tr ' :/-' '\n' |
    grep -v -e '^$' | # blanks
    grep -v "[^ a-z]" | # non-alpha
    grep -v -x -F -f $filter > $cloud
cat $cloud | sort | uniq -c | sort -rn | head -n 20

library(wordcloud)
library(readr)
library(RColorBrewer)
words <- read_file(cloud)
par(bg=NA)
pallete <- brewer.pal(8, "Dark2")
wordcloud(words, min.freq=20, scale=c(8,.5), colors=pallete, random.order=FALSE)

stsquad/kvm25-qemu-keynote.org