From June 26, 2016 (python 3.5.2 release) to Aug. 31, 2016.
Python versions from 2.6 to 3.5
Without 2.7
From June 26, 2016 (python 3.5.2 release) to Aug. 31, 2016.
Python versions from 2.6 to 3.5
Without 2.7
-- https://bigquery.cloud.google.com/dataset/the-psf:pypi | |
SELECT concat( | |
date(timestamp), '_', REGEXP_EXTRACT(details.python, r'^([2-3]).[0-9].') | |
) as date_python, count(details.python) as downloads | |
FROM (TABLE_DATE_RANGE([the-psf:pypi.downloads], | |
TIMESTAMP('2016-06-26'), | |
TIMESTAMP('2016-08-31'))) | |
group by date_python |
-- https://bigquery.cloud.google.com/dataset/the-psf:pypi | |
-- https://bigquery.cloud.google.com/table/the-psf:pypi.downloads20160903 | |
SELECT concat( | |
date(timestamp), '_', REGEXP_EXTRACT(details.python, r'^([2-3].[0-9]).') | |
) as date_python, count(details.python) as downloads | |
FROM (TABLE_DATE_RANGE([the-psf:pypi.downloads], | |
TIMESTAMP('2016-06-26'), | |
TIMESTAMP('2016-08-31'))) | |
group by date_python |
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# To plot chart from csv generated by bigquery | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
plt.figure() | |
ts = pd.read_csv('download_python_version_by_day.csv') | |
ts['date'] = pd.to_datetime(ts['date']) | |
df = ts.pivot(index='date', columns='python', values='downloads') | |
#df.plot() | |
#df[[2.6, 2.7, 3.1, 3.2, 3.3, 3.4, 3.5]].plot() | |
df[[2.6, 3.1, 3.2, 3.3, 3.4, 3.5]].plot() | |
plt.show() |
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
ts = pd.read_csv( | |
'download_python_major_version_by_day.csv', parse_dates=True, | |
) | |
ts['date'] = pd.to_datetime(ts['date']) | |
df = ts.pivot(index='date', columns='python', values='downloads') | |
ax = df[[2, 3]].plot(logy=True, figsize=(12, 9)) | |
ax.set_ylabel('log(downloads)') | |
ax.set_title('Python packages downloads') | |
plt.show() |
I also think it's important for this analysis to filter down to pip
installs:
SELECT
CONCAT( DATE(timestamp), '_', REGEXP_EXTRACT(details.python, r'^([2-3].[0-9]).') ) AS date_python,
COUNT(details.python) AS downloads
FROM (TABLE_DATE_RANGE([the-psf:pypi.downloads], TIMESTAMP('2016-06-26'), TIMESTAMP('2016-08-31')))
WHERE
details.installer.name LIKE 'pip'
GROUP BY
date_python
although it doesn't have a huge impact on the results.
Also, if you just want a relative comparison (rather than absolute), it might also be better to add WHERE ... AND details.cpu IS NOT NULL
; more chance to get "actual" people installs rather than bots that are just mirroring (which might have more chance to be 2.7)
Try a log scale on the y-axis, it's as simple as
df.plot(logy=True)
(docs)