Skip to content

Instantly share code, notes, and snippets.

@marvin-roesch
Created October 21, 2017 19:05
Show Gist options
  • Save marvin-roesch/126291638fa3b1a2b55a7878b7ee8b2c to your computer and use it in GitHub Desktop.
Save marvin-roesch/126291638fa3b1a2b55a7878b7ee8b2c to your computer and use it in GitHub Desktop.
A simple scraping and analysis script to get programming language data from CurseForge projects. Includes the latest dump.

Curse Language Statistics

These are two simple Python scripts to gather some basic data about Minecraft mods on CurseForge. curselangs.py scrapes the project listing, while curselangs-stats.py evaluates the scraped data and outputs some accumulated results. Mind you that I'm far from a Python expert, so the scripts might not be the most well written.

In order to save you the effort of scraping, I've dumped the latest 1.12 data (as of 2017-10-21) in a separate Gist.

The evaluated data is visualized in a Google Spreadsheet.

import heapq
import json
from collections import defaultdict
with open('data-1.12.json') as data_file:
data = json.load(data_file)
stats = defaultdict(int)
grouped_lang = defaultdict(list)
grouped_license = defaultdict(int)
for p in data:
grouped_lang[p["language"]].append(p)
grouped_license[p["license"]] += 1
for l, ps in grouped_lang.items():
stats[l] = len(ps)
print(l + ": " + str(list(map(lambda p: p['url'], sorted(heapq.nlargest(3, ps, lambda p: p['downloads']),
key=lambda p1: -p1['downloads'])))))
for p in grouped_lang['kotlin']:
print(p['url'])
print(stats)
print("Licenses:")
for l, ps in sorted(list(grouped_license.items()), key=lambda xs: xs[1]):
print(l + "\t" + str(ps))
import http.client
import json
import operator
from collections import defaultdict
from itertools import groupby
import heapq
from bs4 import BeautifulSoup
from github import Github
from pybitbucket.repository import Repository
gh = Github('{MUH SECRIT}')
connection = http.client.HTTPSConnection('minecraft.curseforge.com')
baseUrl = '/mc-mods?filter-game-version=2020709689%3A6580&filter-sort=downloads'
stats = defaultdict(int)
projectData = []
repos = 0
page = 1
nextUrl = baseUrl + "&page=1"
def inspect_github(repo_link):
global repos
repo = repo_link.replace('https://github.com/', '')
try:
languages = gh.get_repo(repo).get_languages()
if len(languages) == 0:
print('No language found')
return "unknown"
else:
used_lang = max(languages.items(), key=operator.itemgetter(1))[0]
repos += 1
return used_lang.lower()
except Exception:
print(repo_link)
return "failure"
def inspect_bitbucket(repo_link):
global repos
repo = repo_link.replace('https://bitbucket.org/', '')
try:
result = Repository.find_repository_by_full_name(repo)
language = result.data['language']
if len(language) != 0:
repos += 1
return lang.lower()
else:
print('No language found')
return "unknown"
except Exception:
print(repo_link)
return "failure"
while nextUrl is not None:
connection.request('GET', nextUrl)
response = connection.getresponse()
soup = BeautifulSoup(response.read().decode(), 'html.parser')
projects = soup.select('ul.project-listing .name-wrapper a')
for project in projects:
projectUrl = project['href']
print('Inspecting ' + projectUrl)
connection.request('GET', projectUrl)
response = connection.getresponse()
projectSoup = BeautifulSoup(response.read().decode(), 'html.parser')
menu = projectSoup.select('.e-header-nav .e-menu a')
lang = "unknown"
for link in menu:
if "Source" not in link.string:
continue
repoLink = link['href']
if "github" in repoLink:
lang = inspect_github(repoLink)
elif "bitbucket" in repoLink:
lang = inspect_bitbucket(repoLink)
licence = 'unknown'
downloads = 0
details = projectSoup.select('.cf-details.project-details li')
for detail in details:
detail_label = detail.select_one('.info-label').string.strip()
if detail_label == 'Total Downloads':
downloads = int(detail.select_one('.info-data').string.replace(',', ''))
elif detail_label == 'License':
licenceLink = detail.select_one('.info-data a')
licence = licenceLink.string
if licence is None:
licence = licenceLink.select_one('span')['title']
licence = licence.strip()
notice = 'none'
if lang == 'failure':
notice = 'Failed: ' + repoLink
lang = 'unknown'
projectData.append({
'url': 'https://minecraft.curseforge.com' + projectUrl,
'language': lang,
'downloads': downloads,
'license': licence,
'notice': notice
})
print("Done inspecting page " + str(page))
next_button = soup.select_one('.listing-header .paging-list a[rel="next"]')
if next_button is None:
nextUrl = None
else:
nextUrl = next_button["href"]
page += 1
grouped = {}
for l, psGrouper in groupby(projectData, lambda x: x['language']):
ps = list(psGrouper)
grouped[l] = ps
for l, ps in grouped.items():
stats[l] = len(ps)
print(l + ": " + str(list(map(lambda p: p['url'], heapq.nlargest(3, ps, lambda p: p['downloads'])))))
with open('data-1.12.json', 'w') as outfile:
json.dump(projectData, outfile, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment