Last active
August 19, 2020 12:47
-
-
Save imankulov/9b7ea86059aa2c1048d003e90541d807 to your computer and use it in GitHub Desktop.
Analyze the percentage of the topic-specific StackOverflow questions for each programming language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
An ad-hoc script to analyze the percentage of the topic-specific questions for | |
each programming language. | |
For each language, the script downloads two pages and looks for the number of questions | |
in the title, like "NNN,NNN questions". For example, for "python" and | |
an extra tag "architecture", the script downloads | |
https://stackoverflow.com/questions/tagged/python and | |
https://stackoverflow.com/questions/tagged/python+architecture. | |
The "Tag Popularity" value is the number of tag-specific questions per 1000 of | |
questions in general. | |
The script saves the output in a CSV file with the name of the tag. | |
The output for "architecture.csv" is available here: https://chart-studio.plotly.com/~RomanImankulov/55 | |
""" | |
import re | |
import sys | |
import requests | |
from lxml.etree import HTML | |
import pandas as pd | |
from urllib.parse import quote | |
language_tags = [ | |
"c#", | |
"java", | |
"c++", | |
"python", | |
"ruby", | |
"javascript", | |
"php", | |
"go", | |
"rust", | |
"kotlin", | |
"swift", | |
] | |
def analyze_languages(languages, secondary_tag): | |
raw_data = [analyze_language(lang, secondary_tag) for lang in languages] | |
return pd.DataFrame.from_records(raw_data) | |
def analyze_language(language, secondary_tag): | |
questions = get_question_count([language]) | |
questions_with_tag = get_question_count([language, secondary_tag]) | |
return { | |
"language": language, | |
"questions": questions, | |
"question_with_tag": questions_with_tag, | |
"tag_popularity": questions_with_tag * 1000 / questions, | |
} | |
def get_question_count(tags): | |
"""Return the number of questions, tagged with provided tags.""" | |
formatted_tags = quote(" ".join(tags)) | |
resp = requests.get(f"https://stackoverflow.com/questions/tagged/{formatted_tags}") | |
tree = HTML(resp.text).getroottree() | |
xpath_selector = '//div[@id="mainbar"]//div[contains(@class, "mr12")]' | |
count_text = tree.xpath(xpath_selector)[0].text.strip() | |
count = int("".join(re.findall(r"\d+", count_text))) | |
return count | |
if __name__ == "__main__": | |
secondary_tag = sys.argv[1] | |
df = analyze_languages(language_tags, secondary_tag) | |
df.to_csv(f"{secondary_tag}.csv", index=False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
language | questions | question_with_tag | tag_popularity | |
---|---|---|---|---|
c# | 1429283 | 4751 | 3.324044293537389 | |
java | 1705257 | 13349 | 7.828145552254 | |
c++ | 687722 | 10857 | 15.786902265741098 | |
python | 1514427 | 9523 | 6.288186885204767 | |
ruby | 214977 | 813 | 3.7817999134791163 | |
javascript | 2070667 | 4616 | 2.2292333822869637 | |
php | 1366946 | 2128 | 1.5567549852005858 | |
go | 48037 | 174 | 3.622207881424735 | |
rust | 17761 | 35 | 1.970609762963797 | |
kotlin | 44392 | 48 | 1.081275905568571 | |
swift | 275312 | 288 | 1.0460858952751788 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
language | questions | question_with_tag | tag_popularity | |
---|---|---|---|---|
c# | 1429343 | 1943 | 1.3593658065278942 | |
java | 1705327 | 1420 | 0.8326848751002007 | |
c++ | 687757 | 407 | 0.5917787823315502 | |
python | 1514603 | 315 | 0.20797529121492564 | |
ruby | 214981 | 96 | 0.4465510905614915 | |
javascript | 2070781 | 519 | 0.25063007628522765 | |
php | 1367002 | 519 | 0.3796629412392959 | |
go | 48045 | 47 | 0.9782495577063169 | |
rust | 17763 | 7 | 0.3940775769858695 | |
kotlin | 44399 | 39 | 0.8783981621207685 | |
swift | 275332 | 98 | 0.3559339270408089 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
language | questions | question_with_tag | tag_popularity | |
---|---|---|---|---|
c# | 1429280 | 5090 | 3.561233628120452 | |
java | 1705254 | 6490 | 3.805884636540949 | |
c++ | 687722 | 2237 | 3.2527678335141235 | |
python | 1514423 | 884 | 0.5837206645699385 | |
ruby | 214978 | 226 | 1.0512703625487259 | |
javascript | 2070668 | 1591 | 0.7683510828389679 | |
php | 1366947 | 1566 | 1.1456186670002568 | |
go | 48037 | 64 | 1.332306347190707 | |
rust | 17761 | 20 | 1.1260627216935983 | |
kotlin | 44391 | 60 | 1.351625329458674 | |
swift | 275312 | 216 | 0.784564421456384 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
language | questions | question_with_tag | tag_popularity | |
---|---|---|---|---|
c# | 1429280 | 3391 | 2.3725232284786744 | |
java | 1705254 | 5722 | 3.355511847501897 | |
c++ | 687723 | 474 | 0.6892309839862851 | |
python | 1514424 | 1049 | 0.6926725936725778 | |
ruby | 214978 | 277 | 1.2885039399380402 | |
javascript | 2070668 | 3328 | 1.6072108131289033 | |
php | 1366947 | 7497 | 5.48448476788054 | |
go | 48037 | 69 | 1.4363927805649812 | |
rust | 17761 | 8 | 0.45042508867743936 | |
kotlin | 44392 | 22 | 0.4955847900522617 | |
swift | 275312 | 178 | 0.6465391991631313 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment