Skip to content

Instantly share code, notes, and snippets.

@grobbie
Created August 10, 2023 13:51
Show Gist options
  • Save grobbie/72483f36f59221fbf3170354cb7cca1e to your computer and use it in GitHub Desktop.
Save grobbie/72483f36f59221fbf3170354cb7cca1e to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_stackoverflow_data(num_pages=3):
base_url = "https://stackoverflow.com/questions/tagged/"
data = []
tags = ["ubuntu", "rhel", "arch", "suse"]
for tag in tags:
for page in range(1, num_pages + 1):
url = f"{base_url}{tag}?tab=votes&page={page}"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
questions = soup.find_all("div", class_="s-post-summary")
for question in questions:
title = question.find("h3").get_text().strip()
#vote_count = int(question.find("span", class_="s-post-summary--stats-item-number").get_text())
vote_count = 0
stats = question.find_all("div", class_="s-post-summary--stats-item")
for stat in stats:
stat_name=stat.find("span", class_="s-post-summary--stats-item-unit").get_text()
if stat_name == "votes":
vote_count = int(stat.find("span", class_="s-post-summary--stats-item-number").get_text())
break
data.append({"distro": tag, "title": title, "vote_count": vote_count})
else:
print("status_code: " + response.status_code)
return data
if __name__ == "__main__":
scraped_data = get_stackoverflow_data(num_pages=3)
df = pd.DataFrame(scraped_data)
df.to_csv("stack_overflow_data.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment