Last active
August 28, 2023 18:01
-
-
Save sash13/ec660294732f7cce92ccb5db426ffdf5 to your computer and use it in GitHub Desktop.
Mining names from petition.president.gov.ua
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from bs4 import BeautifulSoup | |
import sys | |
import re | |
import requests | |
url_main = 'https://petition.president.gov.ua/petition/{}' | |
url_json = 'https://petition.president.gov.ua/petition/{}/votes/{}/json' | |
def get_pages_count(data): | |
r1 = re.findall(r"get_voters_page\('(\d+)'\)",data) | |
try: | |
return max(map(int,r1)) | |
except: | |
return 1 | |
def get_names(data): | |
names = [] | |
soup = BeautifulSoup(data, 'html.parser' ) | |
table = soup.find("div", { "class" : "table" }) | |
for row in table.findAll("div", { "class" : "table_cell name" }): | |
names.append(row.text) | |
return names | |
def get_vote_page(number, page = 1): | |
page = url_json.format(number,page) | |
res = requests.get(page) | |
data = res.json() | |
return get_names(data['table_html']) | |
def get_main_page(number): | |
peoples = [] | |
page = url_main.format(number) | |
res = requests.get(page) | |
data = res.content.decode('utf-8') | |
peoples += get_names(data) | |
pages_count = get_pages_count(data) | |
if pages_count > 1: | |
for page in range(2, pages_count+1): | |
peoples += get_vote_page(number, page) | |
print (len(peoples)) | |
return peoples | |
if __name__ == '__main__': | |
try: | |
number = sys.argv[1] | |
except: | |
sys.exit('Using: python script.py <number>') | |
itemlist = get_main_page(number) | |
with open(number+ '_petition.txt', 'w', encoding='utf-8') as outfile: | |
outfile.write("\n".join(itemlist)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment