Created
January 10, 2020 06:52
-
-
Save eliask/3e932469bd29c14dc61d87b1cd0defb8 to your computer and use it in GitHub Desktop.
Scrape Vesla measurements: https://wwwp2.ymparisto.fi/vesla/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, glob, sys | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support.expected_conditions import presence_of_element_located | |
from selenium.webdriver.support import expected_conditions as EC | |
if not sys.argv[2:]: | |
print(f'Usage {sys.argv[0]} <username> <password>') | |
sys.exit(0) | |
username, password = sys.argv[1:3] | |
profile=webdriver.FirefoxProfile() | |
dir_=f'{os.path.curdir}/vesla-out' # doesn't work? | |
dir_=f'{os.environ.get("HOME")}/Downloads' | |
profile.set_preference("browser.download.dir", dir_) | |
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") | |
os.makedirs('vesla-out', exist_ok=True) | |
driver = browser = webdriver.Firefox(firefox_profile=profile) | |
browser.get('https://wwwp2.ymparisto.fi/scripts/kirjaudu.asp') | |
browser.find_element_by_name("userName").send_keys(username) | |
browser.find_element_by_name("password").send_keys(password + Keys.RETURN) | |
time.sleep(1) # Required | |
browser.get('https://wwwp2.ymparisto.fi/scripts/hearts/welcome.asp') | |
kunnat=''' | |
20 5 9 10 16 18 19 35 43 46 47 49 50 51 52 60 61 62 65 69 71 72 74 75 | |
76 77 78 79 81 82 86 111 90 91 97 98 99 102 103 105 106 108 109 139 140 | |
142 143 145 146 153 148 149 151 152 165 167 169 170 171 172 176 177 178 | |
179 181 182 186 202 204 205 208 211 213 214 216 217 218 224 226 230 231 | |
232 233 235 236 239 240 320 241 322 244 245 249 250 256 257 260 261 263 | |
265 271 272 273 275 276 280 284 285 286 287 288 290 291 295 297 300 301 | |
304 305 312 316 317 318 398 399 400 407 402 403 405 408 410 416 417 418 | |
420 421 422 423 425 426 444 430 433 434 435 436 438 440 441 475 478 480 | |
481 483 484 489 491 494 495 498 499 500 503 504 505 508 507 529 531 535 | |
536 538 541 543 545 560 561 562 563 564 309 576 577 578 445 580 581 599 | |
583 854 584 588 592 593 595 598 601 604 607 608 609 611 638 614 615 616 | |
619 620 623 624 625 626 630 631 635 636 678 710 680 681 683 684 686 687 | |
689 691 694 697 698 700 702 704 707 729 732 734 736 790 738 739 740 742 | |
743 746 747 748 791 749 751 753 755 758 759 761 762 765 766 768 771 777 | |
778 781 783 831 832 833 834 837 844 845 846 848 849 850 851 853 857 858 | |
859 886 887 889 890 892 893 895 785 905 908 911 92 915 918 921 922 924 | |
925 927 931 934 935 936 941 946 976 977 980 981 989 992 | |
'''.strip().split() | |
wait = WebDriverWait(driver, 10) | |
def wait_find_elem(selector): | |
wait.until(EC.presence_of_element_located( | |
(By.CSS_SELECTOR, selector))) | |
return browser.find_element_by_css_selector(selector) | |
def dl_stuff(kunta): | |
browser.get('https://wwwp2.ymparisto.fi/vesla/Common/rules/SearchRules.aspx') | |
# Reset selection, if any | |
try: browser.find_element_by_css_selector('#ContentPlaceHolder1_btnRemove1').click() | |
except: pass | |
browser.find_element_by_css_selector('#ContentPlaceHolder1_rptSelect_btnRule_0').click() | |
wait_find_elem('#ContentPlaceHolder1_rptSelect_pnlRules_0 :nth-child(5)').click() | |
# Reset selection | |
wait_find_elem('#ContentPlaceHolder1_SelectControl1_btnRemoveAll').click() | |
time.sleep(0.5) | |
# Select it | |
wait_find_elem(f'option[value="{kunta}"]').click() | |
time.sleep(0.5) | |
wait_find_elem('#ContentPlaceHolder1_SelectControl1_btnAdd').click() | |
wait_find_elem('#ContentPlaceHolder1_SelectControl1_lstSelected option') | |
# Accept the filter | |
browser.find_element_by_css_selector('#ContentPlaceHolder1_btnSelect').click() | |
# Main view -> go to results | |
wait_find_elem('#ContentPlaceHolder1_Button0server').click() | |
# Select all rows? | |
time.sleep(1.0) | |
elem = wait_find_elem('#ContentPlaceHolder1_chkSelectAll') | |
if not elem.is_selected(): elem.click() | |
time.sleep(1.5) | |
# No data: skip this. | |
if wait_find_elem('#ContentPlaceHolder1_lblTitle').text == 'Hakutulos: 0 paikkaa': | |
return | |
# Export to Excel | |
wait_find_elem('[value="sas"]').click() | |
wait_find_elem('#ContentPlaceHolder1_SendRightAllPeriod').click() | |
wait_find_elem('#ContentPlaceHolder1_SelectedDatePeriods option') | |
wait_find_elem('#ContentPlaceHolder1_SendRightAllSeason').click() | |
wait_find_elem('#ContentPlaceHolder1_SelectedSeasons option') | |
wait_find_elem('#ContentPlaceHolder1_SendRightAllLayer').click() | |
wait_find_elem('#ContentPlaceHolder1_SelectedLayers option') | |
wait_find_elem('#ContentPlaceHolder1_SendRightAllQuantity').click() | |
wait_find_elem('#ContentPlaceHolder1_SelectedQuantities option') | |
output_files = glob.glob(f'{dir_}/Stats_*.csv') | |
wait_find_elem('#ContentPlaceHolder1_btnSave').click() | |
for _ in range(600): | |
time.sleep(1) | |
new_output_files = glob.glob(f'{dir_}/Stats_*.csv') | |
if len(new_output_files) > len(output_files): | |
break | |
visited = [] | |
for kunta in kunnat: | |
if kunta in visited: continue | |
dl_stuff(kunta) | |
visited += [kunta] | |
print(f'Downloaded data for municipality: {kunta}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment