Last active
May 23, 2017 01:42
-
-
Save santiago-salas-v/0135726a66bda416fb3a4845eb12a5fb to your computer and use it in GitHub Desktop.
Modulhandbuecher mit selenium, urllib2, pandas, xlwings exportieren.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
import webbrowser | |
import urllib | |
import urllib2 | |
from bs4 import BeautifulSoup | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium import webdriver | |
from selenium.common.exceptions import TimeoutException | |
import xlwings as xw | |
from xlwings.constants import Direction | |
import pandas as pd | |
tum_page = 'https://campus.tum.de/tumonline/' | |
tum_mh = 'https://campus.tum.de/tumonline/wbsuche.durchfuehren?pSuchTyp=23&pOrgNr=' | |
o = open('f_codes.txt') | |
f_codes = [line.replace('\n', '') for line in o] | |
o.close() | |
wd = webdriver.Chrome() | |
sht = xw.Book().sheets[0] | |
main_window = wd.current_window_handle | |
def series_pairs(f_code): | |
wd.get(tum_mh) | |
WebDriverWait(wd, 10).until( | |
EC.visibility_of_element_located( | |
(By.NAME, 'pSuchbegriff') | |
)) | |
input = wd.find_element(By.NAME, 'pSuchbegriff') | |
input.send_keys(f_code) | |
input.send_keys(Keys.RETURN) | |
WebDriverWait(wd, 10).until( | |
EC.visibility_of_element_located( | |
(By.XPATH, "//a[@target='w_mhb_beschr']") | |
)) | |
modul_elem = wd.find_element_by_xpath("//a[@target='w_mhb_beschr']") | |
link = modul_elem.get_attribute('href') | |
wd.execute_script('window.open()') | |
new_window = wd.window_handles[-1] | |
wd.switch_to_window(new_window) | |
wd.get(link) | |
WebDriverWait(wd, 10).until( | |
EC.visibility_of_element_located( | |
(By.XPATH, | |
'//fieldset//' + | |
'td[contains(@class," MaskRenderer MaskLabel")]') | |
)) | |
# For structure determination, use beautifulsoup: | |
soup = BeautifulSoup(wd.page_source) | |
print soup.find_all('fieldset')[0].prettify() | |
# <fieldset class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator "> | |
# <legend class="Mask "> | |
# <span class="Mask"> | |
# Moduldetails | |
# </span> | |
# </legend> | |
# <table class=" MaskSpacing"> | |
# <tbody> | |
# <tr> | |
# <td class=" MaskRenderer top" colspan="1"> | |
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator "> | |
# <table class=" MaskSpacing"> | |
# <tbody> | |
# <tr> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641382"> | |
# Name | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641382"> | |
# <span class="bold "> | |
# Grundlagen der modernen Informationstechnik | |
# </span> | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641383"> | |
# Organisation | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641383"> | |
# Lehrstuhl für Automatisierung und Informationssysteme (Prof. Vogel-Heuser) | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641384"> | |
# Organisationskennung | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641384"> | |
# TUMWAIS | |
# </span> | |
# </td> | |
# </tr> | |
# <tr class="CSS2163641385 hidden"> | |
# <td class=" MaskRenderer"> | |
# </td> | |
# </tr> | |
# </tbody> | |
# </table> | |
# </div> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer top" colspan="1"> | |
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator "> | |
# <table class=" MaskSpacing"> | |
# <tbody> | |
# <tr> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer top" colspan="0"> | |
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator "> | |
# <table class=" MaskSpacing"> | |
# <tbody> | |
# <tr> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641386"> | |
# Anmerkung | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641386"> | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641387"> | |
# ECTS-Credits | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641387"> | |
# 8 | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641388"> | |
# Gewichtungsfaktor | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641388"> | |
# 1 | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641389"> | |
# Dauer | |
# <span class="TextToolTip " title="Die Einheit wird in der SPO-Version festgelegt."> | |
# [nach SPOV] | |
# </span> | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641389"> | |
# 6 | |
# </span> | |
# </td> | |
# </tr> | |
# </tbody> | |
# </table> | |
# </div> | |
# </td> | |
# <td class=" MaskRenderer top" colspan="0"> | |
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator "> | |
# <table class=" MaskSpacing"> | |
# <tbody> | |
# <tr> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641390"> | |
# Modul-Kennung | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641390"> | |
# <span class="bold "> | |
# MW2206 | |
# </span> | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641391"> | |
# Versionskurzbezeichnung | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641391"> | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641392"> | |
# Externe Zuordnung | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641392"> | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641393"> | |
# Gültig Von | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641393"> | |
# 2012W | |
# </span> | |
# </td> | |
# </tr> | |
# <tr> | |
# <td class=" MaskRenderer MaskLabel top"> | |
# <label class="Mask " for="ST2163641394"> | |
# Gültig Bis | |
# </label> | |
# </td> | |
# <td class=" MaskRenderer"> | |
# <span class="Mask " id="ST2163641394"> | |
# </span> | |
# </td> | |
# </tr> | |
# </tbody> | |
# </table> | |
# </div> | |
# </td> | |
# </tr> | |
# </tbody> | |
# </table> | |
# </div> | |
# </td> | |
# </tr> | |
# </tbody> | |
# </table> | |
# </fieldset> | |
# Structure shows MaskRenderer MaskLabel and its sibling are the | |
# objects of interest | |
all_relevant_fieldsets = wd.find_elements_by_xpath( | |
'//fieldset//td[contains(@class, " MaskRenderer MaskLabel")]' + | |
'/ancestor::fieldset' | |
) | |
result = [] | |
for fieldset in all_relevant_fieldsets: | |
labels = fieldset.find_elements_by_xpath('.//td[contains(@class, "MaskRenderer MaskLabel")]') | |
values = fieldset.find_elements_by_xpath( | |
'.//td[contains(@class, "MaskRenderer MaskLabel")]/following-sibling::*') | |
for k, label in enumerate(labels): | |
if len(label.text) > 0: | |
added_tuple = ( | |
str(k), | |
fieldset.text.split('\n')[0], | |
label.text, values[k].text | |
) | |
result.append(added_tuple) | |
print added_tuple | |
tuples = [(item[1], item[2]) for item in result] | |
values = [item[-1] for item in result] | |
index = pd.MultiIndex.from_tuples(tuples) | |
s = pd.Series(values, index=index) | |
return s | |
s = series_pairs(f_codes[0]) | |
df = pd.DataFrame( | |
index=f_codes, | |
columns=s.index | |
) | |
df.loc[f_codes[0]] = s | |
wd.close() | |
wd.switch_to_window(main_window) | |
sht.range('A1').value = df | |
sht.book.save('output.xlsx') | |
for code in f_codes[1:]: | |
try: | |
df.loc[code] = pd.Series(series_pairs(code)) | |
wd.close() | |
except TimeoutException as e: | |
if 'err' not in [ | |
sheet.name for sheet in sht.book.sheets]: | |
sht.book.sheets.add('err') | |
last_row = sht.book.sheets[ | |
'err' | |
].range('A:A').last_cell.end(Direction.xlUp).row | |
sht.book.sheets['err'].range( | |
'A' + str(last_row + 1) | |
).value = code | |
sht.book.sheets['err'].range( | |
'B' + str(last_row + 1) | |
).value = 'Timed out' | |
wd.switch_to_window(main_window) | |
sht.range('A1').value = df | |
sht.book.save() | |
for handle in wd.window_handles: | |
print handle |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MA9301 | |
MW1937 | |
CH0857 | |
MW2205 | |
MW2206 | |
MA9302 | |
MW1938 | |
CH0575 | |
CH4104 | |
MA9305 | |
MW1984 | |
MW2015 | |
CH4110 | |
CH2108 | |
CH2109 | |
MW2023 | |
MW2102 | |
CH4114 | |
CH0864 | |
CH0861 | |
CH5401 | |
CH0862 | |
CH0603 | |
CH0604 | |
MW1903 | |
MW1930 | |
MW2021 | |
MW0992 | |
SZ0402 | |
SZ0413 | |
SZ0429 | |
SZ0423 | |
SZ0424 | |
SZ0426 | |
SZ0438 | |
SZ0428 | |
SZ0431 | |
SZ0432 | |
MW0219 | |
WI000728 | |
MW0250 | |
MW0730 | |
MW0244 | |
CH0136 | |
CH0213 | |
CH0124 | |
WI000027 | |
WI000030 | |
WI000775 | |
MW1535 | |
WI000810 | |
WI000664 | |
CH0610 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment