Skip to content

Instantly share code, notes, and snippets.

@santiago-salas-v
Last active May 23, 2017 01:42
Show Gist options
  • Save santiago-salas-v/0135726a66bda416fb3a4845eb12a5fb to your computer and use it in GitHub Desktop.
Save santiago-salas-v/0135726a66bda416fb3a4845eb12a5fb to your computer and use it in GitHub Desktop.
Modulhandbuecher mit selenium, urllib2, pandas, xlwings exportieren.
# coding=utf-8
import webbrowser
import urllib
import urllib2
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
import xlwings as xw
from xlwings.constants import Direction
import pandas as pd
tum_page = 'https://campus.tum.de/tumonline/'
tum_mh = 'https://campus.tum.de/tumonline/wbsuche.durchfuehren?pSuchTyp=23&pOrgNr='
o = open('f_codes.txt')
f_codes = [line.replace('\n', '') for line in o]
o.close()
wd = webdriver.Chrome()
sht = xw.Book().sheets[0]
main_window = wd.current_window_handle
def series_pairs(f_code):
wd.get(tum_mh)
WebDriverWait(wd, 10).until(
EC.visibility_of_element_located(
(By.NAME, 'pSuchbegriff')
))
input = wd.find_element(By.NAME, 'pSuchbegriff')
input.send_keys(f_code)
input.send_keys(Keys.RETURN)
WebDriverWait(wd, 10).until(
EC.visibility_of_element_located(
(By.XPATH, "//a[@target='w_mhb_beschr']")
))
modul_elem = wd.find_element_by_xpath("//a[@target='w_mhb_beschr']")
link = modul_elem.get_attribute('href')
wd.execute_script('window.open()')
new_window = wd.window_handles[-1]
wd.switch_to_window(new_window)
wd.get(link)
WebDriverWait(wd, 10).until(
EC.visibility_of_element_located(
(By.XPATH,
'//fieldset//' +
'td[contains(@class," MaskRenderer MaskLabel")]')
))
# For structure determination, use beautifulsoup:
soup = BeautifulSoup(wd.page_source)
print soup.find_all('fieldset')[0].prettify()
# <fieldset class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator ">
# <legend class="Mask ">
# <span class="Mask">
# Moduldetails
# </span>
# </legend>
# <table class=" MaskSpacing">
# <tbody>
# <tr>
# <td class=" MaskRenderer top" colspan="1">
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator ">
# <table class=" MaskSpacing">
# <tbody>
# <tr>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641382">
# Name
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641382">
# <span class="bold ">
# Grundlagen der modernen Informationstechnik
# </span>
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641383">
# Organisation
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641383">
# Lehrstuhl für Automatisierung und Informationssysteme (Prof. Vogel-Heuser)
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641384">
# Organisationskennung
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641384">
# TUMWAIS
# </span>
# </td>
# </tr>
# <tr class="CSS2163641385 hidden">
# <td class=" MaskRenderer">
# </td>
# </tr>
# </tbody>
# </table>
# </div>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer top" colspan="1">
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator ">
# <table class=" MaskSpacing">
# <tbody>
# <tr>
# </tr>
# <tr>
# <td class=" MaskRenderer top" colspan="0">
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator ">
# <table class=" MaskSpacing">
# <tbody>
# <tr>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641386">
# Anmerkung
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641386">
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641387">
# ECTS-Credits
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641387">
# 8
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641388">
# Gewichtungsfaktor
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641388">
# 1
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641389">
# Dauer
# <span class="TextToolTip " title="Die Einheit wird in der SPO-Version festgelegt.">
# [nach SPOV]
# </span>
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641389">
# 6
# </span>
# </td>
# </tr>
# </tbody>
# </table>
# </div>
# </td>
# <td class=" MaskRenderer top" colspan="0">
# <div class=" MaskS MaskSCT KnotenDetailsLabelWidth KnotenDetailsLabelSeperator ">
# <table class=" MaskSpacing">
# <tbody>
# <tr>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641390">
# Modul-Kennung
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641390">
# <span class="bold ">
# MW2206
# </span>
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641391">
# Versionskurzbezeichnung
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641391">
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641392">
# Externe Zuordnung
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641392">
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641393">
# Gültig Von
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641393">
# 2012W
# </span>
# </td>
# </tr>
# <tr>
# <td class=" MaskRenderer MaskLabel top">
# <label class="Mask " for="ST2163641394">
# Gültig Bis
# </label>
# </td>
# <td class=" MaskRenderer">
# <span class="Mask " id="ST2163641394">
# </span>
# </td>
# </tr>
# </tbody>
# </table>
# </div>
# </td>
# </tr>
# </tbody>
# </table>
# </div>
# </td>
# </tr>
# </tbody>
# </table>
# </fieldset>
# Structure shows MaskRenderer MaskLabel and its sibling are the
# objects of interest
all_relevant_fieldsets = wd.find_elements_by_xpath(
'//fieldset//td[contains(@class, " MaskRenderer MaskLabel")]' +
'/ancestor::fieldset'
)
result = []
for fieldset in all_relevant_fieldsets:
labels = fieldset.find_elements_by_xpath('.//td[contains(@class, "MaskRenderer MaskLabel")]')
values = fieldset.find_elements_by_xpath(
'.//td[contains(@class, "MaskRenderer MaskLabel")]/following-sibling::*')
for k, label in enumerate(labels):
if len(label.text) > 0:
added_tuple = (
str(k),
fieldset.text.split('\n')[0],
label.text, values[k].text
)
result.append(added_tuple)
print added_tuple
tuples = [(item[1], item[2]) for item in result]
values = [item[-1] for item in result]
index = pd.MultiIndex.from_tuples(tuples)
s = pd.Series(values, index=index)
return s
s = series_pairs(f_codes[0])
df = pd.DataFrame(
index=f_codes,
columns=s.index
)
df.loc[f_codes[0]] = s
wd.close()
wd.switch_to_window(main_window)
sht.range('A1').value = df
sht.book.save('output.xlsx')
for code in f_codes[1:]:
try:
df.loc[code] = pd.Series(series_pairs(code))
wd.close()
except TimeoutException as e:
if 'err' not in [
sheet.name for sheet in sht.book.sheets]:
sht.book.sheets.add('err')
last_row = sht.book.sheets[
'err'
].range('A:A').last_cell.end(Direction.xlUp).row
sht.book.sheets['err'].range(
'A' + str(last_row + 1)
).value = code
sht.book.sheets['err'].range(
'B' + str(last_row + 1)
).value = 'Timed out'
wd.switch_to_window(main_window)
sht.range('A1').value = df
sht.book.save()
for handle in wd.window_handles:
print handle
MA9301
MW1937
CH0857
MW2205
MW2206
MA9302
MW1938
CH0575
CH4104
MA9305
MW1984
MW2015
CH4110
CH2108
CH2109
MW2023
MW2102
CH4114
CH0864
CH0861
CH5401
CH0862
CH0603
CH0604
MW1903
MW1930
MW2021
MW0992
SZ0402
SZ0413
SZ0429
SZ0423
SZ0424
SZ0426
SZ0438
SZ0428
SZ0431
SZ0432
MW0219
WI000728
MW0250
MW0730
MW0244
CH0136
CH0213
CH0124
WI000027
WI000030
WI000775
MW1535
WI000810
WI000664
CH0610
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment