tyndyll · September 25, 2015 14:41
diff --git a/translink_extract.py b/translink_extract.py
 #!/usr/bin/env python
 #
 # Copyright (c) 2012 Tyndyll
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met: 
 #
 # 1. Redistributions of source code must retain the above copyright notice, this
 #    list of conditions and the following disclaimer. 
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 #    this list of conditions and the following disclaimer in the documentation
 #    and/or other materials provided with the distribution. 
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # The views and conclusions contained in the software and documentation are those
 # of the authors and should not be interpreted as representing official policies, 
 # either expressed or implied, of the FreeBSD Project.
 #

 from selenium import webdriver
 from selenium.webdriver.common.by import By

 import sys
 import time


 class Extraction:

    def __init__(self):
        self._driver = None
        self.travel_pages = {
            "metro": "http://www.translink.co.uk/Services/Metro-Service-Page/",
            "ulsterbus": "http://www.translink.co.uk/Services/Ulsterbus-Service-Page/Timetables/",
            "goldline": "http://www.translink.co.uk/Services/Goldline/Timetables/",
            "rail": "http://www.translink.co.uk/Services/NI-Railways/",
            "enterprise": "http://www.translink.co.uk/Services/Enterprise/"
        }

    def __del__(self):
        if self._driver is not None:
            self._driver.quit()

    def _get_driver(self, driver="Firefox"):
        if self._driver is None:
            self._driver = getattr(webdriver, driver)()
        return self._driver

    def _return_stripped_element(self, driver, element):
        if element.is_displayed():
            return_string = element.text
        else:
            return_string = driver.execute_script("return arguments[0].innerText || arguments[0].textContent;", element)
        return return_string.replace(u'\xa0', ' ').encode('utf-8').strip()

    def get_pages(self, travel):
        """
        Get list of page urls from which timetable data could be extracted.
        Pages are gathered by type
 
             enterprise - NI Railways Cross Border rail service
             goldline - Ulsterbus Goldline Service
             metro - Belfast Metro Bus Service
             rail - NI Railways Service
             ulsterbus - Standard Ulsterbus Service

        Throws:
            
        """
        pages = []
        driver = self._get_driver()
        url = self.travel_pages[travel]
        driver.get(url)
        pagination = driver.find_elements(By.XPATH, "//div[contains(@class,'rgWrap rgInfoPart')]/strong")
        if len(pagination) != 0:
            #for i in xrange(int(pagination[1].text)-1):
            for i in xrange(int(pagination[1].text)):
                pages.extend(self._get_pages_from_icons(driver, travel))
                driver.find_elements(By.XPATH, "//input[contains(@class,'rgPageNext')]")[0].click()
                # Wait 5 seconds before going to the next page out of politeness
                time.sleep(5)
        else:
            pages.extend(self._get_pages_from_icons(driver, travel))
        return pages

    def _get_pages_from_icons(self, driver, travel):
        pages = []
        if travel == "rail" or travel == "enterprise":
            for page in driver.find_elements(By.XPATH, "//a[contains(@class,'timetable-ico')]"):
                if page.get_attribute("href") is None:
                    continue
                pages.append(page.get_attribute("href"))
        else:
            if travel == "ulsterbus" or travel == "goldline":
                idn = 'ctl00_MainRegion_rptPageListCurrent_'
            else:
                idn = 'ctl00_MainRegion_ContentArea_ctl01_'
            for page in driver.find_elements(By.XPATH, "//td/a[contains(@id,'" + idn + "')]"):
                if page.get_attribute("href") is None:
                    continue
                mid = str(page.get_attribute("id"))
                if "hypRoute" in mid or "hypMetro in mid":
                    href = str(page.get_attribute("href")).replace("outputFormat=1", "outputFormat=0")
                    if href.startswith("http://journeyplanner.translink.co.uk"):
                        if href not in pages:
                            pages.append(href)
        return pages

    def get_new_timetables(self, driver, operator):
        page_service = driver.find_elements(By.XPATH, ".//div[@id='ttbH_H']/div/table/tbody/tr/td[2]/span")[0].text
        total_journeys = []
        for direction in ["H", "R"]:
            for timetable in xrange(1, 4):
                try:
                    driver.find_elements_by_id("weekday%s_%d" % (direction, timetable))[0].click()
                except IndexError:
                    continue
                all_journeys = []
                eid = "ttbTable_%sday_%d" % (direction, timetable)
                value = driver.find_elements(By.XPATH, ".//*[@id='"+eid+"']/table/tbody/tr/td[1]")
                if len(value) == 0:
                    continue
                days = self._return_stripped_element(driver, value[0])
                eid = "ttbH_%s_%d" % (direction, timetable)

                row_label = ""
                for label in driver.find_elements(By.XPATH, ".//*[@id='ttbCa_%s_%s']" % (direction, timetable)):
                    row_label = self._return_stripped_element(driver, label)
                    break

                for td in driver.find_elements(By.XPATH, ".//*[@id='" + eid + "']/table/tbody/tr[1]/td/span"):
                    service = self._return_stripped_element(driver, td)
                    if service == "":
                        continue
                    elif service.startswith("_") or row_label.startswith("Service Restriction"):
                        service = page_service
                    obj = {"days": days, "service": service, "operator": operator, "stops":[]}
                    all_journeys.append(obj)

                eid = "ttbM_%s_%d" % (direction, timetable)
                locations = []
                for td in driver.find_elements(By.XPATH, ".//*[@id='" + eid + "']/table/tbody/tr/td/span"):
                    title = td.get_attribute('title').strip()
                    if len(title) != 0:
                        title = title.replace("$ ", "")
                        locations.append(title)
                if len(locations) == 0:
                    continue

                eid = "ttbCo_%s_%d" % (direction, timetable)
                location = 0
                for row in driver.find_elements(By.XPATH, ".//*[@id='" + eid + "']/table/tbody/tr"):
                    journey = 0
                    for td in row.find_elements(By.XPATH, ".//td[@class='ttbCo']/span[@class='ttbCo']"):
                        data = td.text
                        if data == "":
                            continue
                        if data == "-" or data == "|":
                            journey += 1
                            continue
                        data = int(data[:4])
                        try:
                            if len(all_journeys[journey]["stops"]) == 0:
                                all_journeys[journey]["stops"].append({
                                    "departs": data,
                                    "stop": locations[location]
                                })
                            else:
                                if all_journeys[journey]["stops"][-1]["stop"] == locations[location]:
                                    all_journeys[journey]["stops"][-1]["departs"] = data
                                else:
                                    all_journeys[journey]["stops"].append({
                                        "arrives": data,
                                        "departs": data,
                                        "stop": locations[location]
                                    })
                        except IndexError:
                            continue
                        journey += 1
                    location += 1
                for i in xrange(len(all_journeys)):
                    try:
                        del(all_journeys[i]["stops"][-1]["departs"])
                    except IndexError:
                        continue
                total_journeys = total_journeys + all_journeys
        return total_journeys

    def get_old_timetables(self, driver, operator):
        all_journeys = []
        e = driver.find_elements(By.XPATH, "//div[contains(@id,'top')]/h2")
        if len(e) != 1:
            raise Exception( "Cannot determine title" )
        for table in driver.find_elements(By.XPATH, "//table[contains(@class,'lower_whitebox_timetables_results_table')]"):
            # Process all the timetable tables on a page
            journeys = None
            first_row = True
            in_data = False
            for row in table.find_elements_by_tag_name("tr"):
                counter = 0
                cells = row.find_elements_by_tag_name("th")
                if first_row:
                    journeys = [{"days": None, "service": None, "operator": operator, "stops": []} for _ in xrange(len(cells))]
                    first_row = False
                    continue
                else:
                    if len(cells) != 1:
                        continue
                key = self._return_stripped_element(driver, cells[0])

                if key == "Calling points:":
                    # Now moving into the timetable data. All headers are parsed
                    in_data = True
                    continue
                elif key == "":
                    continue

                cells = row.find_elements_by_tag_name("td")
                for cell in cells:
                    text = self._return_stripped_element(driver, cell)
                    if in_data:
                        if text == "..." or text == "":
                            pass
                        else:
                            text = int(text[:4])
                            if len(journeys[counter]["stops"]) == 0:
                                journeys[counter]["stops"].append({"stop": key, "departs": text})
                            else:
                                if journeys[counter]["stops"][-1]["stop"] == key:
                                    journeys[counter]["stops"][-1]["departs"] = text
                                else:
                                    journeys[counter]["stops"].append({"stop": key, "arrives": text, "departs": text})
                    else:
                        if key == "Operator:":
                            journeys[counter]["operator"] = text
                        elif key == "Service:":
                            journeys[counter]["service"] = text
                        elif key == "Days of operation:":
                            journeys[counter]["days"] = text
                    counter += 1
            while counter > 0:
                counter -= 1
                del(journeys[counter]["stops"][-1]["departs"])
            all_journeys.extend(journeys)
            # Table complete
        return all_journeys

    def get_journeys(self, page, operator):
        """
        Throws Webdriver creator failure
        """
        driver = self._get_driver()
        driver.get(page)

        all_journeys = []
        if page.startswith("http://www.translink.co.uk"):
            all_journeys = self.get_old_timetables(driver, operator)
        elif page.startswith("http://journeyplanner.translink.co.uk"):
            all_journeys = self.get_new_timetables(driver, operator)
        return all_journeys




 if __name__ == "__main__":

    import optparse

    operator_options = ["enterprise", "metro", "ulsterbus", "rail", "goldline"]

    o = optparse.OptionParser()
    o.add_option("--list", action="store", choices=operator_options, dest="list",
                  help="Display selected timetable URLs (enterprise, goldline, metro, rail, ulsterbus)")
    o.add_option("--list-options", action="store_true", dest="list_options", help="List avaialable options for --list")
    o.add_option("--url", action="store", dest="url", help="URL from which to gather timetable data")
    o.add_option("--operator", action="store", dest="operator", choices=operator_options)
    o.add_option("-o", "--output", dest="output_file")

    (opts, args) = o.parse_args()

    if opts.list_options:
        print "\n".join(o.get_option("--list").choices)
        sys.exit(0)
    else:
        if (opts.list and opts.url) or not (opts.list or opts.url):
             o.print_help()

    output = None
    if opts.output_file is not None:
        output = open(opts.output_file, "w")
    else:
        output = sys.stdout
    
    ext = Extraction()
    
    if opts.url:
        if opts.operator is None:
            o.error("--url also requires an --operator")
        import json
        output.write(json.dumps(ext.get_journeys(opts.url, opts.operator), indent=4))
    else:
        msg = "\n".join(ext.get_pages(opts.list))
        output.write("%s\n" % msg)
	#!/usr/bin/env python
	#
	# Copyright (c) 2012 Tyndyll
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:
	#
	# 1. Redistributions of source code must retain the above copyright notice, this
	# list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright notice,
	# this list of conditions and the following disclaimer in the documentation
	# and/or other materials provided with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	#
	# The views and conclusions contained in the software and documentation are those
	# of the authors and should not be interpreted as representing official policies,
	# either expressed or implied, of the FreeBSD Project.
	#

	from selenium import webdriver
	from selenium.webdriver.common.by import By

	import sys
	import time


	class Extraction:

	def __init__(self):
	self._driver = None
	self.travel_pages = {
	"metro": "http://www.translink.co.uk/Services/Metro-Service-Page/",
	"ulsterbus": "http://www.translink.co.uk/Services/Ulsterbus-Service-Page/Timetables/",
	"goldline": "http://www.translink.co.uk/Services/Goldline/Timetables/",
	"rail": "http://www.translink.co.uk/Services/NI-Railways/",
	"enterprise": "http://www.translink.co.uk/Services/Enterprise/"
	}

	def __del__(self):
	if self._driver is not None:
	self._driver.quit()

	def _get_driver(self, driver="Firefox"):
	if self._driver is None:
	self._driver = getattr(webdriver, driver)()
	return self._driver

	def _return_stripped_element(self, driver, element):
	if element.is_displayed():
	return_string = element.text
	else:
	return_string = driver.execute_script("return arguments[0].innerText \|\| arguments[0].textContent;", element)
	return return_string.replace(u'\xa0', ' ').encode('utf-8').strip()

	def get_pages(self, travel):
	"""
	Get list of page urls from which timetable data could be extracted.
	Pages are gathered by type

	enterprise - NI Railways Cross Border rail service
	goldline - Ulsterbus Goldline Service
	metro - Belfast Metro Bus Service
	rail - NI Railways Service
	ulsterbus - Standard Ulsterbus Service

	Throws:

	"""
	pages = []
	driver = self._get_driver()
	url = self.travel_pages[travel]
	driver.get(url)
	pagination = driver.find_elements(By.XPATH, "//div[contains(@class,'rgWrap rgInfoPart')]/strong")
	if len(pagination) != 0:
	#for i in xrange(int(pagination[1].text)-1):
	for i in xrange(int(pagination[1].text)):
	pages.extend(self._get_pages_from_icons(driver, travel))
	driver.find_elements(By.XPATH, "//input[contains(@class,'rgPageNext')]")[0].click()
	# Wait 5 seconds before going to the next page out of politeness
	time.sleep(5)
	else:
	pages.extend(self._get_pages_from_icons(driver, travel))
	return pages

	def _get_pages_from_icons(self, driver, travel):
	pages = []
	if travel == "rail" or travel == "enterprise":
	for page in driver.find_elements(By.XPATH, "//a[contains(@class,'timetable-ico')]"):
	if page.get_attribute("href") is None:
	continue
	pages.append(page.get_attribute("href"))
	else:
	if travel == "ulsterbus" or travel == "goldline":
	idn = 'ctl00_MainRegion_rptPageListCurrent_'
	else:
	idn = 'ctl00_MainRegion_ContentArea_ctl01_'
	for page in driver.find_elements(By.XPATH, "//td/a[contains(@id,'" + idn + "')]"):
	if page.get_attribute("href") is None:
	continue
	mid = str(page.get_attribute("id"))
	if "hypRoute" in mid or "hypMetro in mid":
	href = str(page.get_attribute("href")).replace("outputFormat=1", "outputFormat=0")
	if href.startswith("http://journeyplanner.translink.co.uk"):
	if href not in pages:
	pages.append(href)
	return pages

	def get_new_timetables(self, driver, operator):
	page_service = driver.find_elements(By.XPATH, ".//div[@id='ttbH_H']/div/table/tbody/tr/td[2]/span")[0].text
	total_journeys = []
	for direction in ["H", "R"]:
	for timetable in xrange(1, 4):
	try:
	driver.find_elements_by_id("weekday%s_%d" % (direction, timetable))[0].click()
	except IndexError:
	continue
	all_journeys = []
	eid = "ttbTable_%sday_%d" % (direction, timetable)
	value = driver.find_elements(By.XPATH, ".//*[@id='"+eid+"']/table/tbody/tr/td[1]")
	if len(value) == 0:
	continue
	days = self._return_stripped_element(driver, value[0])
	eid = "ttbH_%s_%d" % (direction, timetable)

	row_label = ""
	for label in driver.find_elements(By.XPATH, ".//*[@id='ttbCa_%s_%s']" % (direction, timetable)):
	row_label = self._return_stripped_element(driver, label)
	break

	for td in driver.find_elements(By.XPATH, ".//*[@id='" + eid + "']/table/tbody/tr[1]/td/span"):
	service = self._return_stripped_element(driver, td)
	if service == "":
	continue
	elif service.startswith("_") or row_label.startswith("Service Restriction"):
	service = page_service
	obj = {"days": days, "service": service, "operator": operator, "stops":[]}
	all_journeys.append(obj)

	eid = "ttbM_%s_%d" % (direction, timetable)
	locations = []
	for td in driver.find_elements(By.XPATH, ".//*[@id='" + eid + "']/table/tbody/tr/td/span"):
	title = td.get_attribute('title').strip()
	if len(title) != 0:
	title = title.replace("$ ", "")
	locations.append(title)
	if len(locations) == 0:
	continue

	eid = "ttbCo_%s_%d" % (direction, timetable)
	location = 0
	for row in driver.find_elements(By.XPATH, ".//*[@id='" + eid + "']/table/tbody/tr"):
	journey = 0
	for td in row.find_elements(By.XPATH, ".//td[@class='ttbCo']/span[@class='ttbCo']"):
	data = td.text
	if data == "":
	continue
	if data == "-" or data == "\|":
	journey += 1
	continue
	data = int(data[:4])
	try:
	if len(all_journeys[journey]["stops"]) == 0:
	all_journeys[journey]["stops"].append({
	"departs": data,
	"stop": locations[location]
	})
	else:
	if all_journeys[journey]["stops"][-1]["stop"] == locations[location]:
	all_journeys[journey]["stops"][-1]["departs"] = data
	else:
	all_journeys[journey]["stops"].append({
	"arrives": data,
	"departs": data,
	"stop": locations[location]
	})
	except IndexError:
	continue
	journey += 1
	location += 1
	for i in xrange(len(all_journeys)):
	try:
	del(all_journeys[i]["stops"][-1]["departs"])
	except IndexError:
	continue
	total_journeys = total_journeys + all_journeys
	return total_journeys

	def get_old_timetables(self, driver, operator):
	all_journeys = []
	e = driver.find_elements(By.XPATH, "//div[contains(@id,'top')]/h2")
	if len(e) != 1:
	raise Exception( "Cannot determine title" )
	for table in driver.find_elements(By.XPATH, "//table[contains(@class,'lower_whitebox_timetables_results_table')]"):
	# Process all the timetable tables on a page
	journeys = None
	first_row = True
	in_data = False
	for row in table.find_elements_by_tag_name("tr"):
	counter = 0
	cells = row.find_elements_by_tag_name("th")
	if first_row:
	journeys = [{"days": None, "service": None, "operator": operator, "stops": []} for _ in xrange(len(cells))]
	first_row = False
	continue
	else:
	if len(cells) != 1:
	continue
	key = self._return_stripped_element(driver, cells[0])

	if key == "Calling points:":
	# Now moving into the timetable data. All headers are parsed
	in_data = True
	continue
	elif key == "":
	continue

	cells = row.find_elements_by_tag_name("td")
	for cell in cells:
	text = self._return_stripped_element(driver, cell)
	if in_data:
	if text == "..." or text == "":
	pass
	else:
	text = int(text[:4])
	if len(journeys[counter]["stops"]) == 0:
	journeys[counter]["stops"].append({"stop": key, "departs": text})
	else:
	if journeys[counter]["stops"][-1]["stop"] == key:
	journeys[counter]["stops"][-1]["departs"] = text
	else:
	journeys[counter]["stops"].append({"stop": key, "arrives": text, "departs": text})
	else:
	if key == "Operator:":
	journeys[counter]["operator"] = text
	elif key == "Service:":
	journeys[counter]["service"] = text
	elif key == "Days of operation:":
	journeys[counter]["days"] = text
	counter += 1
	while counter > 0:
	counter -= 1
	del(journeys[counter]["stops"][-1]["departs"])
	all_journeys.extend(journeys)
	# Table complete
	return all_journeys

	def get_journeys(self, page, operator):
	"""
	Throws Webdriver creator failure
	"""
	driver = self._get_driver()
	driver.get(page)

	all_journeys = []
	if page.startswith("http://www.translink.co.uk"):
	all_journeys = self.get_old_timetables(driver, operator)
	elif page.startswith("http://journeyplanner.translink.co.uk"):
	all_journeys = self.get_new_timetables(driver, operator)
	return all_journeys




	if __name__ == "__main__":

	import optparse

	operator_options = ["enterprise", "metro", "ulsterbus", "rail", "goldline"]

	o = optparse.OptionParser()
	o.add_option("--list", action="store", choices=operator_options, dest="list",
	help="Display selected timetable URLs (enterprise, goldline, metro, rail, ulsterbus)")
	o.add_option("--list-options", action="store_true", dest="list_options", help="List avaialable options for --list")
	o.add_option("--url", action="store", dest="url", help="URL from which to gather timetable data")
	o.add_option("--operator", action="store", dest="operator", choices=operator_options)
	o.add_option("-o", "--output", dest="output_file")

	(opts, args) = o.parse_args()

	if opts.list_options:
	print "\n".join(o.get_option("--list").choices)
	sys.exit(0)
	else:
	if (opts.list and opts.url) or not (opts.list or opts.url):
	o.print_help()

	output = None
	if opts.output_file is not None:
	output = open(opts.output_file, "w")
	else:
	output = sys.stdout

	ext = Extraction()

	if opts.url:
	if opts.operator is None:
	o.error("--url also requires an --operator")
	import json
	output.write(json.dumps(ext.get_journeys(opts.url, opts.operator), indent=4))
	else:
	msg = "\n".join(ext.get_pages(opts.list))
	output.write("%s\n" % msg)
No results found