mbafford · January 1, 2022 20:11
diff --git a/dcu_opentimetable.py b/dcu_opentimetable.py
 import scrapy
 import json
 import scrapy.http
 import re
 import datetime

 def build_weeks():
    ret = []
    for weekNo in range(16, 30):
        week1 = datetime.date(2021, 9, 20)        
        week = week1 + datetime.timedelta(weeks=weekNo-1)
        ret.append({
            "WeekNumber": weekNo,
            "WeekLabel": str(weekNo),
            "FirstDayInWeek": f'{week.strftime("%Y-%m-%d")}T00:00:00+00:00'
        })
    return ret

 class TimetableSpider(scrapy.Spider):
    name = 'timetable'
    allowed_domains = ['opentimetable.dcu.ie']

    def build_index_request_page(self, pageNo: int) -> scrapy.Request:
        return scrapy.FormRequest(
            f"https://opentimetable.dcu.ie/broker/api/CategoryTypes/525fe79b-73c3-4b5c-8186-83c652b3adcc/Categories/Filter?pageNumber={pageNo}&query=",
            formdata=[{"Identity": "241e4d36-60e0-49f8-b27e-99416745d98d","Values": ["null"]}],
            headers={
                "Content-Type": "application/json",
                "Accept": "application/json",
                "Authorization": self.auth_token
            },
            callback=self.parse_index_page
        )

    def build_course_events_fetch(self, courseId: str):
        formdata = {
            "ViewOptions": {
                "Weeks": build_weeks()
            },
            "CategoryIdentities": [
                courseId
            ]
        }

        return scrapy.Request(
            f"https://opentimetable.dcu.ie/broker/api/categoryTypes/525fe79b-73c3-4b5c-8186-83c652b3adcc/categories/events/filter",
            body=json.dumps(formdata),
            method='POST',
            headers={
                "Content-Type": "application/json",
                "Accept": "application/json",
                "Authorization": self.auth_token
            },
            callback=self.parse_course_details
        )

    def start_requests(self):
        yield scrapy.Request(f"https://opentimetable.dcu.ie", callback=self.parse_index_html)

    def parse_index_html(self, response:scrapy.http.HtmlResponse):
        scripts = response.css("script::attr(src)").getall()
        for src in scripts:
            if 'main' in src and '.bundle.js' in src:
                yield scrapy.Request(response.urljoin(src), callback=self.extract_auth_token)

    def extract_auth_token(self, response:scrapy.http.Response):
        m = re.search(r'apiAuthentication:["](basic [^"]+)"', response.text)
        assert m is not None
        self.auth_token = m.group(1)
        yield self.build_index_request_page(1)

    def parse_index_page(self, response:scrapy.http.Response):
        data = response.json()
        # this will call on every page loaded, but be de-duped by Scrapy, right?
        for page in range(data['CurrentPage'], data['TotalPages']):
            yield self.build_index_request_page(page)

        for result in data["Results"]:
            yield result
            yield self.build_course_events_fetch(result["Identity"])

    def parse_course_details(self, response:scrapy.http.Response):
        data = response.json()
        for record in data:
            yield record
diff --git a/extract_courses_list b/extract_courses_list
 grep Extra timetables.jl | jq '.CategoryEvents//[] | .[] | [(.Name | sub("[[].*"; "") ), .Name, .HostKey, .EventType, .Location, .Description, (.ExtraProperties//[] | .[] | select(.DisplayName == "Module Name") | .Value ), (.ExtraProperties//[] | .[] | select(.DisplayName == "Weeks") | .Value), .StartDateTime, .EndDateTime ] | @tsv' -r | pbcopy
diff --git a/scrapy.cfg b/scrapy.cfg
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html

 [settings]
 default = settings

diff --git a/settings.py b/settings.py
 BOT_NAME = 'dcu_opentimetable'
 USER_AGENT = 'opentimetable scraper - please add bookmarks support'
 ROBOTSTXT_OBEY = True
 CONCURRENT_REQUESTS = 10


 AUTOTHROTTLE_ENABLED = True
 AUTOTHROTTLE_TARGET_CONCURRENCY = 4
 AUTOTHROTTLE_DEBUG = False

 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 HTTPCACHE_ENABLED = True
 HTTPCACHE_EXPIRATION_SECS = 3600
 HTTPCACHE_DIR = 'httpcache'
 HTTPCACHE_IGNORE_HTTP_CODES = [503, 504, 505, 500, 400, 401, 402, 403, 404]
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
	import scrapy
	import json
	import scrapy.http
	import re
	import datetime

	def build_weeks():
	ret = []
	for weekNo in range(16, 30):
	week1 = datetime.date(2021, 9, 20)
	week = week1 + datetime.timedelta(weeks=weekNo-1)
	ret.append({
	"WeekNumber": weekNo,
	"WeekLabel": str(weekNo),
	"FirstDayInWeek": f'{week.strftime("%Y-%m-%d")}T00:00:00+00:00'
	})
	return ret

	class TimetableSpider(scrapy.Spider):
	name = 'timetable'
	allowed_domains = ['opentimetable.dcu.ie']

	def build_index_request_page(self, pageNo: int) -> scrapy.Request:
	return scrapy.FormRequest(
	f"https://opentimetable.dcu.ie/broker/api/CategoryTypes/525fe79b-73c3-4b5c-8186-83c652b3adcc/Categories/Filter?pageNumber={pageNo}&query=",
	formdata=[{"Identity": "241e4d36-60e0-49f8-b27e-99416745d98d","Values": ["null"]}],
	headers={
	"Content-Type": "application/json",
	"Accept": "application/json",
	"Authorization": self.auth_token
	},
	callback=self.parse_index_page
	)

	def build_course_events_fetch(self, courseId: str):
	formdata = {
	"ViewOptions": {
	"Weeks": build_weeks()
	},
	"CategoryIdentities": [
	courseId
	]
	}

	return scrapy.Request(
	f"https://opentimetable.dcu.ie/broker/api/categoryTypes/525fe79b-73c3-4b5c-8186-83c652b3adcc/categories/events/filter",
	body=json.dumps(formdata),
	method='POST',
	headers={
	"Content-Type": "application/json",
	"Accept": "application/json",
	"Authorization": self.auth_token
	},
	callback=self.parse_course_details
	)

	def start_requests(self):
	yield scrapy.Request(f"https://opentimetable.dcu.ie", callback=self.parse_index_html)

	def parse_index_html(self, response:scrapy.http.HtmlResponse):
	scripts = response.css("script::attr(src)").getall()
	for src in scripts:
	if 'main' in src and '.bundle.js' in src:
	yield scrapy.Request(response.urljoin(src), callback=self.extract_auth_token)

	def extract_auth_token(self, response:scrapy.http.Response):
	m = re.search(r'apiAuthentication:["](basic [^"]+)"', response.text)
	assert m is not None
	self.auth_token = m.group(1)
	yield self.build_index_request_page(1)

	def parse_index_page(self, response:scrapy.http.Response):
	data = response.json()
	# this will call on every page loaded, but be de-duped by Scrapy, right?
	for page in range(data['CurrentPage'], data['TotalPages']):
	yield self.build_index_request_page(page)

	for result in data["Results"]:
	yield result
	yield self.build_course_events_fetch(result["Identity"])

	def parse_course_details(self, response:scrapy.http.Response):
	data = response.json()
	for record in data:
	yield record
	# Automatically created by: scrapy startproject
	#
	# For more information about the [deploy] section see:
	# https://scrapyd.readthedocs.io/en/latest/deploy.html

	[settings]
	default = settings
	BOT_NAME = 'dcu_opentimetable'
	USER_AGENT = 'opentimetable scraper - please add bookmarks support'
	ROBOTSTXT_OBEY = True
	CONCURRENT_REQUESTS = 10


	AUTOTHROTTLE_ENABLED = True
	AUTOTHROTTLE_TARGET_CONCURRENCY = 4
	AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	HTTPCACHE_ENABLED = True
	HTTPCACHE_EXPIRATION_SECS = 3600
	HTTPCACHE_DIR = 'httpcache'
	HTTPCACHE_IGNORE_HTTP_CODES = [503, 504, 505, 500, 400, 401, 402, 403, 404]
	#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'