Last active
March 11, 2025 13:49
-
-
Save jborean93/8bd09a3314311c78fe2939b88bb82f4f to your computer and use it in GitHub Desktop.
Cross platform way to search for and download updates listed in the Microsoft Update catalog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Copyright: (c) 2019, Jordan Borean (@jborean93) <[email protected]> | |
# MIT License (see LICENSE or https://opensource.org/licenses/MIT) | |
# Script to search for updates in the Microsoft Update Catalog. Works on both Python 2 and 3 but requires BeautifulSoup | |
# to be installed - https://www.crummy.com/software/BeautifulSoup/#Download | |
import contextlib | |
import datetime | |
import json | |
import re | |
import uuid | |
from bs4 import BeautifulSoup | |
try: | |
from urllib.parse import quote, urlencode, urlparse | |
from urllib.request import Request, urlopen, urlretrieve | |
except ImportError: # Python 2 | |
from urllib import quote, urlencode, urlretrieve | |
from urllib2 import Request, urlopen | |
from urlparse import urlparse | |
CATALOG_URL = 'https://www.catalog.update.microsoft.com/' | |
DOWNLOAD_PATTERN = re.compile(r'\[(\d*)\]\.url = [\"\'](http[s]?://w{0,3}.?download\.windowsupdate\.com/[^\'\"]*)') | |
PRODUCT_SPLIT_PATTERN = re.compile(r',(?=[^\s])') | |
@contextlib.contextmanager | |
def fetch_url(url, data=None, headers=None): | |
# Python 2 does not have urlopen as a contextmanager, just make our own | |
req = Request(url, data=data, headers=headers) | |
resp = urlopen(req) | |
try: | |
yield resp | |
finally: | |
resp.close() | |
class WUDownloadInfo: | |
def __init__(self, download_id, url, raw): | |
""" | |
Contains information about an individual download link for an update. An update might have multiple download | |
links available and this keeps track of the metadata for each of them. | |
:param download_id: The ID that relates to the download URL. | |
:param url: The download URL for this entry. | |
:param raw: The raw response text of the downloads page. | |
""" | |
self.url = url | |
self.digest = None | |
self.architectures = None | |
self.languages = None | |
self.long_languages = None | |
self.file_name = None | |
attribute_map = { | |
'digest': 'digest', | |
'architectures': 'architectures', | |
'languages': 'languages', | |
'long_languages': 'longLanguages', | |
'file_name': 'fileName', | |
} | |
for attrib_name, raw_name in attribute_map.items(): | |
regex_pattern = r"\[%s]\.%s = ['\"]([\w\-\.=+\/\(\) ]*)['\"];" \ | |
% (re.escape(download_id), re.escape(raw_name)) | |
regex_match = re.search(regex_pattern, raw) | |
if regex_match: | |
setattr(self, attrib_name, regex_match.group(1)) | |
def __str__(self): | |
return "%s - %s" % (self.file_name or "unknown", self.long_languages or "unknown language") | |
class WindowsUpdate: | |
def __init__(self, raw_element): | |
""" | |
Stores information about a Windows Update entry. | |
:param raw_element: The raw XHTML element that has been parsed by BeautifulSoup4. | |
""" | |
cells = raw_element.find_all('td') | |
self.title = cells[1].get_text().strip() | |
# Split , if there is no space ahead. | |
products = cells[2].get_text().strip() | |
self.products = list(filter(None, re.split(PRODUCT_SPLIT_PATTERN, products))) | |
self.classification = cells[3].get_text().strip() | |
self.last_updated = datetime.datetime.strptime(cells[4].get_text().strip(), '%m/%d/%Y') | |
self.version = cells[5].get_text().strip() | |
self.size = int(cells[6].find_all('span')[1].get_text().strip()) | |
self.id = uuid.UUID(cells[7].find('input').attrs['id']) | |
self._details = None | |
self._architecture = None | |
self._description = None | |
self._download_urls = None | |
self._kb_numbers = None | |
self._more_information = None | |
self._msrc_number = None | |
self._msrc_severity = None | |
self._support_url = None | |
@property | |
def architecture(self): | |
""" The architecture of the update. """ | |
if not self._architecture: | |
details = self._get_details() | |
raw_arch = details.find(id='ScopedViewHandler_labelArchitecture_Separator') | |
self._architecture = raw_arch.next_sibling.strip() | |
return self._architecture | |
@property | |
def description(self): | |
""" The description of the update. """ | |
if not self._description: | |
details = self._get_details() | |
self._description = details.find(id='ScopedViewHandler_desc').get_text() | |
return self._description | |
@property | |
def download_url(self): | |
""" The download URL of the update, will fail if the update contains multiple packages. """ | |
download_urls = self.get_download_urls() | |
if len(download_urls) != 1: | |
raise ValueError("Expecting only 1 download link for '%s', received %d. Use get_download_urls() and " | |
"filter it based on your criteria." % (str(self), len(download_urls))) | |
return download_urls[0].url | |
@property | |
def kb_numbers(self): | |
""" A list of KB article numbers that apply to the update. """ | |
if self._kb_numbers is None: | |
details = self._get_details() | |
raw_kb = details.find(id='ScopedViewHandler_labelKBArticle_Separator') | |
# If no KB's apply then the value will be n/a. Technically an update can have multiple KBs but I have | |
# not been able to find an example of this so cannot test that scenario. | |
self._kb_numbers = [int(n.strip()) for n in list(raw_kb.next_siblings) if n.strip().lower() != 'n/a'] | |
return self._kb_numbers | |
@property | |
def more_information(self): | |
""" Typically the URL of the KB article for the update but it can be anything. """ | |
if self._more_information is None: | |
details = self._get_details() | |
raw_info = details.find(id='ScopedViewHandler_labelMoreInfo_Separator') | |
self._more_information = list(raw_info.next_siblings)[1].get_text().strip() | |
return self._more_information | |
@property | |
def msrc_number(self): | |
""" The MSRC Number for the update, set to n/a if not defined. """ | |
if self._msrc_number is None: | |
details = self._get_details() | |
raw_info = details.find(id='ScopedViewHandler_labelSecurityBulliten_Separator') | |
self._msrc_number = list(raw_info.next_siblings)[0].strip() | |
return self._msrc_number | |
@property | |
def msrc_severity(self): | |
""" THe MSRC severity level for the update, set to Unspecified if not defined. """ | |
if self._msrc_severity is None: | |
details = self._get_details() | |
self._msrc_severity = details.find(id='ScopedViewHandler_msrcSeverity').get_text().strip() | |
return self._msrc_severity | |
@property | |
def support_url(self): | |
""" The support URL for the update. """ | |
if self._support_url is None: | |
details = self._get_details() | |
raw_info = details.find(id='ScopedViewHandler_labelSupportUrl_Separator') | |
self._support_url = list(raw_info.next_siblings)[1].get_text().strip() | |
return self._support_url | |
def get_download_urls(self): | |
""" | |
Get a list of WUDownloadInfo objects for the current update. These objects contain the download URL for all the | |
packages inside the update. | |
""" | |
if self._download_urls is None: | |
update_ids = json.dumps({ | |
'size': 0, | |
'updateID': str(self.id), | |
'uidInfo': str(self.id), | |
}) | |
data = urlencode({'updateIDs': '[%s]' % update_ids}).encode('utf-8') | |
headers = { | |
'Content-Type': 'application/x-www-form-urlencoded', | |
} | |
with fetch_url('%s/DownloadDialog.aspx' % CATALOG_URL, data=data, headers=headers) as resp: | |
resp_text = resp.read().decode('utf-8').strip() | |
link_matches = re.findall(DOWNLOAD_PATTERN, resp_text) | |
if len(link_matches) == 0: | |
raise ValueError("Failed to find any download links for '%s'" % str(self)) | |
download_urls = [] | |
for download_id, url in link_matches: | |
download_urls.append(WUDownloadInfo(download_id, url, resp_text)) | |
self._download_urls = download_urls | |
return self._download_urls | |
def _get_details(self): | |
if not self._details: | |
headers = { | |
'Content-Type': 'application/x-www-form-urlencoded', | |
} | |
with fetch_url('%s/ScopedViewInline.aspx?updateid=%s' % (CATALOG_URL, str(self.id)), | |
headers=headers) as resp: | |
resp_text = resp.read().decode('utf-8').strip() | |
self._details = BeautifulSoup(resp_text, 'html.parser') | |
return self._details | |
def __str__(self): | |
return self.title | |
def find_updates(search, all_updates=False, sort=None, sort_reverse=False, data=None): | |
""" | |
Generator function that yields WindowsUpdate objects for each update found on the Microsoft Update catalog. | |
Yields a list of updates from the Microsoft Update catalog. These updates can then be downloaded locally using the | |
.download(path) function. | |
:param search: The search string used when searching the update catalog. | |
:param all_updates: Set to True to continue to search on all pages and not just the first 25. This can dramatically | |
increase the runtime of the script so use with caution. | |
:param sort: The field name as seen in the update catalog GUI to sort by. Setting this will result in 1 more call | |
to the catalog URL. | |
:param sort_reverse: Reverse the sort after initially sorting it. Setting this will result in 1 more call after | |
the sort call to the catalog URL. | |
:param data: Data to post to the request, used when getting all pages | |
:return: Yields the WindowsUpdate objects found. | |
""" | |
search_safe = quote(search) | |
headers = { | |
'Content-Type': 'application/x-www-form-urlencoded', | |
} | |
if data: | |
data = urlencode(data).encode('utf-8') | |
url = '%s/Search.aspx?q=%s' % (CATALOG_URL, search_safe) | |
with fetch_url(url, data=data, headers=headers) as resp: | |
resp_text = resp.read().decode('utf-8').strip() | |
catalog = BeautifulSoup(resp_text, 'html.parser') | |
# If we need to perform an action (like sorting or next page) we need to add these 4 fields that are based on the | |
# original response received. | |
def build_action_data(action): | |
data = { | |
'__EVENTTARGET': action, | |
} | |
for field in ['__EVENTARGUMENT', '__EVENTVALIDATION', '__VIEWSTATE', '__VIEWSTATEGENERATOR']: | |
element = catalog.find(id=field) | |
if element: | |
data[field] = element.attrs['value'] | |
return data | |
raw_updates = catalog.find(id='ctl00_catalogBody_updateMatches').find_all('tr') | |
headers = raw_updates[0] # The first entry in the table are the headers which we may use for sorting. | |
if sort: | |
# Lookup the header click JS targets based on the header name to sort. | |
header_links = headers.find_all('a') | |
event_targets = dict((l.find('span').get_text(), l.attrs['id'].replace('_', '$')) for l in header_links) | |
data = build_action_data(event_targets[sort]) | |
sort = sort if sort_reverse else None # If we want to sort descending we need to sort it again. | |
for update in find_updates(search, all_updates, sort=sort, data=data): | |
yield update | |
return | |
for u in raw_updates[1:]: | |
yield WindowsUpdate(u) | |
# ctl00_catalogBody_nextPage is set when there are no more updates to retrieve. | |
last_page = catalog.find(id='ctl00_catalogBody_nextPage') | |
if not last_page and all_updates: | |
data = build_action_data('ctl00$catalogBody$nextPageLinkText') | |
for update in find_updates(search, True, data=data): | |
yield update |
DOWNLOAD_PATTERN = re.compile(r'\[(\d*)\]\.url = [\"\'](https://catalog\.s\.download\.windowsupdate\.com/[^\'\"\\]*)')
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The
DOWNLOAD_PATTERN = re.compile(r'\[(\d*)\]\.url = [\"\'](http[s]?://w{0,3}.?download\.windowsupdate\.com/[^\'\"]*)')
doesn't work for all updates.For example, update
93cc4b1f-82f5-43ab-a1a6-ccda3b7ef829
has the following download url:https://catalog.s.download.windowsupdate.com/d/msdownload/update/driver/drvs/2022/10/3ebe973e-0017-4a82-9886-a18b74521bcf_3bc69bbf9676bbe4380c6bf2ee73ed51ce0d98c0.cab
.