Created
April 26, 2023 07:35
-
-
Save jfmaes/e95a4ee60dbad163654b8e72d863240c to your computer and use it in GitHub Desktop.
MSDN function definition scraper. requires chromium driver.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import selenium as se | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
import time | |
def main(): | |
parser = argparse.ArgumentParser(description = 'extract function definitions from MSDN') | |
parser.add_argument('--methods', help='list of methods',required=True) | |
args = parser.parse_args() | |
#methods_list = [] | |
if args.methods: | |
methods_list = args.methods.split(',') | |
#needs chrome or firefox because javascript loads the actual results, requests doesnt deal in that sh*t | |
options = se.webdriver.ChromeOptions() | |
options.add_argument('headless') | |
options.add_experimental_option('excludeSwitches', ['enable-logging']) | |
browser = se.webdriver.Chrome(options=options) | |
#Get the search query from the user | |
for method in methods_list: | |
search_query = method | |
#Create the request for the MSDN search page | |
url = f'https://docs.microsoft.com/en-us/search/?terms={search_query}' | |
browser.get(url) | |
time.sleep(100/1000) | |
#Parse the MSDN search page | |
soup = BeautifulSoup(browser.page_source, 'html.parser') | |
#only interested in the top search result. You might need to tweak this if you want more robust searches. | |
function_page_link = soup.find('a', {"data-bi-name" : "searchItem.0"}) | |
#Check if the function page link is found | |
if function_page_link: | |
#print("Checking {0} for function definition. ".format(function_page_link['href'])) | |
#Create the request for the MSDN page of the function | |
browser.get(function_page_link['href']) | |
#Parse the MSDN page of the function | |
function_page_soup = BeautifulSoup(browser.page_source, 'html.parser') | |
#Find the syntax section of the MSDN page | |
function_definition = function_page_soup.find('code').text | |
if function_definition: | |
#Print the function definition | |
print(function_definition) | |
else: | |
print(f'Function "{search_query}" not found') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment