lanky · May 20, 2021 20:29
diff --git a/scrapehits.py b/scrapehits.py
 #!/usr/bin/env python3
 #!/usr/bin/env python3
 # pattern matching
 import re
 # HTML fetching
 import requests
 # HTML parsing
 from bs4 import BeautifulSoup
 # for CSV output
 from csv import DictWriter

 if __name__ == "__main__":
    # a list to hold our matched table rows
    records = []
    # field titles for the output
    fields = ['date', 'artist', 'title', 'wks']

    # try and fetch the URL
    req =  requests.get("https://www.officialcharts.com/chart-news/all-the-number-1-singles__7931/")

    if req.ok:
        # we managed to fetch the webpage, now parse it
        soup = BeautifulSoup(req.content, features="lxml")
        # find all the tables in the webpage
        for table in soup.findAll('table'):
            for tr in table.findAll('tr'):
                # find text content for each 'td' element.
                # these are all one-item lists so [0] gets the only entry
                # we're also converting to 'title case' here
                data = [td.findChildren(text=True)[0].title() for td in tr.findAll("td")]

                # reject rows that don't start with a date in DD/MM/YYYY format
                # this could probably be much simpler
                if re.match(r'\d{2}/\d{2}/\d{4}', data[0]):
                    records.append(dict(zip(fields, data)))

        # open a new CSV file to write to
        with open("numberones.csv", "w") as out:
            writer = DictWriter(out, fieldnames=fields, restval='')
            writer.writeheader()
            writer.writerows(records)
	#!/usr/bin/env python3
	#!/usr/bin/env python3
	# pattern matching
	import re
	# HTML fetching
	import requests
	# HTML parsing
	from bs4 import BeautifulSoup
	# for CSV output
	from csv import DictWriter

	if __name__ == "__main__":
	# a list to hold our matched table rows
	records = []
	# field titles for the output
	fields = ['date', 'artist', 'title', 'wks']

	# try and fetch the URL
	req = requests.get("https://www.officialcharts.com/chart-news/all-the-number-1-singles__7931/")

	if req.ok:
	# we managed to fetch the webpage, now parse it
	soup = BeautifulSoup(req.content, features="lxml")
	# find all the tables in the webpage
	for table in soup.findAll('table'):
	for tr in table.findAll('tr'):
	# find text content for each 'td' element.
	# these are all one-item lists so [0] gets the only entry
	# we're also converting to 'title case' here
	data = [td.findChildren(text=True)[0].title() for td in tr.findAll("td")]

	# reject rows that don't start with a date in DD/MM/YYYY format
	# this could probably be much simpler
	if re.match(r'\d{2}/\d{2}/\d{4}', data[0]):
	records.append(dict(zip(fields, data)))

	# open a new CSV file to write to
	with open("numberones.csv", "w") as out:
	writer = DictWriter(out, fieldnames=fields, restval='')
	writer.writeheader()
	writer.writerows(records)