Last active
May 12, 2017 10:58
-
-
Save bobquest33/2759aabeedbe6a9dc8281349b7f0a0c3 to your computer and use it in GitHub Desktop.
Extracting Bic addresses from http://www.bankswiftcode.org using a Web Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import sys | |
| from bs4 import BeautifulSoup | |
| from tqdm import tqdm | |
| import re | |
| import time | |
| import traceback | |
| import os | |
| import pickle | |
| from fake_useragent import UserAgent | |
| #from multiprocessing.pool import ThreadPool | |
| ua = UserAgent() | |
| pageurls = [] | |
| fileurls = {} | |
| parsedurls = [] | |
| pfname1 = "html_parsed_bic_urls.pickel" | |
| pfname2 = "html_parsed_bics.pickel" | |
| def save_pickle(pobj,pfname): | |
| with open(pfname,"wb") as pf: | |
| pickle.dump(pobj,pf) | |
| def load_pickle(pfname): | |
| with open(pfname,"rb") as pf: | |
| pobj = pickle.load(pf) | |
| return pobj | |
| if os.path.exists(pfname1): | |
| parsedurls = load_pickle(pfname1) | |
| if os.path.exists(pfname2): | |
| fileurls = load_pickle(pfname2) | |
| pheaders = {'User-Agent':ua.random} | |
| base_url = "http://www.bankswiftcode.org" | |
| # Find the download link for bic pages for that country and save it to pickle based on the url of the country | |
| def parse_countr_pgs(url): | |
| try: | |
| r = requests.get(url,headers=pheaders) | |
| time.sleep(10) | |
| content = r.content | |
| print "parsing bic list page" | |
| fileurls[url]=content | |
| save_pickle(fileurls,pfname2) | |
| parsedurls.append(url) | |
| save_pickle(parsedurls,pfname1) | |
| soup = BeautifulSoup(content, 'html.parser') | |
| for tag in soup.find_all("a"): | |
| #print tag | |
| if "Page" in tag.getText(): | |
| print tag["href"] | |
| turl = base_url+tag["href"] | |
| if not (turl in parsedurls): | |
| print turl | |
| parse_countr_pgs(turl) | |
| except: | |
| traceback.print_exc() | |
| # Find the download link for the country page | |
| def parse_country(url): | |
| try: | |
| r = requests.get(url,headers=pheaders) | |
| time.sleep(10) | |
| content = r.content | |
| print "parsing contry page" | |
| soup = BeautifulSoup(content, 'html.parser') | |
| countries = soup.find_all("table") | |
| for tbl in countries: | |
| for tag in tbl.find_all("a"): | |
| print tag["href"] | |
| url = base_url+tag["href"] | |
| print url | |
| parse_countr_pgs(url) | |
| except: | |
| traceback.print_exc() | |
| # Finding all country pages from main page | |
| def parse_spages(url): | |
| r = requests.get(url,stream=True,headers=pheaders) | |
| #time.sleep(10) | |
| print "fetching main page "+url | |
| content = "" | |
| for data in tqdm(r.iter_content()): | |
| content += data | |
| print "parsing main page" | |
| #print content | |
| soup = BeautifulSoup(content, 'html.parser') | |
| for tag in soup.find_all("a"): | |
| #print tag | |
| if "letter" in tag["href"]: | |
| print tag["href"] | |
| url = base_url+tag["href"] | |
| print url | |
| parse_country(url) | |
| try: | |
| print "url:"+base_url | |
| parse_spages(base_url) | |
| except: | |
| traceback.print_exc() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment