Skip to content

Instantly share code, notes, and snippets.

@bobquest33
Last active May 12, 2017 10:58
Show Gist options
  • Select an option

  • Save bobquest33/2759aabeedbe6a9dc8281349b7f0a0c3 to your computer and use it in GitHub Desktop.

Select an option

Save bobquest33/2759aabeedbe6a9dc8281349b7f0a0c3 to your computer and use it in GitHub Desktop.
Extracting Bic addresses from http://www.bankswiftcode.org using a Web Scraper
import requests
import sys
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
import time
import traceback
import os
import pickle
from fake_useragent import UserAgent
#from multiprocessing.pool import ThreadPool
ua = UserAgent()
pageurls = []
fileurls = {}
parsedurls = []
pfname1 = "html_parsed_bic_urls.pickel"
pfname2 = "html_parsed_bics.pickel"
def save_pickle(pobj,pfname):
with open(pfname,"wb") as pf:
pickle.dump(pobj,pf)
def load_pickle(pfname):
with open(pfname,"rb") as pf:
pobj = pickle.load(pf)
return pobj
if os.path.exists(pfname1):
parsedurls = load_pickle(pfname1)
if os.path.exists(pfname2):
fileurls = load_pickle(pfname2)
pheaders = {'User-Agent':ua.random}
base_url = "http://www.bankswiftcode.org"
# Find the download link for bic pages for that country and save it to pickle based on the url of the country
def parse_countr_pgs(url):
try:
r = requests.get(url,headers=pheaders)
time.sleep(10)
content = r.content
print "parsing bic list page"
fileurls[url]=content
save_pickle(fileurls,pfname2)
parsedurls.append(url)
save_pickle(parsedurls,pfname1)
soup = BeautifulSoup(content, 'html.parser')
for tag in soup.find_all("a"):
#print tag
if "Page" in tag.getText():
print tag["href"]
turl = base_url+tag["href"]
if not (turl in parsedurls):
print turl
parse_countr_pgs(turl)
except:
traceback.print_exc()
# Find the download link for the country page
def parse_country(url):
try:
r = requests.get(url,headers=pheaders)
time.sleep(10)
content = r.content
print "parsing contry page"
soup = BeautifulSoup(content, 'html.parser')
countries = soup.find_all("table")
for tbl in countries:
for tag in tbl.find_all("a"):
print tag["href"]
url = base_url+tag["href"]
print url
parse_countr_pgs(url)
except:
traceback.print_exc()
# Finding all country pages from main page
def parse_spages(url):
r = requests.get(url,stream=True,headers=pheaders)
#time.sleep(10)
print "fetching main page "+url
content = ""
for data in tqdm(r.iter_content()):
content += data
print "parsing main page"
#print content
soup = BeautifulSoup(content, 'html.parser')
for tag in soup.find_all("a"):
#print tag
if "letter" in tag["href"]:
print tag["href"]
url = base_url+tag["href"]
print url
parse_country(url)
try:
print "url:"+base_url
parse_spages(base_url)
except:
traceback.print_exc()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment