Skip to content

Instantly share code, notes, and snippets.

@bobquest33
Created May 12, 2017 11:03
Show Gist options
  • Select an option

  • Save bobquest33/8a0c45048498a2ba28c883639a065b09 to your computer and use it in GitHub Desktop.

Select an option

Save bobquest33/8a0c45048498a2ba28c883639a065b09 to your computer and use it in GitHub Desktop.
The following script reads Bic data from Pickle file and filter it and saved the refined data into another pickle file.
import requests
import sys
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
import time
import traceback
import os
import pickle
from fake_useragent import UserAgent
#from multiprocessing.pool import ThreadPool
ua = UserAgent()
pageurls = []
fileurls = {}
parsedurls = []
pfname1 = "html_parsed_bic_urls.pickel"
pfname2 = "html_parsed_bics.pickel"
def save_pickle(pobj,pfname):
with open(pfname,"wb") as pf:
pickle.dump(pobj,pf)
def load_pickle(pfname):
with open(pfname,"rb") as pf:
pobj = pickle.load(pf)
return pobj
if os.path.exists(pfname1):
parsedurls = load_pickle(pfname1)
if os.path.exists(pfname2):
fileurls = load_pickle(pfname2)
parsedurls = list(set(parsedurls))
save_pickle(parsedurls,pfname1)
def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
"""
import unicodedata
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
value = unicode(re.sub('[-\s]+', '-', value))
return value
for val in fileurls:
print val
filename = slugify(val)+".html"
with open("html_files\\"+filename,"wb") as fp:
fp.write(fileurls[val])
import html2text
import re
import pandas as pd
#pattern = "^.*(\b[a-zA-Z]){4}([a-zA-Z]){2}([0-9a-zA-Z]){2}([0-9a-zA-Z]{3}\b)?.*$"
pattern = "^.*([a-zA-Z]){4}([a-zA-Z]){2}([0-9a-zA-Z]){2}([0-9a-zA-Z]{3})?.*$"
h = html2text.HTML2Text()
recs = []
for val in fileurls:
print val
txt = h.handle(str(fileurls[val]).decode('ascii','ignore'))
#recs = recs+[x.split("|")[1:]+[val] for x in txt.split("\n") if x.count("|") ==4]
lines = [x for x in txt.split("\n") if "|" in x or re.search(pattern,x)]
for i,x in enumerate(lines):
v = ["","","","",""]
if x.count("|") ==4:
v = x.split("|")[1:]+[val]
if v[3].strip()=="":
#print v[3]
if re.search(pattern,lines[i+1]):
v[3] = lines[i+1].strip().replace("|","")
#print v[3]
recs.append(v)
headers = ["institution","city_heading", "branch_name", "swift","url"]
df = pd.DataFrame(recs, columns=headers)
df.to_csv("total_bic.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment