Created
May 12, 2017 11:03
-
-
Save bobquest33/8a0c45048498a2ba28c883639a065b09 to your computer and use it in GitHub Desktop.
The following script reads Bic data from Pickle file and filter it and saved the refined data into another pickle file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import sys | |
| from bs4 import BeautifulSoup | |
| from tqdm import tqdm | |
| import re | |
| import time | |
| import traceback | |
| import os | |
| import pickle | |
| from fake_useragent import UserAgent | |
| #from multiprocessing.pool import ThreadPool | |
| ua = UserAgent() | |
| pageurls = [] | |
| fileurls = {} | |
| parsedurls = [] | |
| pfname1 = "html_parsed_bic_urls.pickel" | |
| pfname2 = "html_parsed_bics.pickel" | |
| def save_pickle(pobj,pfname): | |
| with open(pfname,"wb") as pf: | |
| pickle.dump(pobj,pf) | |
| def load_pickle(pfname): | |
| with open(pfname,"rb") as pf: | |
| pobj = pickle.load(pf) | |
| return pobj | |
| if os.path.exists(pfname1): | |
| parsedurls = load_pickle(pfname1) | |
| if os.path.exists(pfname2): | |
| fileurls = load_pickle(pfname2) | |
| parsedurls = list(set(parsedurls)) | |
| save_pickle(parsedurls,pfname1) | |
| def slugify(value): | |
| """ | |
| Normalizes string, converts to lowercase, removes non-alpha characters, | |
| and converts spaces to hyphens. | |
| """ | |
| import unicodedata | |
| value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') | |
| value = unicode(re.sub('[^\w\s-]', '', value).strip().lower()) | |
| value = unicode(re.sub('[-\s]+', '-', value)) | |
| return value | |
| for val in fileurls: | |
| print val | |
| filename = slugify(val)+".html" | |
| with open("html_files\\"+filename,"wb") as fp: | |
| fp.write(fileurls[val]) | |
| import html2text | |
| import re | |
| import pandas as pd | |
| #pattern = "^.*(\b[a-zA-Z]){4}([a-zA-Z]){2}([0-9a-zA-Z]){2}([0-9a-zA-Z]{3}\b)?.*$" | |
| pattern = "^.*([a-zA-Z]){4}([a-zA-Z]){2}([0-9a-zA-Z]){2}([0-9a-zA-Z]{3})?.*$" | |
| h = html2text.HTML2Text() | |
| recs = [] | |
| for val in fileurls: | |
| print val | |
| txt = h.handle(str(fileurls[val]).decode('ascii','ignore')) | |
| #recs = recs+[x.split("|")[1:]+[val] for x in txt.split("\n") if x.count("|") ==4] | |
| lines = [x for x in txt.split("\n") if "|" in x or re.search(pattern,x)] | |
| for i,x in enumerate(lines): | |
| v = ["","","","",""] | |
| if x.count("|") ==4: | |
| v = x.split("|")[1:]+[val] | |
| if v[3].strip()=="": | |
| #print v[3] | |
| if re.search(pattern,lines[i+1]): | |
| v[3] = lines[i+1].strip().replace("|","") | |
| #print v[3] | |
| recs.append(v) | |
| headers = ["institution","city_heading", "branch_name", "swift","url"] | |
| df = pd.DataFrame(recs, columns=headers) | |
| df.to_csv("total_bic.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment