Created
September 18, 2020 08:18
-
-
Save kzinmr/0a4c974a469ad3393c9115494c26c341 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%time | |
import xmltodict | |
import unicodedata | |
import requests | |
import datetime | |
def download_leidata(): | |
dt = datetime.datetime.today().strftime('%Y%m%d') | |
filestem = f'JPX-LEI-JPXFullFile_v2_{dt}' | |
filepath = filestem + '.xml' | |
url = f'https://www.lei.jpx.co.jp/lei/en/full_file.html?FileIDLatest2={filestem}&RRFullHistory=&RRExceptionHistory=' | |
response = requests.get(url) | |
if response.ok: | |
with open(filepath, "w") as fp: | |
fp.write(response.content.decode('utf8')) | |
return filepath | |
def extract_lei_companies(filepath): | |
od = xmltodict.parse(open(filepath).read()) | |
totcount = int(od['lei:LEIData']['lei:LEIHeader']['lei:RecordCount']) | |
records = od['lei:LEIData']['lei:LEIRecords']['lei:LEIRecord'] | |
suffix = ['株式会社', '合同会社', '有限会社','目的会社','相互会社','投資法人','信用金庫','中央金庫','連合会','ltd','mited','inc',' llc',] | |
prefix = ['株式会社', '合同会社', '有限会社','医療法人', '一般社団', '一般財団', '公益財団', '独立行政', '学校法人', '国立大学',] | |
lei_records = [] | |
for r in records: | |
lei = r['lei:LEI'] | |
text = r['lei:Entity']['lei:LegalName']['#text'] | |
text_stem = text.split('/')[0] | |
text_stem = unicodedata.normalize('NFKC', text_stem) | |
if 'ファンド' not in text_stem and 'fund' not in text_stem.lower() and '為替ヘッジ' not in text_stem: | |
if any(s in text_stem[-5:].lower() for s in suffix) or any(p in text_stem[:5].lower() for p in prefix): | |
lei_ver = '/'.join(text.split('/')[1:]) | |
lei_records.append({'text': text_stem, 'lei': lei, 'version': lei_ver if lei_ver else None}) | |
df = pd.DataFrame(lei_records) | |
companies_with_lei = sorted([text for text, df_ in df.set_index('text').groupby(level=0)]) | |
n_companies = len(companies_with_lei) | |
print('#COMPANIES: {} / {}'.format(n_companies, totcount)) | |
return companies_with_lei | |
filepath = download_leidata() | |
if filepath is not None: | |
companies_with_lei = extract_lei_companies(filepath) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment