kzinmr · September 18, 2020 08:18
diff --git a/companies_with_lei.py b/companies_with_lei.py
 %%time

 import xmltodict
 import unicodedata
 import requests
 import datetime 

 def download_leidata():
    dt = datetime.datetime.today().strftime('%Y%m%d')
    filestem = f'JPX-LEI-JPXFullFile_v2_{dt}'
    filepath = filestem + '.xml'
    url = f'https://www.lei.jpx.co.jp/lei/en/full_file.html?FileIDLatest2={filestem}&RRFullHistory=&RRExceptionHistory='
    response = requests.get(url)
    if response.ok:
        with open(filepath, "w") as fp:
            fp.write(response.content.decode('utf8'))
        return filepath


 def extract_lei_companies(filepath):
    od = xmltodict.parse(open(filepath).read())
    totcount = int(od['lei:LEIData']['lei:LEIHeader']['lei:RecordCount'])

    records = od['lei:LEIData']['lei:LEIRecords']['lei:LEIRecord']

    suffix = ['株式会社', '合同会社', '有限会社','目的会社','相互会社','投資法人','信用金庫','中央金庫','連合会','ltd','mited','inc',' llc',]
    prefix = ['株式会社', '合同会社', '有限会社','医療法人', '一般社団', '一般財団', '公益財団', '独立行政', '学校法人', '国立大学',]
    lei_records = []
    for r in records:
        lei = r['lei:LEI']
        text = r['lei:Entity']['lei:LegalName']['#text']
        text_stem = text.split('/')[0]
        text_stem = unicodedata.normalize('NFKC', text_stem)
        if 'ファンド' not in text_stem and 'fund' not in text_stem.lower() and '為替ヘッジ' not in text_stem:
            if any(s in text_stem[-5:].lower() for s in suffix) or any(p in text_stem[:5].lower() for p in prefix):
                lei_ver = '/'.join(text.split('/')[1:])
                lei_records.append({'text': text_stem, 'lei': lei, 'version': lei_ver if lei_ver else None})
    df = pd.DataFrame(lei_records)
    companies_with_lei = sorted([text for text, df_ in df.set_index('text').groupby(level=0)])
    n_companies = len(companies_with_lei)
    print('#COMPANIES: {} / {}'.format(n_companies, totcount))
    return companies_with_lei

 filepath = download_leidata()
 if filepath is not None:
    companies_with_lei = extract_lei_companies(filepath)
	%%time

	import xmltodict
	import unicodedata
	import requests
	import datetime

	def download_leidata():
	dt = datetime.datetime.today().strftime('%Y%m%d')
	filestem = f'JPX-LEI-JPXFullFile_v2_{dt}'
	filepath = filestem + '.xml'
	url = f'https://www.lei.jpx.co.jp/lei/en/full_file.html?FileIDLatest2={filestem}&RRFullHistory=&RRExceptionHistory='
	response = requests.get(url)
	if response.ok:
	with open(filepath, "w") as fp:
	fp.write(response.content.decode('utf8'))
	return filepath


	def extract_lei_companies(filepath):
	od = xmltodict.parse(open(filepath).read())
	totcount = int(od['lei:LEIData']['lei:LEIHeader']['lei:RecordCount'])

	records = od['lei:LEIData']['lei:LEIRecords']['lei:LEIRecord']

	suffix = ['株式会社', '合同会社', '有限会社','目的会社','相互会社','投資法人','信用金庫','中央金庫','連合会','ltd','mited','inc',' llc',]
	prefix = ['株式会社', '合同会社', '有限会社','医療法人', '一般社団', '一般財団', '公益財団', '独立行政', '学校法人', '国立大学',]
	lei_records = []
	for r in records:
	lei = r['lei:LEI']
	text = r['lei:Entity']['lei:LegalName']['#text']
	text_stem = text.split('/')[0]
	text_stem = unicodedata.normalize('NFKC', text_stem)
	if 'ファンド' not in text_stem and 'fund' not in text_stem.lower() and '為替ヘッジ' not in text_stem:
	if any(s in text_stem[-5:].lower() for s in suffix) or any(p in text_stem[:5].lower() for p in prefix):
	lei_ver = '/'.join(text.split('/')[1:])
	lei_records.append({'text': text_stem, 'lei': lei, 'version': lei_ver if lei_ver else None})
	df = pd.DataFrame(lei_records)
	companies_with_lei = sorted([text for text, df_ in df.set_index('text').groupby(level=0)])
	n_companies = len(companies_with_lei)
	print('#COMPANIES: {} / {}'.format(n_companies, totcount))
	return companies_with_lei

	filepath = download_leidata()
	if filepath is not None:
	companies_with_lei = extract_lei_companies(filepath)