Created
January 16, 2021 03:59
-
-
Save sagorbrur/b6bb7d8192829efa494707d0718a9279 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import glob | |
import json | |
from tqdm import tqdm | |
def cleanhtml(raw_html): | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', raw_html) | |
return cleantext | |
def process_wiki_file(file): | |
with open(file) as f: | |
text = f.read() | |
docs = text.split('</doc>') | |
return docs | |
def main(files): | |
wiki_corpus = { | |
'info':{ | |
'download_date': "15/01/2021", | |
'process_date': "15/01/2021", | |
'processed_by': 'Sagor Sarker', | |
}, | |
'articles': [] | |
} | |
for file in tqdm(files): | |
docs = process_wiki_file(file) | |
for doc in docs: | |
clean_text = cleanhtml(doc) | |
lines = clean_text.split("\n") | |
lines = [x for x in lines if x] | |
if len(lines) >= 5: | |
title = lines[0] | |
article_body = "\n".join(lines[1:]) | |
article_corpus = { | |
'article_title': title, | |
'article_body': article_body | |
} | |
wiki_corpus['articles'].append(article_corpus) | |
return wiki_corpus | |
if __name__=="__main__": | |
files = glob.glob('/content/bn_wikiextractor/text/*/*') | |
wiki_corpus = main(files) | |
with open('wiki_corpus.json', 'w') as fp: | |
json.dump(wiki_corpus, fp, ensure_ascii=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment