Skip to content

Instantly share code, notes, and snippets.

@sagorbrur
Created January 16, 2021 03:59
Show Gist options
  • Save sagorbrur/b6bb7d8192829efa494707d0718a9279 to your computer and use it in GitHub Desktop.
Save sagorbrur/b6bb7d8192829efa494707d0718a9279 to your computer and use it in GitHub Desktop.
import re
import glob
import json
from tqdm import tqdm
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def process_wiki_file(file):
with open(file) as f:
text = f.read()
docs = text.split('</doc>')
return docs
def main(files):
wiki_corpus = {
'info':{
'download_date': "15/01/2021",
'process_date': "15/01/2021",
'processed_by': 'Sagor Sarker',
},
'articles': []
}
for file in tqdm(files):
docs = process_wiki_file(file)
for doc in docs:
clean_text = cleanhtml(doc)
lines = clean_text.split("\n")
lines = [x for x in lines if x]
if len(lines) >= 5:
title = lines[0]
article_body = "\n".join(lines[1:])
article_corpus = {
'article_title': title,
'article_body': article_body
}
wiki_corpus['articles'].append(article_corpus)
return wiki_corpus
if __name__=="__main__":
files = glob.glob('/content/bn_wikiextractor/text/*/*')
wiki_corpus = main(files)
with open('wiki_corpus.json', 'w') as fp:
json.dump(wiki_corpus, fp, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment