Last active
September 25, 2021 09:01
-
-
Save miladfa7/af6ba713ca29435d8810319133a17ba3 to your computer and use it in GitHub Desktop.
Download Wikipedia articles with python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
from gensim.corpora import WikiCorpus | |
import os | |
import argparse | |
# lang = 'fa' farsi | |
def store(corpus, lang): | |
base_path = os.getcwd() | |
store_path = os.path.join(base_path, '{}_corpus'.format(lang)) | |
if not os.path.exists(store_path): | |
os.mkdir(store_path) | |
file_idx=1 | |
for text in corpus.get_texts(): | |
current_file_path = os.path.join(store_path, 'article_{}.txt'.format(file_idx)) | |
with open(current_file_path, 'w' , encoding='utf-8') as file: | |
file.write(bytes(' '.join(text), 'utf-8').decode('utf-8')) | |
file_idx += 1 | |
def tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list: | |
return [token for token in text.split() if token_min_len <= len(token) <= token_max_len] | |
def run(lang): | |
origin='https://dumps.wikimedia.org/{}wiki/latest/{}wiki-latest-pages-articles.xml.bz2'.format(lang,lang) | |
fname='{}wiki-latest-pages-articles.xml.bz2'.format(lang) | |
file_path = tf.keras.utils.get_file(origin=origin, fname=fname, untar=False, extract=False) | |
corpus = WikiCorpus(file_path, lemmatize=False, lower=False, tokenizer_func=tokenizer_func) | |
store(corpus, lang) | |
if __name__ == '__main__': | |
ARGS_PARSER = argparse.ArgumentParser() | |
ARGS_PARSER.add_argument( | |
'--lang', | |
default='fa', | |
type=str, | |
help='language code to download from wikipedia corpus' | |
) | |
ARGS = ARGS_PARSER.parse_args() | |
run(**vars(ARGS)) | |
#python3 WikiText_Download.py --lang fa |
Hello, this program throws the following error raise EOFError("Compressed file ended before the " EOFError: Compressed file ended before the end-of-stream marker was reached
any suggestion of how to remedy it???
hi,
what is the gensim version?
I have the version 4.1.2
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, this program throws the following error raise EOFError("Compressed file ended before the " EOFError: Compressed file ended before the end-of-stream marker was reached
any suggestion of how to remedy it???