Created
June 26, 2017 21:47
-
-
Save cuevasclemente/14bcac0662325808ea9e36fb868c98eb to your computer and use it in GitHub Desktop.
Parse Wikipedia XML Extracted Files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from os import path | |
def parse_wiki_file(output_location, filename): | |
with open(filename) as f: | |
txt = f.read() | |
docs = txt.split("</doc>") | |
split_by_line = [doc.split("\n") for doc in docs] | |
with_title = [{"title": docs[0][1], "body": "\n".join(docs[0][3:])}] + [{ | |
"title": doc[2], "body": "\n".join(doc[3:])} | |
for doc in split_by_line[1:] if len(doc) > 2] | |
for title_and_body in with_title: | |
with open(path.join( | |
output_location, title_and_body["title"]), "w") as f: | |
f.write(title_and_body["body"]) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description="Write the articles in a wikipedia " | |
"parsed file where all the text is in a " | |
"file named after the article title") | |
parser.add_argument("files", | |
help="The files to run the parsing procedure on", | |
nargs="+") | |
parser.add_argument("--output_location", | |
help="The directory to output articles to", | |
default="./") | |
args = parser.parse_args() | |
for f in args.files: | |
parse_wiki_file(args.output_location, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment