Skip to content

Instantly share code, notes, and snippets.

@cuevasclemente
Created June 26, 2017 21:47
Show Gist options
  • Save cuevasclemente/14bcac0662325808ea9e36fb868c98eb to your computer and use it in GitHub Desktop.
Save cuevasclemente/14bcac0662325808ea9e36fb868c98eb to your computer and use it in GitHub Desktop.
Parse Wikipedia XML Extracted Files
import argparse
from os import path
def parse_wiki_file(output_location, filename):
with open(filename) as f:
txt = f.read()
docs = txt.split("</doc>")
split_by_line = [doc.split("\n") for doc in docs]
with_title = [{"title": docs[0][1], "body": "\n".join(docs[0][3:])}] + [{
"title": doc[2], "body": "\n".join(doc[3:])}
for doc in split_by_line[1:] if len(doc) > 2]
for title_and_body in with_title:
with open(path.join(
output_location, title_and_body["title"]), "w") as f:
f.write(title_and_body["body"])
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Write the articles in a wikipedia "
"parsed file where all the text is in a "
"file named after the article title")
parser.add_argument("files",
help="The files to run the parsing procedure on",
nargs="+")
parser.add_argument("--output_location",
help="The directory to output articles to",
default="./")
args = parser.parse_args()
for f in args.files:
parse_wiki_file(args.output_location, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment