cuevasclemente · June 26, 2017 21:47
diff --git a/parse_wiki_files.py b/parse_wiki_files.py
 import argparse                                      
 from os import path                                  


 def parse_wiki_file(output_location, filename):      
    with open(filename) as f:                        
        txt = f.read()                               
    docs = txt.split("</doc>")                       
    split_by_line = [doc.split("\n") for doc in docs]                                                     
    with_title = [{"title": docs[0][1], "body": "\n".join(docs[0][3:])}] + [{                             
            "title": doc[2], "body": "\n".join(doc[3:])}                                                  
            for doc in split_by_line[1:] if len(doc) > 2]                                                 
    for title_and_body in with_title:                
        with open(path.join(                         
                output_location, title_and_body["title"]), "w") as f:                                     
            f.write(title_and_body["body"])          

 if __name__ == '__main__':                           
    parser = argparse.ArgumentParser(                
        description="Write the articles in a wikipedia "                                                  
                    "parsed file where all the text is in a "                                             
                    "file named after the article title")                                                 
    parser.add_argument("files",                     
                        help="The files to run the parsing procedure on",                                 
                        nargs="+")                   
    parser.add_argument("--output_location",         
                        help="The directory to output articles to",                                       
                        default="./")                
    args = parser.parse_args()                       
    for f in args.files:                             
        parse_wiki_file(args.output_location, f)
	import argparse
	from os import path


	def parse_wiki_file(output_location, filename):
	with open(filename) as f:
	txt = f.read()
	docs = txt.split("</doc>")
	split_by_line = [doc.split("\n") for doc in docs]
	with_title = [{"title": docs[0][1], "body": "\n".join(docs[0][3:])}] + [{
	"title": doc[2], "body": "\n".join(doc[3:])}
	for doc in split_by_line[1:] if len(doc) > 2]
	for title_and_body in with_title:
	with open(path.join(
	output_location, title_and_body["title"]), "w") as f:
	f.write(title_and_body["body"])

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description="Write the articles in a wikipedia "
	"parsed file where all the text is in a "
	"file named after the article title")
	parser.add_argument("files",
	help="The files to run the parsing procedure on",
	nargs="+")
	parser.add_argument("--output_location",
	help="The directory to output articles to",
	default="./")
	args = parser.parse_args()
	for f in args.files:
	parse_wiki_file(args.output_location, f)