Created
March 8, 2017 03:44
-
-
Save bstriner/7062dbefd54bd66955a4aa67f8f0cdc4 to your computer and use it in GitHub Desktop.
Read output from wikiextractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import os | |
import json | |
class WikiDoc(object): | |
def __init__(self, url, text, id, title): | |
self.url = url | |
self.text = text | |
self.id = id | |
self.title = title | |
class WikiModel(object): | |
def __init__(self, data_dir): | |
self.data_dir = data_dir | |
def files(self): | |
return glob.glob(os.path.join(self.data_dir, "**", "wiki_*")) | |
def file_docs(self, path): | |
with open(path) as f: | |
for line in f: | |
if line: | |
doc = json.loads(line) | |
yield WikiDoc(doc["url"], doc["text"], doc["id"], doc["title"]) | |
def docs(self): | |
for path in self.files(): | |
for doc in self.file_docs(path): | |
yield doc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use Wikiextractor in JSON mode which will create several directories with files with one JSON object on each line.
WikiExtractor
This class will iterate through the extracted documents.