Created
September 23, 2018 01:20
-
-
Save WillKoehrsen/8aee0f61b4a44d0e8bfb99c05b5c3137 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def process_article(title, text, timestamp, template = 'Infobox book'): | |
"""Process a wikipedia article looking for template""" | |
# Create a parsing object | |
wikicode = mwparserfromhell.parse(text) | |
# Search through templates for the template | |
matches = wikicode.filter_templates(matches = template) | |
if len(matches) >= 1: | |
# Extract information from infobox | |
properties = {param.name.strip_code().strip(): param.value.strip_code().strip() | |
for param in matches[0].params | |
if param.value.strip_code().strip()} | |
# Extract internal wikilinks | |
wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()] | |
# Extract external links | |
exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()] | |
return (title, properties, wikilinks, exlinks, timestamp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment