Last active
January 2, 2016 19:19
-
-
Save ljos/8349786 to your computer and use it in GitHub Desktop.
Program to transform the output from atekst to csv where we are only interested in articles that mention NAV in the article, but not in either the lead or the headline.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
import codecs | |
import csv | |
import re | |
import sys | |
import time | |
def parse_date(s): | |
# We assume that the date is always in the format '10.01.2014'. | |
match = re.search(r'\d{2}\.\d{2}\.\d{4}',s) | |
if match is None: | |
return 'NA' | |
else: | |
return match.group() | |
def parse_meta(raw_meta): | |
meta = raw_meta.split('\n') | |
# Reduce the meta lines until we find a line with a date. I hope | |
# that we don't get meta-data where the date is not at the same | |
# line as the newspaper name. | |
date = parse_date(meta[0]) | |
while date is 'NA' and meta: | |
meta = meta[1:] | |
date = parse_date(meta[0]) | |
# If we get to the end there is really nothing to do but to abort | |
# and return null. | |
if not meta: | |
raise ValueError("There was no date in the meta-data.") | |
# The paper name should be at the same line as the date, before | |
# the ','. | |
paper = meta[0].split(',')[0].strip() | |
if paper is '': | |
paper = 'NA' | |
author = 'NA' | |
# If we see 'Seksjon:', 'Side' or 'Publisert' in the next line | |
# there is no author, but otherwise we assume that the next line | |
# is name of the author(s). | |
if not re.match(r'^Seksjon:.*|^Side.*|^Publisert.*|\d{4}-\d\d-\d\dT', meta[1], re.I): | |
author = meta[1] | |
return (date, paper, author) | |
# To look for nav, we define a regex that ignores case and looks for | |
# the string 'nav' with a word boundery on each side. | |
nav = r'(?i)\bnav\b' | |
def parse(filename): | |
with codecs.open(filename, 'r', 'utf-8') as f: | |
# Split the file at every ===+ which denotes new article. | |
articles = re.split(r'={20,}\n', f.read()) | |
for raw_article in articles: | |
# Split every raw article at ---+. What is above is the title. | |
headline, raw_article = re.split(r'-{20,}\n', raw_article) | |
# Because the first split will leave a large header | |
# containing some information for the document we split at | |
# all triple lines and take the last line. The last line | |
# here should be the title of the first article. | |
headline = re.sub(r'\n+', ' ', headline.split('\n\n\n')[-1].strip() | |
# We split the raw_article at double newline and assume | |
# that the meta-data is before and not after the article | |
# body. However, sometimes there can be a long string | |
# before the meta-data. Even with a double newline before | |
# the meta-data. | |
raw_article = raw_article.split('\n\n') | |
while raw_article: | |
try: | |
raw_meta, lead, * article = raw_article | |
date, paper, author = parse_meta(raw_meta) | |
# If we manage to get out of the parse_meta safely | |
# we break out of the loop. | |
break | |
except IndexError: | |
# When we get an index error we assume that we got | |
# a string or an empty line before the meta data. | |
raw_article = raw_article[1:] | |
if not raw_article: | |
# If we now find that the raw_article is empty, there | |
# was no meta-data that we could find. | |
print('Could not find meta data in: %s' % headline) | |
continue | |
# To get the text into one string we replace all sequences | |
# of newlines in each of the remaining lines in the | |
# article. We then join with ' ', so that we get one long | |
# string. | |
text = ' '.join([re.sub(r'\n+', ' ', line).strip() for line in article]) | |
# We find all places where NAV is mentioned in the | |
# article. | |
global nav | |
mentions = re.findall(nav, text) | |
# We are only interested in when NAV is mentioned in the | |
# text, but not in either the lead and headline. | |
if (mentions and not re.search(nav, lead) and not re.search(nav, headline)): | |
with codecs.open(filename+'.csv', 'a', 'utf-8') as csv_out: | |
writer = csv.writer(csv_out, dialect='excel-tab') | |
writer.writerow([len(mentions), paper, date, author, headline, lead, text]) | |
if __name__ == '__main__': | |
for arg in sys.argv[1:]: | |
parse(arg) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment