ljos · January 2, 2016 19:19
diff --git a/atekst.py b/atekst.py
 #! /usr/bin/env python3

 import codecs
 import csv
 import re
 import sys
 import time

 def parse_date(s):
    # We assume that the date is always in the format '10.01.2014'.
    match = re.search(r'\d{2}\.\d{2}\.\d{4}',s)
    if match is None:
        return 'NA'
    else:
        return match.group()

 def parse_meta(raw_meta):
    meta = raw_meta.split('\n')

    # Reduce the meta lines until we find a line with a date. I hope
    # that we don't get meta-data where the date is not at the same
    # line as the newspaper name.
    date = parse_date(meta[0])
    while date is 'NA' and meta:
        meta = meta[1:]
        date = parse_date(meta[0])

    # If we get to the end there is really nothing to do but to abort
    # and return null.
    if not meta:
        raise ValueError("There was no date in the meta-data.")

    # The paper name should be at the same line as the date, before
    # the ','.
    paper = meta[0].split(',')[0].strip()
    if paper is '':
        paper = 'NA'

    author = 'NA'
    # If we see 'Seksjon:', 'Side' or 'Publisert' in the next line
    # there is no author, but otherwise we assume that the next line
    # is name of the author(s).
    if not re.match(r'^Seksjon:.*|^Side.*|^Publisert.*|\d{4}-\d\d-\d\dT', meta[1], re.I):
        author = meta[1]

    return (date, paper, author)

 # To look for nav, we define a regex that ignores case and looks for
 # the string 'nav' with a word boundery on each side.
 nav = r'(?i)\bnav\b'

 def parse(filename):
    with codecs.open(filename, 'r', 'utf-8') as f:
        # Split the file at every ===+ which denotes new article.
        articles = re.split(r'={20,}\n', f.read())
        for raw_article in articles:
            # Split every raw article at ---+. What is above is the title.
            headline, raw_article = re.split(r'-{20,}\n', raw_article)
            # Because the first split will leave a large header
            # containing some information for the document we split at
            # all triple lines and take the last line. The last line
            # here should be the title of the first article.
            headline = re.sub(r'\n+', ' ', headline.split('\n\n\n')[-1].strip()

            # We split the raw_article at double newline and assume
            # that the meta-data is before and not after the article
            # body. However, sometimes there can be a long string
            # before the meta-data. Even with a double newline before
            # the meta-data.
            raw_article = raw_article.split('\n\n')
            while raw_article:
                try:
                    raw_meta, lead, * article = raw_article
                    date, paper, author = parse_meta(raw_meta)
                    # If we manage to get out of the parse_meta safely
                    # we break out of the loop.
                    break
                except IndexError:
                    # When we get an index error we assume that we got
                    # a string or an empty line before the meta data.
                    raw_article = raw_article[1:]

            if not raw_article:
                # If we now find that the raw_article is empty, there
                # was no meta-data that we could find.
                print('Could not find meta data in: %s' % headline)
                continue

            # To get the text into one string we replace all sequences
            # of newlines in each of the remaining lines in the
            # article. We then join with ' ', so that we get one long
            # string.
            text = ' '.join([re.sub(r'\n+', ' ', line).strip() for line in article])

            # We find all places where NAV is mentioned in the
            # article.
            global nav
            mentions = re.findall(nav, text)
            # We are only interested in when NAV is mentioned in the
            # text, but not in either the lead and headline.
            if (mentions and not re.search(nav, lead) and not re.search(nav, headline)):
                with codecs.open(filename+'.csv', 'a', 'utf-8') as csv_out:
                    writer = csv.writer(csv_out, dialect='excel-tab')
                    writer.writerow([len(mentions), paper, date, author, headline, lead, text])

 if __name__ == '__main__':
    for arg in sys.argv[1:]:
        parse(arg)
	#! /usr/bin/env python3

	import codecs
	import csv
	import re
	import sys
	import time

	def parse_date(s):
	# We assume that the date is always in the format '10.01.2014'.
	match = re.search(r'\d{2}\.\d{2}\.\d{4}',s)
	if match is None:
	return 'NA'
	else:
	return match.group()

	def parse_meta(raw_meta):
	meta = raw_meta.split('\n')

	# Reduce the meta lines until we find a line with a date. I hope
	# that we don't get meta-data where the date is not at the same
	# line as the newspaper name.
	date = parse_date(meta[0])
	while date is 'NA' and meta:
	meta = meta[1:]
	date = parse_date(meta[0])

	# If we get to the end there is really nothing to do but to abort
	# and return null.
	if not meta:
	raise ValueError("There was no date in the meta-data.")

	# The paper name should be at the same line as the date, before
	# the ','.
	paper = meta[0].split(',')[0].strip()
	if paper is '':
	paper = 'NA'

	author = 'NA'
	# If we see 'Seksjon:', 'Side' or 'Publisert' in the next line
	# there is no author, but otherwise we assume that the next line
	# is name of the author(s).
	if not re.match(r'^Seksjon:.\|^Side.\|^Publisert.*\|\d{4}-\d\d-\d\dT', meta[1], re.I):
	author = meta[1]

	return (date, paper, author)

	# To look for nav, we define a regex that ignores case and looks for
	# the string 'nav' with a word boundery on each side.
	nav = r'(?i)\bnav\b'

	def parse(filename):
	with codecs.open(filename, 'r', 'utf-8') as f:
	# Split the file at every ===+ which denotes new article.
	articles = re.split(r'={20,}\n', f.read())
	for raw_article in articles:
	# Split every raw article at ---+. What is above is the title.
	headline, raw_article = re.split(r'-{20,}\n', raw_article)
	# Because the first split will leave a large header
	# containing some information for the document we split at
	# all triple lines and take the last line. The last line
	# here should be the title of the first article.
	headline = re.sub(r'\n+', ' ', headline.split('\n\n\n')[-1].strip()

	# We split the raw_article at double newline and assume
	# that the meta-data is before and not after the article
	# body. However, sometimes there can be a long string
	# before the meta-data. Even with a double newline before
	# the meta-data.
	raw_article = raw_article.split('\n\n')
	while raw_article:
	try:
	raw_meta, lead, * article = raw_article
	date, paper, author = parse_meta(raw_meta)
	# If we manage to get out of the parse_meta safely
	# we break out of the loop.
	break
	except IndexError:
	# When we get an index error we assume that we got
	# a string or an empty line before the meta data.
	raw_article = raw_article[1:]

	if not raw_article:
	# If we now find that the raw_article is empty, there
	# was no meta-data that we could find.
	print('Could not find meta data in: %s' % headline)
	continue

	# To get the text into one string we replace all sequences
	# of newlines in each of the remaining lines in the
	# article. We then join with ' ', so that we get one long
	# string.
	text = ' '.join([re.sub(r'\n+', ' ', line).strip() for line in article])

	# We find all places where NAV is mentioned in the
	# article.
	global nav
	mentions = re.findall(nav, text)
	# We are only interested in when NAV is mentioned in the
	# text, but not in either the lead and headline.
	if (mentions and not re.search(nav, lead) and not re.search(nav, headline)):
	with codecs.open(filename+'.csv', 'a', 'utf-8') as csv_out:
	writer = csv.writer(csv_out, dialect='excel-tab')
	writer.writerow([len(mentions), paper, date, author, headline, lead, text])

	if __name__ == '__main__':
	for arg in sys.argv[1:]:
	parse(arg)