svartalf · May 10, 2012 23:35
diff --git a/pr_sort.py b/pr_sort.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 """Page Rank CSV sorter

 Usage:
    pr_sort.py /path/to/csv-file.csv [/path/to/output-file.csv]

    Output filename is not required, and if not supplied, output goes to the stdout.
 """

 import sys
 import os.path
 import urlparse

 def parse(input, output):

    urls = {}

    # Parse initial file
    for line in open(input):
        url, rank = line.strip().split(',')
        url = urlparse.urlsplit(url).netloc
        urls[url] = int(rank)

    results = {}

    # Iterate over parsed data and remove duplicates
    for url in urls.iterkeys():

        # Determine both www-parted and non-www-parted URLs
        if url.startswith('www.'):
            www_url = url
            non_www_url = url[4:]
        else:
            www_url = 'www.%s' % url
            non_www_url = url

        # Get Page Ranks for them
        www_rank = urls.get(www_url)
        non_www_rank = urls.get(non_www_url)

        # Compare now
        if www_rank == non_www_rank:
            results[www_url] = www_rank
        elif www_rank > non_www_rank:
            results[www_url] = www_rank
        else:
            results[non_www_url] = non_www_rank

    if output:
        result_file = open(output, 'w')
        for url, rank in results.iteritems():
            result_file.write('http://%s,%s\n' % (url, rank))
        result_file.close()
    else:
        for url, rank in results.iteritems():
            print 'http://%s,%s' % (url, rank)

 if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'Usage: %s /path/to/csv-file.csv [/path/to/output-file.csv]' % sys.argv[0]
        sys.exit(-1)

    if not os.path.exists(sys.argv[1]):
        print 'File `%s` doesnt exists' % sys.argv[1]
        sys.exit(-2)

    try:
        output = sys.argv[2]
    except IndexError:
        output = None

    parse(sys.argv[1], output)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""Page Rank CSV sorter

	Usage:
	pr_sort.py /path/to/csv-file.csv [/path/to/output-file.csv]

	Output filename is not required, and if not supplied, output goes to the stdout.
	"""

	import sys
	import os.path
	import urlparse

	def parse(input, output):

	urls = {}

	# Parse initial file
	for line in open(input):
	url, rank = line.strip().split(',')
	url = urlparse.urlsplit(url).netloc
	urls[url] = int(rank)

	results = {}

	# Iterate over parsed data and remove duplicates
	for url in urls.iterkeys():

	# Determine both www-parted and non-www-parted URLs
	if url.startswith('www.'):
	www_url = url
	non_www_url = url[4:]
	else:
	www_url = 'www.%s' % url
	non_www_url = url

	# Get Page Ranks for them
	www_rank = urls.get(www_url)
	non_www_rank = urls.get(non_www_url)

	# Compare now
	if www_rank == non_www_rank:
	results[www_url] = www_rank
	elif www_rank > non_www_rank:
	results[www_url] = www_rank
	else:
	results[non_www_url] = non_www_rank

	if output:
	result_file = open(output, 'w')
	for url, rank in results.iteritems():
	result_file.write('http://%s,%s\n' % (url, rank))
	result_file.close()
	else:
	for url, rank in results.iteritems():
	print 'http://%s,%s' % (url, rank)

	if __name__ == '__main__':
	if len(sys.argv) < 2:
	print 'Usage: %s /path/to/csv-file.csv [/path/to/output-file.csv]' % sys.argv[0]
	sys.exit(-1)

	if not os.path.exists(sys.argv[1]):
	print 'File `%s` doesnt exists' % sys.argv[1]
	sys.exit(-2)

	try:
	output = sys.argv[2]
	except IndexError:
	output = None

	parse(sys.argv[1], output)