nandajavarma · August 29, 2015 13:57
diff --git a/parsecsv.py b/parsecsv.py
 # -*- coding: utf-8 -*-
 import urllib2
 import re
 import csv
 from bs4 import BeautifulSoup
 import sys
 from sys import argv

 def getlinks(csvfile):
    links = []
    with open(csvfile, 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            if row:
                links.append(row[1].replace(' ', '_').split('\n'))
    return links

 def create_csv(counts):
    with open("csvfile.csv", "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerow(["character_count"])
        for val in counts:
            writer.writerow([val])
    return

 def getcontent(links):
    urlhead= 'http://ml.wikisource.org/wiki/'
    links.pop(0)
    counts = []
    for title in links:
        url = urlhead + title[0]
        print "counting characters in " + url
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)
        text_parts = str(soup.find("div", {"class": "pagetext"})).decode("UTF-8")
        text = re.sub('<[^>]*>', '', text_parts).replace(' ', '')
        counts.append(len(text))
    return counts

 def getoutput():
    readerop = csv.reader(open('thal_creation.csv', 'rb'))
    reader1 = csv.reader(open('csvfile.csv', 'rb'))
    writer1 = csv.writer(open('charactercount_output.csv', 'wb'))
    for row in readerop:
        try:
            row1 = reader1.next()
            writer.writerow(row + row1)
        except StopIteration:
            pass
    return

 if __name__=='__main__':
    if len(argv) < 2:
        print "USAGE: python parsecsv.py <csvfilename>"
        sys.exit()
    links = getlinks(argv[1])
    counts = getcontent(links)
    create_csv(counts)
    getoutput()
	# -- coding: utf-8 --
	import urllib2
	import re
	import csv
	from bs4 import BeautifulSoup
	import sys
	from sys import argv

	def getlinks(csvfile):
	links = []
	with open(csvfile, 'rb') as f:
	reader = csv.reader(f)
	for row in reader:
	if row:
	links.append(row[1].replace(' ', '_').split('\n'))
	return links

	def create_csv(counts):
	with open("csvfile.csv", "w") as output:
	writer = csv.writer(output, lineterminator='\n')
	writer.writerow(["character_count"])
	for val in counts:
	writer.writerow([val])
	return

	def getcontent(links):
	urlhead= 'http://ml.wikisource.org/wiki/'
	links.pop(0)
	counts = []
	for title in links:
	url = urlhead + title[0]
	print "counting characters in " + url
	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page)
	text_parts = str(soup.find("div", {"class": "pagetext"})).decode("UTF-8")
	text = re.sub('<[^>]*>', '', text_parts).replace(' ', '')
	counts.append(len(text))
	return counts

	def getoutput():
	readerop = csv.reader(open('thal_creation.csv', 'rb'))
	reader1 = csv.reader(open('csvfile.csv', 'rb'))
	writer1 = csv.writer(open('charactercount_output.csv', 'wb'))
	for row in readerop:
	try:
	row1 = reader1.next()
	writer.writerow(row + row1)
	except StopIteration:
	pass
	return

	if __name__=='__main__':
	if len(argv) < 2:
	print "USAGE: python parsecsv.py <csvfilename>"
	sys.exit()
	links = getlinks(argv[1])
	counts = getcontent(links)
	create_csv(counts)
	getoutput()