joohee · December 27, 2015 18:19
diff --git a/1_download.py b/1_download.py
 #!/usr/env/python
 # coding: utf8
 import urllib
 import urllib2
 import requests
 from datetime import date, datetime

 ## urllib는 파일 만들고 다운로드 + write 시작 
 ## urllib2 다운로드 받은 후에 파일 write 시작 
 url = 'http://www.poombuy.com/sk_styletag/sk_stt_all.xml'
 def download():
 	print 'download poombuy'
 	filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml'
 	urllib.urlretrieve(url, filename)

 def download2():
 	print 'download poombuy2'
 	filename = 'poombuy-' + datetime.now().strftime('%Y%m%d') + '.xml'
 	f = urllib2.urlopen(url)
 	data = f.read()
 	with open(filename, 'wb') as code:
 		code.write(data)

 if __name__ == '__main__':
 	import timeit 
 	print (timeit.timeit(stmt="download()", setup="from __main__ import download", number=1))
 	print (timeit.timeit(stmt="download2()", setup="from __main__ import download2", number=1))

diff --git a/2_xml_to_csv.py b/2_xml_to_csv.py
 #!/usr/env/python
 # coding: utf8

 from lxml import etree
 import sys
 from diff import diff

 def read(args):
 	print sys.stdout.encoding
 	input_xml = ""
 	error_count = 0
 	infile = args[1]
 	outfile = args[2]
 	if (len(args) > 3):
 		yesterdayfile = args[3]

 	try:
 		input_xml = etree.parse(infile)
 	except Exception, e:
 		log = e.error_log.filter_from_level(etree.ErrorLevels.FATAL)
 		print(log)
 		error_count += 1
 		pass

 	print "error_count", error_count
 	pb = input_xml.getroot()
 	prdlist = pb.findall('products/product')

 	f = open(outfile, 'w')
 	f_filtered = open(outfile + '.filtered', 'w')
 	products = []
 	prdcodes = []
 	for prd in prdlist:
 		product = ""
 		prdcode = ""
 		for child in prd.getchildren():
 			if child.tag == 'prdcode':
 				# 맨 앞으로 붙인다. 
 				product = "%s%s%s" % (child.text, "," , product)
 				prdcode = child.text
 			try:
 				if (type(child.text) == unicode):
 					text = "%s%s%s" % ("\"", child.text, "\"")
 				else:
 					if child.text == None:
 						text = ''
 					else:
 						text = child.text

 				product = "%s%s%s" % (product, ',', text)
 			except Exception, e:
 				product = "%s%s", (product, ',')
 				
 		#print product	
 		#f.write(product.encode('utf-8') + '\n')
 		#f_filtered.write(prdcode.encode('utf-8') + '\n')
 		products.append(product)
 		prdcodes.append(prdcode)
 	
 	sorted(products)
 	sorted(set(prdcodes))

 	f.writelines( "%s\n" % item.encode('utf-8') for item in products )
 	f_filtered.writelines( "%s\n" % item.encode('utf-8') for item in prdcodes )
 	f.close()
 	f_filtered.close()

 	if (len(args) > 3):
 		diff(outfile, yesterdayfile)

 if __name__ == '__main__':
 	print sys.argv
 	if len(sys.argv) < 3:
 		print "usage: python xml_to_csv.sh [input] [output] [yesterday file]?"
 	else:
 		import timeit 
 		print (timeit.timeit(stmt="read(sys.argv)", setup="from __main__ import read", number=1))
diff --git a/3_diff.py b/3_diff.py
 #!/usr/env/python
 # coding: utf-8

 import sys
 import difflib
 import re

 def diff(file1, file2):
 	print file1, file2

 	f1 = open(file1, 'U')
 	fromlines = f1.readlines()
 	f2 = open(file2, 'U')
 	tolines = f2.readlines()

 	filename = 'deleted-' + str(date.today()) + '.txt'
 	print 'write diff to ', filename
 	deletefile = open(filename, 'w')
 	for line in difflib.unified_diff(fromlines, tolines, f1, f2):
 		match = re.match(r'-\d', line, re.M|re.I)
 		if match:
 			deletefile.write(line[1:len(line)])

 if __name__ == '__main__':
 	if (len(sys.argv) < 3):
 		print "usage: python diff.py input output"
 	else:
 		diff(sys.argv[1], sys.argv[2])
diff --git a/4_upload.py b/4_upload.py
 #!/usr/env/python
 #coding: utf8

 from boto.s3.connection import S3Connection
 from boto.s3.key import Key
 import timeit
 from datetime import date

 import os
 import sys
 import optparse
 import progressbar
 import time

 aws_key = ''
 aws_secret_key = ''
 pbar = None

 def sizeof_fmt(num):
    for x in ['bytes','KB','MB','GB','TB']:
        if num < 1024.0:
            return "%3.1f%s" % (num, x)
        num /= 1024.0

 def progress_callback(current, total):
    try:
        pbar.update(current)
    except AssertionError, e:
        print e

 def s3(bucket, filename, reduced_redundancy=False):
 	global pbar

 	k = Key(bucket)
 	k.key = filename

 	size = os.stat(filename).st_size
 	if size == 0:
 		print 'Bad filesize for "%s"' % (filename)
 		return 0

 	widgets = [
 		unicode(filename, errors='ignore').encode('utf-8'), ' ',
 		progressbar.FileTransferSpeed(),
 		' <<<', progressbar.Bar(), '>>> ',
 		progressbar.Percentage(), ' ', progressbar.ETA()
 	]
 	pbar = progressbar.ProgressBar(widgets=widgets, maxval=size)
 	pbar.start()

 	try:
 		k.set_contents_from_filename(
 			filename, 
 			cb=progress_callback,
            num_cb=100,
            reduced_redundancy=reduced_redundancy,
        )
        except IOError, e:
        	print e
        	return 0

 	pbar.finish()
 	return size

 if __name__ == '__main__':
 	#print timeit.timeit(stmt='from __main__ import s3', number=1)
 	conn = S3Connection(aws_key, aws_secret_key)
 	print conn

 	filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml'
 	bucket = conn.get_bucket('styletag-batch-dev')
 	
 	s3(bucket, filename)
	#!/usr/env/python
	# coding: utf8
	import urllib
	import urllib2
	import requests
	from datetime import date, datetime

	## urllib는 파일 만들고 다운로드 + write 시작
	## urllib2 다운로드 받은 후에 파일 write 시작
	url = 'http://www.poombuy.com/sk_styletag/sk_stt_all.xml'
	def download():
	print 'download poombuy'
	filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml'
	urllib.urlretrieve(url, filename)

	def download2():
	print 'download poombuy2'
	filename = 'poombuy-' + datetime.now().strftime('%Y%m%d') + '.xml'
	f = urllib2.urlopen(url)
	data = f.read()
	with open(filename, 'wb') as code:
	code.write(data)

	if __name__ == '__main__':
	import timeit
	print (timeit.timeit(stmt="download()", setup="from __main__ import download", number=1))
	print (timeit.timeit(stmt="download2()", setup="from __main__ import download2", number=1))
	#!/usr/env/python
	# coding: utf8

	from lxml import etree
	import sys
	from diff import diff

	def read(args):
	print sys.stdout.encoding
	input_xml = ""
	error_count = 0
	infile = args[1]
	outfile = args[2]
	if (len(args) > 3):
	yesterdayfile = args[3]

	try:
	input_xml = etree.parse(infile)
	except Exception, e:
	log = e.error_log.filter_from_level(etree.ErrorLevels.FATAL)
	print(log)
	error_count += 1
	pass

	print "error_count", error_count
	pb = input_xml.getroot()
	prdlist = pb.findall('products/product')

	f = open(outfile, 'w')
	f_filtered = open(outfile + '.filtered', 'w')
	products = []
	prdcodes = []
	for prd in prdlist:
	product = ""
	prdcode = ""
	for child in prd.getchildren():
	if child.tag == 'prdcode':
	# 맨 앞으로 붙인다.
	product = "%s%s%s" % (child.text, "," , product)
	prdcode = child.text
	try:
	if (type(child.text) == unicode):
	text = "%s%s%s" % ("\"", child.text, "\"")
	else:
	if child.text == None:
	text = ''
	else:
	text = child.text

	product = "%s%s%s" % (product, ',', text)
	except Exception, e:
	product = "%s%s", (product, ',')

	#print product
	#f.write(product.encode('utf-8') + '\n')
	#f_filtered.write(prdcode.encode('utf-8') + '\n')
	products.append(product)
	prdcodes.append(prdcode)

	sorted(products)
	sorted(set(prdcodes))

	f.writelines( "%s\n" % item.encode('utf-8') for item in products )
	f_filtered.writelines( "%s\n" % item.encode('utf-8') for item in prdcodes )
	f.close()
	f_filtered.close()

	if (len(args) > 3):
	diff(outfile, yesterdayfile)

	if __name__ == '__main__':
	print sys.argv
	if len(sys.argv) < 3:
	print "usage: python xml_to_csv.sh [input] [output] [yesterday file]?"
	else:
	import timeit
	print (timeit.timeit(stmt="read(sys.argv)", setup="from __main__ import read", number=1))
	#!/usr/env/python
	# coding: utf-8

	import sys
	import difflib
	import re

	def diff(file1, file2):
	print file1, file2

	f1 = open(file1, 'U')
	fromlines = f1.readlines()
	f2 = open(file2, 'U')
	tolines = f2.readlines()

	filename = 'deleted-' + str(date.today()) + '.txt'
	print 'write diff to ', filename
	deletefile = open(filename, 'w')
	for line in difflib.unified_diff(fromlines, tolines, f1, f2):
	match = re.match(r'-\d', line, re.M\|re.I)
	if match:
	deletefile.write(line[1:len(line)])

	if __name__ == '__main__':
	if (len(sys.argv) < 3):
	print "usage: python diff.py input output"
	else:
	diff(sys.argv[1], sys.argv[2])
	#!/usr/env/python
	#coding: utf8

	from boto.s3.connection import S3Connection
	from boto.s3.key import Key
	import timeit
	from datetime import date

	import os
	import sys
	import optparse
	import progressbar
	import time

	aws_key = ''
	aws_secret_key = ''
	pbar = None

	def sizeof_fmt(num):
	for x in ['bytes','KB','MB','GB','TB']:
	if num < 1024.0:
	return "%3.1f%s" % (num, x)
	num /= 1024.0

	def progress_callback(current, total):
	try:
	pbar.update(current)
	except AssertionError, e:
	print e

	def s3(bucket, filename, reduced_redundancy=False):
	global pbar

	k = Key(bucket)
	k.key = filename

	size = os.stat(filename).st_size
	if size == 0:
	print 'Bad filesize for "%s"' % (filename)
	return 0

	widgets = [
	unicode(filename, errors='ignore').encode('utf-8'), ' ',
	progressbar.FileTransferSpeed(),
	' <<<', progressbar.Bar(), '>>> ',
	progressbar.Percentage(), ' ', progressbar.ETA()
	]
	pbar = progressbar.ProgressBar(widgets=widgets, maxval=size)
	pbar.start()

	try:
	k.set_contents_from_filename(
	filename,
	cb=progress_callback,
	num_cb=100,
	reduced_redundancy=reduced_redundancy,
	)
	except IOError, e:
	print e
	return 0

	pbar.finish()
	return size

	if __name__ == '__main__':
	#print timeit.timeit(stmt='from __main__ import s3', number=1)
	conn = S3Connection(aws_key, aws_secret_key)
	print conn

	filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml'
	bucket = conn.get_bucket('styletag-batch-dev')

	s3(bucket, filename)