Skip to content

Instantly share code, notes, and snippets.

@joohee
Last active December 27, 2015 18:19
Show Gist options
  • Select an option

  • Save joohee/7369230 to your computer and use it in GitHub Desktop.

Select an option

Save joohee/7369230 to your computer and use it in GitHub Desktop.
1. download 2. xml -> csv 변환 및 prdcode값만 따로 추출 - 정렬을 위해 prdcode값을 맨 앞으로 붙입니다. 3. diff 후 파일 저장 - 어제 있었는데 오늘 없는 파일은 삭제 대상이므로 deleted-%Y-%m-%d.txt 파일을 생성합니다. 샘플은 일단 삭제 ^_^)
#!/usr/env/python
# coding: utf8
import urllib
import urllib2
import requests
from datetime import date, datetime
## urllib는 파일 만들고 다운로드 + write 시작
## urllib2 다운로드 받은 후에 파일 write 시작
url = 'http://www.poombuy.com/sk_styletag/sk_stt_all.xml'
def download():
print 'download poombuy'
filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml'
urllib.urlretrieve(url, filename)
def download2():
print 'download poombuy2'
filename = 'poombuy-' + datetime.now().strftime('%Y%m%d') + '.xml'
f = urllib2.urlopen(url)
data = f.read()
with open(filename, 'wb') as code:
code.write(data)
if __name__ == '__main__':
import timeit
print (timeit.timeit(stmt="download()", setup="from __main__ import download", number=1))
print (timeit.timeit(stmt="download2()", setup="from __main__ import download2", number=1))
#!/usr/env/python
# coding: utf8
from lxml import etree
import sys
from diff import diff
def read(args):
print sys.stdout.encoding
input_xml = ""
error_count = 0
infile = args[1]
outfile = args[2]
if (len(args) > 3):
yesterdayfile = args[3]
try:
input_xml = etree.parse(infile)
except Exception, e:
log = e.error_log.filter_from_level(etree.ErrorLevels.FATAL)
print(log)
error_count += 1
pass
print "error_count", error_count
pb = input_xml.getroot()
prdlist = pb.findall('products/product')
f = open(outfile, 'w')
f_filtered = open(outfile + '.filtered', 'w')
products = []
prdcodes = []
for prd in prdlist:
product = ""
prdcode = ""
for child in prd.getchildren():
if child.tag == 'prdcode':
# 맨 앞으로 붙인다.
product = "%s%s%s" % (child.text, "," , product)
prdcode = child.text
try:
if (type(child.text) == unicode):
text = "%s%s%s" % ("\"", child.text, "\"")
else:
if child.text == None:
text = ''
else:
text = child.text
product = "%s%s%s" % (product, ',', text)
except Exception, e:
product = "%s%s", (product, ',')
#print product
#f.write(product.encode('utf-8') + '\n')
#f_filtered.write(prdcode.encode('utf-8') + '\n')
products.append(product)
prdcodes.append(prdcode)
sorted(products)
sorted(set(prdcodes))
f.writelines( "%s\n" % item.encode('utf-8') for item in products )
f_filtered.writelines( "%s\n" % item.encode('utf-8') for item in prdcodes )
f.close()
f_filtered.close()
if (len(args) > 3):
diff(outfile, yesterdayfile)
if __name__ == '__main__':
print sys.argv
if len(sys.argv) < 3:
print "usage: python xml_to_csv.sh [input] [output] [yesterday file]?"
else:
import timeit
print (timeit.timeit(stmt="read(sys.argv)", setup="from __main__ import read", number=1))
#!/usr/env/python
# coding: utf-8
import sys
import difflib
import re
def diff(file1, file2):
print file1, file2
f1 = open(file1, 'U')
fromlines = f1.readlines()
f2 = open(file2, 'U')
tolines = f2.readlines()
filename = 'deleted-' + str(date.today()) + '.txt'
print 'write diff to ', filename
deletefile = open(filename, 'w')
for line in difflib.unified_diff(fromlines, tolines, f1, f2):
match = re.match(r'-\d', line, re.M|re.I)
if match:
deletefile.write(line[1:len(line)])
if __name__ == '__main__':
if (len(sys.argv) < 3):
print "usage: python diff.py input output"
else:
diff(sys.argv[1], sys.argv[2])
#!/usr/env/python
#coding: utf8
from boto.s3.connection import S3Connection
from boto.s3.key import Key
import timeit
from datetime import date
import os
import sys
import optparse
import progressbar
import time
aws_key = ''
aws_secret_key = ''
pbar = None
def sizeof_fmt(num):
for x in ['bytes','KB','MB','GB','TB']:
if num < 1024.0:
return "%3.1f%s" % (num, x)
num /= 1024.0
def progress_callback(current, total):
try:
pbar.update(current)
except AssertionError, e:
print e
def s3(bucket, filename, reduced_redundancy=False):
global pbar
k = Key(bucket)
k.key = filename
size = os.stat(filename).st_size
if size == 0:
print 'Bad filesize for "%s"' % (filename)
return 0
widgets = [
unicode(filename, errors='ignore').encode('utf-8'), ' ',
progressbar.FileTransferSpeed(),
' <<<', progressbar.Bar(), '>>> ',
progressbar.Percentage(), ' ', progressbar.ETA()
]
pbar = progressbar.ProgressBar(widgets=widgets, maxval=size)
pbar.start()
try:
k.set_contents_from_filename(
filename,
cb=progress_callback,
num_cb=100,
reduced_redundancy=reduced_redundancy,
)
except IOError, e:
print e
return 0
pbar.finish()
return size
if __name__ == '__main__':
#print timeit.timeit(stmt='from __main__ import s3', number=1)
conn = S3Connection(aws_key, aws_secret_key)
print conn
filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml'
bucket = conn.get_bucket('styletag-batch-dev')
s3(bucket, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment