Last active
December 27, 2015 18:19
-
-
Save joohee/7369230 to your computer and use it in GitHub Desktop.
1. download 2. xml -> csv 변환 및 prdcode값만 따로 추출 - 정렬을 위해 prdcode값을 맨 앞으로 붙입니다. 3. diff 후 파일 저장 - 어제 있었는데 오늘 없는 파일은 삭제 대상이므로 deleted-%Y-%m-%d.txt 파일을 생성합니다. 샘플은 일단 삭제 ^_^)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/env/python | |
| # coding: utf8 | |
| import urllib | |
| import urllib2 | |
| import requests | |
| from datetime import date, datetime | |
| ## urllib는 파일 만들고 다운로드 + write 시작 | |
| ## urllib2 다운로드 받은 후에 파일 write 시작 | |
| url = 'http://www.poombuy.com/sk_styletag/sk_stt_all.xml' | |
| def download(): | |
| print 'download poombuy' | |
| filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml' | |
| urllib.urlretrieve(url, filename) | |
| def download2(): | |
| print 'download poombuy2' | |
| filename = 'poombuy-' + datetime.now().strftime('%Y%m%d') + '.xml' | |
| f = urllib2.urlopen(url) | |
| data = f.read() | |
| with open(filename, 'wb') as code: | |
| code.write(data) | |
| if __name__ == '__main__': | |
| import timeit | |
| print (timeit.timeit(stmt="download()", setup="from __main__ import download", number=1)) | |
| print (timeit.timeit(stmt="download2()", setup="from __main__ import download2", number=1)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/env/python | |
| # coding: utf8 | |
| from lxml import etree | |
| import sys | |
| from diff import diff | |
| def read(args): | |
| print sys.stdout.encoding | |
| input_xml = "" | |
| error_count = 0 | |
| infile = args[1] | |
| outfile = args[2] | |
| if (len(args) > 3): | |
| yesterdayfile = args[3] | |
| try: | |
| input_xml = etree.parse(infile) | |
| except Exception, e: | |
| log = e.error_log.filter_from_level(etree.ErrorLevels.FATAL) | |
| print(log) | |
| error_count += 1 | |
| pass | |
| print "error_count", error_count | |
| pb = input_xml.getroot() | |
| prdlist = pb.findall('products/product') | |
| f = open(outfile, 'w') | |
| f_filtered = open(outfile + '.filtered', 'w') | |
| products = [] | |
| prdcodes = [] | |
| for prd in prdlist: | |
| product = "" | |
| prdcode = "" | |
| for child in prd.getchildren(): | |
| if child.tag == 'prdcode': | |
| # 맨 앞으로 붙인다. | |
| product = "%s%s%s" % (child.text, "," , product) | |
| prdcode = child.text | |
| try: | |
| if (type(child.text) == unicode): | |
| text = "%s%s%s" % ("\"", child.text, "\"") | |
| else: | |
| if child.text == None: | |
| text = '' | |
| else: | |
| text = child.text | |
| product = "%s%s%s" % (product, ',', text) | |
| except Exception, e: | |
| product = "%s%s", (product, ',') | |
| #print product | |
| #f.write(product.encode('utf-8') + '\n') | |
| #f_filtered.write(prdcode.encode('utf-8') + '\n') | |
| products.append(product) | |
| prdcodes.append(prdcode) | |
| sorted(products) | |
| sorted(set(prdcodes)) | |
| f.writelines( "%s\n" % item.encode('utf-8') for item in products ) | |
| f_filtered.writelines( "%s\n" % item.encode('utf-8') for item in prdcodes ) | |
| f.close() | |
| f_filtered.close() | |
| if (len(args) > 3): | |
| diff(outfile, yesterdayfile) | |
| if __name__ == '__main__': | |
| print sys.argv | |
| if len(sys.argv) < 3: | |
| print "usage: python xml_to_csv.sh [input] [output] [yesterday file]?" | |
| else: | |
| import timeit | |
| print (timeit.timeit(stmt="read(sys.argv)", setup="from __main__ import read", number=1)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/env/python | |
| # coding: utf-8 | |
| import sys | |
| import difflib | |
| import re | |
| def diff(file1, file2): | |
| print file1, file2 | |
| f1 = open(file1, 'U') | |
| fromlines = f1.readlines() | |
| f2 = open(file2, 'U') | |
| tolines = f2.readlines() | |
| filename = 'deleted-' + str(date.today()) + '.txt' | |
| print 'write diff to ', filename | |
| deletefile = open(filename, 'w') | |
| for line in difflib.unified_diff(fromlines, tolines, f1, f2): | |
| match = re.match(r'-\d', line, re.M|re.I) | |
| if match: | |
| deletefile.write(line[1:len(line)]) | |
| if __name__ == '__main__': | |
| if (len(sys.argv) < 3): | |
| print "usage: python diff.py input output" | |
| else: | |
| diff(sys.argv[1], sys.argv[2]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/env/python | |
| #coding: utf8 | |
| from boto.s3.connection import S3Connection | |
| from boto.s3.key import Key | |
| import timeit | |
| from datetime import date | |
| import os | |
| import sys | |
| import optparse | |
| import progressbar | |
| import time | |
| aws_key = '' | |
| aws_secret_key = '' | |
| pbar = None | |
| def sizeof_fmt(num): | |
| for x in ['bytes','KB','MB','GB','TB']: | |
| if num < 1024.0: | |
| return "%3.1f%s" % (num, x) | |
| num /= 1024.0 | |
| def progress_callback(current, total): | |
| try: | |
| pbar.update(current) | |
| except AssertionError, e: | |
| print e | |
| def s3(bucket, filename, reduced_redundancy=False): | |
| global pbar | |
| k = Key(bucket) | |
| k.key = filename | |
| size = os.stat(filename).st_size | |
| if size == 0: | |
| print 'Bad filesize for "%s"' % (filename) | |
| return 0 | |
| widgets = [ | |
| unicode(filename, errors='ignore').encode('utf-8'), ' ', | |
| progressbar.FileTransferSpeed(), | |
| ' <<<', progressbar.Bar(), '>>> ', | |
| progressbar.Percentage(), ' ', progressbar.ETA() | |
| ] | |
| pbar = progressbar.ProgressBar(widgets=widgets, maxval=size) | |
| pbar.start() | |
| try: | |
| k.set_contents_from_filename( | |
| filename, | |
| cb=progress_callback, | |
| num_cb=100, | |
| reduced_redundancy=reduced_redundancy, | |
| ) | |
| except IOError, e: | |
| print e | |
| return 0 | |
| pbar.finish() | |
| return size | |
| if __name__ == '__main__': | |
| #print timeit.timeit(stmt='from __main__ import s3', number=1) | |
| conn = S3Connection(aws_key, aws_secret_key) | |
| print conn | |
| filename = 'poombuy-' + date.today().strftime('%Y%m%d') + '.xml' | |
| bucket = conn.get_bucket('styletag-batch-dev') | |
| s3(bucket, filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment