Created
July 2, 2013 05:27
-
-
Save thisiswei/5906955 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import webapp2 | |
import jinja2 | |
import os | |
import sys | |
import random | |
sys.path.insert(0, 'libs') | |
import re | |
import time | |
import jinja2 | |
from bs4 import BeautifulSoup | |
from google.appengine.ext import db | |
from google.appengine.api import urlfetch | |
from google.appengine.api import memcache | |
template_dir = os.path.join(os.path.dirname(__file__), 'templates') | |
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader(template_dir), autoescape = True) | |
categories = """W_APP_DRESSES WOMENS_SHOES W_NEWARRIVALS W_CAMP M_TOPS MENS_SHOES M_NEWARRIVALS M_PATTERN A_DEC_BEDDING A_MEDIA_GADGETS A_NEWARRIVALS W_WORKAHOLIC SALE_W SALE_M SALE_APT""".split(' ') | |
DETAIL_PAGE = 'http://www.urbanoutfitters.com/urban/catalog/productdetail.jsp?id=' | |
HOST = 'http://www.urbanoutfitters.com/' | |
CATEGORY_URL = 'http://www.urbanoutfitters.com/urban/catalog/category.jsp?id=' | |
IMG_URL = 'http://images.urbanoutfitters.com/is/image/UrbanOutfitters/%s_%s_' | |
DETAIL = '?$detailMain$' | |
THUMB = '?$detailThumb$' | |
def timedcall(fn, *args): | |
t0 = time.clock() | |
result = fn(*args) | |
t1 = time.clock() | |
return t1-t0, result | |
def make_soup(url): | |
s = None | |
while True: | |
r = urlfetch.fetch(url) | |
if r.status_code == 200: | |
break | |
else: | |
print url | |
pass | |
try: | |
s = BeautifulSoup(r.content) | |
except Exception: | |
print url | |
pass | |
return s | |
#--------------- main function ---------------------- | |
def get_all_prods(category): | |
uri = CATEGORY_URL + category | |
s = make_soup(uri) | |
prods = get_page_products(s) | |
pages, items = get_pages_and_items(s, category) | |
pages_uri = get_pages_urls(pages, items, category) | |
for p_uri in pages_uri: | |
p_soup = make_soup(p_uri) | |
if p_soup: | |
prods += get_page_products(p_soup) | |
return prods | |
def get_page_products(s): | |
cats = s.find(id='category-products') | |
prods = [p for p in cats.findAll('div', 'category-product')] | |
return prods | |
def get_pages_and_items(s, category): | |
span = s.find(class_ = 'category-pagination-pages') | |
pages = int(re.findall('\d+', span.text)[-1]) | |
items = int(span.find('a')['href'].split('=')[-1]) | |
return pages, items | |
def get_pages_urls(pages, items, catg): | |
base_uri = CATEGORY_URL + catg+ '&startValue=%d' | |
rang = items - 1 | |
pages_uri = [base_uri % i for i in range(1, pages*rang, rang)] | |
return pages_uri | |
#---------------functions parsing products from one page--------------- | |
def get_swatches(p): | |
swatches = p.find('ul', 'category-product-swatches') | |
return ([l.a['href'][-3:] for l in swatches.findAll('li')] if swatches | |
else []) | |
def get_id_and_colors(p): | |
img_link = p.find('p', 'category-product-image').img['src'] | |
p_id, color, _ = img_link.split('/')[-1].split('_') | |
swatches = get_swatches(p) | |
return p_id, (swatches if swatches else [color]) | |
def get_detail(p): | |
des = p.find(class_='category-product-description') | |
instore = False if p.find('li',"category-product-online-only") else True | |
price, name = get_price(des), des.find('a').string | |
return price, name, instore | |
def get_price(des): | |
price = (des.find('h3', 'price').string or | |
des.find('h3', 'price').text or | |
des.find(class_='price').find('span', 'price-sale').string) | |
if len(price) > 8: | |
l = map(float, re.findall('\d+(?:\.\d+)?', price)) | |
return l[-1] / (l[-2] if len(l) == 2 else 1) | |
return float(price[1:6]) | |
def compose_img_link(pid, colors, form): | |
color = random.choice(colors) | |
#later angle = random.choice('bdf') | |
angle = 'b' | |
return IMG_URL % (pid, color) + angle + form | |
#---------------end--------------- | |
class Product(db.Expando): | |
colors = db.ListProperty(str) | |
price = db.FloatProperty() | |
pid = db.StringProperty() | |
instore = db.BooleanProperty() | |
category = db.StringProperty() | |
def thumb(self): | |
res = compose_img_link(self.pid, self.colors, THUMB) | |
return res | |
def detail(self): | |
res = compose_img_link(self.pid, self.colors, DETAIL) | |
return res | |
def detail_page(self): | |
res = DETAIL_PAGE + self.pid | |
return res | |
def get_or_update(category, update=False): | |
ps = get_or_set_mem(category, 0, None, 1) or [] | |
if update or (not ps): | |
ps = update_category(category) | |
return ps | |
def update_category(category): | |
for p in get_all_prods(category): | |
pid, colors = get_id_and_colors(p) | |
price, name, instore = get_detail(p) | |
ins = update_product(name, pid, colors, price, instore, category) | |
res = Product.all().filter('category =', category).fetch(1000) | |
r = get_or_set_mem(category, 1, res) | |
get_or_set_mem('allprods', 1) | |
return r | |
def filter_and_fetch(k, v, keys_only=False): | |
prods = Product.all() | |
res = prods.filter(k+' =', v).fetch(1000) | |
return list(res) | |
def update_product(name, pid, colors, price, instore, category): | |
ins = Product.get_or_insert(name, pid=pid, colors=colors, price=price, instore=instore, category=category) | |
if price != ins.price: | |
ins.price = price | |
ins.put() | |
def get_or_set_mem(key, update=False, val=None, catg=False): | |
res = memcache.get(key) or [] | |
if update or (not res): | |
prods = Product.all() | |
if catg: | |
val = filter_and_fetch('category', key) | |
res = val or list(prods) | |
memcache.set(key, res) | |
return res | |
class BaseHandler(webapp2.RequestHandler): | |
def render(self, template, **params): | |
t = jinja_env.get_template(template) | |
self.response.write(t.render(params)) | |
def render_front(self, prods): | |
self.render('front.html', products=prods) | |
class CategoryHandler(BaseHandler): | |
def get(self, category): | |
prods = get_or_update(category) | |
random.shuffle(prods) | |
self.render_front(prods) | |
class SearchHandler(BaseHandler): | |
def get(self, k): | |
ps = Product.all(keys_only=True) | |
k = k.upper() | |
ls = [p for p in ps if k in p.name().upper()] | |
filtered = Product.get(ls) | |
self.render_front(filtered) | |
class UpdateHandler(BaseHandler): | |
def get(self, catg): | |
prods = update_category(catg) | |
self.render_front(prods) | |
class FrontPage(BaseHandler): | |
def get(self): | |
prods = get_or_set_mem('allprods') | |
random.shuffle(prods) | |
self.render_front(prods) | |
#------------------- let's parse end --------------------------------------------- | |
app = webapp2.WSGIApplication([ | |
('/', FrontPage), | |
('/search/(\w+)', SearchHandler), | |
('/update/(\w+)', UpdateHandler), | |
('/categories/(\w+)', CategoryHandler) | |
], debug=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment