Skip to content

Instantly share code, notes, and snippets.

@thisiswei
Created July 2, 2013 05:27
Show Gist options
  • Save thisiswei/5906955 to your computer and use it in GitHub Desktop.
Save thisiswei/5906955 to your computer and use it in GitHub Desktop.
import webapp2
import jinja2
import os
import sys
import random
sys.path.insert(0, 'libs')
import re
import time
import jinja2
from bs4 import BeautifulSoup
from google.appengine.ext import db
from google.appengine.api import urlfetch
from google.appengine.api import memcache
template_dir = os.path.join(os.path.dirname(__file__), 'templates')
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader(template_dir), autoescape = True)
categories = """W_APP_DRESSES WOMENS_SHOES W_NEWARRIVALS W_CAMP M_TOPS MENS_SHOES M_NEWARRIVALS M_PATTERN A_DEC_BEDDING A_MEDIA_GADGETS A_NEWARRIVALS W_WORKAHOLIC SALE_W SALE_M SALE_APT""".split(' ')
DETAIL_PAGE = 'http://www.urbanoutfitters.com/urban/catalog/productdetail.jsp?id='
HOST = 'http://www.urbanoutfitters.com/'
CATEGORY_URL = 'http://www.urbanoutfitters.com/urban/catalog/category.jsp?id='
IMG_URL = 'http://images.urbanoutfitters.com/is/image/UrbanOutfitters/%s_%s_'
DETAIL = '?$detailMain$'
THUMB = '?$detailThumb$'
def timedcall(fn, *args):
t0 = time.clock()
result = fn(*args)
t1 = time.clock()
return t1-t0, result
def make_soup(url):
s = None
while True:
r = urlfetch.fetch(url)
if r.status_code == 200:
break
else:
print url
pass
try:
s = BeautifulSoup(r.content)
except Exception:
print url
pass
return s
#--------------- main function ----------------------
def get_all_prods(category):
uri = CATEGORY_URL + category
s = make_soup(uri)
prods = get_page_products(s)
pages, items = get_pages_and_items(s, category)
pages_uri = get_pages_urls(pages, items, category)
for p_uri in pages_uri:
p_soup = make_soup(p_uri)
if p_soup:
prods += get_page_products(p_soup)
return prods
def get_page_products(s):
cats = s.find(id='category-products')
prods = [p for p in cats.findAll('div', 'category-product')]
return prods
def get_pages_and_items(s, category):
span = s.find(class_ = 'category-pagination-pages')
pages = int(re.findall('\d+', span.text)[-1])
items = int(span.find('a')['href'].split('=')[-1])
return pages, items
def get_pages_urls(pages, items, catg):
base_uri = CATEGORY_URL + catg+ '&startValue=%d'
rang = items - 1
pages_uri = [base_uri % i for i in range(1, pages*rang, rang)]
return pages_uri
#---------------functions parsing products from one page---------------
def get_swatches(p):
swatches = p.find('ul', 'category-product-swatches')
return ([l.a['href'][-3:] for l in swatches.findAll('li')] if swatches
else [])
def get_id_and_colors(p):
img_link = p.find('p', 'category-product-image').img['src']
p_id, color, _ = img_link.split('/')[-1].split('_')
swatches = get_swatches(p)
return p_id, (swatches if swatches else [color])
def get_detail(p):
des = p.find(class_='category-product-description')
instore = False if p.find('li',"category-product-online-only") else True
price, name = get_price(des), des.find('a').string
return price, name, instore
def get_price(des):
price = (des.find('h3', 'price').string or
des.find('h3', 'price').text or
des.find(class_='price').find('span', 'price-sale').string)
if len(price) > 8:
l = map(float, re.findall('\d+(?:\.\d+)?', price))
return l[-1] / (l[-2] if len(l) == 2 else 1)
return float(price[1:6])
def compose_img_link(pid, colors, form):
color = random.choice(colors)
#later angle = random.choice('bdf')
angle = 'b'
return IMG_URL % (pid, color) + angle + form
#---------------end---------------
class Product(db.Expando):
colors = db.ListProperty(str)
price = db.FloatProperty()
pid = db.StringProperty()
instore = db.BooleanProperty()
category = db.StringProperty()
def thumb(self):
res = compose_img_link(self.pid, self.colors, THUMB)
return res
def detail(self):
res = compose_img_link(self.pid, self.colors, DETAIL)
return res
def detail_page(self):
res = DETAIL_PAGE + self.pid
return res
def get_or_update(category, update=False):
ps = get_or_set_mem(category, 0, None, 1) or []
if update or (not ps):
ps = update_category(category)
return ps
def update_category(category):
for p in get_all_prods(category):
pid, colors = get_id_and_colors(p)
price, name, instore = get_detail(p)
ins = update_product(name, pid, colors, price, instore, category)
res = Product.all().filter('category =', category).fetch(1000)
r = get_or_set_mem(category, 1, res)
get_or_set_mem('allprods', 1)
return r
def filter_and_fetch(k, v, keys_only=False):
prods = Product.all()
res = prods.filter(k+' =', v).fetch(1000)
return list(res)
def update_product(name, pid, colors, price, instore, category):
ins = Product.get_or_insert(name, pid=pid, colors=colors, price=price, instore=instore, category=category)
if price != ins.price:
ins.price = price
ins.put()
def get_or_set_mem(key, update=False, val=None, catg=False):
res = memcache.get(key) or []
if update or (not res):
prods = Product.all()
if catg:
val = filter_and_fetch('category', key)
res = val or list(prods)
memcache.set(key, res)
return res
class BaseHandler(webapp2.RequestHandler):
def render(self, template, **params):
t = jinja_env.get_template(template)
self.response.write(t.render(params))
def render_front(self, prods):
self.render('front.html', products=prods)
class CategoryHandler(BaseHandler):
def get(self, category):
prods = get_or_update(category)
random.shuffle(prods)
self.render_front(prods)
class SearchHandler(BaseHandler):
def get(self, k):
ps = Product.all(keys_only=True)
k = k.upper()
ls = [p for p in ps if k in p.name().upper()]
filtered = Product.get(ls)
self.render_front(filtered)
class UpdateHandler(BaseHandler):
def get(self, catg):
prods = update_category(catg)
self.render_front(prods)
class FrontPage(BaseHandler):
def get(self):
prods = get_or_set_mem('allprods')
random.shuffle(prods)
self.render_front(prods)
#------------------- let's parse end ---------------------------------------------
app = webapp2.WSGIApplication([
('/', FrontPage),
('/search/(\w+)', SearchHandler),
('/update/(\w+)', UpdateHandler),
('/categories/(\w+)', CategoryHandler)
], debug=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment