Skip to content

Instantly share code, notes, and snippets.

@thisiswei
Last active December 19, 2015 09:09
Show Gist options
  • Save thisiswei/5931331 to your computer and use it in GitHub Desktop.
Save thisiswei/5931331 to your computer and use it in GitHub Desktop.
import webapp2
import jinja2
import os
import sys
import random
sys.path.insert(0, 'libs')
import re
import time
import jinja2
from bs4 import BeautifulSoup
from google.appengine.ext import db
from google.appengine.api import urlfetch
from google.appengine.api import memcache
template_dir = os.path.join(os.path.dirname(__file__), 'templates')
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader(template_dir), autoescape = True)
categories = """W_APP_DRESSES WOMENS_SHOES W_NEWARRIVALS W_CAMP M_TOPS MENS_SHOES M_NEWARRIVALS M_PATTERN A_DEC_BEDDING A_MEDIA_GADGETS A_NEWARRIVALS W_WORKAHOLIC SALE_W SALE_M SALE_APT""".split(' ')
DETAIL_PAGE = 'http://www.urbanoutfitters.com/urban/catalog/productdetail.jsp?id='
HOST = 'http://www.urbanoutfitters.com/'
CATEGORY_URL = 'http://www.urbanoutfitters.com/urban/catalog/category.jsp?id='
IMG_URL = 'http://images.urbanoutfitters.com/is/image/UrbanOutfitters/%s_%s_'
DETAIL = '?$detailMain$'
THUMB = '?$detailThumb$'
def make_soup(url):
s = None
while True:
r = urlfetch.fetch(url)
if r.status_code == 200:
break
else:
print url
pass
try:
s = BeautifulSoup(r.content)
except Exception:
print url
pass
return s
#--------------- main function ----------------------
def get_all_prods(category):
uri = CATEGORY_URL + category
s = make_soup(uri)
prods = get_page_products(s)
pages, items = get_pages_and_items(s, category)
pages_uri = get_pages_urls(pages, items, category)
for p_uri in pages_uri:
p_soup = make_soup(p_uri)
if p_soup:
prods += get_page_products(p_soup)
return prods
def get_page_products(s):
cats = s.find(id='category-products')
prods = [p for p in cats.findAll('div', 'category-product')]
return prods
def get_pages_and_items(s, category):
span = s.find(class_ = 'category-pagination-pages')
pages = int(re.findall('\d+', span.text)[-1])
items = int(span.find('a')['href'].split('=')[-1])
return pages, items
def get_pages_urls(pages, items, catg):
base_uri = CATEGORY_URL + catg+ '&startValue=%d'
rang = items - 1
pages_uri = [base_uri % i for i in range(1, pages*rang, rang)]
return pages_uri
#---------------functions parsing products from one page---------------
def get_swatches(p):
swatches = p.find('ul', 'category-product-swatches')
return ([l.a['href'][-3:] for l in swatches.findAll('li')] if swatches
else [])
def get_id_and_colors(p):
img_link = p.find('p', 'category-product-image').img['src']
p_id, color, _ = img_link.split('/')[-1].split('_')
swatches = get_swatches(p)
return p_id, (swatches if swatches else [color])
def get_detail(p):
des = p.find(class_='category-product-description')
instore = False if p.find('li',"category-product-online-only") else True
price, name = get_price(des), des.find('a').string
return price, name, instore
def get_price(des):
price = (des.find('h3', 'price').string or
des.find('h3', 'price').text or
des.find(class_='price').find('span', 'price-sale').string)
if len(price) > 8:
l = map(float, re.findall('\d+(?:\.\d+)?', price))
return l[-1] / (l[-2] if len(l) == 2 else 1)
return float(price[1:6])
def compose_img_link(pid, colors, form):
color = random.choice(colors)
#later angle = random.choice('bdf')
angle = 'b'
return IMG_URL % (pid, color) + angle + form
#------------------------------
def get_or_update(category, update=False):
ps = get_or_set_mem(category, 0, None, 1) or []
if update or (not ps):
ps = update_category(category)
return ps
def get_or_set_mem(key, update=False, val=None, catg=False):
res = memcache.get(key) or []
if update or (not res):
prods = Product.all()
if catg:
val = filter_and_fetch('category', key)
res = val or list(prods)
memcache.set(key, res)
return res
def update_category(category):
for p in get_all_prods(category):
pid, colors = get_id_and_colors(p)
price, name, instore = get_detail(p)
prod = update_product(name, pid, colors, price, instore, category)
res = Product.all().filter('category =', category).fetch(1000)
r = get_or_set_mem(category, 1, res)
get_or_set_mem('allprods', 1)
return r
def filter_and_fetch(k, v, keys_only=False):
prods = Product.all()
res = prods.filter(k+' =', v).fetch(1000)
return list(res)
def update_product(name, pid, colors, price, instore, category):
prod = Product.get_or_insert(name, pid=pid, colors=colors, price=price, instore=instore, category=category)
if price != prod.price:
prod.price = price
prod.put()
class Product(db.Expando):
colors = db.ListProperty(str)
price = db.FloatProperty()
pid = db.StringProperty()
instore = db.BooleanProperty()
category = db.StringProperty()
def thumb(self):
return compose_img_link(self.pid, self.colors, THUMB)
def detail(self):
return compose_img_link(self.pid, self.colors, DETAIL)
def detail_page(self):
return DETAIL_PAGE + self.pid
class BaseHandler(webapp2.RequestHandler):
def render(self, template, **params):
t = jinja_env.get_template(template)
self.response.write(t.render(params))
def render_front(self, prods):
self.render('front.html', products=prods)
class CategoryHandler(BaseHandler):
def get(self, category):
prods = get_or_update(category)
random.shuffle(prods)
self.render_front(prods)
class SearchHandler(BaseHandler):
def get(self, key):
ps = Product.all(keys_only=True)
key = key.upper()
L = filter(lambda x: x.name.upper() == key. ps)
filtered = Product.get(L)
self.render_front(filtered)
class UpdateHandler(BaseHandler):
def get(self, catg):
prods = update_category(catg)
self.render_front(prods)
class FrontPage(BaseHandler):
def get(self):
prods = get_or_set_mem('allprods')
random.shuffle(prods)
self.render_front(prods)
app = webapp2.WSGIApplication([
('/', FrontPage),
('/search/(\w+)', SearchHandler),
('/update/(\w+)', UpdateHandler),
('/categories/(\w+)', CategoryHandler)
], debug=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment