Last active
December 19, 2015 09:09
-
-
Save thisiswei/5931331 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import webapp2 | |
import jinja2 | |
import os | |
import sys | |
import random | |
sys.path.insert(0, 'libs') | |
import re | |
import time | |
import jinja2 | |
from bs4 import BeautifulSoup | |
from google.appengine.ext import db | |
from google.appengine.api import urlfetch | |
from google.appengine.api import memcache | |
template_dir = os.path.join(os.path.dirname(__file__), 'templates') | |
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader(template_dir), autoescape = True) | |
categories = """W_APP_DRESSES WOMENS_SHOES W_NEWARRIVALS W_CAMP M_TOPS MENS_SHOES M_NEWARRIVALS M_PATTERN A_DEC_BEDDING A_MEDIA_GADGETS A_NEWARRIVALS W_WORKAHOLIC SALE_W SALE_M SALE_APT""".split(' ') | |
DETAIL_PAGE = 'http://www.urbanoutfitters.com/urban/catalog/productdetail.jsp?id=' | |
HOST = 'http://www.urbanoutfitters.com/' | |
CATEGORY_URL = 'http://www.urbanoutfitters.com/urban/catalog/category.jsp?id=' | |
IMG_URL = 'http://images.urbanoutfitters.com/is/image/UrbanOutfitters/%s_%s_' | |
DETAIL = '?$detailMain$' | |
THUMB = '?$detailThumb$' | |
def make_soup(url): | |
s = None | |
while True: | |
r = urlfetch.fetch(url) | |
if r.status_code == 200: | |
break | |
else: | |
print url | |
pass | |
try: | |
s = BeautifulSoup(r.content) | |
except Exception: | |
print url | |
pass | |
return s | |
#--------------- main function ---------------------- | |
def get_all_prods(category): | |
uri = CATEGORY_URL + category | |
s = make_soup(uri) | |
prods = get_page_products(s) | |
pages, items = get_pages_and_items(s, category) | |
pages_uri = get_pages_urls(pages, items, category) | |
for p_uri in pages_uri: | |
p_soup = make_soup(p_uri) | |
if p_soup: | |
prods += get_page_products(p_soup) | |
return prods | |
def get_page_products(s): | |
cats = s.find(id='category-products') | |
prods = [p for p in cats.findAll('div', 'category-product')] | |
return prods | |
def get_pages_and_items(s, category): | |
span = s.find(class_ = 'category-pagination-pages') | |
pages = int(re.findall('\d+', span.text)[-1]) | |
items = int(span.find('a')['href'].split('=')[-1]) | |
return pages, items | |
def get_pages_urls(pages, items, catg): | |
base_uri = CATEGORY_URL + catg+ '&startValue=%d' | |
rang = items - 1 | |
pages_uri = [base_uri % i for i in range(1, pages*rang, rang)] | |
return pages_uri | |
#---------------functions parsing products from one page--------------- | |
def get_swatches(p): | |
swatches = p.find('ul', 'category-product-swatches') | |
return ([l.a['href'][-3:] for l in swatches.findAll('li')] if swatches | |
else []) | |
def get_id_and_colors(p): | |
img_link = p.find('p', 'category-product-image').img['src'] | |
p_id, color, _ = img_link.split('/')[-1].split('_') | |
swatches = get_swatches(p) | |
return p_id, (swatches if swatches else [color]) | |
def get_detail(p): | |
des = p.find(class_='category-product-description') | |
instore = False if p.find('li',"category-product-online-only") else True | |
price, name = get_price(des), des.find('a').string | |
return price, name, instore | |
def get_price(des): | |
price = (des.find('h3', 'price').string or | |
des.find('h3', 'price').text or | |
des.find(class_='price').find('span', 'price-sale').string) | |
if len(price) > 8: | |
l = map(float, re.findall('\d+(?:\.\d+)?', price)) | |
return l[-1] / (l[-2] if len(l) == 2 else 1) | |
return float(price[1:6]) | |
def compose_img_link(pid, colors, form): | |
color = random.choice(colors) | |
#later angle = random.choice('bdf') | |
angle = 'b' | |
return IMG_URL % (pid, color) + angle + form | |
#------------------------------ | |
def get_or_update(category, update=False): | |
ps = get_or_set_mem(category, 0, None, 1) or [] | |
if update or (not ps): | |
ps = update_category(category) | |
return ps | |
def get_or_set_mem(key, update=False, val=None, catg=False): | |
res = memcache.get(key) or [] | |
if update or (not res): | |
prods = Product.all() | |
if catg: | |
val = filter_and_fetch('category', key) | |
res = val or list(prods) | |
memcache.set(key, res) | |
return res | |
def update_category(category): | |
for p in get_all_prods(category): | |
pid, colors = get_id_and_colors(p) | |
price, name, instore = get_detail(p) | |
prod = update_product(name, pid, colors, price, instore, category) | |
res = Product.all().filter('category =', category).fetch(1000) | |
r = get_or_set_mem(category, 1, res) | |
get_or_set_mem('allprods', 1) | |
return r | |
def filter_and_fetch(k, v, keys_only=False): | |
prods = Product.all() | |
res = prods.filter(k+' =', v).fetch(1000) | |
return list(res) | |
def update_product(name, pid, colors, price, instore, category): | |
prod = Product.get_or_insert(name, pid=pid, colors=colors, price=price, instore=instore, category=category) | |
if price != prod.price: | |
prod.price = price | |
prod.put() | |
class Product(db.Expando): | |
colors = db.ListProperty(str) | |
price = db.FloatProperty() | |
pid = db.StringProperty() | |
instore = db.BooleanProperty() | |
category = db.StringProperty() | |
def thumb(self): | |
return compose_img_link(self.pid, self.colors, THUMB) | |
def detail(self): | |
return compose_img_link(self.pid, self.colors, DETAIL) | |
def detail_page(self): | |
return DETAIL_PAGE + self.pid | |
class BaseHandler(webapp2.RequestHandler): | |
def render(self, template, **params): | |
t = jinja_env.get_template(template) | |
self.response.write(t.render(params)) | |
def render_front(self, prods): | |
self.render('front.html', products=prods) | |
class CategoryHandler(BaseHandler): | |
def get(self, category): | |
prods = get_or_update(category) | |
random.shuffle(prods) | |
self.render_front(prods) | |
class SearchHandler(BaseHandler): | |
def get(self, key): | |
ps = Product.all(keys_only=True) | |
key = key.upper() | |
L = filter(lambda x: x.name.upper() == key. ps) | |
filtered = Product.get(L) | |
self.render_front(filtered) | |
class UpdateHandler(BaseHandler): | |
def get(self, catg): | |
prods = update_category(catg) | |
self.render_front(prods) | |
class FrontPage(BaseHandler): | |
def get(self): | |
prods = get_or_set_mem('allprods') | |
random.shuffle(prods) | |
self.render_front(prods) | |
app = webapp2.WSGIApplication([ | |
('/', FrontPage), | |
('/search/(\w+)', SearchHandler), | |
('/update/(\w+)', UpdateHandler), | |
('/categories/(\w+)', CategoryHandler) | |
], debug=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment