bemasher · May 30, 2011 00:48
diff --git a/Memory.py b/Memory.py

 import threading
 import urllib, urllib2
 import json, re
 from Queue import Queue

 class GetURL(threading.Thread):
    def __init__(self, urlQueue, jsonQueue):
        threading.Thread.__init__(self)
        self.urlQueue = urlQueue
        self.jsonQueue = jsonQueue
    
    def run(self):
        # Run until thread is terminated
        while True:
            # Get an item number and url for the item
            itemNumber, url = self.urlQueue.get()
            
            # Get item specifications
            raw = urllib2.urlopen(url).read()
            jsonQueue.put((itemNumber, json.loads(raw)))
            
            # Indicate that the item pulled last from the queue is finished
            self.urlQueue.task_done()

 def getItems(pageNumber = 1):
    # Parameters retrieved from the query builder
    params = {
        "SubCategoryId": 147, 
        "NValue": "100007611 600006050", 
        "StoreDepaId": 1, 
        "NodeId": 7611, 
        "BrandId": -1,
        
        # Be sure to fill in the pageNumber variable for pagination
        "PageNumber": pageNumber, 
        "CategoryId": 17
    }
    
    # Pass search parameters to newegg's api and get response
    request = urllib2.Request(searchURL, json.dumps(params))
    response = urllib2.urlopen(request)
    raw = response.read()
    data = json.loads(raw)

    # Check to see how many results are left
    pagination = data['PaginationInfo']
    pages = pagination['TotalCount'] / pagination['PageSize']

    # Extend the items list with the current result set
    items = []
    items.extend(data['ProductListItems'])
    
    # If there are still more pages of data get the next result set
    if pages >= pageNumber:
        # Recursively call getItems for next set and extend items with it's result
        items.extend(getItems(pageNumber + 1))
    return items

 # API urls we need to pull data for this analysis
 itemSpecURL = "http://www.ows.newegg.com/Products.egg/{}/Specification"
 searchURL = "http://www.ows.newegg.com/Search.egg/Advanced"

 # Get item list (a single api call)
 itemList = getItems()

 # Define queues for item urls and specs result for each item
 urlQueue = Queue()
 jsonQueue = Queue()
 items = {}

 # Push each item and it's item number onto the urlQueue
 # and assign each item number it's item structure
 for item in itemList:
    specURL = itemSpecURL.format(item["ItemNumber"])
    urlQueue.put((item["ItemNumber"], specURL))
    items[item["ItemNumber"]] = item

 # Start two threads for grabbing specifications of each item
 # this is not fault tolerant so any more than 4 threads
 # tends to choke on an exception when a call fails.

 # Obviously this needs fixing.
 for worker in xrange(2):
    t = GetURL(urlQueue, jsonQueue)
    t.setDaemon(True)
    t.start()

 # Block until all requests have been processed
 urlQueue.join()

 # Regex's used for parsing specification data for analysis
 speed_re = re.compile('DDR\d\s(\d+).*')
 capacity_re = re.compile("(\d+)GB\s\((\d+)\sx\s(\d+)GB\)")
 timing_re = re.compile('(\d+-\d+-\d+-\d+)')

 # List of features for each item we're concerned with
 features = ['Brand', 'Model', 'ItemNumber', 'Price', 'Speed', 'Capacity', 'Dimms', 'Timing', 'Voltage']
 required = ['Price', 'Speed', 'Capacity', 'Dimms', 'Timing']

 # For every item in the specification queue
 while not jsonQueue.empty():
    itemNumber, specs = jsonQueue.get()
    
    # Make a new item dictionary and populate it's feature pairs
    item = {}
    for group in specs['SpecificationGroupList']:
        for pair in group['SpecificationPairList']:
            if pair['Key'] in features:
                item[pair['Key']] = pair['Value'].encode('ascii', errors='ignore')
    
    # If capacity exists and is parsable, parse it and so on
    if 'Capacity' in item:
        capacity = capacity_re.match(item['Capacity'])
        if capacity:
            item['Capacity'] = capacity.group(1)
            item['Dimms'] = capacity.group(2)
    
    if 'Speed' in item:
        speed = speed_re.match(item['Speed'])
        if speed:
            item['Speed'] = speed.group(1)
    if 'Timing' in item:
        timing = timing_re.match(item['Timing'])
        if timing:
            item['Timing'] = timing.group(1).replace('-','\t')
            
    # Get price and item number from the items dictionary we made earlier
    item['Price'] = items[itemNumber]['FinalPrice']
    item['ItemNumber'] = specs['NeweggItemNumber']
    
    # Skip the item if it's missing a required feature
    for feature in required:
        if feature not in item:
            continue
    
    try:
        # Fill in blank features
        for feature in features:
            if feature not in item:
                item[feature] = ""
        # Print all item features
        print '\t'.join(map(lambda x: item[x], features))
    except KeyError:
        # If an item is missing a feature then just skip the item
        pass
    jsonQueue.task_done()

	import threading
	import urllib, urllib2
	import json, re
	from Queue import Queue

	class GetURL(threading.Thread):
	def __init__(self, urlQueue, jsonQueue):
	threading.Thread.__init__(self)
	self.urlQueue = urlQueue
	self.jsonQueue = jsonQueue

	def run(self):
	# Run until thread is terminated
	while True:
	# Get an item number and url for the item
	itemNumber, url = self.urlQueue.get()

	# Get item specifications
	raw = urllib2.urlopen(url).read()
	jsonQueue.put((itemNumber, json.loads(raw)))

	# Indicate that the item pulled last from the queue is finished
	self.urlQueue.task_done()

	def getItems(pageNumber = 1):
	# Parameters retrieved from the query builder
	params = {
	"SubCategoryId": 147,
	"NValue": "100007611 600006050",
	"StoreDepaId": 1,
	"NodeId": 7611,
	"BrandId": -1,

	# Be sure to fill in the pageNumber variable for pagination
	"PageNumber": pageNumber,
	"CategoryId": 17
	}

	# Pass search parameters to newegg's api and get response
	request = urllib2.Request(searchURL, json.dumps(params))
	response = urllib2.urlopen(request)
	raw = response.read()
	data = json.loads(raw)

	# Check to see how many results are left
	pagination = data['PaginationInfo']
	pages = pagination['TotalCount'] / pagination['PageSize']

	# Extend the items list with the current result set
	items = []
	items.extend(data['ProductListItems'])

	# If there are still more pages of data get the next result set
	if pages >= pageNumber:
	# Recursively call getItems for next set and extend items with it's result
	items.extend(getItems(pageNumber + 1))
	return items

	# API urls we need to pull data for this analysis
	itemSpecURL = "http://www.ows.newegg.com/Products.egg/{}/Specification"
	searchURL = "http://www.ows.newegg.com/Search.egg/Advanced"

	# Get item list (a single api call)
	itemList = getItems()

	# Define queues for item urls and specs result for each item
	urlQueue = Queue()
	jsonQueue = Queue()
	items = {}

	# Push each item and it's item number onto the urlQueue
	# and assign each item number it's item structure
	for item in itemList:
	specURL = itemSpecURL.format(item["ItemNumber"])
	urlQueue.put((item["ItemNumber"], specURL))
	items[item["ItemNumber"]] = item

	# Start two threads for grabbing specifications of each item
	# this is not fault tolerant so any more than 4 threads
	# tends to choke on an exception when a call fails.

	# Obviously this needs fixing.
	for worker in xrange(2):
	t = GetURL(urlQueue, jsonQueue)
	t.setDaemon(True)
	t.start()

	# Block until all requests have been processed
	urlQueue.join()

	# Regex's used for parsing specification data for analysis
	speed_re = re.compile('DDR\d\s(\d+).*')
	capacity_re = re.compile("(\d+)GB\s\((\d+)\sx\s(\d+)GB\)")
	timing_re = re.compile('(\d+-\d+-\d+-\d+)')

	# List of features for each item we're concerned with
	features = ['Brand', 'Model', 'ItemNumber', 'Price', 'Speed', 'Capacity', 'Dimms', 'Timing', 'Voltage']
	required = ['Price', 'Speed', 'Capacity', 'Dimms', 'Timing']

	# For every item in the specification queue
	while not jsonQueue.empty():
	itemNumber, specs = jsonQueue.get()

	# Make a new item dictionary and populate it's feature pairs
	item = {}
	for group in specs['SpecificationGroupList']:
	for pair in group['SpecificationPairList']:
	if pair['Key'] in features:
	item[pair['Key']] = pair['Value'].encode('ascii', errors='ignore')

	# If capacity exists and is parsable, parse it and so on
	if 'Capacity' in item:
	capacity = capacity_re.match(item['Capacity'])
	if capacity:
	item['Capacity'] = capacity.group(1)
	item['Dimms'] = capacity.group(2)

	if 'Speed' in item:
	speed = speed_re.match(item['Speed'])
	if speed:
	item['Speed'] = speed.group(1)
	if 'Timing' in item:
	timing = timing_re.match(item['Timing'])
	if timing:
	item['Timing'] = timing.group(1).replace('-','\t')

	# Get price and item number from the items dictionary we made earlier
	item['Price'] = items[itemNumber]['FinalPrice']
	item['ItemNumber'] = specs['NeweggItemNumber']

	# Skip the item if it's missing a required feature
	for feature in required:
	if feature not in item:
	continue

	try:
	# Fill in blank features
	for feature in features:
	if feature not in item:
	item[feature] = ""
	# Print all item features
	print '\t'.join(map(lambda x: item[x], features))
	except KeyError:
	# If an item is missing a feature then just skip the item
	pass
	jsonQueue.task_done()