ogrisel · March 11, 2010 18:23
diff --git a/.gitignore b/.gitignore
 *.swp
 *.pyc
 *.png
 data/*
 build
diff --git a/__init__.py b/__init__.py
 """Neural nets-based utility to build low dimensional codes or/and sparse codes

 - low dimensional projections are typically used for semantic mapping and
 visualization of high dim vector data while preserving nearest neighboors

 - sparse codes are typically used for scalable semantic indexing of high
 dimensional vectors such as the term frequency vectors of documents

 Author: [email protected]
 License: MIT http://www.opensource.org/licenses/mit-license.php
 """

diff --git a/README.rst b/README.rst
diff --git a/activate.sh b/activate.sh
 #!/bin/sh
 export PYTHONPATH="../theano:../pynnet:../scikit-learn:./src:$PYTHONPATH"
diff --git a/evaluation.py b/evaluation.py
 """Functions to naively compute nearest neighbors"""

 import numpy as np
 # TODO: move the matplotlib dependent code somewhere else
 import matplotlib.pyplot as plt
 import matplotlib.image as mpimg

 def find_nearest(ref, vectors, n=5, max_dist=None, norm='l2', mean=True):
    """Find the n vector indices whose components match ref"""
    if norm == 'l2':
        distances = np.sqrt((vectors - ref) ** 2)
    elif norm == 'l1':
        distances = np.abs(vectors - ref)
    else:
        raise ValueError("unknown norm: " + norm)
    if mean:
        distances = distances.mean(axis=1)
    else:
        distances = distances.sum(axis=1)

    return [(i, distances[i]) for i in distances.argsort()[:n]
            if max_dist is None or distances[i] < max_dist]


 def preview(filenames, rows=1):
    for i, f in enumerate(filenames):
        sp = plt.subplot(rows, len(filenames), i + 1)
        sp.imshow(np.flipud(mpimg.imread(f)))
    plt.show()


 def preview_nearest(ref_idx, codes, filenames, n=5):
    if not isinstance(codes, list):
        codes = [codes]
    for i, code_set in enumerate(codes):
        nearest = find_nearest(code_set[ref_idx], code_set, n=n)
        for j, f in enumerate([filenames[k] for k, _ in nearest]):
            columns = len(nearest) + 1
            rows = len(codes)
            sp = plt.subplot(rows, columns, (i * columns) + j + 1)
            sp.imshow(np.flipud(mpimg.imread(f)))


diff --git a/image.py b/image.py
 #!/usr/bin/env python
 """Script to build image classification dataset from commons.wikipedia.org

 Author: [email protected]
 License: MIT - http://www.opensource.org/licenses/mit-license.php
 """

 import os
 import sys
 import cgi
 import random
 from gzip import GzipFile
 from PIL import Image
 from cStringIO import StringIO
 from restkit import Resource
 from restkit import ConnectionPool
 from lxml import etree
 import numpy as np
 import leargist

 CONNECTION_POOL = ConnectionPool(max_connections=4)

 BASE_SERVER_URL = "http://commons.wikimedia.org"
 BASE_CATEGORY_URL = "/wiki/Category:"

 # the magic single command to do gray level & crop / extent transforms
 CONVERT_GRAY_CMD = ("convert {input} -colorspace Gray"
                    " -resize {w}x{h}^ -gravity center"
                    " -extent {w}x{h} {output}")

 CONVERT_COLOR_CMD = ("convert {input}"
                    " -resize {w}x{h}^ -gravity center"
                    " -extent {w}x{h} {output}")

 COLLECTIONS = (
    ("portraits", [
        "Portrait_photographs_of_men",
        "Portrait_photographs_of_women",
    ]),
    ("landscapes", [
        "Landscapes",
        "Coasts",
        "Landscape_of_France",
        "Snowy_landscapes",
    ]),
    ("diagrams", [
        "Diagrams",
    ]),
    ("paintings", [
        "Impressionist_paintings",
        "Post-Impressionist_paintings",
        "Realist_paintings",
    ]),
    ("drawings", [
        "Sepia_drawings",
        "Pencil_drawings",
        "Charcoal_drawings",
        "Pastel_drawings",
    ]),
    ("buildings", [
        "Columns",
        "Building_construction",
        "Building_interiors",
    ]),
 )

 def microthumbs(filenames, sizes=[2, 3, 4]):
    """Aggregate pixel values of thumbnails of various micro sizes

    This can be used as a poors man replacement for GIST features for image
    clustering.
    """
    result = np.zeros((len(filenames), 3 * sum(s ** 2 for s in sizes)))
    for i, f in enumerate(filenames):
        vectors = []
        for s in sizes:
            img = Image.open(f).convert('RGB').resize((s, s))
            vectors.append(np.asarray(img, dtype=np.double).flatten() / 128)
        result[i][:] = np.concatenate(vectors)
    return result


 #TODO: drop this an replace me by scipy.ndimage instead
 def img_to_array(image, mode='L', w=32, h=32, dtype=np.float32):
    """Convert a PIL Image into a numpy array"""
    if image.mode != mode:
        image = image.convert(mode=mode)
    if image.size != (w, h):
        image = image.resize((w, h), Image.ANTIALIAS)

    data = np.array(image.getdata(), dtype)
    rescaled_data = data / 128. - 1
    return rescaled_data


 def array_to_img(data, w=32, h=32):
    """Convert brain ouput to an Image with the same format as the input"""
    rescaled = ((data + 1.) * 128).clip(0, 255)
    image = Image.new('L', (w, h))
    image.putdata(list(rescaled.flat))
    return image


 def urldecode(url):
    """Helper to convert query parameters as a dict"""
    if "?" in url:
        path, query = url.split("?", 1)
        return path, dict(cgi.parse_qsl(query))
    else:
        return url, {}


 def find_image_urls(category, server_url=BASE_SERVER_URL,
                    category_prefix=BASE_CATEGORY_URL,
                    resource=None, pool=CONNECTION_POOL):
    """Scrap the mediawiki category pages to identify thumbs to download"""
    parser = etree.HTMLParser()
    if resource is None:
        resource = Resource(server_url, pool_instance=pool)
    collected_urls = []
    page = category_prefix + category
    parameters = {}
    while page is not None:
        response = resource.get(page, **parameters)
        tree = etree.parse(StringIO(response.body), parser)
        thumb_tags = tree.xpath(
            '//div[@class="thumb"]//img[contains(@src, "/thumb/")]')
        collected_urls.extend(tag.get('src') for tag in thumb_tags)
        links = tree.xpath('//a[contains(text(), "next 200")]')
        if links:
            page, parameters = urldecode(links[0].get('href'))
        else:
            page = None
    return collected_urls


 def collect(collections, folder):
    """Collect categorized thumbnails from commons.mediawiki.org"""
    resource = Resource(BASE_SERVER_URL, pool_instance=CONNECTION_POOL)
    for category_name, subcategories in collections:
        category_folder = os.path.join(folder, category_name)
        if not os.path.exists(category_folder):
            os.makedirs(category_folder)
        for subcategory in subcategories:
            for url in find_image_urls(subcategory, resource=resource):
                _, filename = url.rsplit("/", 1)
                if len(filename) > 200:
                    print "skipping file: ", filename
                    continue
                filepath = os.path.join(category_folder, filename)
                if os.path.exists(filepath):
                    print "skipping", url
                    continue
                print "downloading", url
                with file(filepath, 'wb') as f:
                    payload = Resource(
                        url, pool_instance=CONNECTION_POOL).get().body
                    f.write(payload)


 def preprocess(input_folder, output_folder, w=32, h=32, gray_levels=True):
    """Convert images to fixed dimensional squared graylevel jpegs"""
    for category in os.listdir(input_folder):
        category_input_folder = os.path.join(input_folder, category)
        category_output_folder = os.path.join(output_folder, category)
        if not os.path.isdir(category_input_folder):
            continue
        if not os.path.exists(category_output_folder):
            os.makedirs(category_output_folder)

        for filename in os.listdir(category_input_folder):
            input_path = os.path.join(category_input_folder, filename)
            output_path = os.path.join(
                category_output_folder, filename + '-preprocessed.jpeg')
            if os.path.exists(output_path):
                print "skipping", filename
                continue
            if gray_levels:
                cmd = CONVERT_GRAY_CMD.format(
                    input=input_path, output=output_path, w=w, h=h)
            else:
                cmd = CONVERT_COLOR_CMD.format(
                    input=input_path, output=output_path, w=w, h=h)

            print "processing", filename
            os.system(cmd)


 def pack(input_folder, index_file, data_output_file, label_output_file=None,
         seed=42, dtype=np.float32, transform=None):
    """Pack picture files as numpy arrays with category folder as labels"""
    all_filenames = [(category, os.path.join(input_folder, category, filename))
                     for category in os.listdir(input_folder)
                     for filename in os.listdir(
                         os.path.join(input_folder, category))]
    random.seed(seed)
    random.shuffle(all_filenames)

    # count the number of picture by category and drop excedent so that all
    # categories have equal number of samples

    counts = dict()
    for c, _ in all_filenames:
        if c in counts:
            counts[c] += 1
        else:
            counts[c] = 1
    limit = min(counts.values())

    resampled_filenames = []
    counts = dict()
    for c, fn in all_filenames:
        if c in counts:
            counts[c] += 1
        else:
            counts[c] = 1
        if counts[c] <= limit:
            resampled_filenames.append(fn)

    with file(index_file, "wb") as f:
        f.write("\n".join(resampled_filenames))
        f.write('\n')

    reference = Image.open(resampled_filenames[0])

    w, h = reference.size

    dim = w * h
    if transform == 'gist':
        dim = 960 # hardcoded for now

    # TODO: add microthumb transform as baseline too

    data_array = np.zeros((len(resampled_filenames), dim), dtype=dtype)
    for i, filepath in enumerate(resampled_filenames):
        im = Image.open(filepath)
        if transform == 'gist':
            data_array[i,:] = leargist.color_gist(im)
        else:
            data_array[i,:] = img_to_array(im, w=w, h=h, dtype=dtype).flatten()

    if data_output_file.endswith('.gz'):
        np.save(GzipFile(data_output_file, 'wb'), data_array)
    else:
        np.save(file(data_output_file, 'wb'), data_array)

    # TODO: deal with label data at some point


 if __name__ == "__main__":
    # TODO: use optparser to create a real CLI handler
    if sys.argv[1] == "collect":
        collect(COLLECTIONS, "collected-images")
    elif sys.argv[1] == "preprocess":
        preprocess("collected-images", "preprocessed-images")
    elif sys.argv[1] == "pack":
        pack("preprocessed-images", "image_filenames.txt", "images.npy.gz")
    elif sys.argv[1] == "crop-color":
        preprocess("collected-images", "cropped-images", h=200, w=200,
                   gray_levels=False)
    elif sys.argv[1] == "gist":
        pack("cropped-images", "gist_image_filenames.txt", "gist.npy.gz",
             transform='gist')

diff --git a/setup.py b/setup.py
 from distutils.core import setup, Extension
 import sys, os

 version = file('VERSION.txt').read().strip()


 setup(name='codemaker',
      version=version,
      description="Embedding and sparse coding of dense and/or high dimensional data",
      long_description=file('README.rst').read(),
      classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
      keywords=('machine-learning artificial-intelligence scientific numerical'
                'artificial-neural-networks'),
      author='Olivier Grisel',
      author_email='[email protected]',
      url='http://github.com/ogrisel/codemaker',
      license='MIT',
      package_dir={'': 'src'},
      packages=['codemaker'])
diff --git a/sparse.py b/sparse.py
 import numpy as np
 from scikits.learn.glm.coordinate_descent import ElasticNet

 def sparse_encode(D, data, alpha=1, tol=1e-3, callback=None):
    """Given dictionary D, finc sparse encoding of vectors in data

    Encoding is performed using coordinate descent of the elastic net
    regularized least squares.
    """
    D = np.asanyarray(D, dtype=np.double)
    data = np.asanyarray(data, dtype=np.double)
    data = np.atleast_2d(data)
    encoded = np.zeros((data.shape[0], D.shape[1]), dtype=np.double)

    for i, code in enumerate(data):
        # TODO: parallelize me with multiprocessing!
        encoded[i][:] = ElasticNet(alpha=alpha).fit(D, code, tol=tol).w
        if callback is not None:
            callback(i)
    return encoded


diff --git a/test_sparsecode.py b/test_sparsecode.py
 from codemaker.sparse import sparse_encode
 import numpy as np
 from nose.tools import *

 def test_random_sparsecode():

    n_samples = 100
    n_features = 50
    n_basis = 30

    np.random.seed(0)

    # sample data from centered unit variance guaussian process
    x = np.random.normal(
        loc=0.0, scale=1.0, size=n_samples * n_features).reshape(
            n_samples, n_features)

    # lets take the first samples as dictionary
    D = x[:n_basis].T

    # encode all samples according to D
    encoded = sparse_encode(D, x, alpha=10)
    assert_equal(encoded.shape, (n_samples, n_basis))

    # check that the first sample are encoded using a single component (trivial
    # sparse code)
    avg_density = (encoded[:n_basis] != 0).sum(axis=1).mean()
    assert_almost_equal(avg_density, 1.0, 0.01)

    for i in xrange(n_basis):
        for j in xrange(n_basis):
            if i == j:
                assert_almost_equal(encoded[i][j], 1.0, 0.05)
            else:
                assert_almost_equal(encoded[i][j], 0.0, 0.01)

    # check that the remaining samples could also be encoding with a sparse code
    avg_density = (encoded[n_basis:] != 0).sum(axis=1).mean()
    assert_almost_equal(avg_density, 3.6, 0.1)


diff --git a/utils.py b/utils.py
 """This file contains different utility functions that are not connected 
 in anyway to the networks presented in the tutorials, but rather help in 
 processing the outputs into a more understandable way. 

 For example ``tile_raster_images`` helps in generating a easy to grasp 
 image from a set of samples or weights.

 Credit: excerpt from the http://deeplearning.net/tutorial
 """


 import numpy


 def scale_to_unit_interval(ndar,eps=1e-8):
    """ Scales all values in the ndarray ndar to be between 0 and 1 """
    ndar = ndar.copy()
    ndar -= ndar.min()
    ndar *= 1.0 / (ndar.max()+eps)
    return ndar


 def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0), 
              scale_rows_to_unit_interval = True, output_pixel_vals = True):
    """
    Transform an array with one flattened image per row, into an array in 
    which images are reshaped and layed out like tiles on a floor.

    This function is useful for visualizing datasets whose rows are images, 
    and also columns of matrices for transforming those rows 
    (such as the first layer of a neural net).

    :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 
    be 2-D ndarrays or None;
    :param X: a 2-D array in which every row is a flattened image.

    :type img_shape: tuple; (height, width)
    :param img_shape: the original shape of each image

    :type tile_shape: tuple; (rows, cols)
    :param tile_shape: the number of images to tile (rows, cols)
    
    :param output_pixel_vals: if output should be pixel values (i.e. int8
    values) or floats

    :param scale_rows_to_unit_interval: if the values need to be scaled before
    being plotted to [0,1] or not


    :returns: array suitable for viewing as an image.  
    (See:`PIL.Image.fromarray`.)
    :rtype: a 2-d array with same dtype as X.

    """
 
    assert len(img_shape) == 2
    assert len(tile_shape) == 2
    assert len(tile_spacing) == 2

    # The expression below can be re-written in a more C style as 
    # follows : 
    #
    # out_shape    = [0,0]
    # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
    #                tile_spacing[0]
    # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
    #                tile_spacing[1]
    out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 
                        in zip(img_shape, tile_shape, tile_spacing)]

    if isinstance(X, tuple):
        assert len(X) == 4
        # Create an output numpy ndarray to store the image 
        if output_pixel_vals:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
        else:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)

        #colors default to 0, alpha defaults to 1 (opaque)
        if output_pixel_vals:
            channel_defaults = [0,0,0,255]
        else:
            channel_defaults = [0.,0.,0.,1.]

        for i in xrange(4):
            if X[i] is None:
                # if channel is None, fill it with zeros of the correct 
                # dtype
                out_array[:,:,i] = numpy.zeros(out_shape,
                        dtype='uint8' if output_pixel_vals else out_array.dtype
                        )+channel_defaults[i]
            else:
                # use a recurrent call to compute the channel and store it 
                # in the output
                out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
        return out_array

    else:
        # if we are dealing with only one channel 
        H, W = img_shape
        Hs, Ws = tile_spacing

        # generate a matrix to store the output
        out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)


        for tile_row in xrange(tile_shape[0]):
            for tile_col in xrange(tile_shape[1]):
                if tile_row * tile_shape[1] + tile_col < X.shape[0]:
                    if scale_rows_to_unit_interval:
                        # if we should scale values to be between 0 and 1 
                        # do this by calling the `scale_to_unit_interval`
                        # function
                        this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
                    else:
                        this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
                    # add the slice to the corresponding position in the 
                    # output array
                    out_array[
                        tile_row * (H+Hs):tile_row*(H+Hs)+H,
                        tile_col * (W+Ws):tile_col*(W+Ws)+W
                        ] \
                        = this_img * (255 if output_pixel_vals else 1)
        return out_array



diff --git a/VERSION.txt b/VERSION.txt
 0.1.0-git
	"""Neural nets-based utility to build low dimensional codes or/and sparse codes

	- low dimensional projections are typically used for semantic mapping and
	visualization of high dim vector data while preserving nearest neighboors

	- sparse codes are typically used for scalable semantic indexing of high
	dimensional vectors such as the term frequency vectors of documents

	Author: [email protected]
	License: MIT http://www.opensource.org/licenses/mit-license.php
	"""
	#!/bin/sh
	export PYTHONPATH="../theano:../pynnet:../scikit-learn:./src:$PYTHONPATH"
	"""Functions to naively compute nearest neighbors"""

	import numpy as np
	# TODO: move the matplotlib dependent code somewhere else
	import matplotlib.pyplot as plt
	import matplotlib.image as mpimg

	def find_nearest(ref, vectors, n=5, max_dist=None, norm='l2', mean=True):
	"""Find the n vector indices whose components match ref"""
	if norm == 'l2':
	distances = np.sqrt((vectors - ref) ** 2)
	elif norm == 'l1':
	distances = np.abs(vectors - ref)
	else:
	raise ValueError("unknown norm: " + norm)
	if mean:
	distances = distances.mean(axis=1)
	else:
	distances = distances.sum(axis=1)

	return [(i, distances[i]) for i in distances.argsort()[:n]
	if max_dist is None or distances[i] < max_dist]


	def preview(filenames, rows=1):
	for i, f in enumerate(filenames):
	sp = plt.subplot(rows, len(filenames), i + 1)
	sp.imshow(np.flipud(mpimg.imread(f)))
	plt.show()


	def preview_nearest(ref_idx, codes, filenames, n=5):
	if not isinstance(codes, list):
	codes = [codes]
	for i, code_set in enumerate(codes):
	nearest = find_nearest(code_set[ref_idx], code_set, n=n)
	for j, f in enumerate([filenames[k] for k, _ in nearest]):
	columns = len(nearest) + 1
	rows = len(codes)
	sp = plt.subplot(rows, columns, (i * columns) + j + 1)
	sp.imshow(np.flipud(mpimg.imread(f)))
	#!/usr/bin/env python
	"""Script to build image classification dataset from commons.wikipedia.org

	Author: [email protected]
	License: MIT - http://www.opensource.org/licenses/mit-license.php
	"""

	import os
	import sys
	import cgi
	import random
	from gzip import GzipFile
	from PIL import Image
	from cStringIO import StringIO
	from restkit import Resource
	from restkit import ConnectionPool
	from lxml import etree
	import numpy as np
	import leargist

	CONNECTION_POOL = ConnectionPool(max_connections=4)

	BASE_SERVER_URL = "http://commons.wikimedia.org"
	BASE_CATEGORY_URL = "/wiki/Category:"

	# the magic single command to do gray level & crop / extent transforms
	CONVERT_GRAY_CMD = ("convert {input} -colorspace Gray"
	" -resize {w}x{h}^ -gravity center"
	" -extent {w}x{h} {output}")

	CONVERT_COLOR_CMD = ("convert {input}"
	" -resize {w}x{h}^ -gravity center"
	" -extent {w}x{h} {output}")

	COLLECTIONS = (
	("portraits", [
	"Portrait_photographs_of_men",
	"Portrait_photographs_of_women",
	]),
	("landscapes", [
	"Landscapes",
	"Coasts",
	"Landscape_of_France",
	"Snowy_landscapes",
	]),
	("diagrams", [
	"Diagrams",
	]),
	("paintings", [
	"Impressionist_paintings",
	"Post-Impressionist_paintings",
	"Realist_paintings",
	]),
	("drawings", [
	"Sepia_drawings",
	"Pencil_drawings",
	"Charcoal_drawings",
	"Pastel_drawings",
	]),
	("buildings", [
	"Columns",
	"Building_construction",
	"Building_interiors",
	]),
	)

	def microthumbs(filenames, sizes=[2, 3, 4]):
	"""Aggregate pixel values of thumbnails of various micro sizes

	This can be used as a poors man replacement for GIST features for image
	clustering.
	"""
	result = np.zeros((len(filenames), 3 * sum(s ** 2 for s in sizes)))
	for i, f in enumerate(filenames):
	vectors = []
	for s in sizes:
	img = Image.open(f).convert('RGB').resize((s, s))
	vectors.append(np.asarray(img, dtype=np.double).flatten() / 128)
	result[i][:] = np.concatenate(vectors)
	return result


	#TODO: drop this an replace me by scipy.ndimage instead
	def img_to_array(image, mode='L', w=32, h=32, dtype=np.float32):
	"""Convert a PIL Image into a numpy array"""
	if image.mode != mode:
	image = image.convert(mode=mode)
	if image.size != (w, h):
	image = image.resize((w, h), Image.ANTIALIAS)

	data = np.array(image.getdata(), dtype)
	rescaled_data = data / 128. - 1
	return rescaled_data


	def array_to_img(data, w=32, h=32):
	"""Convert brain ouput to an Image with the same format as the input"""
	rescaled = ((data + 1.) * 128).clip(0, 255)
	image = Image.new('L', (w, h))
	image.putdata(list(rescaled.flat))
	return image


	def urldecode(url):
	"""Helper to convert query parameters as a dict"""
	if "?" in url:
	path, query = url.split("?", 1)
	return path, dict(cgi.parse_qsl(query))
	else:
	return url, {}


	def find_image_urls(category, server_url=BASE_SERVER_URL,
	category_prefix=BASE_CATEGORY_URL,
	resource=None, pool=CONNECTION_POOL):
	"""Scrap the mediawiki category pages to identify thumbs to download"""
	parser = etree.HTMLParser()
	if resource is None:
	resource = Resource(server_url, pool_instance=pool)
	collected_urls = []
	page = category_prefix + category
	parameters = {}
	while page is not None:
	response = resource.get(page, **parameters)
	tree = etree.parse(StringIO(response.body), parser)
	thumb_tags = tree.xpath(
	'//div[@class="thumb"]//img[contains(@src, "/thumb/")]')
	collected_urls.extend(tag.get('src') for tag in thumb_tags)
	links = tree.xpath('//a[contains(text(), "next 200")]')
	if links:
	page, parameters = urldecode(links[0].get('href'))
	else:
	page = None
	return collected_urls


	def collect(collections, folder):
	"""Collect categorized thumbnails from commons.mediawiki.org"""
	resource = Resource(BASE_SERVER_URL, pool_instance=CONNECTION_POOL)
	for category_name, subcategories in collections:
	category_folder = os.path.join(folder, category_name)
	if not os.path.exists(category_folder):
	os.makedirs(category_folder)
	for subcategory in subcategories:
	for url in find_image_urls(subcategory, resource=resource):
	_, filename = url.rsplit("/", 1)
	if len(filename) > 200:
	print "skipping file: ", filename
	continue
	filepath = os.path.join(category_folder, filename)
	if os.path.exists(filepath):
	print "skipping", url
	continue
	print "downloading", url
	with file(filepath, 'wb') as f:
	payload = Resource(
	url, pool_instance=CONNECTION_POOL).get().body
	f.write(payload)


	def preprocess(input_folder, output_folder, w=32, h=32, gray_levels=True):
	"""Convert images to fixed dimensional squared graylevel jpegs"""
	for category in os.listdir(input_folder):
	category_input_folder = os.path.join(input_folder, category)
	category_output_folder = os.path.join(output_folder, category)
	if not os.path.isdir(category_input_folder):
	continue
	if not os.path.exists(category_output_folder):
	os.makedirs(category_output_folder)

	for filename in os.listdir(category_input_folder):
	input_path = os.path.join(category_input_folder, filename)
	output_path = os.path.join(
	category_output_folder, filename + '-preprocessed.jpeg')
	if os.path.exists(output_path):
	print "skipping", filename
	continue
	if gray_levels:
	cmd = CONVERT_GRAY_CMD.format(
	input=input_path, output=output_path, w=w, h=h)
	else:
	cmd = CONVERT_COLOR_CMD.format(
	input=input_path, output=output_path, w=w, h=h)

	print "processing", filename
	os.system(cmd)


	def pack(input_folder, index_file, data_output_file, label_output_file=None,
	seed=42, dtype=np.float32, transform=None):
	"""Pack picture files as numpy arrays with category folder as labels"""
	all_filenames = [(category, os.path.join(input_folder, category, filename))
	for category in os.listdir(input_folder)
	for filename in os.listdir(
	os.path.join(input_folder, category))]
	random.seed(seed)
	random.shuffle(all_filenames)

	# count the number of picture by category and drop excedent so that all
	# categories have equal number of samples

	counts = dict()
	for c, _ in all_filenames:
	if c in counts:
	counts[c] += 1
	else:
	counts[c] = 1
	limit = min(counts.values())

	resampled_filenames = []
	counts = dict()
	for c, fn in all_filenames:
	if c in counts:
	counts[c] += 1
	else:
	counts[c] = 1
	if counts[c] <= limit:
	resampled_filenames.append(fn)

	with file(index_file, "wb") as f:
	f.write("\n".join(resampled_filenames))
	f.write('\n')

	reference = Image.open(resampled_filenames[0])

	w, h = reference.size

	dim = w * h
	if transform == 'gist':
	dim = 960 # hardcoded for now

	# TODO: add microthumb transform as baseline too

	data_array = np.zeros((len(resampled_filenames), dim), dtype=dtype)
	for i, filepath in enumerate(resampled_filenames):
	im = Image.open(filepath)
	if transform == 'gist':
	data_array[i,:] = leargist.color_gist(im)
	else:
	data_array[i,:] = img_to_array(im, w=w, h=h, dtype=dtype).flatten()

	if data_output_file.endswith('.gz'):
	np.save(GzipFile(data_output_file, 'wb'), data_array)
	else:
	np.save(file(data_output_file, 'wb'), data_array)

	# TODO: deal with label data at some point


	if __name__ == "__main__":
	# TODO: use optparser to create a real CLI handler
	if sys.argv[1] == "collect":
	collect(COLLECTIONS, "collected-images")
	elif sys.argv[1] == "preprocess":
	preprocess("collected-images", "preprocessed-images")
	elif sys.argv[1] == "pack":
	pack("preprocessed-images", "image_filenames.txt", "images.npy.gz")
	elif sys.argv[1] == "crop-color":
	preprocess("collected-images", "cropped-images", h=200, w=200,
	gray_levels=False)
	elif sys.argv[1] == "gist":
	pack("cropped-images", "gist_image_filenames.txt", "gist.npy.gz",
	transform='gist')
	from distutils.core import setup, Extension
	import sys, os

	version = file('VERSION.txt').read().strip()


	setup(name='codemaker',
	version=version,
	description="Embedding and sparse coding of dense and/or high dimensional data",
	long_description=file('README.rst').read(),
	classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
	keywords=('machine-learning artificial-intelligence scientific numerical'
	'artificial-neural-networks'),
	author='Olivier Grisel',
	author_email='[email protected]',
	url='http://github.com/ogrisel/codemaker',
	license='MIT',
	package_dir={'': 'src'},
	packages=['codemaker'])
	import numpy as np
	from scikits.learn.glm.coordinate_descent import ElasticNet

	def sparse_encode(D, data, alpha=1, tol=1e-3, callback=None):
	"""Given dictionary D, finc sparse encoding of vectors in data

	Encoding is performed using coordinate descent of the elastic net
	regularized least squares.
	"""
	D = np.asanyarray(D, dtype=np.double)
	data = np.asanyarray(data, dtype=np.double)
	data = np.atleast_2d(data)
	encoded = np.zeros((data.shape[0], D.shape[1]), dtype=np.double)

	for i, code in enumerate(data):
	# TODO: parallelize me with multiprocessing!
	encoded[i][:] = ElasticNet(alpha=alpha).fit(D, code, tol=tol).w
	if callback is not None:
	callback(i)
	return encoded
	from codemaker.sparse import sparse_encode
	import numpy as np
	from nose.tools import *

	def test_random_sparsecode():

	n_samples = 100
	n_features = 50
	n_basis = 30

	np.random.seed(0)

	# sample data from centered unit variance guaussian process
	x = np.random.normal(
	loc=0.0, scale=1.0, size=n_samples * n_features).reshape(
	n_samples, n_features)

	# lets take the first samples as dictionary
	D = x[:n_basis].T

	# encode all samples according to D
	encoded = sparse_encode(D, x, alpha=10)
	assert_equal(encoded.shape, (n_samples, n_basis))

	# check that the first sample are encoded using a single component (trivial
	# sparse code)
	avg_density = (encoded[:n_basis] != 0).sum(axis=1).mean()
	assert_almost_equal(avg_density, 1.0, 0.01)

	for i in xrange(n_basis):
	for j in xrange(n_basis):
	if i == j:
	assert_almost_equal(encoded[i][j], 1.0, 0.05)
	else:
	assert_almost_equal(encoded[i][j], 0.0, 0.01)

	# check that the remaining samples could also be encoding with a sparse code
	avg_density = (encoded[n_basis:] != 0).sum(axis=1).mean()
	assert_almost_equal(avg_density, 3.6, 0.1)
	"""This file contains different utility functions that are not connected
	in anyway to the networks presented in the tutorials, but rather help in
	processing the outputs into a more understandable way.

	For example ``tile_raster_images`` helps in generating a easy to grasp
	image from a set of samples or weights.

	Credit: excerpt from the http://deeplearning.net/tutorial
	"""


	import numpy


	def scale_to_unit_interval(ndar,eps=1e-8):
	""" Scales all values in the ndarray ndar to be between 0 and 1 """
	ndar = ndar.copy()
	ndar -= ndar.min()
	ndar *= 1.0 / (ndar.max()+eps)
	return ndar


	def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0),
	scale_rows_to_unit_interval = True, output_pixel_vals = True):
	"""
	Transform an array with one flattened image per row, into an array in
	which images are reshaped and layed out like tiles on a floor.

	This function is useful for visualizing datasets whose rows are images,
	and also columns of matrices for transforming those rows
	(such as the first layer of a neural net).

	:type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
	be 2-D ndarrays or None;
	:param X: a 2-D array in which every row is a flattened image.

	:type img_shape: tuple; (height, width)
	:param img_shape: the original shape of each image

	:type tile_shape: tuple; (rows, cols)
	:param tile_shape: the number of images to tile (rows, cols)

	:param output_pixel_vals: if output should be pixel values (i.e. int8
	values) or floats

	:param scale_rows_to_unit_interval: if the values need to be scaled before
	being plotted to [0,1] or not


	:returns: array suitable for viewing as an image.
	(See:`PIL.Image.fromarray`.)
	:rtype: a 2-d array with same dtype as X.

	"""

	assert len(img_shape) == 2
	assert len(tile_shape) == 2
	assert len(tile_spacing) == 2

	# The expression below can be re-written in a more C style as
	# follows :
	#
	# out_shape = [0,0]
	# out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
	# tile_spacing[0]
	# out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
	# tile_spacing[1]
	out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
	in zip(img_shape, tile_shape, tile_spacing)]

	if isinstance(X, tuple):
	assert len(X) == 4
	# Create an output numpy ndarray to store the image
	if output_pixel_vals:
	out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
	else:
	out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)

	#colors default to 0, alpha defaults to 1 (opaque)
	if output_pixel_vals:
	channel_defaults = [0,0,0,255]
	else:
	channel_defaults = [0.,0.,0.,1.]

	for i in xrange(4):
	if X[i] is None:
	# if channel is None, fill it with zeros of the correct
	# dtype
	out_array[:,:,i] = numpy.zeros(out_shape,
	dtype='uint8' if output_pixel_vals else out_array.dtype
	)+channel_defaults[i]
	else:
	# use a recurrent call to compute the channel and store it
	# in the output
	out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
	return out_array

	else:
	# if we are dealing with only one channel
	H, W = img_shape
	Hs, Ws = tile_spacing

	# generate a matrix to store the output
	out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)


	for tile_row in xrange(tile_shape[0]):
	for tile_col in xrange(tile_shape[1]):
	if tile_row * tile_shape[1] + tile_col < X.shape[0]:
	if scale_rows_to_unit_interval:
	# if we should scale values to be between 0 and 1
	# do this by calling the `scale_to_unit_interval`
	# function
	this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
	else:
	this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
	# add the slice to the corresponding position in the
	# output array
	out_array[
	tile_row * (H+Hs):tile_row*(H+Hs)+H,
	tile_col * (W+Ws):tile_col*(W+Ws)+W
	] \
	= this_img * (255 if output_pixel_vals else 1)
	return out_array