Skip to content

Instantly share code, notes, and snippets.

View cheekybastard's full-sized avatar

cheekybastard

View GitHub Profile
from scrapy import log
from scrapy.item import Item
from scrapy.http import Request
from scrapy.contrib.spiders import XMLFeedSpider
def NextURL():
"""
Generate a list of URLs to crawl. You can query a database or come up with some other means
Note that if you generate URLs to crawl from a scraped URL then you're better of using a
<?xml version="1.0" encoding="UTF-8" ?>
<Data>
<Series>
<id>83462</id>
<Actors>|Nathan Fillion|Stana Katic|Molly C. Quinn|Jon Huertas|Seamus Dever|Tamala Jones|Susan Sullivan|Ruben Santiago-Hudson|Monet Mazur|</Actors>
<Airs_DayOfWeek>Monday</Airs_DayOfWeek>
<Airs_Time>10:00 PM</Airs_Time>
<ContentRating>TV-PG</ContentRating>
<FirstAired>2009-03-09</FirstAired>
<Genre>|Drama|</Genre>
section {
padding-top: 60px;
}
.subnav {
margin-bottom: 60px;
width: 100%;
height: 36px;
background-color: #eeeeee; /* Old browsers */
background-repeat: repeat-x; /* Repeat the gradient */
import grequests
from collections import deque
class RequestQueue(object):
"""
This is a lame imitation of a Typhoeus Hydra using GRequests.
The main thing this allows is building up a queue of requests and then
executing them, and potentially adding requests to the queue in a callback
so that you can build requests that depend on other requests more naturally.
#! /usr/bin/env python
import redis
import random
import pylibmc
import sys
r = redis.Redis(host = 'localhost', port = 6389)
mc = pylibmc.Client(['localhost:11222'])
@cheekybastard
cheekybastard / python_exception_examples
Created February 26, 2013 01:37
python_exception_examples
# Python 2.x has an ambiguous except syntax, Python 3.x is stricter so the
# following examples help to identify the right way to handle Py2/3 compatible
# exceptions
# Background: http://www.python.org/dev/peps/pep-3110/
# Note that 'as' and ',' are both accepted in Python 2.x but only 'as' in Python 3.x:
# http://docs.python.org/reference/compound_stmts.html#try
# There are longer notes on re-raising, stack traces and tracebacks here:
# http://www.doughellmann.com/articles/how-tos/python-exception-handling/index.html
@cheekybastard
cheekybastard / datetime_parse_examples
Created February 26, 2013 01:33
datetime parse snippets
# NOTE these are code snippets
import datetime
import time
from dateutil import parser as dt_parser # python-dateutil package
# make a date 30 days ago, conver to truncated string in custom format, convert
# back to datetime
filter_from = datetime.datetime.now() - datetime.timedelta(days=30)
print filter_from, type(filter_from)
filter_from_str = time.strftime("%Y-%m-%dT%H:%M", filter_from.timetuple())

Icon ressources collection

import os
from PIL import Image
def extractFrames(inGif, outFolder):
frame = Image.open(inGif)
nframes = 0
while frame:
frame.save( '%s/%s-%s.gif' % (outFolder, os.path.basename(inGif), nframes ) , 'GIF')
nframes += 1
@cheekybastard
cheekybastard / craigslist_scrapy
Created February 17, 2013 00:48
craigslist_scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from myspider.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/"]