Skip to content

Instantly share code, notes, and snippets.

import simplejson as json
import lxml
class objectJSONEncoder(json.JSONEncoder):
"""A specialized JSON encoder that can handle simple lxml objectify types
>>> from lxml import objectify
>>> obj = objectify.fromstring("<Book><price>1.50</price><author>W. Shakespeare</author></Book>")
>>> objectJSONEncoder().encode(obj)
'{"price": 1.5, "author": "W. Shakespeare"}'
"""
import csv, httplib, json
from string import ascii_lowercase
con = httplib.HTTPSConnection('api.github.com')
languages = ['java', 'c', 'ruby', 'python', 'javascript']
for lang in languages:
with open(lang + '.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter = ',', quotechar='"', quoting = csv.QUOTE_MINIMAL)
for ch in ascii_lowercase:
print("Processing repos with " + ch + " for language " + lang)
from gcrawler import GCrawler, Downloader
import unittest
import urllib2
import logging
import traceback
from datetime import datetime
import re
logging.basicConfig(level=logging.DEBUG)
@arowser
arowser / onverting HTML to Plain Text
Created December 7, 2012 08:18
onverting HTML to Plain Text
import re
##
# Removes HTML markup from a text string.
#
# @param text The HTML source.
# @return The plain text. If the HTML source contains non-ASCII
# entities or character references, this is a Unicode string.
def strip_html(text):
@arowser
arowser / weibo_nooauth.py
Created November 24, 2012 02:44 — forked from stephenLee/weibo_nooauth.py
weibo_login script
#!/usr/bin/env python
#coding=utf8
import urllib
import urllib2
import cookielib
import base64
import re
import json
import hashlib
import os
@arowser
arowser / gist:4133773
Created November 23, 2012 02:33 — forked from observerss/gist:3798922
Google Keyword Tool Scraper(casperjs version)
// requires
var utils = require('utils');
var casper = require('casper').create()
var casper = require('casper').create({
verbose: true,
logLevel: "debug"
});
// setup globals
var email = casper.cli.options['email'] || 'REPLACE THIS EMAIL';
@arowser
arowser / gist:4133771
Created November 23, 2012 02:32 — forked from observerss/gist:3798896
Google Keyword Tool Scraper(selenium+python version)
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
import selenium.webdriver.support.wait
selenium.webdriver.support.wait.POLL_FREQUENCY = 0.05
import re
import random
import collections
class AdwordsAutomater(object):
#coding=utf-8
from datetime import datetime
import os
import re
import urllib.request
from html.parser import HTMLParser
from time import sleep
import socket
socket.setdefaulttimeout(60)
@arowser
arowser / pytesser
Created November 22, 2012 02:58
pytesser example
#!/usr/bin/env python
#-*- coding: UTF-8 -*-
# filename: AutoLogin.py
from __future__ import unicode_literals
import urllib2
import cookielib
import urllib
import Image
from cStringIO import StringIO
@arowser
arowser / guokr.json
Created November 20, 2012 07:21 — forked from leecade/guokr.json
json:guokr
{"hotsite":{"name":"热门网站","links":[{"name":"科学松鼠会","url":"http://songshuhui.net/","ico_url":"http://img1.guokr.com/gkimage/hf/9z/cy/hf9zcy.png","style":"","ico":""},{"name":"科学网","url":"http://www.sciencenet.cn/","ico_url":"http://img1.guokr.com/gkimage/ix/ep/rw/ixeprw.png","style":"","ico":""},{"name":"丁香园","url":"http://www.dxy.cn/","ico_url":"http://img1.guokr.com/gkimage/cu/m5/vf/cum5vf.png","style":"","ico":""},{"name":"PubMed","url":"http://www.ncbi.nlm.nih.gov/pubmed","ico_url":"http://img1.guokr.com/gkimage/2o/wv/mt/2owvmt.png","style":"","ico":""},{"name":"趣玩网","url":"http://www.quwan.com/","ico_url":"http://img1.guokr.com/gkimage/rr/50/0c/rr500c.png","style":"","ico":""},{"name":"穷游网","url":"http://www.qyer.com/","ico_url":"http://img1.guokr.com/gkimage/ym/6h/ne/ym6hne.png","style":"","ico":""},{"name":"糗事百科","url":"http://www.qiushibaike.com/","ico_url":"http://img1.guokr.com/gkimage/iz/cv/jx/izcvjx.png","style":"","ico":""},{"name":"36氪","url":"http://www.36kr.com/","ico_url":"http://img1.guokr.co