Skip to content

Instantly share code, notes, and snippets.

@doggan
Last active July 28, 2016 23:06
Show Gist options
  • Save doggan/4ad4c704d623d5507607d5f397a5b27a to your computer and use it in GitHub Desktop.
Save doggan/4ad4c704d623d5507607d5f397a5b27a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
- Parse the raw html dump from http://emojitracker.com, extracting a list of
the most frequently used emoji unicode points.
- Ideally, this script could parse the HTML directly, but since they use
AJAX to render the contents, it's easier to just copy the DOM (in chrome)
to a file and parse the file.
- Scrape/download the emojis from http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal
using the emoji unicode points, and download them to a local file using the
unicode point as the filename.
- Protip:
- For resizing the images after download, try this:
mogrify -path images_out/ -resize 32x32 ./images/*.png
"""
import argparse
from bs4 import BeautifulSoup
import os
import urllib
def parse_args():
parser = argparse.ArgumentParser(
description='scrap emoji textures')
parser.add_argument(
'source', help='the input raw html dump (from http://emojitracker.com)')
parser.add_argument(
'output', help='the output path to write the resultant images')
return parser.parse_args()
def parse_html(contents):
print "### Parsing HTML..."
with open(contents, 'r') as f:
soup = BeautifulSoup(f.read(), "html.parser")
# Extract all the codes, preserving ranking order.
# Links are in the form:
# <a href="/details/1F52B" title="PISTOL" data-id="1F52B">
codes = []
rankings = soup.find("section", {"id": "rankings"})
for link in soup.findAll('a'):
data = link.get('data-id')
if data:
codes.append(data.lower())
print "### %s codes found..." % len(codes)
return codes
def main():
args = parse_args()
source = args.source
output = args.output
# Prepare the output directory.
if not os.path.exists(output):
os.makedirs(output)
codes = parse_html(source)
# Example URL:
# "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/1f601.png"
base_url = "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/"
# Max # of results to download (rankings).
COUNT = 500
print "### Attempting to download %s files..." % COUNT
done_count = 0
for code in codes:
filename = code + '.png'
url = base_url + filename
outpath = os.path.join(output, filename)
# Only download if the file hasn't been downloaded yet.
if not os.path.isfile(outpath):
opener = urllib.URLopener()
opener.retrieve(url, outpath)
done_count += 1
if done_count == COUNT:
break
print "### Finished writing %s files." % done_count
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment