Last active
July 28, 2016 23:06
-
-
Save doggan/4ad4c704d623d5507607d5f397a5b27a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
- Parse the raw html dump from http://emojitracker.com, extracting a list of | |
the most frequently used emoji unicode points. | |
- Ideally, this script could parse the HTML directly, but since they use | |
AJAX to render the contents, it's easier to just copy the DOM (in chrome) | |
to a file and parse the file. | |
- Scrape/download the emojis from http://apps.timwhitlock.info/emoji/tables/unicode#emoji-modal | |
using the emoji unicode points, and download them to a local file using the | |
unicode point as the filename. | |
- Protip: | |
- For resizing the images after download, try this: | |
mogrify -path images_out/ -resize 32x32 ./images/*.png | |
""" | |
import argparse | |
from bs4 import BeautifulSoup | |
import os | |
import urllib | |
def parse_args(): | |
parser = argparse.ArgumentParser( | |
description='scrap emoji textures') | |
parser.add_argument( | |
'source', help='the input raw html dump (from http://emojitracker.com)') | |
parser.add_argument( | |
'output', help='the output path to write the resultant images') | |
return parser.parse_args() | |
def parse_html(contents): | |
print "### Parsing HTML..." | |
with open(contents, 'r') as f: | |
soup = BeautifulSoup(f.read(), "html.parser") | |
# Extract all the codes, preserving ranking order. | |
# Links are in the form: | |
# <a href="/details/1F52B" title="PISTOL" data-id="1F52B"> | |
codes = [] | |
rankings = soup.find("section", {"id": "rankings"}) | |
for link in soup.findAll('a'): | |
data = link.get('data-id') | |
if data: | |
codes.append(data.lower()) | |
print "### %s codes found..." % len(codes) | |
return codes | |
def main(): | |
args = parse_args() | |
source = args.source | |
output = args.output | |
# Prepare the output directory. | |
if not os.path.exists(output): | |
os.makedirs(output) | |
codes = parse_html(source) | |
# Example URL: | |
# "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/1f601.png" | |
base_url = "http://apps.timwhitlock.info/static/images/emoji/emoji-apple/" | |
# Max # of results to download (rankings). | |
COUNT = 500 | |
print "### Attempting to download %s files..." % COUNT | |
done_count = 0 | |
for code in codes: | |
filename = code + '.png' | |
url = base_url + filename | |
outpath = os.path.join(output, filename) | |
# Only download if the file hasn't been downloaded yet. | |
if not os.path.isfile(outpath): | |
opener = urllib.URLopener() | |
opener.retrieve(url, outpath) | |
done_count += 1 | |
if done_count == COUNT: | |
break | |
print "### Finished writing %s files." % done_count | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment