This script parses the emoji list given by the unicode organization, and saves them as image files
Created
October 26, 2022 12:42
-
-
Save jinyu121/3140ade03fff8f555c703a20f403e870 to your computer and use it in GitHub Desktop.
Emoji Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import mimetypes | |
from pathlib import Path | |
import httpx | |
import magic | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
PAGE_URL = "https://unicode.org/emoji/charts-14.0/full-emoji-list.html" | |
EMOJI_TYPES = ["Apple", "Google", "Facebook", "Windows", "Twitter", "JoyPixels", "Samsung", | |
"GMail", "SoftBank", "DoCoMo", "KDDI"] | |
BASE_PATH = Path("emoji") | |
def data2bytes(text: str) -> (str, bytes): | |
text, data = text.split(",") | |
data = base64.b64decode(data) | |
filetype = mimetypes.guess_extension(magic.from_buffer(data, mime=True)) | |
return filetype, data | |
# If you do not want to install `python-magic` and libmagic, this code also **works** | |
#def data2bytes(text: str) -> (str, bytes): | |
# text, data = text.split(",") | |
# data = base64.b64decode(data) | |
# filetype = text.split(";")[0].split(":")[1].split("/")[1] | |
# return "." + filetype, data | |
if __name__ == '__main__': | |
mimetypes.init() | |
for vendor in EMOJI_TYPES: | |
(BASE_PATH / vendor).mkdir(parents=True, exist_ok=True) | |
# Get HTML | |
page_cache = BASE_PATH / "page_cache.html" | |
if page_cache.exists() and page_cache.is_file(): | |
html_doc = page_cache.open().read() | |
else: | |
html_doc = httpx.get(PAGE_URL).content | |
with page_cache.open("wb") as f: | |
f.write(html_doc) | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
for row in tqdm(soup.find_all('tr')): | |
label_td = list(row.find_all("td")) | |
if len(label_td) <= 1: | |
continue | |
if "№" == label_td[0].text.strip(): | |
continue | |
code = label_td[1].text.strip() | |
icons = label_td[3:] | |
if len(icons) < len(EMOJI_TYPES): | |
ext, img = data2bytes(icons[0].img["src"]) | |
for vendor in EMOJI_TYPES: | |
filename = BASE_PATH / vendor / (code + ext) | |
with filename.open("wb") as f: | |
f.write(img) | |
else: | |
for vendor, icon in zip(EMOJI_TYPES, icons): | |
if "—" == icon.text.strip(): | |
continue | |
ext, img, = data2bytes(icon.img["src"]) | |
filename = BASE_PATH / vendor / (code + ext) | |
with filename.open("wb") as f: | |
f.write(img) | |
print("Done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment