Last active
January 31, 2017 23:50
-
-
Save maxmarchuk/44f34500b31d4aaf1808bc8af73357f2 to your computer and use it in GitHub Desktop.
A python script for getting the descriptions of special characters (such as greek letters or math symbols) by parsing http://www.dionysia.org/html/entities/symbols.html. The output is in a format used for CKEditor's specialChars configuration to use the description as the tooltip for the characters.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
from lxml import html | |
import requests | |
try: | |
file = open("./symbols.txt") | |
file_string = file.read().split() | |
except: | |
file_string = 'Α α Β β Γ γ Δ δ Ε ε Ζ ζ Η η Θ θ Ι'.split() | |
# get the page and turn it into a tree object | |
page = requests.get('http://www.dionysia.org/html/entities/symbols.html') | |
tree = html.fromstring(page.content) | |
line_length = 0 | |
final_string = '' | |
for symbol in file_string: | |
# Find any table cells that have the symbol we're searching for and get | |
# the whole row of cells that are its siblings | |
result = tree.xpath('//tr/td[contains(text(), "'+symbol+'")]/../*') | |
if len(result) > 4: | |
# Get the text of the description column | |
desc = result[4].text | |
# only get the description. I don't care about the unicode value | |
desc = desc.split(',')[0] | |
# in some cases, there are multiple definitions. Just get the first one | |
desc = desc.split('=')[0] | |
# remove any remaining whitespace at the start and end of the string | |
desc = desc.strip() | |
# Capitalize the first letter of every description | |
desc = desc[0].capitalize() + desc[1:] | |
current_string = "['{0}', '{1}'], ".format(symbol, desc) | |
# Print the full string when you reach a particular character length | |
# I am just using 80 since it's the standard line length | |
if (line_length + len(current_string)) > 80: | |
print(final_string) | |
line_length = 0 | |
final_string = '' | |
final_string += current_string | |
line_length += len(current_string) | |
print(final_string) | |
#TODO: add the ability to use symbols passed in through command line args | |
#TODO: Use https://www.w3.org/TR/REC-html40/sgml/entities.html instead, as it's probably more reliable and has more characters.Only downside of this is that you won't be able to search by symbol, only by code |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment