Created
November 17, 2015 21:00
-
-
Save jwhitlock/33cfce605f91104ad8fb to your computer and use it in GitHub Desktop.
Generate a CSV of proposed new slugs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """Generate a CSV of new slugs.""" | |
| from collections import namedtuple, OrderedDict | |
| from csv import writer as csv_writer | |
| from hashlib import md5 as hashlib_md5 | |
| from json import load as json_load | |
| from string import ascii_lowercase, digits | |
| from urllib import unquote_plus | |
| from urlparse import urlparse | |
| def main(feature_json_path, output_path): | |
| """Read features.json, and write a new slug CSV.""" | |
| features = extract_features(feature_json_path) | |
| new_slugs = create_new_slugs(features) | |
| with open(output_path, 'wb') as csvfile: | |
| stats = analyze_and_write_csv(csvfile, features, new_slugs) | |
| max_name = max(len(name) for name in stats) | |
| for name, stat in stats.items(): | |
| print("%*s: %s" % (max_name, name, stat)) | |
| return 0 | |
| def slug_import_tool( | |
| word, attempt=0, reducer=None, replacer='_', max_slug=50, hash_len=5): | |
| """Slugger from browsercompat project, tools/common.py | |
| Modified with more arguments: | |
| * reducer: dict of strings to search/replace, {'__', '_'} in common.py | |
| * replacer: string to replace bad contentwith, '_' in common.py | |
| * max_slug: maximum slug size, 50 in common.py | |
| * hash_len: length of hashed bit, 5 in common.py | |
| """ | |
| raw = word.lower().encode('utf-8') | |
| out = [] | |
| acceptable = ascii_lowercase + digits + '_-' | |
| for c in raw: | |
| if c in acceptable: | |
| out += str(c) | |
| else: | |
| out += replacer | |
| slugged = ''.join(out) | |
| for long_string, short_string in reducer.items(): | |
| while long_string in slugged: | |
| slugged = slugged.replace(long_string, short_string) | |
| if attempt: | |
| suffix = str(attempt) | |
| else: | |
| suffix = "" | |
| if len(slugged) > max_slug: | |
| md5 = hashlib_md5(word.encode('utf-8')).hexdigest() | |
| slugged = slugged[:(max_slug - hash_len)] + md5[:hash_len] | |
| return slugged[slice(max_slug - len(suffix))] + suffix | |
| known_slugs = set() | |
| def unique_slugify(feature, reducer, replacer, max_slug, hash_len): | |
| """Keep slugging until a unique slug is derived.""" | |
| global known_slugs | |
| raw_path = feature.path | |
| if raw_path: | |
| path = unquote_plus(raw_path) | |
| slug = slug_import_tool( | |
| word=path, reducer=reducer, max_slug=max_slug, hash_len=hash_len) | |
| attempt = 0 | |
| while slug in known_slugs: | |
| attempt += 1 | |
| slug = slug_import_tool( | |
| word=path, attempt=attempt, reducer=reducer, max_slug=max_slug, | |
| hash_len=hash_len) | |
| else: | |
| slug = feature.slug.replace('-', replacer).replace('_', replacer) | |
| for long_string, short_string in reducer.items(): | |
| slug = slug.replace(long_string, short_string) | |
| known_slugs.add(slug) | |
| return slug | |
| def suggest_slug_import_tool_50(feature): | |
| """Emulates the current slugger for new pages.""" | |
| reducer = {'__': '_'} | |
| return unique_slugify(feature, reducer, '_', 50, 5) | |
| def suggest_slug_john(feature): | |
| """John's proposed slugger.""" | |
| reducer = OrderedDict(( | |
| ('-', '_'), | |
| ('aria_aria_techniques_', 'aria_tech_'), | |
| ('web_accessibility_', 'web_a11y_'), | |
| ('web_javascript_reference_global_objects_', 'web_js_ref_globals_'), | |
| ('web_javascript_reference_', 'web_js_ref_'), | |
| ('web_security_do_not_track_', 'web_sec_dnt_'), | |
| ('__', '_'), | |
| )) | |
| return unique_slugify(feature, reducer, '_', 50, 5) | |
| def suggest_slug_no_limit(feature): | |
| """Allow slugs to expand without hashing.""" | |
| reducer = {'__': '_'} | |
| return unique_slugify(feature, reducer, '_', 10000, 5) | |
| def is_hashed(slug, maxlen=50): | |
| if len(slug) == maxlen: | |
| hashed = digits | |
| for c in slug[-5:]: | |
| if c in hashed: | |
| return True | |
| return False | |
| Feature = namedtuple( | |
| "Feature", ["id", "parent", "url", "path", "name", "slug", "hashed"]) | |
| def extract_features(feature_json_path): | |
| """Extract data from features.json""" | |
| feature_json = json_load(open(feature_json_path)) | |
| prefix = '/en-US/docs/' | |
| features = OrderedDict() | |
| for data in feature_json['features']: | |
| feature_id = int(data['id']) | |
| parent_id = int(data['links']['parent'] or 0) | |
| slug = data['slug'] | |
| hashed = is_hashed(slug) | |
| raw_name = data['name'] | |
| if isinstance(raw_name, dict): | |
| name = raw_name['en'] | |
| else: | |
| name = raw_name | |
| raw_uri = data['mdn_uri'] | |
| if raw_uri is None: | |
| url = None | |
| path = None | |
| else: | |
| url = raw_uri['en'] | |
| parts = urlparse(url) | |
| raw_path = parts.path | |
| assert raw_path.startswith(prefix) | |
| path = raw_path[len(prefix):] | |
| features[feature_id] = Feature( | |
| feature_id, parent_id, url, path, name, slug, hashed) | |
| return features | |
| def create_new_slugs(features): | |
| """For each feature, create a new slug.""" | |
| new_slugs = OrderedDict() | |
| for feature_id, feature in features.items(): | |
| new_slug = suggest_slug(feature) | |
| new_slugs[feature_id] = new_slug | |
| return new_slugs | |
| def analyze_and_write_csv(csvfile, features, new_slugs): | |
| """Write results to a CSV and gather stats as we go.""" | |
| writer = csv_writer(csvfile) | |
| writer.writerow(( | |
| "ID", "URL", "Path", "Name", "Slug", "IsHashed", "NewSlug", | |
| "IsDifferent", "NewIsHashed")) | |
| was_hashed_count = 0 | |
| is_hashed_count = 0 | |
| changed_slug_count = 0 | |
| max_length = max(len(slug) for slug in new_slugs.values()) | |
| biggest_slug = None | |
| for feature_id, feature in features.items(): | |
| new_slug = new_slugs[feature_id] | |
| if len(new_slug) == max_length: | |
| biggest_slug = new_slug | |
| feature_id, parent_id, url, path, name, slug, ishashed = feature | |
| ishashed_csv = 1 if ishashed else 0 | |
| was_hashed_count += ishashed_csv | |
| diff_slug = 1 if (slug != new_slug) else 0 | |
| changed_slug_count += diff_slug | |
| still_hashed = 1 if is_hashed(new_slug, max_length) else 0 | |
| is_hashed_count += still_hashed | |
| writer.writerow(( | |
| feature_id, url, path, name, slug, ishashed_csv, new_slug, | |
| diff_slug, still_hashed)) | |
| return OrderedDict(( | |
| ('slug was hashed', was_hashed_count), | |
| ('slug still hashed', is_hashed_count), | |
| ('slug changed', changed_slug_count), | |
| ('biggest slug length', max_length), | |
| ('biggest slug', biggest_slug), | |
| )) | |
| # Change this to change slugger | |
| suggest_slug = suggest_slug_john | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) < 3: | |
| print """\ | |
| Create suggested slugs CSV. Usage: | |
| %s [path/to/features.json] [path/to/output.csv] | |
| """ | |
| sys.exit(0) | |
| feature_json_path = sys.argv[1] | |
| output_path = sys.argv[2] | |
| ret = main(feature_json_path, output_path) | |
| sys.exit(ret) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment