Skip to content

Instantly share code, notes, and snippets.

@jwhitlock
Created November 17, 2015 21:00
Show Gist options
  • Select an option

  • Save jwhitlock/33cfce605f91104ad8fb to your computer and use it in GitHub Desktop.

Select an option

Save jwhitlock/33cfce605f91104ad8fb to your computer and use it in GitHub Desktop.
Generate a CSV of proposed new slugs
#!/usr/bin/env python
"""Generate a CSV of new slugs."""
from collections import namedtuple, OrderedDict
from csv import writer as csv_writer
from hashlib import md5 as hashlib_md5
from json import load as json_load
from string import ascii_lowercase, digits
from urllib import unquote_plus
from urlparse import urlparse
def main(feature_json_path, output_path):
"""Read features.json, and write a new slug CSV."""
features = extract_features(feature_json_path)
new_slugs = create_new_slugs(features)
with open(output_path, 'wb') as csvfile:
stats = analyze_and_write_csv(csvfile, features, new_slugs)
max_name = max(len(name) for name in stats)
for name, stat in stats.items():
print("%*s: %s" % (max_name, name, stat))
return 0
def slug_import_tool(
word, attempt=0, reducer=None, replacer='_', max_slug=50, hash_len=5):
"""Slugger from browsercompat project, tools/common.py
Modified with more arguments:
* reducer: dict of strings to search/replace, {'__', '_'} in common.py
* replacer: string to replace bad contentwith, '_' in common.py
* max_slug: maximum slug size, 50 in common.py
* hash_len: length of hashed bit, 5 in common.py
"""
raw = word.lower().encode('utf-8')
out = []
acceptable = ascii_lowercase + digits + '_-'
for c in raw:
if c in acceptable:
out += str(c)
else:
out += replacer
slugged = ''.join(out)
for long_string, short_string in reducer.items():
while long_string in slugged:
slugged = slugged.replace(long_string, short_string)
if attempt:
suffix = str(attempt)
else:
suffix = ""
if len(slugged) > max_slug:
md5 = hashlib_md5(word.encode('utf-8')).hexdigest()
slugged = slugged[:(max_slug - hash_len)] + md5[:hash_len]
return slugged[slice(max_slug - len(suffix))] + suffix
known_slugs = set()
def unique_slugify(feature, reducer, replacer, max_slug, hash_len):
"""Keep slugging until a unique slug is derived."""
global known_slugs
raw_path = feature.path
if raw_path:
path = unquote_plus(raw_path)
slug = slug_import_tool(
word=path, reducer=reducer, max_slug=max_slug, hash_len=hash_len)
attempt = 0
while slug in known_slugs:
attempt += 1
slug = slug_import_tool(
word=path, attempt=attempt, reducer=reducer, max_slug=max_slug,
hash_len=hash_len)
else:
slug = feature.slug.replace('-', replacer).replace('_', replacer)
for long_string, short_string in reducer.items():
slug = slug.replace(long_string, short_string)
known_slugs.add(slug)
return slug
def suggest_slug_import_tool_50(feature):
"""Emulates the current slugger for new pages."""
reducer = {'__': '_'}
return unique_slugify(feature, reducer, '_', 50, 5)
def suggest_slug_john(feature):
"""John's proposed slugger."""
reducer = OrderedDict((
('-', '_'),
('aria_aria_techniques_', 'aria_tech_'),
('web_accessibility_', 'web_a11y_'),
('web_javascript_reference_global_objects_', 'web_js_ref_globals_'),
('web_javascript_reference_', 'web_js_ref_'),
('web_security_do_not_track_', 'web_sec_dnt_'),
('__', '_'),
))
return unique_slugify(feature, reducer, '_', 50, 5)
def suggest_slug_no_limit(feature):
"""Allow slugs to expand without hashing."""
reducer = {'__': '_'}
return unique_slugify(feature, reducer, '_', 10000, 5)
def is_hashed(slug, maxlen=50):
if len(slug) == maxlen:
hashed = digits
for c in slug[-5:]:
if c in hashed:
return True
return False
Feature = namedtuple(
"Feature", ["id", "parent", "url", "path", "name", "slug", "hashed"])
def extract_features(feature_json_path):
"""Extract data from features.json"""
feature_json = json_load(open(feature_json_path))
prefix = '/en-US/docs/'
features = OrderedDict()
for data in feature_json['features']:
feature_id = int(data['id'])
parent_id = int(data['links']['parent'] or 0)
slug = data['slug']
hashed = is_hashed(slug)
raw_name = data['name']
if isinstance(raw_name, dict):
name = raw_name['en']
else:
name = raw_name
raw_uri = data['mdn_uri']
if raw_uri is None:
url = None
path = None
else:
url = raw_uri['en']
parts = urlparse(url)
raw_path = parts.path
assert raw_path.startswith(prefix)
path = raw_path[len(prefix):]
features[feature_id] = Feature(
feature_id, parent_id, url, path, name, slug, hashed)
return features
def create_new_slugs(features):
"""For each feature, create a new slug."""
new_slugs = OrderedDict()
for feature_id, feature in features.items():
new_slug = suggest_slug(feature)
new_slugs[feature_id] = new_slug
return new_slugs
def analyze_and_write_csv(csvfile, features, new_slugs):
"""Write results to a CSV and gather stats as we go."""
writer = csv_writer(csvfile)
writer.writerow((
"ID", "URL", "Path", "Name", "Slug", "IsHashed", "NewSlug",
"IsDifferent", "NewIsHashed"))
was_hashed_count = 0
is_hashed_count = 0
changed_slug_count = 0
max_length = max(len(slug) for slug in new_slugs.values())
biggest_slug = None
for feature_id, feature in features.items():
new_slug = new_slugs[feature_id]
if len(new_slug) == max_length:
biggest_slug = new_slug
feature_id, parent_id, url, path, name, slug, ishashed = feature
ishashed_csv = 1 if ishashed else 0
was_hashed_count += ishashed_csv
diff_slug = 1 if (slug != new_slug) else 0
changed_slug_count += diff_slug
still_hashed = 1 if is_hashed(new_slug, max_length) else 0
is_hashed_count += still_hashed
writer.writerow((
feature_id, url, path, name, slug, ishashed_csv, new_slug,
diff_slug, still_hashed))
return OrderedDict((
('slug was hashed', was_hashed_count),
('slug still hashed', is_hashed_count),
('slug changed', changed_slug_count),
('biggest slug length', max_length),
('biggest slug', biggest_slug),
))
# Change this to change slugger
suggest_slug = suggest_slug_john
if __name__ == "__main__":
import sys
if len(sys.argv) < 3:
print """\
Create suggested slugs CSV. Usage:
%s [path/to/features.json] [path/to/output.csv]
"""
sys.exit(0)
feature_json_path = sys.argv[1]
output_path = sys.argv[2]
ret = main(feature_json_path, output_path)
sys.exit(ret)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment