jwhitlock · November 17, 2015 21:00
diff --git a/suggest_slugs.py b/suggest_slugs.py
 #!/usr/bin/env python
 """Generate a CSV of new slugs."""

 from collections import namedtuple, OrderedDict
 from csv import writer as csv_writer
 from hashlib import md5 as hashlib_md5
 from json import load as json_load
 from string import ascii_lowercase, digits
 from urllib import unquote_plus
 from urlparse import urlparse


 def main(feature_json_path, output_path):
    """Read features.json, and write a new slug CSV."""
    features = extract_features(feature_json_path)
    new_slugs = create_new_slugs(features)
    with open(output_path, 'wb') as csvfile:
        stats = analyze_and_write_csv(csvfile, features, new_slugs)
    max_name = max(len(name) for name in stats)
    for name, stat in stats.items():
        print("%*s: %s" % (max_name, name, stat))
    return 0


 def slug_import_tool(
        word, attempt=0, reducer=None, replacer='_', max_slug=50, hash_len=5):
    """Slugger from browsercompat project, tools/common.py

    Modified with more arguments:
    * reducer: dict of strings to search/replace, {'__', '_'} in common.py
    * replacer: string to replace bad contentwith, '_' in common.py
    * max_slug: maximum slug size, 50 in common.py
    * hash_len: length of hashed bit, 5 in common.py
    """
    raw = word.lower().encode('utf-8')
    out = []
    acceptable = ascii_lowercase + digits + '_-'
    for c in raw:
        if c in acceptable:
            out += str(c)
        else:
            out += replacer
    slugged = ''.join(out)

    for long_string, short_string in reducer.items():
        while long_string in slugged:
            slugged = slugged.replace(long_string, short_string)

    if attempt:
        suffix = str(attempt)
    else:
        suffix = ""

    if len(slugged) > max_slug:
        md5 = hashlib_md5(word.encode('utf-8')).hexdigest()
        slugged = slugged[:(max_slug - hash_len)] + md5[:hash_len]
    return slugged[slice(max_slug - len(suffix))] + suffix


 known_slugs = set()


 def unique_slugify(feature, reducer, replacer, max_slug, hash_len):
    """Keep slugging until a unique slug is derived."""
    global known_slugs
    raw_path = feature.path
    if raw_path:
        path = unquote_plus(raw_path)
        slug = slug_import_tool(
            word=path, reducer=reducer, max_slug=max_slug, hash_len=hash_len)
        attempt = 0
        while slug in known_slugs:
            attempt += 1
            slug = slug_import_tool(
                word=path, attempt=attempt, reducer=reducer, max_slug=max_slug,
                hash_len=hash_len)
    else:
        slug = feature.slug.replace('-', replacer).replace('_', replacer)
        for long_string, short_string in reducer.items():
            slug = slug.replace(long_string, short_string)

    known_slugs.add(slug)
    return slug


 def suggest_slug_import_tool_50(feature):
    """Emulates the current slugger for new pages."""
    reducer = {'__': '_'}
    return unique_slugify(feature, reducer, '_', 50, 5)


 def suggest_slug_john(feature):
    """John's proposed slugger."""
    reducer = OrderedDict((
        ('-', '_'),
        ('aria_aria_techniques_', 'aria_tech_'),
        ('web_accessibility_', 'web_a11y_'),
        ('web_javascript_reference_global_objects_', 'web_js_ref_globals_'),
        ('web_javascript_reference_', 'web_js_ref_'),
        ('web_security_do_not_track_', 'web_sec_dnt_'),
        ('__', '_'),
    ))
    return unique_slugify(feature, reducer, '_', 50, 5)


 def suggest_slug_no_limit(feature):
    """Allow slugs to expand without hashing."""
    reducer = {'__': '_'}
    return unique_slugify(feature, reducer, '_', 10000, 5)


 def is_hashed(slug, maxlen=50):
    if len(slug) == maxlen:
        hashed = digits
        for c in slug[-5:]:
            if c in hashed:
                return True
    return False


 Feature = namedtuple(
    "Feature", ["id", "parent", "url", "path", "name", "slug", "hashed"])


 def extract_features(feature_json_path):
    """Extract data from features.json"""
    feature_json = json_load(open(feature_json_path))
    prefix = '/en-US/docs/'
    features = OrderedDict()
    for data in feature_json['features']:
        feature_id = int(data['id'])
        parent_id = int(data['links']['parent'] or 0)
        slug = data['slug']
        hashed = is_hashed(slug)
        raw_name = data['name']
        if isinstance(raw_name, dict):
            name = raw_name['en']
        else:
            name = raw_name
        raw_uri = data['mdn_uri']
        if raw_uri is None:
            url = None
            path = None
        else:
            url = raw_uri['en']
            parts = urlparse(url)
            raw_path = parts.path
            assert raw_path.startswith(prefix)
            path = raw_path[len(prefix):]
        features[feature_id] = Feature(
            feature_id, parent_id, url, path, name, slug, hashed)
    return features


 def create_new_slugs(features):
    """For each feature, create a new slug."""
    new_slugs = OrderedDict()
    for feature_id, feature in features.items():
        new_slug = suggest_slug(feature)
        new_slugs[feature_id] = new_slug
    return new_slugs


 def analyze_and_write_csv(csvfile, features, new_slugs):
    """Write results to a CSV and gather stats as we go."""
    writer = csv_writer(csvfile)
    writer.writerow((
        "ID", "URL", "Path", "Name", "Slug", "IsHashed", "NewSlug",
        "IsDifferent", "NewIsHashed"))
    was_hashed_count = 0
    is_hashed_count = 0
    changed_slug_count = 0
    max_length = max(len(slug) for slug in new_slugs.values())
    biggest_slug = None
    for feature_id, feature in features.items():
        new_slug = new_slugs[feature_id]
        if len(new_slug) == max_length:
            biggest_slug = new_slug
        feature_id, parent_id, url, path, name, slug, ishashed = feature
        ishashed_csv = 1 if ishashed else 0
        was_hashed_count += ishashed_csv
        diff_slug = 1 if (slug != new_slug) else 0
        changed_slug_count += diff_slug
        still_hashed = 1 if is_hashed(new_slug, max_length) else 0
        is_hashed_count += still_hashed
        writer.writerow((
            feature_id, url, path, name, slug, ishashed_csv, new_slug,
            diff_slug, still_hashed))
    return OrderedDict((
        ('slug was hashed', was_hashed_count),
        ('slug still hashed', is_hashed_count),
        ('slug changed', changed_slug_count),
        ('biggest slug length', max_length),
        ('biggest slug', biggest_slug),
    ))


 # Change this to change slugger
 suggest_slug = suggest_slug_john


 if __name__ == "__main__":
    import sys
    if len(sys.argv) < 3:
        print """\
 Create suggested slugs CSV.  Usage:

 %s [path/to/features.json] [path/to/output.csv]
 """
        sys.exit(0)

    feature_json_path = sys.argv[1]
    output_path = sys.argv[2]
    ret = main(feature_json_path, output_path)
    sys.exit(ret)
	#!/usr/bin/env python
	"""Generate a CSV of new slugs."""

	from collections import namedtuple, OrderedDict
	from csv import writer as csv_writer
	from hashlib import md5 as hashlib_md5
	from json import load as json_load
	from string import ascii_lowercase, digits
	from urllib import unquote_plus
	from urlparse import urlparse


	def main(feature_json_path, output_path):
	"""Read features.json, and write a new slug CSV."""
	features = extract_features(feature_json_path)
	new_slugs = create_new_slugs(features)
	with open(output_path, 'wb') as csvfile:
	stats = analyze_and_write_csv(csvfile, features, new_slugs)
	max_name = max(len(name) for name in stats)
	for name, stat in stats.items():
	print("%*s: %s" % (max_name, name, stat))
	return 0


	def slug_import_tool(
	word, attempt=0, reducer=None, replacer='_', max_slug=50, hash_len=5):
	"""Slugger from browsercompat project, tools/common.py

	Modified with more arguments:
	* reducer: dict of strings to search/replace, {'__', '_'} in common.py
	* replacer: string to replace bad contentwith, '_' in common.py
	* max_slug: maximum slug size, 50 in common.py
	* hash_len: length of hashed bit, 5 in common.py
	"""
	raw = word.lower().encode('utf-8')
	out = []
	acceptable = ascii_lowercase + digits + '_-'
	for c in raw:
	if c in acceptable:
	out += str(c)
	else:
	out += replacer
	slugged = ''.join(out)

	for long_string, short_string in reducer.items():
	while long_string in slugged:
	slugged = slugged.replace(long_string, short_string)

	if attempt:
	suffix = str(attempt)
	else:
	suffix = ""

	if len(slugged) > max_slug:
	md5 = hashlib_md5(word.encode('utf-8')).hexdigest()
	slugged = slugged[:(max_slug - hash_len)] + md5[:hash_len]
	return slugged[slice(max_slug - len(suffix))] + suffix


	known_slugs = set()


	def unique_slugify(feature, reducer, replacer, max_slug, hash_len):
	"""Keep slugging until a unique slug is derived."""
	global known_slugs
	raw_path = feature.path
	if raw_path:
	path = unquote_plus(raw_path)
	slug = slug_import_tool(
	word=path, reducer=reducer, max_slug=max_slug, hash_len=hash_len)
	attempt = 0
	while slug in known_slugs:
	attempt += 1
	slug = slug_import_tool(
	word=path, attempt=attempt, reducer=reducer, max_slug=max_slug,
	hash_len=hash_len)
	else:
	slug = feature.slug.replace('-', replacer).replace('_', replacer)
	for long_string, short_string in reducer.items():
	slug = slug.replace(long_string, short_string)

	known_slugs.add(slug)
	return slug


	def suggest_slug_import_tool_50(feature):
	"""Emulates the current slugger for new pages."""
	reducer = {'__': '_'}
	return unique_slugify(feature, reducer, '_', 50, 5)


	def suggest_slug_john(feature):
	"""John's proposed slugger."""
	reducer = OrderedDict((
	('-', '_'),
	('aria_aria_techniques_', 'aria_tech_'),
	('web_accessibility_', 'web_a11y_'),
	('web_javascript_reference_global_objects_', 'web_js_ref_globals_'),
	('web_javascript_reference_', 'web_js_ref_'),
	('web_security_do_not_track_', 'web_sec_dnt_'),
	('__', '_'),
	))
	return unique_slugify(feature, reducer, '_', 50, 5)


	def suggest_slug_no_limit(feature):
	"""Allow slugs to expand without hashing."""
	reducer = {'__': '_'}
	return unique_slugify(feature, reducer, '_', 10000, 5)


	def is_hashed(slug, maxlen=50):
	if len(slug) == maxlen:
	hashed = digits
	for c in slug[-5:]:
	if c in hashed:
	return True
	return False


	Feature = namedtuple(
	"Feature", ["id", "parent", "url", "path", "name", "slug", "hashed"])


	def extract_features(feature_json_path):
	"""Extract data from features.json"""
	feature_json = json_load(open(feature_json_path))
	prefix = '/en-US/docs/'
	features = OrderedDict()
	for data in feature_json['features']:
	feature_id = int(data['id'])
	parent_id = int(data['links']['parent'] or 0)
	slug = data['slug']
	hashed = is_hashed(slug)
	raw_name = data['name']
	if isinstance(raw_name, dict):
	name = raw_name['en']
	else:
	name = raw_name
	raw_uri = data['mdn_uri']
	if raw_uri is None:
	url = None
	path = None
	else:
	url = raw_uri['en']
	parts = urlparse(url)
	raw_path = parts.path
	assert raw_path.startswith(prefix)
	path = raw_path[len(prefix):]
	features[feature_id] = Feature(
	feature_id, parent_id, url, path, name, slug, hashed)
	return features


	def create_new_slugs(features):
	"""For each feature, create a new slug."""
	new_slugs = OrderedDict()
	for feature_id, feature in features.items():
	new_slug = suggest_slug(feature)
	new_slugs[feature_id] = new_slug
	return new_slugs


	def analyze_and_write_csv(csvfile, features, new_slugs):
	"""Write results to a CSV and gather stats as we go."""
	writer = csv_writer(csvfile)
	writer.writerow((
	"ID", "URL", "Path", "Name", "Slug", "IsHashed", "NewSlug",
	"IsDifferent", "NewIsHashed"))
	was_hashed_count = 0
	is_hashed_count = 0
	changed_slug_count = 0
	max_length = max(len(slug) for slug in new_slugs.values())
	biggest_slug = None
	for feature_id, feature in features.items():
	new_slug = new_slugs[feature_id]
	if len(new_slug) == max_length:
	biggest_slug = new_slug
	feature_id, parent_id, url, path, name, slug, ishashed = feature
	ishashed_csv = 1 if ishashed else 0
	was_hashed_count += ishashed_csv
	diff_slug = 1 if (slug != new_slug) else 0
	changed_slug_count += diff_slug
	still_hashed = 1 if is_hashed(new_slug, max_length) else 0
	is_hashed_count += still_hashed
	writer.writerow((
	feature_id, url, path, name, slug, ishashed_csv, new_slug,
	diff_slug, still_hashed))
	return OrderedDict((
	('slug was hashed', was_hashed_count),
	('slug still hashed', is_hashed_count),
	('slug changed', changed_slug_count),
	('biggest slug length', max_length),
	('biggest slug', biggest_slug),
	))


	# Change this to change slugger
	suggest_slug = suggest_slug_john


	if __name__ == "__main__":
	import sys
	if len(sys.argv) < 3:
	print """\
	Create suggested slugs CSV. Usage:

	%s [path/to/features.json] [path/to/output.csv]
	"""
	sys.exit(0)

	feature_json_path = sys.argv[1]
	output_path = sys.argv[2]
	ret = main(feature_json_path, output_path)
	sys.exit(ret)
No results found