nickstenning · January 26, 2016 15:58
diff --git a/get_from_ids.py b/get_from_ids.py
 #!/usr/bin/env python2

 import argparse
 import elasticsearch
 import json


 def get_from_ids(host, port, index, type, ids):
    es = elasticsearch.Elasticsearch([{'host': host, 'port': port}])

    for id in ids:
        doc = es.get(index=index, doc_type=type, id=id.rstrip('\r\n'))
        print("{} {}".format(doc['_id'], json.dumps(doc['_source'], sort_keys=True)))


 def main():
    parser = argparse.ArgumentParser(
        description="Get a bunch of documents whose IDs are stored in a file")
    parser.add_argument("--host")
    parser.add_argument("--port", type=int)
    parser.add_argument("--index")
    parser.add_argument("--type")
    parser.add_argument("--id_file")
    args = parser.parse_args()

    id_file = open(args.id_file)
    get_from_ids(args.host, args.port, args.index, args.type, id_file)

 if __name__ == '__main__':
    main()
diff --git a/get_random_docs.py b/get_random_docs.py
 #!/usr/bin/env python2

 import argparse
 import elasticsearch
 import json


 def get_random_docs(host, port, index, type, numdocs, only_id):
    es = elasticsearch.Elasticsearch([{'host': host, 'port': port}])

    query = {
        "size": numdocs,
        "query": {
            "function_score": {
                "functions": [
                    {
                        "random_score":  {
                            "seed": 11
                        }
                    }
                ],
                "score_mode": "sum",
            }
        }
    }

    results = es.search(index=index, doc_type=type, body=query)

    for doc in results['hits']['hits']:
        if only_id:
            print doc['_id']
        else:
            print("{} {}".format(doc['_id'], json.dumps(doc['_source'], sort_keys=True)))


 def main():
    parser = argparse.ArgumentParser(
        description="Update documents containing via.hypothes.is URLs")
    parser.add_argument("--host")
    parser.add_argument("--port", type=int)
    parser.add_argument("--index")
    parser.add_argument("--type")
    parser.add_argument("--numdocs", type=int)
    parser.add_argument("--only_id", action='store_true')
    args = parser.parse_args()

    get_random_docs(args.host, args.port, args.index, args.type, args.numdocs, args.only_id)

 if __name__ == '__main__':
    main()
diff --git a/migrate_via_urls.py b/migrate_via_urls.py
 #!/usr/bin/env python2

 from __future__ import print_function

 import argparse

 import elasticsearch
 from elasticsearch import helpers

 from via import fix_url


 def fetch_all(client, index, doc_type):
    query = {'query': {'match_all': {}}}

    return helpers.scan(client=client,
                        index=index,
                        doc_type=doc_type,
                        query=query)


 def get_document_actions(index, documents):
    for doc in documents:
        print('document {}'.format(doc['_id']))

        updates = {}

        if 'favicon' in doc['_source']:
            fixed_favicon = fix_url(doc['_source']['favicon'])
            if fixed_favicon is not None:
                updates['favicon'] = fixed_favicon

        if 'link' in doc['_source']:
            links = doc['_source']['link']
            fixed_links, modified = _process_links(links)
            if modified:
                updates['link'] = fixed_links

        if not updates:
            continue

        yield({'_op_type': 'update',
               '_index': index,
               '_type': 'document',
               '_id': doc['_id'],
               'doc': updates})


 def get_annotation_actions(index, annotations):
    for ann in annotations:
        print('annotation {}'.format(ann['_id']))

        updates = {}

        if 'document' in ann['_source']:
            modified = False
            document = ann['_source']['document']

            if 'favicon' in document:
                fixed_favicon = fix_url(document['favicon'])
                if fixed_favicon is not None:
                    document['favicon'] = fixed_favicon
                    modified = True

            if 'link' in document:
                links = document['link']
                _, modified = _process_links(links)

            if modified:
                updates['document'] = document

        if 'target' in ann['_source']:
            modified = False
            target = ann['_source']['target']

            if not isinstance(target, list):
                raise RuntimeError("target is not a list: "
                                   "{:r}".format(target))

            for item in target:
                if 'source' not in item:
                    continue
                fixed_source = fix_url(item['source'])
                if fixed_source is not None:
                    item['source'] = fixed_source
                    modified = True

            if modified:
                updates['target'] = target

        if 'uri' in ann['_source'] and ann['_source']['uri'] is not None:
            uri = fix_url(ann['_source']['uri'])
            if uri is not None:
                updates['uri'] = uri

        if not updates:
            continue

        yield({'_op_type': 'update',
               '_index': index,
               '_type': 'annotation',
               '_id': ann['_id'],
               'doc': updates})


 def _process_links(links):
    modified = False

    # Deal with situations such as: {..., 'link': 'http://...', ...}
    if isinstance(links, basestring):
        links = [{'href': links}]

    if not isinstance(links, list):
        raise RuntimeError("link prop wasn't string or list: "
                           "{:r}".format(links))

    for link in links:
        if not isinstance(link, dict):
            raise RuntimeError("link item wasn't a dict: "
                               "{:r}".format(link))

        if 'href' not in link:
            continue

        href = link['href']

        if not isinstance(href, basestring):
            raise RuntimeError("link['href'] wasn't a string: "
                               "{:r}".format(href))

        fixed_href = fix_url(href)
        if fixed_href is not None:
            link['href'] = fixed_href
            modified = True

    return links, modified


 def main():
    parser = argparse.ArgumentParser(
        description="Update documents containing via.hypothes.is URLs")
    parser.add_argument("host")
    parser.add_argument("port")
    parser.add_argument("index")
    parser.add_argument("--skip_documents", action='store_true')
    parser.add_argument("--skip_annotations", action='store_true')
    args = parser.parse_args()

    es = elasticsearch.Elasticsearch([{'host': args.host, 'port': args.port}])

    if not args.skip_documents:
        all_documents = fetch_all(es, args.index, 'document')
        actions = get_document_actions(args.index, all_documents)
        helpers.bulk(es, actions)

    if not args.skip_annotations:
        all_annotations = fetch_all(es, args.index, 'annotation')
        actions = get_annotation_actions(args.index, all_annotations)
        helpers.bulk(es, actions)


 if __name__ == '__main__':
    main()
diff --git a/via.py b/via.py
 import re

 # Possible via prefixes to be stripped off. These should be given in decreasing
 # order of specificity, to ensure that the longest possible via prefix is
 # removed.
 PREFIX_RE = '|'.join([
    r'static/__shared/viewer/web/viewer\.html\?file=/h/id_/',
    r'static/__shared/viewer/web/viewer\.html\?file=/id_/',
    r'static/__shared/viewer/web/viewer\.html\?file=',
    r'h/\d{14}oe_/',
    r'h/\d{14}/',
    r'h/oe_/',
    r'oe_/',
    r'h/',
    r'\d{14}oe_/',
    r'\d{14}/',
 ])

 VIA_URL_RE = re.compile(r'https?://via.hypothes.is/(?:' + PREFIX_RE + ')?(.+)',
                        re.IGNORECASE)


 def fix_url(url):
    match = re.match(VIA_URL_RE, url)
    # If this URL isn't prefixed by via at all, then we should return None,
    # signifying no change:
    if match is None:
        return None

    url = match.group(1)

    # Strip off any duplicate via prefixes
    while True:
        match = re.match(VIA_URL_RE, url)
        if match is None:
            break
        url = match.group(1)

    # '//example.com' -> 'http://example.com'
    if url.startswith('//'):
        return 'http:' + url

    # 'https:/example.com' -> 'https://example.com'
    match = re.match(r'(https?):/([^/].*)', url, re.IGNORECASE)
    if match is not None:
        return match.group(1) + '://' + match.group(2)

    match = re.match(r'[a-z-]+://(.*)', url, re.IGNORECASE)
    if match is None:
        return 'http://' + url

    return url
diff --git a/via_test.py b/via_test.py
 import pytest

 from fixurls.via import fix_url

 FIXTURES = [
    # Don't touch links to the via homepage
    ('https://via.hypothes.is', None),
    ('https://via.hypothes.is/', None),

    # Don't touch all kinds of other normal links
    ('https://example.com/foo/bar', None),
    ('android-app://com.google.android.youtube/...', None),
    ('http://example.com/foo.pdf', None),
    ('https://w3c-social.github.io/activitypump/', None),
    ('https://example.com/https://via.hypothes.is', None),

    # Normal via links
    ('https://via.hypothes.is/https://example.com', 'https://example.com'),
    ('https://via.hypothes.is/https://example.com/foo/bar',
     'https://example.com/foo/bar'),
    ('https://via.hypothes.is/http://example.com/foo/bar',
     'http://example.com/foo/bar'),

    # Insecure via links
    ('http://via.hypothes.is/https://example.com', 'https://example.com'),
    ('http://via.hypothes.is/https://example.com/foo/bar',
     'https://example.com/foo/bar'),
    ('http://via.hypothes.is/http://example.com/foo/bar',
     'http://example.com/foo/bar'),

    # Incorrect case via links
    ('http://VIA.Hypothes.IS/http://example.com/foo/bar',
     'http://example.com/foo/bar'),

    # Phone app links (yes, really)
    ('https://via.hypothes.is/android-app://com.google.android.youtube/...',
     'android-app://com.google.android.youtube/...'),

    # Links missing scheme
    ('https://via.hypothes.is/www.artbusiness.com',
     'http://www.artbusiness.com'),
    ('https://via.hypothes.is/devblog.avdi.org/2015/',
     'http://devblog.avdi.org/2015/'),
    ('https://via.hypothes.is/tapcore.com/...',
     'http://tapcore.com/...'),
    ('https://via.hypothes.is///codegeekz.com/wp-content/uploads/codegeekz-favicon.png',
     'http://codegeekz.com/wp-content/uploads/codegeekz-favicon.png'),

    # Strange prefixes
    ('https://via.hypothes.is/h/http://example.com', 'http://example.com'),
    ('https://via.hypothes.is/oe_/http://example.com', 'http://example.com'),
    ('https://via.hypothes.is/h/oe_/http://example.com', 'http://example.com'),
    ('https://via.hypothes.is/20150520202836/http://example.com',
     'http://example.com'),
    ('https://via.hypothes.is/20150520202836oe_/http://example.com',
     'http://example.com'),
    ('https://via.hypothes.is/h/20150520202836/http://example.com',
     'http://example.com'),
    ('https://via.hypothes.is/h/20150520202836oe_/http://example.com',
     'http://example.com'),
    ('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=http://example.com/foo.pdf',
     'http://example.com/foo.pdf'),
    ('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/id_/http://example.com/foo.pdf',
     'http://example.com/foo.pdf'),
    ('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/h/id_/http://example.com/foo.pdf',
     'http://example.com/foo.pdf'),

    # Strange prefixes with missing scheme
    ('https://via.hypothes.is/h/www.example.com', 'http://www.example.com'),
    ('https://via.hypothes.is/20150520202836/foo.com/bar/',
     'http://foo.com/bar/'),
    ('https://via.hypothes.is/oe_///www.example.com',
     'http://www.example.com'),

    # URLs with broken prefixes
    ('https://via.hypothes.is/https:/w3c-social.github.io/activitypump/',
     'https://w3c-social.github.io/activitypump/'),

    # Double prefixed
    ('https://via.hypothes.is/https://via.hypothes.is/http://www.nytimes.com/roomfordebate/',
     'http://www.nytimes.com/roomfordebate/'),
 ]


 @pytest.mark.parametrize('url_in,url_out', FIXTURES)
 def test_fix_url(url_in, url_out):
    assert fix_url(url_in) == url_out
	#!/usr/bin/env python2

	import argparse
	import elasticsearch
	import json


	def get_from_ids(host, port, index, type, ids):
	es = elasticsearch.Elasticsearch([{'host': host, 'port': port}])

	for id in ids:
	doc = es.get(index=index, doc_type=type, id=id.rstrip('\r\n'))
	print("{} {}".format(doc['_id'], json.dumps(doc['_source'], sort_keys=True)))


	def main():
	parser = argparse.ArgumentParser(
	description="Get a bunch of documents whose IDs are stored in a file")
	parser.add_argument("--host")
	parser.add_argument("--port", type=int)
	parser.add_argument("--index")
	parser.add_argument("--type")
	parser.add_argument("--id_file")
	args = parser.parse_args()

	id_file = open(args.id_file)
	get_from_ids(args.host, args.port, args.index, args.type, id_file)

	if __name__ == '__main__':
	main()
	#!/usr/bin/env python2

	from __future__ import print_function

	import argparse

	import elasticsearch
	from elasticsearch import helpers

	from via import fix_url


	def fetch_all(client, index, doc_type):
	query = {'query': {'match_all': {}}}

	return helpers.scan(client=client,
	index=index,
	doc_type=doc_type,
	query=query)


	def get_document_actions(index, documents):
	for doc in documents:
	print('document {}'.format(doc['_id']))

	updates = {}

	if 'favicon' in doc['_source']:
	fixed_favicon = fix_url(doc['_source']['favicon'])
	if fixed_favicon is not None:
	updates['favicon'] = fixed_favicon

	if 'link' in doc['_source']:
	links = doc['_source']['link']
	fixed_links, modified = _process_links(links)
	if modified:
	updates['link'] = fixed_links

	if not updates:
	continue

	yield({'_op_type': 'update',
	'_index': index,
	'_type': 'document',
	'_id': doc['_id'],
	'doc': updates})


	def get_annotation_actions(index, annotations):
	for ann in annotations:
	print('annotation {}'.format(ann['_id']))

	updates = {}

	if 'document' in ann['_source']:
	modified = False
	document = ann['_source']['document']

	if 'favicon' in document:
	fixed_favicon = fix_url(document['favicon'])
	if fixed_favicon is not None:
	document['favicon'] = fixed_favicon
	modified = True

	if 'link' in document:
	links = document['link']
	_, modified = _process_links(links)

	if modified:
	updates['document'] = document

	if 'target' in ann['_source']:
	modified = False
	target = ann['_source']['target']

	if not isinstance(target, list):
	raise RuntimeError("target is not a list: "
	"{:r}".format(target))

	for item in target:
	if 'source' not in item:
	continue
	fixed_source = fix_url(item['source'])
	if fixed_source is not None:
	item['source'] = fixed_source
	modified = True

	if modified:
	updates['target'] = target

	if 'uri' in ann['_source'] and ann['_source']['uri'] is not None:
	uri = fix_url(ann['_source']['uri'])
	if uri is not None:
	updates['uri'] = uri

	if not updates:
	continue

	yield({'_op_type': 'update',
	'_index': index,
	'_type': 'annotation',
	'_id': ann['_id'],
	'doc': updates})


	def _process_links(links):
	modified = False

	# Deal with situations such as: {..., 'link': 'http://...', ...}
	if isinstance(links, basestring):
	links = [{'href': links}]

	if not isinstance(links, list):
	raise RuntimeError("link prop wasn't string or list: "
	"{:r}".format(links))

	for link in links:
	if not isinstance(link, dict):
	raise RuntimeError("link item wasn't a dict: "
	"{:r}".format(link))

	if 'href' not in link:
	continue

	href = link['href']

	if not isinstance(href, basestring):
	raise RuntimeError("link['href'] wasn't a string: "
	"{:r}".format(href))

	fixed_href = fix_url(href)
	if fixed_href is not None:
	link['href'] = fixed_href
	modified = True

	return links, modified


	def main():
	parser = argparse.ArgumentParser(
	description="Update documents containing via.hypothes.is URLs")
	parser.add_argument("host")
	parser.add_argument("port")
	parser.add_argument("index")
	parser.add_argument("--skip_documents", action='store_true')
	parser.add_argument("--skip_annotations", action='store_true')
	args = parser.parse_args()

	es = elasticsearch.Elasticsearch([{'host': args.host, 'port': args.port}])

	if not args.skip_documents:
	all_documents = fetch_all(es, args.index, 'document')
	actions = get_document_actions(args.index, all_documents)
	helpers.bulk(es, actions)

	if not args.skip_annotations:
	all_annotations = fetch_all(es, args.index, 'annotation')
	actions = get_annotation_actions(args.index, all_annotations)
	helpers.bulk(es, actions)


	if __name__ == '__main__':
	main()
	import re

	# Possible via prefixes to be stripped off. These should be given in decreasing
	# order of specificity, to ensure that the longest possible via prefix is
	# removed.
	PREFIX_RE = '\|'.join([
	r'static/__shared/viewer/web/viewer\.html\?file=/h/id_/',
	r'static/__shared/viewer/web/viewer\.html\?file=/id_/',
	r'static/__shared/viewer/web/viewer\.html\?file=',
	r'h/\d{14}oe_/',
	r'h/\d{14}/',
	r'h/oe_/',
	r'oe_/',
	r'h/',
	r'\d{14}oe_/',
	r'\d{14}/',
	])

	VIA_URL_RE = re.compile(r'https?://via.hypothes.is/(?:' + PREFIX_RE + ')?(.+)',
	re.IGNORECASE)


	def fix_url(url):
	match = re.match(VIA_URL_RE, url)
	# If this URL isn't prefixed by via at all, then we should return None,
	# signifying no change:
	if match is None:
	return None

	url = match.group(1)

	# Strip off any duplicate via prefixes
	while True:
	match = re.match(VIA_URL_RE, url)
	if match is None:
	break
	url = match.group(1)

	# '//example.com' -> 'http://example.com'
	if url.startswith('//'):
	return 'http:' + url

	# 'https:/example.com' -> 'https://example.com'
	match = re.match(r'(https?):/([^/].*)', url, re.IGNORECASE)
	if match is not None:
	return match.group(1) + '://' + match.group(2)

	match = re.match(r'[a-z-]+://(.*)', url, re.IGNORECASE)
	if match is None:
	return 'http://' + url

	return url
	import pytest

	from fixurls.via import fix_url

	FIXTURES = [
	# Don't touch links to the via homepage
	('https://via.hypothes.is', None),
	('https://via.hypothes.is/', None),

	# Don't touch all kinds of other normal links
	('https://example.com/foo/bar', None),
	('android-app://com.google.android.youtube/...', None),
	('http://example.com/foo.pdf', None),
	('https://w3c-social.github.io/activitypump/', None),
	('https://example.com/https://via.hypothes.is', None),

	# Normal via links
	('https://via.hypothes.is/https://example.com', 'https://example.com'),
	('https://via.hypothes.is/https://example.com/foo/bar',
	'https://example.com/foo/bar'),
	('https://via.hypothes.is/http://example.com/foo/bar',
	'http://example.com/foo/bar'),

	# Insecure via links
	('http://via.hypothes.is/https://example.com', 'https://example.com'),
	('http://via.hypothes.is/https://example.com/foo/bar',
	'https://example.com/foo/bar'),
	('http://via.hypothes.is/http://example.com/foo/bar',
	'http://example.com/foo/bar'),

	# Incorrect case via links
	('http://VIA.Hypothes.IS/http://example.com/foo/bar',
	'http://example.com/foo/bar'),

	# Phone app links (yes, really)
	('https://via.hypothes.is/android-app://com.google.android.youtube/...',
	'android-app://com.google.android.youtube/...'),

	# Links missing scheme
	('https://via.hypothes.is/www.artbusiness.com',
	'http://www.artbusiness.com'),
	('https://via.hypothes.is/devblog.avdi.org/2015/',
	'http://devblog.avdi.org/2015/'),
	('https://via.hypothes.is/tapcore.com/...',
	'http://tapcore.com/...'),
	('https://via.hypothes.is///codegeekz.com/wp-content/uploads/codegeekz-favicon.png',
	'http://codegeekz.com/wp-content/uploads/codegeekz-favicon.png'),

	# Strange prefixes
	('https://via.hypothes.is/h/http://example.com', 'http://example.com'),
	('https://via.hypothes.is/oe_/http://example.com', 'http://example.com'),
	('https://via.hypothes.is/h/oe_/http://example.com', 'http://example.com'),
	('https://via.hypothes.is/20150520202836/http://example.com',
	'http://example.com'),
	('https://via.hypothes.is/20150520202836oe_/http://example.com',
	'http://example.com'),
	('https://via.hypothes.is/h/20150520202836/http://example.com',
	'http://example.com'),
	('https://via.hypothes.is/h/20150520202836oe_/http://example.com',
	'http://example.com'),
	('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=http://example.com/foo.pdf',
	'http://example.com/foo.pdf'),
	('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/id_/http://example.com/foo.pdf',
	'http://example.com/foo.pdf'),
	('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/h/id_/http://example.com/foo.pdf',
	'http://example.com/foo.pdf'),

	# Strange prefixes with missing scheme
	('https://via.hypothes.is/h/www.example.com', 'http://www.example.com'),
	('https://via.hypothes.is/20150520202836/foo.com/bar/',
	'http://foo.com/bar/'),
	('https://via.hypothes.is/oe_///www.example.com',
	'http://www.example.com'),

	# URLs with broken prefixes
	('https://via.hypothes.is/https:/w3c-social.github.io/activitypump/',
	'https://w3c-social.github.io/activitypump/'),

	# Double prefixed
	('https://via.hypothes.is/https://via.hypothes.is/http://www.nytimes.com/roomfordebate/',
	'http://www.nytimes.com/roomfordebate/'),
	]


	@pytest.mark.parametrize('url_in,url_out', FIXTURES)
	def test_fix_url(url_in, url_out):
	assert fix_url(url_in) == url_out