Created
January 26, 2016 15:58
-
-
Save nickstenning/489729480ad8eca8d0a2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import argparse | |
import elasticsearch | |
import json | |
def get_from_ids(host, port, index, type, ids): | |
es = elasticsearch.Elasticsearch([{'host': host, 'port': port}]) | |
for id in ids: | |
doc = es.get(index=index, doc_type=type, id=id.rstrip('\r\n')) | |
print("{} {}".format(doc['_id'], json.dumps(doc['_source'], sort_keys=True))) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Get a bunch of documents whose IDs are stored in a file") | |
parser.add_argument("--host") | |
parser.add_argument("--port", type=int) | |
parser.add_argument("--index") | |
parser.add_argument("--type") | |
parser.add_argument("--id_file") | |
args = parser.parse_args() | |
id_file = open(args.id_file) | |
get_from_ids(args.host, args.port, args.index, args.type, id_file) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import argparse | |
import elasticsearch | |
import json | |
def get_random_docs(host, port, index, type, numdocs, only_id): | |
es = elasticsearch.Elasticsearch([{'host': host, 'port': port}]) | |
query = { | |
"size": numdocs, | |
"query": { | |
"function_score": { | |
"functions": [ | |
{ | |
"random_score": { | |
"seed": 11 | |
} | |
} | |
], | |
"score_mode": "sum", | |
} | |
} | |
} | |
results = es.search(index=index, doc_type=type, body=query) | |
for doc in results['hits']['hits']: | |
if only_id: | |
print doc['_id'] | |
else: | |
print("{} {}".format(doc['_id'], json.dumps(doc['_source'], sort_keys=True))) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Update documents containing via.hypothes.is URLs") | |
parser.add_argument("--host") | |
parser.add_argument("--port", type=int) | |
parser.add_argument("--index") | |
parser.add_argument("--type") | |
parser.add_argument("--numdocs", type=int) | |
parser.add_argument("--only_id", action='store_true') | |
args = parser.parse_args() | |
get_random_docs(args.host, args.port, args.index, args.type, args.numdocs, args.only_id) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
from __future__ import print_function | |
import argparse | |
import elasticsearch | |
from elasticsearch import helpers | |
from via import fix_url | |
def fetch_all(client, index, doc_type): | |
query = {'query': {'match_all': {}}} | |
return helpers.scan(client=client, | |
index=index, | |
doc_type=doc_type, | |
query=query) | |
def get_document_actions(index, documents): | |
for doc in documents: | |
print('document {}'.format(doc['_id'])) | |
updates = {} | |
if 'favicon' in doc['_source']: | |
fixed_favicon = fix_url(doc['_source']['favicon']) | |
if fixed_favicon is not None: | |
updates['favicon'] = fixed_favicon | |
if 'link' in doc['_source']: | |
links = doc['_source']['link'] | |
fixed_links, modified = _process_links(links) | |
if modified: | |
updates['link'] = fixed_links | |
if not updates: | |
continue | |
yield({'_op_type': 'update', | |
'_index': index, | |
'_type': 'document', | |
'_id': doc['_id'], | |
'doc': updates}) | |
def get_annotation_actions(index, annotations): | |
for ann in annotations: | |
print('annotation {}'.format(ann['_id'])) | |
updates = {} | |
if 'document' in ann['_source']: | |
modified = False | |
document = ann['_source']['document'] | |
if 'favicon' in document: | |
fixed_favicon = fix_url(document['favicon']) | |
if fixed_favicon is not None: | |
document['favicon'] = fixed_favicon | |
modified = True | |
if 'link' in document: | |
links = document['link'] | |
_, modified = _process_links(links) | |
if modified: | |
updates['document'] = document | |
if 'target' in ann['_source']: | |
modified = False | |
target = ann['_source']['target'] | |
if not isinstance(target, list): | |
raise RuntimeError("target is not a list: " | |
"{:r}".format(target)) | |
for item in target: | |
if 'source' not in item: | |
continue | |
fixed_source = fix_url(item['source']) | |
if fixed_source is not None: | |
item['source'] = fixed_source | |
modified = True | |
if modified: | |
updates['target'] = target | |
if 'uri' in ann['_source'] and ann['_source']['uri'] is not None: | |
uri = fix_url(ann['_source']['uri']) | |
if uri is not None: | |
updates['uri'] = uri | |
if not updates: | |
continue | |
yield({'_op_type': 'update', | |
'_index': index, | |
'_type': 'annotation', | |
'_id': ann['_id'], | |
'doc': updates}) | |
def _process_links(links): | |
modified = False | |
# Deal with situations such as: {..., 'link': 'http://...', ...} | |
if isinstance(links, basestring): | |
links = [{'href': links}] | |
if not isinstance(links, list): | |
raise RuntimeError("link prop wasn't string or list: " | |
"{:r}".format(links)) | |
for link in links: | |
if not isinstance(link, dict): | |
raise RuntimeError("link item wasn't a dict: " | |
"{:r}".format(link)) | |
if 'href' not in link: | |
continue | |
href = link['href'] | |
if not isinstance(href, basestring): | |
raise RuntimeError("link['href'] wasn't a string: " | |
"{:r}".format(href)) | |
fixed_href = fix_url(href) | |
if fixed_href is not None: | |
link['href'] = fixed_href | |
modified = True | |
return links, modified | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Update documents containing via.hypothes.is URLs") | |
parser.add_argument("host") | |
parser.add_argument("port") | |
parser.add_argument("index") | |
parser.add_argument("--skip_documents", action='store_true') | |
parser.add_argument("--skip_annotations", action='store_true') | |
args = parser.parse_args() | |
es = elasticsearch.Elasticsearch([{'host': args.host, 'port': args.port}]) | |
if not args.skip_documents: | |
all_documents = fetch_all(es, args.index, 'document') | |
actions = get_document_actions(args.index, all_documents) | |
helpers.bulk(es, actions) | |
if not args.skip_annotations: | |
all_annotations = fetch_all(es, args.index, 'annotation') | |
actions = get_annotation_actions(args.index, all_annotations) | |
helpers.bulk(es, actions) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# Possible via prefixes to be stripped off. These should be given in decreasing | |
# order of specificity, to ensure that the longest possible via prefix is | |
# removed. | |
PREFIX_RE = '|'.join([ | |
r'static/__shared/viewer/web/viewer\.html\?file=/h/id_/', | |
r'static/__shared/viewer/web/viewer\.html\?file=/id_/', | |
r'static/__shared/viewer/web/viewer\.html\?file=', | |
r'h/\d{14}oe_/', | |
r'h/\d{14}/', | |
r'h/oe_/', | |
r'oe_/', | |
r'h/', | |
r'\d{14}oe_/', | |
r'\d{14}/', | |
]) | |
VIA_URL_RE = re.compile(r'https?://via.hypothes.is/(?:' + PREFIX_RE + ')?(.+)', | |
re.IGNORECASE) | |
def fix_url(url): | |
match = re.match(VIA_URL_RE, url) | |
# If this URL isn't prefixed by via at all, then we should return None, | |
# signifying no change: | |
if match is None: | |
return None | |
url = match.group(1) | |
# Strip off any duplicate via prefixes | |
while True: | |
match = re.match(VIA_URL_RE, url) | |
if match is None: | |
break | |
url = match.group(1) | |
# '//example.com' -> 'http://example.com' | |
if url.startswith('//'): | |
return 'http:' + url | |
# 'https:/example.com' -> 'https://example.com' | |
match = re.match(r'(https?):/([^/].*)', url, re.IGNORECASE) | |
if match is not None: | |
return match.group(1) + '://' + match.group(2) | |
match = re.match(r'[a-z-]+://(.*)', url, re.IGNORECASE) | |
if match is None: | |
return 'http://' + url | |
return url |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pytest | |
from fixurls.via import fix_url | |
FIXTURES = [ | |
# Don't touch links to the via homepage | |
('https://via.hypothes.is', None), | |
('https://via.hypothes.is/', None), | |
# Don't touch all kinds of other normal links | |
('https://example.com/foo/bar', None), | |
('android-app://com.google.android.youtube/...', None), | |
('http://example.com/foo.pdf', None), | |
('https://w3c-social.github.io/activitypump/', None), | |
('https://example.com/https://via.hypothes.is', None), | |
# Normal via links | |
('https://via.hypothes.is/https://example.com', 'https://example.com'), | |
('https://via.hypothes.is/https://example.com/foo/bar', | |
'https://example.com/foo/bar'), | |
('https://via.hypothes.is/http://example.com/foo/bar', | |
'http://example.com/foo/bar'), | |
# Insecure via links | |
('http://via.hypothes.is/https://example.com', 'https://example.com'), | |
('http://via.hypothes.is/https://example.com/foo/bar', | |
'https://example.com/foo/bar'), | |
('http://via.hypothes.is/http://example.com/foo/bar', | |
'http://example.com/foo/bar'), | |
# Incorrect case via links | |
('http://VIA.Hypothes.IS/http://example.com/foo/bar', | |
'http://example.com/foo/bar'), | |
# Phone app links (yes, really) | |
('https://via.hypothes.is/android-app://com.google.android.youtube/...', | |
'android-app://com.google.android.youtube/...'), | |
# Links missing scheme | |
('https://via.hypothes.is/www.artbusiness.com', | |
'http://www.artbusiness.com'), | |
('https://via.hypothes.is/devblog.avdi.org/2015/', | |
'http://devblog.avdi.org/2015/'), | |
('https://via.hypothes.is/tapcore.com/...', | |
'http://tapcore.com/...'), | |
('https://via.hypothes.is///codegeekz.com/wp-content/uploads/codegeekz-favicon.png', | |
'http://codegeekz.com/wp-content/uploads/codegeekz-favicon.png'), | |
# Strange prefixes | |
('https://via.hypothes.is/h/http://example.com', 'http://example.com'), | |
('https://via.hypothes.is/oe_/http://example.com', 'http://example.com'), | |
('https://via.hypothes.is/h/oe_/http://example.com', 'http://example.com'), | |
('https://via.hypothes.is/20150520202836/http://example.com', | |
'http://example.com'), | |
('https://via.hypothes.is/20150520202836oe_/http://example.com', | |
'http://example.com'), | |
('https://via.hypothes.is/h/20150520202836/http://example.com', | |
'http://example.com'), | |
('https://via.hypothes.is/h/20150520202836oe_/http://example.com', | |
'http://example.com'), | |
('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=http://example.com/foo.pdf', | |
'http://example.com/foo.pdf'), | |
('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/id_/http://example.com/foo.pdf', | |
'http://example.com/foo.pdf'), | |
('https://via.hypothes.is/static/__shared/viewer/web/viewer.html?file=/h/id_/http://example.com/foo.pdf', | |
'http://example.com/foo.pdf'), | |
# Strange prefixes with missing scheme | |
('https://via.hypothes.is/h/www.example.com', 'http://www.example.com'), | |
('https://via.hypothes.is/20150520202836/foo.com/bar/', | |
'http://foo.com/bar/'), | |
('https://via.hypothes.is/oe_///www.example.com', | |
'http://www.example.com'), | |
# URLs with broken prefixes | |
('https://via.hypothes.is/https:/w3c-social.github.io/activitypump/', | |
'https://w3c-social.github.io/activitypump/'), | |
# Double prefixed | |
('https://via.hypothes.is/https://via.hypothes.is/http://www.nytimes.com/roomfordebate/', | |
'http://www.nytimes.com/roomfordebate/'), | |
] | |
@pytest.mark.parametrize('url_in,url_out', FIXTURES) | |
def test_fix_url(url_in, url_out): | |
assert fix_url(url_in) == url_out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment