Skip to content

Instantly share code, notes, and snippets.

@rndmcnlly
Last active February 22, 2021 19:04
Show Gist options
  • Save rndmcnlly/0ccad86bf0f448e00ac713dfc3794d46 to your computer and use it in GitHub Desktop.
Save rndmcnlly/0ccad86bf0f448e00ac713dfc3794d46 to your computer and use it in GitHub Desktop.
A search engine in a gist!

This gist implements a self-contained search engine for the VisualGenome scene corpus. The user interface is built from ipywidgets and presented with Voilà. The corpus is downloaded in the postBuild script so that fresh Binder sessions don't need to wait for this each time. Ideally even more pre-processing would be done here so that the interface gets to a responsive state faster.

launch binder

set -ex
jupyter serverextension enable voila --sys-prefix
wget http://visualgenome.org/static/data/dataset/image_data.json.zip
wget http://visualgenome.org/static/data/dataset/relationships.json.zip
numpy
voila
ijson
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "english-reality",
"metadata": {},
"source": [
"# Simple Search Engine for Scenes\n",
"\n",
"A query like \"woman petting cat\" below will find scenes in the [VisualGenome dataset](http://visualgenome.org/) where there is a \"petting\" relationship with subject of type \"woman\" and object of type \"cat\" (query is split on spaces). Results are ranked by the fraction of query structure slots they match (words must match exactly to count). Because the matching is done at the level of relationships, a single scene may appear in the results multiple times when there are multiple relevant relationships in that scene."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "commercial-bangkok",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import zipfile\n",
"import ijson\n",
"import numpy as np\n",
"import ipywidgets.widgets as ipw"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "urban-group",
"metadata": {},
"outputs": [],
"source": [
"with zipfile.ZipFile(\"image_data.json.zip\") as zf:\n",
" with zf.open(\"image_data.json\") as f:\n",
" image_data_json = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "prime-flexibility",
"metadata": {},
"outputs": [],
"source": [
"triples = []\n",
"\n",
"with zipfile.ZipFile(\"relationships.json.zip\") as zf:\n",
" with zf.open(\"relationships.json\") as f:\n",
" for index, scene in enumerate(ijson.items(f, 'item')):\n",
" for rel in scene['relationships']:\n",
" \n",
" s = rel['subject']\n",
" s_name = s.get('name') or s['names'][0]\n",
" \n",
" p_name = rel['predicate']\n",
" \n",
" o = rel['object']\n",
" o_name = o.get('name') or o['names'][0]\n",
" \n",
" triples.append((index, s_name, p_name, o_name))\n",
" \n",
"#f'Extracted {len(triples)} triples from {len(image_data_json)} scenes.'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "voluntary-equality",
"metadata": {},
"outputs": [],
"source": [
"query_widget = ipw.Text(value=\"woman petting cat\")\n",
"search_widget = ipw.Button(description='Search')\n",
"result_widget = ipw.VBox()\n",
"\n",
"\n",
"def score(q, d):\n",
" return ((q[1]==d[1]) + (q[2]==d[2]) + (q[3]==d[3]))/3\n",
"\n",
"def clicked_search(_):\n",
" \n",
" search_widget.disabled = True\n",
" \n",
" s_name, p_name, o_name = query_widget.value.split(' ')\n",
" query = (-1,s_name, p_name, o_name)\n",
" \n",
" result_widget.children = ()\n",
" \n",
" triple_scores = []\n",
" for triple in triples:\n",
" triple_scores.append(score(query, triple))\n",
" \n",
" triple_scores = np.array(triple_scores)\n",
" \n",
" top_indexes = np.argsort(-triple_scores)[:10]\n",
" result_chunks = []\n",
" for top_index in top_indexes:\n",
" item_score = triple_scores[top_index]\n",
" item_scene_index = triples[top_index][0]\n",
" item_url = image_data_json[item_scene_index]['url']\n",
" item = ipw.HTML(value=f'<a href=\"{item_url}\"><img src=\"{item_url}\" width=\"100\"></a>')\n",
" result_chunks.append(item)\n",
" result_widget.children = tuple(result_chunks)\n",
" \n",
" search_widget.disabled = False\n",
" \n",
"\n",
"search_widget.on_click(clicked_search)\n",
"\n",
" \n",
"ipw.VBox([\n",
" ipw.HBox([query_widget,search_widget]),\n",
" result_widget])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment