Created
January 25, 2017 14:20
-
-
Save 4ndr01d3/95bcea650ff9823b2032e517ffe447b2 to your computer and use it in GitHub Desktop.
python scripts to compare solr vs elasticsearch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import http.client | |
import json | |
import urllib.parse | |
class InterproSolrTest(unittest.TestCase): | |
server = "hmmer-prod-db01" | |
port = 9200 | |
response_times = [] | |
queries = { | |
"*:*": "Number of Docs per entry_db", | |
"protein_db:s": "Number of Swissprot Docs per entry_db", | |
"protein_db:t": "Number of Trembl Docs per entry_db", | |
"protein_acc:a0a0a2t3z9": "Number of Docs with protein Acc per entry_db", | |
"structure_acc:*": "Number of Docs with structure per entry_db", | |
"structure_acc:3nyw": "Number of Docs with structure Acc per entry_db", | |
"!entry_db:interpro%20AND%20!integrated:*": "Number of Unintegrated Docs per entry_db", | |
} | |
def setUp(self): | |
self._connection = http.client.HTTPConnection(self.server, self.port) | |
def tearDown(self): | |
self._connection.close() | |
@classmethod | |
def tearDownClass(self): | |
print("\n"+("*"*80)) | |
print("-= ElasticSearch response times =-".center(80)+"\n") | |
for k,v in self.response_times: | |
print("{:>68}: {:>8}".format(k,v)) | |
def test_solr_core_exists(self): | |
self._connection.request("GET", "/interpro") | |
response = self._connection.getresponse() | |
self.assertEqual(response.status, 200, "The response should be OK") | |
def test_solr_core_returns_json(self): | |
self._connection.request("GET", "/interpro/relationship/_search") | |
response = self._connection.getresponse() | |
data = response.read().decode() | |
try: | |
json.loads(data) | |
except json.JSONDecodeError: | |
self.fail("The document is not JSON") | |
def _elastic_json_query(self, q, query_obj): | |
self._connection.request( | |
"GET", | |
"/interpro/relationship/_search?pretty&q="+q, | |
json.dumps(query_obj) | |
) | |
response = self._connection.getresponse() | |
self.assertEqual(response.status, 200, "The response should be OK") | |
data = response.read().decode() | |
try: | |
obj = json.loads(data) | |
self.assertIn("took", obj) | |
self.assertIn("hits", obj) | |
self.assertIn("aggregations", obj) | |
return obj | |
except json.JSONDecodeError: | |
self.fail("The document is not JSON") | |
def test_number_of_docs_per_entry_db(self): | |
facet= { | |
"aggs" : { | |
"rscount": { | |
"terms": { | |
"field": "entry_db" | |
} | |
} | |
}, | |
"size": 0 | |
} | |
for q,tag in self.queries.items(): | |
response = self._elastic_json_query(q, facet) | |
self.response_times.append((tag, response["took"])) | |
def test_number_of_unique_entries_per_entry_db(self): | |
facet= { | |
"aggs" : { | |
"rscount": { | |
"terms": { | |
"field": "entry_db" | |
}, | |
"aggs": { | |
"unique_entries": { | |
"cardinality": { | |
"field": "entry_acc" | |
} | |
}, | |
} | |
} | |
}, | |
"size": 0 | |
} | |
for q,tag in self.queries.items(): | |
response = self._elastic_json_query(q, facet) | |
self.response_times.append((tag+" (unique entries)", response["took"])) | |
# self.response_times[tag+" (unique entries)"]=response["took"] | |
def test_number_of_unique_proteins_per_entry_db(self): | |
facet= { | |
"aggs" : { | |
"rscount": { | |
"terms": { | |
"field": "entry_db" | |
}, | |
"aggs": { | |
"unique_proteins": { | |
"cardinality": { | |
"field": "protein_acc" | |
} | |
}, | |
} | |
} | |
}, | |
"size": 0 | |
} | |
for q,tag in self.queries.items(): | |
response = self._elastic_json_query(q, facet) | |
self.response_times.append((tag+" (unique proteins)", response["took"])) | |
# self.response_times[tag+" (unique proteins)"]=response["took"] | |
def test_grouping_entries(self): | |
fq = "{}:*%20AND%20{}_acc:{}".format("entry_acc", "protein", "protein_64440985") | |
for q,tag in self.queries.items(): | |
response = self._elastic_group_query(q, "entry_acc", 1, 0) | |
self.response_times.append((tag+" (group)", response["took"])) | |
# self.response_times[tag+" (group)"]=response["took"] | |
response = self._elastic_group_query(q, "entry_acc", 1, 0, fq) | |
self.response_times.append((tag+" (group+fq)", response["took"])) | |
# self.response_times[tag+" (group+fq)"]=response["took"] | |
def _elastic_group_query(self, q, field, rows, start, fq=""): | |
query_obj = { | |
"size": 0, | |
"aggs":{ | |
"by_entry":{ | |
"terms": { | |
"field": field, | |
"size": rows | |
}, | |
"aggs": { | |
"tops": { | |
"top_hits": { "size": 1} | |
} | |
} | |
} | |
} | |
} | |
if fq != "": | |
fq = "%20AND%20"+fq | |
self._connection.request( | |
"GET", | |
"/interpro/relationship/_search?pretty&q="+q+fq, | |
json.dumps(query_obj) | |
) | |
response = self._connection.getresponse() | |
self.assertEqual(response.status, 200, "The response should be OK") | |
data = response.read().decode() | |
try: | |
obj = json.loads(data) | |
self.assertIn("took", obj) | |
self.assertIn("hits", obj) | |
self.assertIn("aggregations", obj) | |
return obj | |
except json.JSONDecodeError: | |
self.fail("The document is not JSON") | |
if __name__ == '__main__': | |
unittest.main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import http.client | |
import json | |
import urllib.parse | |
class InterproSolrTest(unittest.TestCase): | |
server = "hmmer-prod-db02" | |
port = 8983 | |
response_times = {} | |
queries = { | |
"*:*": "Number of Docs per entry_db", | |
"protein_db:swissprot": "Number of Swissprot Docs per entry_db", | |
"protein_db:trembl": "Number of Trembl Docs per entry_db", | |
"protein_acc:protein_64440985": "Number of Docs with protein Acc per entry_db", | |
"structure_acc:*": "Number of Docs with structure per entry_db", | |
"structure_acc:protein_32860": "Number of Docs with structure Acc per entry_db", | |
"!entry_db:interpro && !integrated:*": "Number of Unintegrated Docs per entry_db", | |
} | |
def setUp(self): | |
self._connection = http.client.HTTPConnection(self.server, self.port) | |
def tearDown(self): | |
self._connection.close() | |
@classmethod | |
def tearDownClass(self): | |
print("\n*********\n-= Solr response times =-\n") | |
for k,v in self.response_times.items(): | |
print("{:>60}: {:>8}".format(k,v)) | |
def test_solr_core_exists(self): | |
self._connection.request("GET", "/solr/generated/select") | |
response = self._connection.getresponse() | |
self.assertEqual(response.status, 200, "The response should be OK") | |
def test_solr_core_returns_json(self): | |
self._connection.request("GET", "/solr/generated/select?indent=on&wt=json") | |
response = self._connection.getresponse() | |
data = response.read().decode() | |
try: | |
json.loads(data) | |
except json.JSONDecodeError: | |
self.fail("The document is not JSON") | |
def _solr_json_query(self, q, json_facet): | |
params = { | |
"indent" : "on", | |
"wt": "json", | |
"q": q, | |
"rows": "0", | |
"facet": "on", | |
"json.facet": json_facet | |
} | |
self._connection.request("GET", "/solr/generated/select?"+urllib.parse.urlencode(params)) | |
response = self._connection.getresponse() | |
self.assertEqual(response.status, 200, "The response should be OK") | |
data = response.read().decode() | |
try: | |
obj = json.loads(data) | |
self.assertIn("responseHeader", obj) | |
self.assertIn("response", obj) | |
self.assertIn("facets", obj) | |
return obj | |
except json.JSONDecodeError: | |
self.fail("The document is not JSON") | |
@unittest.skip("yes") | |
def test_number_of_docs_per_entry_db(self): | |
facet= { | |
"databases": { | |
"type": "terms", | |
"field": "entry_db", | |
} | |
} | |
for q,tag in self.queries.items(): | |
response = self._solr_json_query(q, facet) | |
self.response_times[tag]=response["responseHeader"]["QTime"] | |
@unittest.skip("yes") | |
def test_number_of_unique_entries_per_entry_db(self): | |
facet= { | |
"databases": { | |
"type": "terms", | |
"field": "entry_db", | |
"facet": { | |
"unique": "unique(entry_acc)" | |
} | |
} | |
} | |
for q,tag in self.queries.items(): | |
response = self._solr_json_query(q, facet) | |
self.response_times[tag+" (unique entries)"]=response["responseHeader"]["QTime"] | |
@unittest.skip("yes") | |
def test_number_of_unique_proteins_per_entry_db(self): | |
facet= { | |
"databases": { | |
"type": "terms", | |
"field": "entry_db", | |
"facet": { | |
"unique": "unique(protein_acc)" | |
} | |
} | |
} | |
for q,tag in self.queries.items(): | |
response = self._solr_json_query(q, facet) | |
self.response_times[tag+" (unique proteins)"]=response["responseHeader"]["QTime"] | |
def test_grouping_entries(self): | |
fq = "{}:* && {}_acc:{}".format("entry_acc", "protein", "protein_64440985") | |
for q,tag in self.queries.items(): | |
response = self._solr_group_query(q, "entry_acc", 1, 0) | |
self.response_times[tag+" (group)"]=response["responseHeader"]["QTime"] | |
response = self._solr_group_query(q, "entry_acc", 1, 0, fq) | |
self.response_times[tag+" (group+fq)"]=response["responseHeader"]["QTime"] | |
def _solr_group_query(self, q, field, rows, start, fq=None): | |
params = { | |
"indent" : "on", | |
"wt": "json", | |
'group': 'true', | |
'group.field': field, | |
'group.ngroups': 'true', | |
'rows': rows, | |
'start': start, | |
"q": q, | |
} | |
if fq is not None: | |
params['fq'] = fq.lower() | |
self._connection.request("GET", "/solr/generated/select?"+urllib.parse.urlencode(params)) | |
response = self._connection.getresponse() | |
self.assertEqual(response.status, 200, "The response should be OK") | |
data = response.read().decode() | |
try: | |
obj = json.loads(data) | |
self.assertIn("responseHeader", obj) | |
self.assertIn("grouped", obj) | |
return obj | |
except json.JSONDecodeError: | |
self.fail("The document is not JSON") | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment