Skip to content

Instantly share code, notes, and snippets.

@adjam
Created September 1, 2016 00:48
Show Gist options
  • Save adjam/bf5af489de1a520e0dae45511a8353fb to your computer and use it in GitHub Desktop.
Save adjam/bf5af489de1a520e0dae45511a8353fb to your computer and use it in GitHub Desktop.
Example of how to iterate over solr results as if it's just a list of documents
#!/usr/bin/python
from random import randint
class QueryResult(object):
"""Mock representation of a set of matching results to a query"""
def __init__(self,query,max_results):
self.query = query
self.count = randint(1,max_results)
def get_results(self,start,page_size=10):
current = start
docs = []
# this dict looks sort of like a page of results from Solr
result = { "query" : self.query, "start": start, "numFound": self.count, "docs": docs }
while current < self.count and len(docs) < page_size:
docs.append({"id": "doc-{}".format(current+1),
"title" : "Title #{}".format(current+1)})
current += 1
return result
class MockServer(object):
"""Mock Solr server; remembers doc counts for queries and can return results in a given batch size"""
def __init__(self,doc_count=5000):
self.doc_count = doc_count
self.active_queries = {}
def query(self,query,start=0,page_size=10):
if query in self.active_queries:
result = self.active_queries[query]
else:
result = QueryResult(query,self.doc_count)
self.active_queries[query] = result
return result.get_results(start,page_size)
# everything above this line is a simulation of a thing that delivers
# results in chunks like a Solr server
def iterate_results(query,page_size):
"""Wrapper over mocks above that can be iterated over to retrieve each matching document in turn"""
# with real Solr library, the next line would be where you establish
# a connection to the server
server = MockServer()
# initial query; gives us number of hits and a batch of results
results = server.query(query)
last_index = 0
num_found = results['numFound']
print("Initial query says {} hits for '{}'".format(num_found,query))
while last_index < num_found:
for doc in results['docs']:
yield doc
last_index = results['start'] + len(results['docs'])
results = server.query(query,last_index)
# and now ... we iterate!
if __name__ == '__main__':
import sys
# invocation: ./pager.py [query] [page size]
query = len(sys.argv) > 1 and sys.argv[1] or 'some_query'
page_size = 10
if len(sys.argv) > 2:
try:
page_size = int(sys.argv[2])
except:
pass
index = 0
result_modulus = 12
print("printing every {}th result".format(result_modulus))
for doc in iterate_results(query,page_size):
if index % result_modulus == 0:
print("\t{}".format(doc['title']))
index += 1
print("Last doc: {}".format(doc['title']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment