Created
September 7, 2011 17:21
-
-
Save phill-tornroth/1201168 to your computer and use it in GitHub Desktop.
This is my little batched-strategy for updating really large haystack indexes via a Celery task. It's pretty simple... Each task builds a piece of the index, and then puts a task on the queue for building the next batch. I also use my little @bounce_worke
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from celery.signals import task_postrun | |
from celery.task import task | |
from haystack import site | |
def bounce_worker_after(func): | |
""" | |
A decorator which will ensure that the celery worker is shutdown (and subsequently restarted by celery) | |
after this task is executed. | |
""" | |
task_postrun.connect(shutdown_worker, sender=func) | |
return func | |
@bounce_worker_after | |
@task | |
def update_large_index(ModelClass, batch_start=0, batch_size=25000, total_count=None, rebuild=False): | |
index = site.get_index(ModelClass) | |
qs = index.index_queryset() | |
if batch_start == 0: | |
logging.info("Deleting and rebuilding index for %s. (Note that this will be done in batches.)" % (ModelClass,)) | |
if rebuild: | |
index.clear() | |
total_count = qs.count() | |
logging.info("Found %s items. So this is gonna be about %s batches" % (total_count, total_count/batch_size+1)) | |
logging.info("Beginning index of batch started at %s" % (batch_start,)) | |
batch_end = batch_start + batch_size | |
index.backend.update(index, qs[batch_start:batch_end]) | |
new_start = batch_end | |
if new_start > total_count: | |
logging.info("Index rebuild complete!") | |
else: | |
update_large_index.delay(ModelClass, new_start, batch_size, total_count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment