Skip to content

Instantly share code, notes, and snippets.

@mattes
Last active November 14, 2018 17:29
Show Gist options
  • Save mattes/9646173 to your computer and use it in GitHub Desktop.
Save mattes/9646173 to your computer and use it in GitHub Desktop.
class InitCSVHandler(webapp2.RequestHandler):
def get(self):
fetched = self.request.get('fetched')
if not fetched:
self.response.write("no fetched date")
return
taskqueue.add(url='/create_csv', queue_name='csv', params={'fetched': fetched})
self.response.write("creating " + str(fetched))
class CreateCSVHandler(webapp2.RequestHandler):
def post(self):
fetched = self.request.get('fetched')
fetched = datetime.strptime(fetched, "%Y-%m-%d")
ctx = ndb.get_context()
ctx.set_cache_policy(lambda key: key.kind() != 'Page')
pages_query = Page.query()
pages_query = pages_query.filter(Page.fetched >= fetched)
pages_query = pages_query.filter(Page.fetched < fetched + timedelta(days=1))
filename = "/csv123/" + self.request.get('fetched') + ".csv"
logging.info("filename: " + filename)
gcs_file = gcs.open(filename, 'w', content_type="text/plain")
cursor = None
while True:
pages, next_cursor, more = pages_query.fetch_page(50, start_cursor=cursor)
for page in pages:
string = ""
string += page.fetched.strftime("%Y-%m-%d %H:%M:%S")
string += ";"
string += urllib.quote(page.url)
string += ";"
string += base64.b64encode(zlib.compress(page.html.encode('utf-8'), 9))
string += "\n"
gcs_file.write(string)
if(more):
cursor = next_cursor
else:
break
gcs_file.close()
gc.collect()
ctx.clear_cache()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment