Skip to content

Instantly share code, notes, and snippets.

@abevieiramota
Created March 28, 2019 22:19
Show Gist options
  • Save abevieiramota/adf28fc5a061fb685d1160517d41317a to your computer and use it in GitHub Desktop.
Save abevieiramota/adf28fc5a061fb685d1160517d41317a to your computer and use it in GitHub Desktop.
import math
import os
buckets_dir = './all_texts_buckets'
if not os.path.isdir(buckets_dir):
os.mkdir(buckets_dir)
n_characters = 30000
accumulated = 0
positions = []
for i, (idx, text_len) in enumerate(all_texts['ltext'].str.len().iteritems()):
accumulated += text_len
if accumulated > n_characters:
positions.append(i - 1)
accumulated = text_len
for i, (i_begin, i_end) in enumerate(zip([0] + positions, positions + [None])):
bucket = all_texts.iloc[i_begin:i_end, :]
bucket['ltext'].to_csv(os.path.join(buckets_dir, f'all_texts_bucket_{i}.txt'), index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment