Skip to content

Instantly share code, notes, and snippets.

View yashbonde's full-sized avatar
👽
Up there!

Yash Bonde yashbonde

👽
Up there!
View GitHub Profile
In this quick script we are trying to solve sharding problem:
often in very large datasets there is no way to tokenize everything and store
them. Considering the CLM datasets we have a fixed dataset where each row
has dynamic number of tokens. A dummy looks like follows:
j n sequence (w/o EOT = 42)
[0] [15] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
[1] [13] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
[2] [11] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[3] [13] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
# @yashbonde
#
# In this quick script we are trying to solve sharding problem:
# often in very large datasets there is no way to tokenize everything and store
# them. Considering the CLM datasets we have a fixed dataset where each row
# has dynamic number of tokens. A dummy looks like follows:
#
# j n sequence (w/o EOT = 42)
# [0] [15] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
# [1] [13] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
@yashbonde
yashbonde / daily.py
Last active August 5, 2021 10:18
This file has many functions I use daily. As a gist so I can download any where!
# whole bunch of utility functions I use in day to day
# - @yashbonde / https://github.com/yashbonde
#
# Now this is a simple script and cannot be loaded like a package
# so you'll need to import it. This is how you can do it
"""
try:
from daily import *
except ImportError as e:
import requests
{"probabilities": [[1.2926712516090588e-20, 5.5255535698961467e-05, 0.0002742670476436615, 0.0006402312428690493, 0.0014720156323164701, 0.0012355875223875046, 0.001348935067653656, 0.0008120863349176943, 0.00014566878962796181, 0.00010050860146293417, 8.873591286828741e-05, 8.371012518182397e-05, 8.067757153185084e-05, 7.812779949745163e-05, 7.676590757910162e-05, 7.566258136648685e-05, 7.451995770679787e-05, 7.365801138803363e-05, 7.284935418283567e-05, 7.216513768071309e-05, 7.18772571417503e-05, 7.194354111561552e-05, 7.281054422492161e-05, 7.401074253721163e-05, 7.490790449082851e-05, 7.615067443111911e-05, 7.768635987304151e-05, 7.906524115242064e-05, 8.06819589342922e-05, 8.203649485949427e-05, 8.346716640517116e-05, 8.52410594234243e-05, 8.689863898325711e-05, 8.861553214956075e-05, 9.03504405869171e-05, 9.193049481837079e-05, 9.383008000440896e-05, 9.576052252668887e-05, 9.806567686609924e-05, 0.00010064752132166177, 0.00010472193389432505, 0.00010961966472677886, 0.00011670083767967299, 0.0001280230
{"probabilities": [[1.1141151645688296e-07, 7.109872240107507e-05, 0.00012866298493463546, 0.00016478959878440946, 0.00016841781325638294, 0.00010295069660060108, 7.951889710966498e-05, 8.092021744232625e-05, 4.021274435217492e-05, 3.11876974592451e-05, 2.906467125285417e-05, 2.7716798285837285e-05, 2.700846380321309e-05, 2.650410169735551e-05, 2.6154128136113286e-05, 2.5903316782205366e-05, 2.5689405447337776e-05, 2.53952184721129e-05, 2.510555714252405e-05, 2.4870221750461496e-05, 2.471556217642501e-05, 2.460943505866453e-05, 2.463997225277126e-05, 2.4738801585044712e-05, 2.4801340259728022e-05, 2.4910948923206888e-05, 2.5076300516957417e-05, 2.521012538636569e-05, 2.5363751774420962e-05, 2.556990693847183e-05, 2.572470657469239e-05, 2.5900726541294716e-05, 2.6104935386683792e-05, 2.624808985274285e-05, 2.651236172823701e-05, 2.6783925932249986e-05, 2.7069901989307255e-05, 2.736990609264467e-05, 2.7800717361969873e-05, 2.8264890715945512e-05, 2.8743679649778642e-05, 2.9462489692377858e-05, 3.030664629477542
{"probabilities": [[8.319987365723591e-09, 8.807858830550686e-05, 8.181072189472616e-05, 5.777460683020763e-05, 5.405166302807629e-05, 5.242229235591367e-05, 5.3901250794297084e-05, 0.00011154919775435701, 3.446875052759424e-05, 2.2755972167942673e-05, 2.00589402084006e-05, 1.883219192677643e-05, 1.8182163330493495e-05, 1.778568548616022e-05, 1.7511243640910834e-05, 1.730769145069644e-05, 1.7141845091828145e-05, 1.69992108567385e-05, 1.6905491065699607e-05, 1.6829179003252648e-05, 1.674113082117401e-05, 1.6692392819095403e-05, 1.6684442016412504e-05, 1.6706540918676183e-05, 1.672308826528024e-05, 1.6754280295572244e-05, 1.6772557501099072e-05, 1.676101965131238e-05, 1.6800377125036903e-05, 1.6810528904898092e-05, 1.6838268493302166e-05, 1.683078517089598e-05, 1.685601455392316e-05, 1.6878739188541658e-05, 1.6892756320885383e-05, 1.690590761427302e-05, 1.6895086446311325e-05, 1.6952975784079172e-05, 1.701845758361742e-05, 1.7069842215278186e-05, 1.711402364890091e-05, 1.7277752704103477e-05, 1.7525228031445295
{"probabilities": [[2.3264911419573764e-07, 5.301749297359493e-06, 2.6038098894787254e-06, 1.7164599057650776e-06, 1.0858479981834535e-06, 9.61552018452494e-07, 7.355832281064068e-07, 1.941584059750312e-06, 8.369137503905222e-06, 9.675602086645085e-06, 1.0033788385044318e-05, 1.0145812666451093e-05, 1.0224254765489604e-05, 1.033136959449621e-05, 1.0335026672692038e-05, 1.034099386743037e-05, 1.0352066965424456e-05, 1.0341838788008317e-05, 1.0354656296840403e-05, 1.0351989658374805e-05, 1.033322678267723e-05, 1.0312256563338451e-05, 1.0319741704734042e-05, 1.0330334589525592e-05, 1.0314281098544598e-05, 1.0298736924596597e-05, 1.0285924872732721e-05, 1.0259565897285938e-05, 1.025995152303949e-05, 1.025033270707354e-05, 1.0248631042486522e-05, 1.0259065675199963e-05, 1.0291999387845863e-05, 1.0291360922565218e-05, 1.0267344805470202e-05, 1.0224153811577708e-05, 1.020798026729608e-05, 1.0168583685299382e-05, 1.0163683327846229e-05, 1.0170007044507656e-05, 1.0129339898412582e-05, 1.0088480848935433e-05, 1.0064393
{"probabilities": [[4.840159939512034e-10, 1.661045098444447e-05, 0.0001381409529130906, 0.0004579269152600318, 0.00036898537655360997, 0.00036809389712288976, 0.00029120693216100335, 0.00014563219156116247, 3.4400494769215584e-05, 2.3562979549751617e-05, 2.1039057173766196e-05, 1.9975657778559253e-05, 1.9286471797386184e-05, 1.885205347207375e-05, 1.855322261690162e-05, 1.8282113160239533e-05, 1.8071341401082464e-05, 1.78573409357341e-05, 1.769024493114557e-05, 1.7547819879837334e-05, 1.7441077943658456e-05, 1.738855462463107e-05, 1.7468122678110376e-05, 1.7624051906750537e-05, 1.7679565644357353e-05, 1.7851274606073275e-05, 1.8063354218611494e-05, 1.8237777112517506e-05, 1.8433252989780158e-05, 1.858892574091442e-05, 1.8795646610669792e-05, 1.8996423023054376e-05, 1.918710768222809e-05, 1.937993147294037e-05, 1.960914596565999e-05, 1.984423397516366e-05, 2.0094759747735225e-05, 2.03862800844945e-05, 2.0699026208603755e-05, 2.108862463501282e-05, 2.167898310290184e-05, 2.2392268874682486e-05, 2.3377493562293
{"probabilities": [[1.0849407483526363e-10, 6.94548634783132e-06, 1.3681126802111976e-05, 1.95740612980444e-05, 1.8555307178758085e-05, 1.1816458936664276e-05, 1.7588121409062296e-05, 2.1660376660292968e-05, 4.139439988648519e-05, 4.466215614229441e-05, 4.503170202951878e-05, 4.5510394556913525e-05, 4.55682456959039e-05, 4.527786222752184e-05, 4.514204920269549e-05, 4.5166609197622165e-05, 4.503627133090049e-05, 4.475050445762463e-05, 4.44216129835695e-05, 4.423066275194287e-05, 4.4071344746043906e-05, 4.4062562665203586e-05, 4.4481392251327634e-05, 4.50205770903267e-05, 4.540235022432171e-05, 4.589887976180762e-05, 4.654252916225232e-05, 4.713853923021816e-05, 4.783949407283217e-05, 4.8390320444013923e-05, 4.898404949926771e-05, 4.9716421926859766e-05, 5.04347808600869e-05, 5.103492003399879e-05, 5.171795783098787e-05, 5.2446528570726514e-05, 5.318355033523403e-05, 5.406369382399134e-05, 5.513917494681664e-05, 5.616097041638568e-05, 5.7629484217613935e-05, 5.9401147154858336e-05, 6.145717634353787e-05, 6.468
{"probabilities": [[3.3102880963165204e-18, 9.984234202420339e-05, 9.484939801041037e-05, 0.00018421022105030715, 0.00020795002637896687, 0.0004247576871421188, 0.002076999982818961, 0.0023418839555233717, 8.673076808918267e-05, 3.570398621377535e-05, 2.7259036869509146e-05, 2.4256771212094463e-05, 2.253505590488203e-05, 2.1720759832533076e-05, 2.1129870219738223e-05, 2.0706023860839196e-05, 2.052182753686793e-05, 2.0418083295226097e-05, 2.0341436538728885e-05, 2.0335219232947566e-05, 2.0335233784862794e-05, 2.0322520867921412e-05, 2.0266750652808696e-05, 2.0184563254588284e-05, 2.0082203263882548e-05, 1.9988396161352284e-05, 1.9870823962264694e-05, 1.9783325114985928e-05, 1.9718472685781308e-05, 1.9645312931970693e-05, 1.956095547939185e-05, 1.9450988475000486e-05, 1.9284416339360178e-05, 1.9213652194594033e-05, 1.9178378352080472e-05, 1.9174874978489242e-05, 1.9134351532557048e-05, 1.901563700812403e-05, 1.8979766537086107e-05, 1.8924285541288555e-05, 1.8849794287234545e-05, 1.8833097783499397e-05, 1.899709