Skip to content

Instantly share code, notes, and snippets.

View eightysteele's full-sized avatar

Aaron Steele eightysteele

View GitHub Profile
STOP_WORDS = ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']
STOP_CONCEPTS = ['acceptednameusageid', 'accessrights', 'basisofrecord', 'collectionid', 'coordinateprecision', 'coordinateuncertaintyinmeters', 'datasetid', 'dateidentified', 'day', 'decimallatitude', 'decimallongitude', 'disposition', 'enddayofyear', 'eventdate', 'eventid', 'eventtime', 'fieldnotes', 'footprintspatialfit', 'footprintsrs', 'footprintwkt', 'geologicalcontextid', 'georeferenceremarks', 'georeferenceverificationstatus', 'highergeographyid', 'identificationid', 'individualcount', 'individualid', 'institutionid', 'language', 'locationid', 'maximumdepthinmeters', 'maximumdistanceabovesurfaceinmeters', 'maximumelevationinmeters', 'minimumdepthinmeters', 'minimumdistanceabovesurfaceinmeters', 'minimumelevationinmeters', 'modified', 'month', 'nameaccordingtoid', 'namepublishedinid', 'nomenclaturalcode', 'occurrencedetails', 'occurrenceid', 'originalnameusageid', 'parentnameusageid', 'pointradiusspatialfit', 'rights', 'rightsholder', 'scientificnameid', 'startdayofyear', 'taxonconceptid', 'taxonid', '
>>> sp = []
>>> for x in props.split('\n'):
... if x.split()[0] == 'X':
... sp.append(x.split()[1].lower())
...
>>> sp.sort()
>>> sp
def add_dynamic_properties(input_dict, instance, bulkload_state_copy):
for key,value in input_dict.iteritems():
if key in PROPERTIES_STOP_CONCEPTS:
continue
try:
instance[key] = value.lower()
except:
pass
instance.pop('rechash')
instance.pop('reckey')
def get_corpus_list():
def wrapper(value, bulkload_state):
"""Returns list of unique words in the entire record.
Arguments:
value - the JSON encoded record
"""
d = bulkload_state.current_dictionary
recjson = simplejson.loads(value)
d.update(recjson)
# Stop words not to index
STOP_WORDS = [
'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been',
'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either',
'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have',
'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into',
'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me',
'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off',
'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say',
DWC_ALIAS = dict(
acceptedNameUsage='anu',
acceptedNameUsageID='anuid',
accessRights='ar',
associatedMedia='am',
associatedOccurrences='ao',
associatedReferences='ar',
associatedSequences='as',
associatedTaxa='at',
basisOfRecord='br',
$ fdisk -l
WARNING: GPT (GUID Partition Table) detected on '/dev/sda'! The util fdisk doesn't support GPT. Use GNU Parted.
Disk /dev/sda: 250.1 GB, 250059350016 bytes
255 heads, 63 sectors/track, 30401 cylinders
Units = cylinders of 16065 * 512 = 8225280 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
class Cell(model.Model):
"""Models a CouchDB cell document.
key_name - The cell key (e.g., 1-2).
"""
rev = model.StringProperty('r')
coords = model.StringProperty('c')
varvals = model.TextProperty('v')
def __eq__(self, other):
@classmethod
def create(cls, xmin, ymax, xmax, ymin):
return cls(Point(xmin, ymax), Point(xmax, ymin))