Skip to content

Instantly share code, notes, and snippets.

@mayrop
Last active July 31, 2019 13:24
Show Gist options
  • Save mayrop/a5dbe55e78fc355e1738d05ccb5b4cd4 to your computer and use it in GitHub Desktop.
Save mayrop/a5dbe55e78fc355e1738d05ccb5b4cd4 to your computer and use it in GitHub Desktop.
import numpy as np...
import numpy as np
import pandas as pd
import re
def get_patterns(ids, debug=False):
max_char = len(str(np.max(ids))) # getting max length
ids = list(map(str, ids)) # changing to chars
# padding any zeros to the left
ids = [number.zfill(max_char) for number in ids]
# converting to all chars
ids = [list(id) for id in ids]
ids = np.array(ids)
counts = pd.DataFrame(columns=['i', 'dict', 'len'])
for i in range(0, max_char):
unique, cnt = np.unique(ids[:, i], return_counts=True)
mydict = (dict(zip(unique, cnt)))
counts = counts.append({
'i': i,
'dict': mydict,
'len': len(mydict)
}, ignore_index=True)
# we only want cols where the char is the same for all rows
counts = counts[counts['len'] == 1]
counts['i2'] = (counts['i'].shift(-1))
counts['diff'] = (counts['i2'] - counts['i'])
# we want sequences in the col indexes
sequences = counts[counts['diff'] != 1].copy()
n_sequences = len(counts) - len(sequences)
debug and print(counts)
debug and print(sequences)
debug and print("Number of sequences: ", n_sequences)
indexes = sequences[sequences['diff'] != 1].index.values.astype(int)
debug and print(indexes)
prev = 0
seqs = []
# apparently when the sequence is at the beginning, slicing fails
switch_i0 = len(counts[counts['i'] == 0]) == 1
for i in indexes:
# todo -> improve and double check
to_i = i + 1 if prev == 0 and switch_i0 else i
myseq = [''.join(list(mydict))
for mydict in counts[prev:to_i]['dict']]
seqs.append(''.join(myseq))
prev = to_i
return seqs
def get_suffixes(ids):
ids_string = list(map(str, ids))
suffixes = []
for pattern in get_patterns(ids):
print("^.*{}(.*)$".format(pattern))
r = re.compile("^.*{}(.*)$".format(pattern))
suffixes.append([r.match(line).group(1)
for line in ids_string if r.match(line)])
return suffixes
ids = np.array([417118.0, 417119.0, 417213.0, 417166.0, 417167.0,
417168.0, 417210.0, 517066.0, 517004.0, 517005.0, 517059.0])
get_patterns(ids)
# ^.*17(.*)$
# ^.*.0(.*)$
get_suffixes(ids)
# [['118.0',
# '119.0',
# '213.0',
# '166.0',
# '167.0',
# '168.0',
# '210.0',
# '066.0',
# '004.0',
# '005.0',
# '059.0'],
# ['', '', '', '', '', '', '', '', '', '', '']]
ids = np.array([41117118.10, 41117119.10, 41117213.10, 41117166.10, 41117167.10,
41117168.10, 41117210.10, 51117066.10, 51117004.10, 51117005.10,
51117059.10])
print(get_patterns(ids))
# ['1117', '.1']
ids = np.array([12341117118.10, 12341117119.10, 12341117213.10, 12341117166.10, 12341117167.10, 12341117168.10, 12341117210.10, 12351117066.10, 12351117004.10, 12351117005.10, 12351117059.10])
print(get_patterns(ids))
# ['123', '1117', '.1']
ids = np.array([123, 345, 678])
get_patterns(ids)
get_suffixes(ids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment