Last active
July 31, 2019 13:24
-
-
Save mayrop/a5dbe55e78fc355e1738d05ccb5b4cd4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np... | |
import numpy as np | |
import pandas as pd | |
import re | |
def get_patterns(ids, debug=False): | |
max_char = len(str(np.max(ids))) # getting max length | |
ids = list(map(str, ids)) # changing to chars | |
# padding any zeros to the left | |
ids = [number.zfill(max_char) for number in ids] | |
# converting to all chars | |
ids = [list(id) for id in ids] | |
ids = np.array(ids) | |
counts = pd.DataFrame(columns=['i', 'dict', 'len']) | |
for i in range(0, max_char): | |
unique, cnt = np.unique(ids[:, i], return_counts=True) | |
mydict = (dict(zip(unique, cnt))) | |
counts = counts.append({ | |
'i': i, | |
'dict': mydict, | |
'len': len(mydict) | |
}, ignore_index=True) | |
# we only want cols where the char is the same for all rows | |
counts = counts[counts['len'] == 1] | |
counts['i2'] = (counts['i'].shift(-1)) | |
counts['diff'] = (counts['i2'] - counts['i']) | |
# we want sequences in the col indexes | |
sequences = counts[counts['diff'] != 1].copy() | |
n_sequences = len(counts) - len(sequences) | |
debug and print(counts) | |
debug and print(sequences) | |
debug and print("Number of sequences: ", n_sequences) | |
indexes = sequences[sequences['diff'] != 1].index.values.astype(int) | |
debug and print(indexes) | |
prev = 0 | |
seqs = [] | |
# apparently when the sequence is at the beginning, slicing fails | |
switch_i0 = len(counts[counts['i'] == 0]) == 1 | |
for i in indexes: | |
# todo -> improve and double check | |
to_i = i + 1 if prev == 0 and switch_i0 else i | |
myseq = [''.join(list(mydict)) | |
for mydict in counts[prev:to_i]['dict']] | |
seqs.append(''.join(myseq)) | |
prev = to_i | |
return seqs | |
def get_suffixes(ids): | |
ids_string = list(map(str, ids)) | |
suffixes = [] | |
for pattern in get_patterns(ids): | |
print("^.*{}(.*)$".format(pattern)) | |
r = re.compile("^.*{}(.*)$".format(pattern)) | |
suffixes.append([r.match(line).group(1) | |
for line in ids_string if r.match(line)]) | |
return suffixes | |
ids = np.array([417118.0, 417119.0, 417213.0, 417166.0, 417167.0, | |
417168.0, 417210.0, 517066.0, 517004.0, 517005.0, 517059.0]) | |
get_patterns(ids) | |
# ^.*17(.*)$ | |
# ^.*.0(.*)$ | |
get_suffixes(ids) | |
# [['118.0', | |
# '119.0', | |
# '213.0', | |
# '166.0', | |
# '167.0', | |
# '168.0', | |
# '210.0', | |
# '066.0', | |
# '004.0', | |
# '005.0', | |
# '059.0'], | |
# ['', '', '', '', '', '', '', '', '', '', '']] | |
ids = np.array([41117118.10, 41117119.10, 41117213.10, 41117166.10, 41117167.10, | |
41117168.10, 41117210.10, 51117066.10, 51117004.10, 51117005.10, | |
51117059.10]) | |
print(get_patterns(ids)) | |
# ['1117', '.1'] | |
ids = np.array([12341117118.10, 12341117119.10, 12341117213.10, 12341117166.10, 12341117167.10, 12341117168.10, 12341117210.10, 12351117066.10, 12351117004.10, 12351117005.10, 12351117059.10]) | |
print(get_patterns(ids)) | |
# ['123', '1117', '.1'] | |
ids = np.array([123, 345, 678]) | |
get_patterns(ids) | |
get_suffixes(ids) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment