Last active
August 20, 2019 20:33
-
-
Save heetbeet/bdfb12856a6163f5d12bd4d8bcb18ab5 to your computer and use it in GitHub Desktop.
Some functions to help with analysing cpp files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
class ddict(dict): | |
def __init__(self, **kwds): | |
self.update(kwds) | |
self.__dict__ = self | |
def to_markers(lefts, | |
rights): | |
return [ddict(lhs = i, | |
lenl = len(i), | |
rhs = j, | |
lenr = len(j) ) for i,j in zip(lefts, | |
rights)] | |
def single_spacing(txt, also_ln=False): | |
txt_old = None | |
while(txt_old != txt): | |
txt_old = txt | |
txt = txt.replace(' ',' ') | |
txt = txt.replace('\t',' ') | |
if also_ln: | |
txt = txt.replace('\n',' ') | |
return txt | |
def remove_whitespace(txt, also_ln=False): | |
N = None | |
while(N!=len(txt)): | |
N = len(txt) | |
txt = txt.replace(' ','') | |
txt = txt.replace('\t','') | |
if also_ln: | |
txt = txt.replace('\n','') | |
return txt | |
def scrub_nonvarchars(txt): | |
return ''.join([' ' if i not in '\n_0123456789' | |
'abcdefghijklmnopqrstuvwxyz' | |
'ABCDEFGHIJKLMNOPQRSTUVWXYZ' else i for i in txt]) | |
def scrub_all_except_newline(txt): | |
return ''.join([' ' if i != '\n' else i for i in txt ]) | |
def scrub_comments_and_strings(txt): | |
""" | |
A pre-process to make scraping easier. This function turns all the comments and strings | |
into empty text (spaces): | |
FROM: /* // */ cout << "hello \"world\"!" << R"(bla)"; //chingching | |
TO : cout << " " << R"( )"; | |
""" | |
markers = to_markers(['/*', '"', 'R"('], | |
['*/', '"', ')"' ]) | |
#Remove any escaped \" or \\\", but not \\" or \\\\" (uneven vs. even slashes) | |
txtout = txt | |
i = -1 | |
while i<len(txtout)-1: | |
i+=1 | |
if txtout[i] == '\\': | |
nrslashes = 0 | |
for j in range(i, len(txtout)): | |
if txtout[j] == '\\': | |
nrslashes += 1 | |
elif txtout[j] == '"': | |
if nrslashes%2==1: | |
txtout = txtout[:j-1]+' '+txtout[j+1:] | |
i=j | |
break | |
else: | |
i=j | |
break | |
#Match lefts with righs and clear the text inbetween | |
#be aware of // comments!!!! | |
i = -1 | |
while(i<len(txtout)-1): | |
i+=1 | |
nxtiter = False | |
for m in markers: | |
if m.lhs == txtout[i:i+m.lenl]: | |
#was the last seen \" farther back than the last seen //? then | |
#we are in a comment, skip this event | |
if txtout.rfind('//', 0, i+1) > txtout.rfind('\n', 0, i+1): | |
break #--, | |
#<-------------------' | |
i+=m.lenl | |
for j in range(i, len(txtout)): | |
if m.rhs == txtout[j:j+m.lenr]: | |
txtout = (txtout[:i] + | |
scrub_all_except_newline(txtout[i:j]) + | |
txtout[j:]) | |
i = j+m.lenr-1 #will ++ just now | |
nxiter = True | |
break #-----+ | |
if nxiter: # | |
break # | |
#<--------------------------+ | |
#clear the // commented text and remove the lefover /* and */ signs | |
lines = txtout.split('\n') | |
for i, line in enumerate(lines): | |
idx = line.find('//') | |
if idx >= 0: | |
lines[i] = line[:idx] + ' '*(len(line)-idx) | |
txtout = '\n'.join(lines) | |
txtout = txtout.replace('/*', ' ') | |
txtout = txtout.replace('*/', ' ') | |
return txtout | |
def place_back_strings(txt_scrubbed, | |
txt_original): | |
""" | |
This function placed back the strings that was scrubbed away, | |
so you end up with only the comments scrubbed. | |
""" | |
markers = to_markers(['/*', '"', 'R"('], | |
['*/', '"', ')"' ]) | |
#Match lefts with righs and clear the text inbetween | |
txtout = txt_scrubbed | |
i = -1 | |
while(i<len(txtout)-1): | |
i+=1 | |
nxtiter = False | |
for m in markers: | |
if m.lhs == txtout[i:i+m.lenl]: | |
i = i+m.lenl | |
for j in range(i+1, len(txtout)): | |
if m.rhs == txtout[j:j+m.lenr]: | |
txtout = (txtout[:i] + | |
txt_original[i:j] + | |
txtout[j:]) | |
i = j+m.lenr-1 #will ++ just now | |
nxiter = True | |
break #-----+ | |
if nxiter: # | |
break # | |
#<--------------------------+ | |
return txtout | |
def txt_views(txt): | |
v = ddict() | |
v.orig = txt | |
v.clean = scrub_comments_and_strings(txt) | |
v.nocomments = place_back_strings(v.clean, txt) | |
v.vars = scrub_nonvarchars(v.clean) | |
return v | |
def view_lnsplit(v): | |
return ddict(**{k:v.split('\n') for k,v in v.items()}) | |
def isint(txt): | |
try: | |
int(txt) | |
return True | |
except: return False | |
def split(txt): | |
splits = [] | |
iskeep = False | |
for i, char in enumerate(txt): | |
if not iskeep and not char in (' ', '\t', '\n'): | |
iskeep = True | |
splits.append(ddict(i=i)) | |
elif iskeep and char in (' ', '\t', '\n'): | |
iskeep = False | |
splits[-1].j = i | |
for s in splits: | |
s.str = txt[s.i:s.j] | |
return splits |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment