heetbeet · August 20, 2019 20:33
diff --git a/cpp_code_helpers.py b/cpp_code_helpers.py
 import os

 class ddict(dict):
    def __init__(self, **kwds):
        self.update(kwds)
        self.__dict__ = self
        
 def to_markers(lefts,
               rights):
    return [ddict(lhs  = i,
                  lenl = len(i),
                  rhs  = j,
                  lenr = len(j) ) for i,j in zip(lefts,
                                                 rights)]
    
 def single_spacing(txt, also_ln=False):
    txt_old = None
    while(txt_old != txt):
        txt_old = txt
        
        txt = txt.replace('  ',' ')
        txt = txt.replace('\t',' ')
        if also_ln:
            txt = txt.replace('\n',' ')
    return txt
    
 def remove_whitespace(txt, also_ln=False):
    N = None
    while(N!=len(txt)):
        N = len(txt)
        
        txt = txt.replace(' ','')
        txt = txt.replace('\t','')
        if also_ln:
            txt = txt.replace('\n','')
    return txt

    
 def scrub_nonvarchars(txt):
    return ''.join([' ' if i not in '\n_0123456789'
                                    'abcdefghijklmnopqrstuvwxyz'
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ' else i for i in txt])
    
 def scrub_all_except_newline(txt):
    return ''.join([' ' if i != '\n' else i for i in txt ])


 def scrub_comments_and_strings(txt):
    """
    A pre-process to make scraping easier. This function turns all the comments and strings
    into empty text (spaces):
    FROM: /* // */ cout << "hello \"world\"!" << R"(bla)"; //chingching
    TO  :          cout << "                " << R"(   )";             
    """
    
    markers = to_markers(['/*', '"', 'R"('],
                         ['*/', '"', ')"' ])
    
    #Remove any escaped \" or \\\", but not \\" or \\\\" (uneven vs. even slashes)
    txtout = txt
    i = -1
    while i<len(txtout)-1:
        i+=1
        
        if txtout[i] == '\\':
            nrslashes = 0
            for j in range(i, len(txtout)):
                
                if txtout[j] == '\\':
                    nrslashes += 1
                    
                elif txtout[j] == '"':
                    if nrslashes%2==1:
                        txtout = txtout[:j-1]+'  '+txtout[j+1:]
                    i=j
                    break
                else:
                    i=j
                    break

    
    #Match lefts with righs and clear the text inbetween
    #be aware of // comments!!!!
    i = -1
    while(i<len(txtout)-1):
        i+=1
        
        nxtiter = False
        for m in markers:
            if m.lhs == txtout[i:i+m.lenl]:
                #was the last seen \" farther back than the last seen //? then 
                #we are in a comment, skip this event
                if txtout.rfind('//', 0, i+1) > txtout.rfind('\n', 0, i+1):
                    break #--,
        #<-------------------'       
        
                i+=m.lenl
                for j in range(i, len(txtout)):
                    if m.rhs == txtout[j:j+m.lenr]:
                        txtout = (txtout[:i] +
                                  scrub_all_except_newline(txtout[i:j]) +
                                  txtout[j:])
                        
                        i = j+m.lenr-1 #will ++ just now
                        
                        nxiter = True
                        break #-----+
                if nxiter:          #
                    break           #  
        #<--------------------------+
    
    #clear the // commented text and remove the lefover /* and */ signs
    lines = txtout.split('\n')
    for i, line in enumerate(lines):
        idx = line.find('//')
        if idx >= 0:
            lines[i] = line[:idx] + ' '*(len(line)-idx)
    txtout = '\n'.join(lines)
    
    txtout = txtout.replace('/*', '  ')
    txtout = txtout.replace('*/', '  ')
    
    return txtout


 def place_back_strings(txt_scrubbed,
                       txt_original):
    """
    This function placed back the strings that was scrubbed away,
    so you end up with only the comments scrubbed.
    """
    markers = to_markers(['/*', '"', 'R"('],
                         ['*/', '"', ')"' ])
    
    
    #Match lefts with righs and clear the text inbetween
    txtout = txt_scrubbed
    i = -1
    while(i<len(txtout)-1):
        i+=1
        
        nxtiter = False
        for m in markers:
            if m.lhs == txtout[i:i+m.lenl]:   
                i = i+m.lenl
                for j in range(i+1, len(txtout)):
                    if m.rhs == txtout[j:j+m.lenr]:
                        txtout = (txtout[:i] +
                                  txt_original[i:j] +
                                  txtout[j:])
                        
                        i = j+m.lenr-1 #will ++ just now
                        
                        nxiter = True
                        break #-----+
                if nxiter:          #
                    break           #  
        #<--------------------------+
    
    return txtout

 def txt_views(txt):
    v = ddict()
    v.orig = txt
    v.clean = scrub_comments_and_strings(txt)
    v.nocomments = place_back_strings(v.clean, txt)
    v.vars = scrub_nonvarchars(v.clean)
    
    return v

 def view_lnsplit(v):
    return ddict(**{k:v.split('\n') for k,v in v.items()})
    

 def isint(txt):
    try:
        int(txt)
        return True
    except: return False
    
 def split(txt):
    splits = []
    iskeep = False
    for i, char in enumerate(txt):
        if not iskeep and not char in (' ', '\t', '\n'):
            iskeep = True
            splits.append(ddict(i=i))
        elif iskeep and char in (' ', '\t', '\n'):
            iskeep = False
            splits[-1].j = i
            
    for s in splits:
        s.str = txt[s.i:s.j]
        
    return splits
diff --git a/Untitled.ipynb b/Untitled.ipynb
	import os

	class ddict(dict):
	def __init__(self, **kwds):
	self.update(kwds)
	self.__dict__ = self

	def to_markers(lefts,
	rights):
	return [ddict(lhs = i,
	lenl = len(i),
	rhs = j,
	lenr = len(j) ) for i,j in zip(lefts,
	rights)]

	def single_spacing(txt, also_ln=False):
	txt_old = None
	while(txt_old != txt):
	txt_old = txt

	txt = txt.replace(' ',' ')
	txt = txt.replace('\t',' ')
	if also_ln:
	txt = txt.replace('\n',' ')
	return txt

	def remove_whitespace(txt, also_ln=False):
	N = None
	while(N!=len(txt)):
	N = len(txt)

	txt = txt.replace(' ','')
	txt = txt.replace('\t','')
	if also_ln:
	txt = txt.replace('\n','')
	return txt


	def scrub_nonvarchars(txt):
	return ''.join([' ' if i not in '\n_0123456789'
	'abcdefghijklmnopqrstuvwxyz'
	'ABCDEFGHIJKLMNOPQRSTUVWXYZ' else i for i in txt])

	def scrub_all_except_newline(txt):
	return ''.join([' ' if i != '\n' else i for i in txt ])


	def scrub_comments_and_strings(txt):
	"""
	A pre-process to make scraping easier. This function turns all the comments and strings
	into empty text (spaces):
	FROM: /* // */ cout << "hello \"world\"!" << R"(bla)"; //chingching
	TO : cout << " " << R"( )";
	"""

	markers = to_markers(['/*', '"', 'R"('],
	['*/', '"', ')"' ])

	#Remove any escaped \" or \\\", but not \\" or \\\\" (uneven vs. even slashes)
	txtout = txt
	i = -1
	while i<len(txtout)-1:
	i+=1

	if txtout[i] == '\\':
	nrslashes = 0
	for j in range(i, len(txtout)):

	if txtout[j] == '\\':
	nrslashes += 1

	elif txtout[j] == '"':
	if nrslashes%2==1:
	txtout = txtout[:j-1]+' '+txtout[j+1:]
	i=j
	break
	else:
	i=j
	break


	#Match lefts with righs and clear the text inbetween
	#be aware of // comments!!!!
	i = -1
	while(i<len(txtout)-1):
	i+=1

	nxtiter = False
	for m in markers:
	if m.lhs == txtout[i:i+m.lenl]:
	#was the last seen \" farther back than the last seen //? then
	#we are in a comment, skip this event
	if txtout.rfind('//', 0, i+1) > txtout.rfind('\n', 0, i+1):
	break #--,
	#<-------------------'

	i+=m.lenl
	for j in range(i, len(txtout)):
	if m.rhs == txtout[j:j+m.lenr]:
	txtout = (txtout[:i] +
	scrub_all_except_newline(txtout[i:j]) +
	txtout[j:])

	i = j+m.lenr-1 #will ++ just now

	nxiter = True
	break #-----+
	if nxiter: #
	break #
	#<--------------------------+

	#clear the // commented text and remove the lefover /* and */ signs
	lines = txtout.split('\n')
	for i, line in enumerate(lines):
	idx = line.find('//')
	if idx >= 0:
	lines[i] = line[:idx] + ' '*(len(line)-idx)
	txtout = '\n'.join(lines)

	txtout = txtout.replace('/*', ' ')
	txtout = txtout.replace('*/', ' ')

	return txtout


	def place_back_strings(txt_scrubbed,
	txt_original):
	"""
	This function placed back the strings that was scrubbed away,
	so you end up with only the comments scrubbed.
	"""
	markers = to_markers(['/*', '"', 'R"('],
	['*/', '"', ')"' ])


	#Match lefts with righs and clear the text inbetween
	txtout = txt_scrubbed
	i = -1
	while(i<len(txtout)-1):
	i+=1

	nxtiter = False
	for m in markers:
	if m.lhs == txtout[i:i+m.lenl]:
	i = i+m.lenl
	for j in range(i+1, len(txtout)):
	if m.rhs == txtout[j:j+m.lenr]:
	txtout = (txtout[:i] +
	txt_original[i:j] +
	txtout[j:])

	i = j+m.lenr-1 #will ++ just now

	nxiter = True
	break #-----+
	if nxiter: #
	break #
	#<--------------------------+

	return txtout

	def txt_views(txt):
	v = ddict()
	v.orig = txt
	v.clean = scrub_comments_and_strings(txt)
	v.nocomments = place_back_strings(v.clean, txt)
	v.vars = scrub_nonvarchars(v.clean)

	return v

	def view_lnsplit(v):
	return ddict(**{k:v.split('\n') for k,v in v.items()})


	def isint(txt):
	try:
	int(txt)
	return True
	except: return False

	def split(txt):
	splits = []
	iskeep = False
	for i, char in enumerate(txt):
	if not iskeep and not char in (' ', '\t', '\n'):
	iskeep = True
	splits.append(ddict(i=i))
	elif iskeep and char in (' ', '\t', '\n'):
	iskeep = False
	splits[-1].j = i

	for s in splits:
	s.str = txt[s.i:s.j]

	return splits