JayDoubleu · May 30, 2023 15:33
diff --git a/gistfile1.txt b/gistfile1.txt
 import fnmatch
 import os
 #import re2 as re
 import regex as re
 import subprocess
 from subprocess import PIPE, STDOUT
 import argparse
 import yaml
 import csv

 parser = argparse.ArgumentParser()
 parser.add_argument("--levels", required=False, action='store_true') 
 args = parser.parse_args()

 noalias_dumper = yaml.dumper.SafeDumper
 noalias_dumper.ignore_aliases = lambda self, data: True

 input_folder = "./input/"
 output_folder = "./output/"
 playbooks_folder = "./playbooks/"
 pdfs = []
 playbooks = []
 plays = []

 def growing(s):
    parts = s.split(".")
    for i in range(len(parts)):
        yield 'cis_section' + ".".join(parts[:i+1])

 for file in os.listdir(input_folder):
    if fnmatch.fnmatch(file, '*.pdf'):
        pdfs.append(file)

 for file in os.listdir(playbooks_folder):
    if fnmatch.fnmatch(file, '*.yml') or fnmatch.fnmatch(file, '*.yaml'):
        playbooks.append(file)

 for playbook in playbooks:
    with open(playbooks_folder + playbook) as playbook:
        playbook = yaml.load(playbook)
        for play in playbook:
            plays.append(play)

 for pdf in pdfs:
    directory = output_folder + pdf.replace('.pdf', '')
    if not os.path.exists(directory):
        os.makedirs(directory)

 for pdf in pdfs:
    print ('Processing ' + input_folder + pdf)
    command = ["pdfgrep", ".", input_folder + pdf]
    pdf_contents = subprocess.check_output(["pdfgrep", ".", input_folder + pdf]).decode('utf-8')
    regex_titles = "^\s*((?:[0-9]+\.)+[0-9]+)\ ((?:[0-9a-zA-Z\-\/(),+\"_'\ ](?:(?:\.\ )|\.[a-zA-Z0-9])?[r'\r\n|\r|\n]{0,2})*)(?:(?:\ \.\ )|(?:\ \.*)|(?:\.{2,})|)\ ([0-9]{2,3}$)"
    parsed = re.finditer(regex_titles, pdf_contents, re.MULTILINE)
    for match in parsed:
        task_no = re.sub('\s{2,}', ' ', match.group(1)).strip()
        task_title = re.sub('\s{2,}', ' ', match.group(2)).strip()
        task_page = re.sub('\s{2,}', ' ', match.group(3)).strip()
        scored = ""
        if '(Scored)' in task_title:
            scored = 'cis_scored'
        if '(Not Scored)' in task_title:
            scored = 'cis_notscored'
        if '(Scored)' not in task_title and '(Not Scored)' not in task_title:
            scored = 'cis_unknown'
        result = task_no + ',' + task_title + ',' + task_page
        level = re.search('^(('+ task_no + '(?!\.d)\s))[^\S\n]*((?:[^L\d]*(?!Level\s+\d+|^[^\n\S]*\d+(?:\.\d+){2,3})\S+\s+)+)(Level\ [0-9]+)?', pdf_contents, re.MULTILINE).group(4)
        if level is None and args.levels is False:
                level = 'None'
        if level is not None:
            if 'cis_unknown' not in scored:
                #print (result + ',' + level + ',' + scored)
                task_title = re.sub('(\(Scored\)|\(Not Scored\)|\(L1\)|\(L2\))', '', task_title).strip()
                task_title = " ".join(task_title.split())
                current_play = task_no + ' - ' + task_title
                default_tags = [ pdf.replace('.pdf', '').lower(), 'cis_page_' + task_page , 'cis_' + level.lower().replace(' ', '_'), scored ] + list(growing(task_no))
                data = None
                for play in plays:
                    this_play = re.sub('(^\s*((?:[0-9]+\.)+[0-9]+)\s*\-)','', play['name']).strip()
                    if this_play == task_title:
                        play['name'] = task_no + ' - ' + task_title
                        play['tags'] = default_tags
                        if data is None:
                            data = [play]
                        else:
                            data = data + [play]

                if data is None:
                    data = [ { "name": current_play, "ping": None, "tags": default_tags } ]

                output = output_folder + pdf.replace('.pdf', '') + '/' + pdf.replace('.pdf', '.yml')
                with open(output, 'a+') as outfile:
                    yaml.dump(data, outfile, default_flow_style=False, width=1000, sort_keys=False, Dumper=noalias_dumper)
                    outfile.write('\n')

                with open(output.replace('.yml', '.csv'), "a") as csv_file:
                    writer = csv.writer(csv_file, delimiter=',')
                    writer.writerow([task_no,task_title,level,task_page,scored,pdf.replace('.pdf', '').lower()])
	import fnmatch
	import os
	#import re2 as re
	import regex as re
	import subprocess
	from subprocess import PIPE, STDOUT
	import argparse
	import yaml
	import csv

	parser = argparse.ArgumentParser()
	parser.add_argument("--levels", required=False, action='store_true')
	args = parser.parse_args()

	noalias_dumper = yaml.dumper.SafeDumper
	noalias_dumper.ignore_aliases = lambda self, data: True

	input_folder = "./input/"
	output_folder = "./output/"
	playbooks_folder = "./playbooks/"
	pdfs = []
	playbooks = []
	plays = []

	def growing(s):
	parts = s.split(".")
	for i in range(len(parts)):
	yield 'cis_section' + ".".join(parts[:i+1])

	for file in os.listdir(input_folder):
	if fnmatch.fnmatch(file, '*.pdf'):
	pdfs.append(file)

	for file in os.listdir(playbooks_folder):
	if fnmatch.fnmatch(file, '.yml') or fnmatch.fnmatch(file, '.yaml'):
	playbooks.append(file)

	for playbook in playbooks:
	with open(playbooks_folder + playbook) as playbook:
	playbook = yaml.load(playbook)
	for play in playbook:
	plays.append(play)

	for pdf in pdfs:
	directory = output_folder + pdf.replace('.pdf', '')
	if not os.path.exists(directory):
	os.makedirs(directory)

	for pdf in pdfs:
	print ('Processing ' + input_folder + pdf)
	command = ["pdfgrep", ".", input_folder + pdf]
	pdf_contents = subprocess.check_output(["pdfgrep", ".", input_folder + pdf]).decode('utf-8')
	regex_titles = "^\s((?:[0-9]+\.)+[0-9]+)\ ((?:[0-9a-zA-Z\-\/(),+\"_'\ ](?:(?:\.\ )\|\.[a-zA-Z0-9])?[r'\r\n\|\r\|\n]{0,2}))(?:(?:\ \.\ )\|(?:\ \.*)\|(?:\.{2,})\|)\ ([0-9]{2,3}$)"
	parsed = re.finditer(regex_titles, pdf_contents, re.MULTILINE)
	for match in parsed:
	task_no = re.sub('\s{2,}', ' ', match.group(1)).strip()
	task_title = re.sub('\s{2,}', ' ', match.group(2)).strip()
	task_page = re.sub('\s{2,}', ' ', match.group(3)).strip()
	scored = ""
	if '(Scored)' in task_title:
	scored = 'cis_scored'
	if '(Not Scored)' in task_title:
	scored = 'cis_notscored'
	if '(Scored)' not in task_title and '(Not Scored)' not in task_title:
	scored = 'cis_unknown'
	result = task_no + ',' + task_title + ',' + task_page
	level = re.search('^(('+ task_no + '(?!\.d)\s))[^\S\n]((?:[^L\d](?!Level\s+\d+\|^[^\n\S]*\d+(?:\.\d+){2,3})\S+\s+)+)(Level\ [0-9]+)?', pdf_contents, re.MULTILINE).group(4)
	if level is None and args.levels is False:
	level = 'None'
	if level is not None:
	if 'cis_unknown' not in scored:
	#print (result + ',' + level + ',' + scored)
	task_title = re.sub('(\(Scored\)\|\(Not Scored\)\|\(L1\)\|\(L2\))', '', task_title).strip()
	task_title = " ".join(task_title.split())
	current_play = task_no + ' - ' + task_title
	default_tags = [ pdf.replace('.pdf', '').lower(), 'cis_page_' + task_page , 'cis_' + level.lower().replace(' ', '_'), scored ] + list(growing(task_no))
	data = None
	for play in plays:
	this_play = re.sub('(^\s((?:[0-9]+\.)+[0-9]+)\s\-)','', play['name']).strip()
	if this_play == task_title:
	play['name'] = task_no + ' - ' + task_title
	play['tags'] = default_tags
	if data is None:
	data = [play]
	else:
	data = data + [play]

	if data is None:
	data = [ { "name": current_play, "ping": None, "tags": default_tags } ]

	output = output_folder + pdf.replace('.pdf', '') + '/' + pdf.replace('.pdf', '.yml')
	with open(output, 'a+') as outfile:
	yaml.dump(data, outfile, default_flow_style=False, width=1000, sort_keys=False, Dumper=noalias_dumper)
	outfile.write('\n')

	with open(output.replace('.yml', '.csv'), "a") as csv_file:
	writer = csv.writer(csv_file, delimiter=',')
	writer.writerow([task_no,task_title,level,task_page,scored,pdf.replace('.pdf', '').lower()])
No results found