twobob · July 21, 2023 20:23
diff --git a/qlorsStack.py b/qlorsStack.py
 import argparse
 import os
 import sys
 import random
 import codecs
 import json
 from bs4 import BeautifulSoup
 from multiprocessing import Pool
 import colorama
 from colorama import Fore
 import logging

 colorama.init()

 # Setup logging configuration
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

 def extract_text_from_html(html_file, min_score):
    with open(html_file, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        divs = soup.find_all('div', attrs={'itemprop': 'text'})

        # Filter divs by score
        divs = [div for div in divs if int(div.parent.get('data-score', 0)) >= min_score]

        texts = [div.text.strip() for div in divs]
        return '\n'.join('### Human: {}\n### Assistant: {}'.format(texts[i], texts[i+1]) for i in range(0, len(texts)-1, 2))


 def process_file(args):
    root, file, subfolder_name, min_score = args
    output_file_path = os.path.join(root, "Stack-{}-answers.txt".format(subfolder_name))

    if file.endswith(".txt") or file.endswith(".json"):
        return None, None

    html_file = os.path.join(root, file)
    try:
        text = extract_text_from_html(html_file, min_score)
    except UnicodeEncodeError as e:
        error_message = "Error processing file: {}. UnicodeEncodeError: {}".format(html_file, e)
        return None, error_message

    try:
        with open(output_file_path, 'a', encoding='utf-8') as output_file:
            output_file.write(text)
    except Exception as e:
        print(f"Error writing to file {output_file_path}: {e}")
        
    return text, None



 def count_files_in_folder(questions_folder):
    count = 0
    file_info_list = []
    for root, _, files in os.walk(questions_folder):
        if 'tagged' in root:
            continue
        subfolder_name = os.path.basename(root)
        for file in files:
            if not file.endswith(".txt") and not file.endswith(".json"):
                count += 1
                file_info_list.append((root, file, subfolder_name))
    return count, file_info_list

 def process_files_in_questions_folder(folder, min_score, release, split_ratio):
    total_files, file_info_list = count_files_in_folder(folder)
    processed_files = 0

    print('{}Total files to process: {}'.format(Fore.GREEN, total_files))

    pool = Pool()

    error_messages = []
    qna_pairs = []
    for result, error in pool.imap_unordered(process_file, [(info[0], info[1], info[2], min_score) for info in file_info_list]):
        processed_files += 1
        percentage_completed = (processed_files / float(total_files)) * 100
        sys.stdout.write("\rProcessing [{}] {:.2f}% completed.".format('=' * int(percentage_completed / 2), percentage_completed))
        sys.stdout.flush()

        if result:
            qna_pairs.append(result)
        if error:
            error_messages.append(error)

    pool.close()
    pool.join()

    sys.stdout.write("\nProcessing completed.\n")
    sys.stdout.flush()

    if release:
        random.shuffle(qna_pairs)
        train_size = int(len(qna_pairs) * split_ratio)
        train_set, test_set = qna_pairs[:train_size], qna_pairs[train_size:]

        if not os.path.exists('RELEASE'):
            os.mkdir('RELEASE')

        with open(os.path.join('RELEASE', 'final-training.json'), 'w') as f:
            json.dump(train_set, f)

        with open(os.path.join('RELEASE', 'final-testing.json'), 'w') as f:
            json.dump(test_set, f)

    with open("unprocessed-questions.txt", "w") as f:
        for error in error_messages:
            f.write(error + "\n")

 def parse_arguments():
    parser = argparse.ArgumentParser(description='Process StackExchange data for QLORA (Question Generation from Large Open-domain Reading) task.')
    parser.add_argument('--folder', default='questions', help='Folder to process containing HTML files.')
    parser.add_argument('--min_score', type=int, default=0, help='Minimum score to include an answer.')
    parser.add_argument('--release', action='store_true', help='Create final training and testing files.')
    parser.add_argument('--split_ratio', type=float, default=0.8, help='Split ratio for training and testing data.')
    return parser.parse_args()

 if __name__ == "__main__":
    args = parse_arguments()
    process_files_in_questions_folder(args.folder, args.min_score, args.release, args.split_ratio)
	import argparse
	import os
	import sys
	import random
	import codecs
	import json
	from bs4 import BeautifulSoup
	from multiprocessing import Pool
	import colorama
	from colorama import Fore
	import logging

	colorama.init()

	# Setup logging configuration
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

	def extract_text_from_html(html_file, min_score):
	with open(html_file, 'r', encoding='utf-8') as file:
	soup = BeautifulSoup(file, 'html.parser')
	divs = soup.find_all('div', attrs={'itemprop': 'text'})

	# Filter divs by score
	divs = [div for div in divs if int(div.parent.get('data-score', 0)) >= min_score]

	texts = [div.text.strip() for div in divs]
	return '\n'.join('### Human: {}\n### Assistant: {}'.format(texts[i], texts[i+1]) for i in range(0, len(texts)-1, 2))


	def process_file(args):
	root, file, subfolder_name, min_score = args
	output_file_path = os.path.join(root, "Stack-{}-answers.txt".format(subfolder_name))

	if file.endswith(".txt") or file.endswith(".json"):
	return None, None

	html_file = os.path.join(root, file)
	try:
	text = extract_text_from_html(html_file, min_score)
	except UnicodeEncodeError as e:
	error_message = "Error processing file: {}. UnicodeEncodeError: {}".format(html_file, e)
	return None, error_message

	try:
	with open(output_file_path, 'a', encoding='utf-8') as output_file:
	output_file.write(text)
	except Exception as e:
	print(f"Error writing to file {output_file_path}: {e}")

	return text, None



	def count_files_in_folder(questions_folder):
	count = 0
	file_info_list = []
	for root, _, files in os.walk(questions_folder):
	if 'tagged' in root:
	continue
	subfolder_name = os.path.basename(root)
	for file in files:
	if not file.endswith(".txt") and not file.endswith(".json"):
	count += 1
	file_info_list.append((root, file, subfolder_name))
	return count, file_info_list

	def process_files_in_questions_folder(folder, min_score, release, split_ratio):
	total_files, file_info_list = count_files_in_folder(folder)
	processed_files = 0

	print('{}Total files to process: {}'.format(Fore.GREEN, total_files))

	pool = Pool()

	error_messages = []
	qna_pairs = []
	for result, error in pool.imap_unordered(process_file, [(info[0], info[1], info[2], min_score) for info in file_info_list]):
	processed_files += 1
	percentage_completed = (processed_files / float(total_files)) * 100
	sys.stdout.write("\rProcessing [{}] {:.2f}% completed.".format('=' * int(percentage_completed / 2), percentage_completed))
	sys.stdout.flush()

	if result:
	qna_pairs.append(result)
	if error:
	error_messages.append(error)

	pool.close()
	pool.join()

	sys.stdout.write("\nProcessing completed.\n")
	sys.stdout.flush()

	if release:
	random.shuffle(qna_pairs)
	train_size = int(len(qna_pairs) * split_ratio)
	train_set, test_set = qna_pairs[:train_size], qna_pairs[train_size:]

	if not os.path.exists('RELEASE'):
	os.mkdir('RELEASE')

	with open(os.path.join('RELEASE', 'final-training.json'), 'w') as f:
	json.dump(train_set, f)

	with open(os.path.join('RELEASE', 'final-testing.json'), 'w') as f:
	json.dump(test_set, f)

	with open("unprocessed-questions.txt", "w") as f:
	for error in error_messages:
	f.write(error + "\n")

	def parse_arguments():
	parser = argparse.ArgumentParser(description='Process StackExchange data for QLORA (Question Generation from Large Open-domain Reading) task.')
	parser.add_argument('--folder', default='questions', help='Folder to process containing HTML files.')
	parser.add_argument('--min_score', type=int, default=0, help='Minimum score to include an answer.')
	parser.add_argument('--release', action='store_true', help='Create final training and testing files.')
	parser.add_argument('--split_ratio', type=float, default=0.8, help='Split ratio for training and testing data.')
	return parser.parse_args()

	if __name__ == "__main__":
	args = parse_arguments()
	process_files_in_questions_folder(args.folder, args.min_score, args.release, args.split_ratio)