Skip to content

Instantly share code, notes, and snippets.

@twobob
Created July 21, 2023 20:23
Show Gist options
  • Save twobob/a869fbd5aa0c9f253c8a2bc453ab0743 to your computer and use it in GitHub Desktop.
Save twobob/a869fbd5aa0c9f253c8a2bc453ab0743 to your computer and use it in GitHub Desktop.
QLORA from Stackexchange zim exports
import argparse
import os
import sys
import random
import codecs
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
import colorama
from colorama import Fore
import logging
colorama.init()
# Setup logging configuration
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
def extract_text_from_html(html_file, min_score):
with open(html_file, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
divs = soup.find_all('div', attrs={'itemprop': 'text'})
# Filter divs by score
divs = [div for div in divs if int(div.parent.get('data-score', 0)) >= min_score]
texts = [div.text.strip() for div in divs]
return '\n'.join('### Human: {}\n### Assistant: {}'.format(texts[i], texts[i+1]) for i in range(0, len(texts)-1, 2))
def process_file(args):
root, file, subfolder_name, min_score = args
output_file_path = os.path.join(root, "Stack-{}-answers.txt".format(subfolder_name))
if file.endswith(".txt") or file.endswith(".json"):
return None, None
html_file = os.path.join(root, file)
try:
text = extract_text_from_html(html_file, min_score)
except UnicodeEncodeError as e:
error_message = "Error processing file: {}. UnicodeEncodeError: {}".format(html_file, e)
return None, error_message
try:
with open(output_file_path, 'a', encoding='utf-8') as output_file:
output_file.write(text)
except Exception as e:
print(f"Error writing to file {output_file_path}: {e}")
return text, None
def count_files_in_folder(questions_folder):
count = 0
file_info_list = []
for root, _, files in os.walk(questions_folder):
if 'tagged' in root:
continue
subfolder_name = os.path.basename(root)
for file in files:
if not file.endswith(".txt") and not file.endswith(".json"):
count += 1
file_info_list.append((root, file, subfolder_name))
return count, file_info_list
def process_files_in_questions_folder(folder, min_score, release, split_ratio):
total_files, file_info_list = count_files_in_folder(folder)
processed_files = 0
print('{}Total files to process: {}'.format(Fore.GREEN, total_files))
pool = Pool()
error_messages = []
qna_pairs = []
for result, error in pool.imap_unordered(process_file, [(info[0], info[1], info[2], min_score) for info in file_info_list]):
processed_files += 1
percentage_completed = (processed_files / float(total_files)) * 100
sys.stdout.write("\rProcessing [{}] {:.2f}% completed.".format('=' * int(percentage_completed / 2), percentage_completed))
sys.stdout.flush()
if result:
qna_pairs.append(result)
if error:
error_messages.append(error)
pool.close()
pool.join()
sys.stdout.write("\nProcessing completed.\n")
sys.stdout.flush()
if release:
random.shuffle(qna_pairs)
train_size = int(len(qna_pairs) * split_ratio)
train_set, test_set = qna_pairs[:train_size], qna_pairs[train_size:]
if not os.path.exists('RELEASE'):
os.mkdir('RELEASE')
with open(os.path.join('RELEASE', 'final-training.json'), 'w') as f:
json.dump(train_set, f)
with open(os.path.join('RELEASE', 'final-testing.json'), 'w') as f:
json.dump(test_set, f)
with open("unprocessed-questions.txt", "w") as f:
for error in error_messages:
f.write(error + "\n")
def parse_arguments():
parser = argparse.ArgumentParser(description='Process StackExchange data for QLORA (Question Generation from Large Open-domain Reading) task.')
parser.add_argument('--folder', default='questions', help='Folder to process containing HTML files.')
parser.add_argument('--min_score', type=int, default=0, help='Minimum score to include an answer.')
parser.add_argument('--release', action='store_true', help='Create final training and testing files.')
parser.add_argument('--split_ratio', type=float, default=0.8, help='Split ratio for training and testing data.')
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
process_files_in_questions_folder(args.folder, args.min_score, args.release, args.split_ratio)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment