Created
July 21, 2023 20:23
-
-
Save twobob/a869fbd5aa0c9f253c8a2bc453ab0743 to your computer and use it in GitHub Desktop.
QLORA from Stackexchange zim exports
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import sys | |
import random | |
import codecs | |
import json | |
from bs4 import BeautifulSoup | |
from multiprocessing import Pool | |
import colorama | |
from colorama import Fore | |
import logging | |
colorama.init() | |
# Setup logging configuration | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
def extract_text_from_html(html_file, min_score): | |
with open(html_file, 'r', encoding='utf-8') as file: | |
soup = BeautifulSoup(file, 'html.parser') | |
divs = soup.find_all('div', attrs={'itemprop': 'text'}) | |
# Filter divs by score | |
divs = [div for div in divs if int(div.parent.get('data-score', 0)) >= min_score] | |
texts = [div.text.strip() for div in divs] | |
return '\n'.join('### Human: {}\n### Assistant: {}'.format(texts[i], texts[i+1]) for i in range(0, len(texts)-1, 2)) | |
def process_file(args): | |
root, file, subfolder_name, min_score = args | |
output_file_path = os.path.join(root, "Stack-{}-answers.txt".format(subfolder_name)) | |
if file.endswith(".txt") or file.endswith(".json"): | |
return None, None | |
html_file = os.path.join(root, file) | |
try: | |
text = extract_text_from_html(html_file, min_score) | |
except UnicodeEncodeError as e: | |
error_message = "Error processing file: {}. UnicodeEncodeError: {}".format(html_file, e) | |
return None, error_message | |
try: | |
with open(output_file_path, 'a', encoding='utf-8') as output_file: | |
output_file.write(text) | |
except Exception as e: | |
print(f"Error writing to file {output_file_path}: {e}") | |
return text, None | |
def count_files_in_folder(questions_folder): | |
count = 0 | |
file_info_list = [] | |
for root, _, files in os.walk(questions_folder): | |
if 'tagged' in root: | |
continue | |
subfolder_name = os.path.basename(root) | |
for file in files: | |
if not file.endswith(".txt") and not file.endswith(".json"): | |
count += 1 | |
file_info_list.append((root, file, subfolder_name)) | |
return count, file_info_list | |
def process_files_in_questions_folder(folder, min_score, release, split_ratio): | |
total_files, file_info_list = count_files_in_folder(folder) | |
processed_files = 0 | |
print('{}Total files to process: {}'.format(Fore.GREEN, total_files)) | |
pool = Pool() | |
error_messages = [] | |
qna_pairs = [] | |
for result, error in pool.imap_unordered(process_file, [(info[0], info[1], info[2], min_score) for info in file_info_list]): | |
processed_files += 1 | |
percentage_completed = (processed_files / float(total_files)) * 100 | |
sys.stdout.write("\rProcessing [{}] {:.2f}% completed.".format('=' * int(percentage_completed / 2), percentage_completed)) | |
sys.stdout.flush() | |
if result: | |
qna_pairs.append(result) | |
if error: | |
error_messages.append(error) | |
pool.close() | |
pool.join() | |
sys.stdout.write("\nProcessing completed.\n") | |
sys.stdout.flush() | |
if release: | |
random.shuffle(qna_pairs) | |
train_size = int(len(qna_pairs) * split_ratio) | |
train_set, test_set = qna_pairs[:train_size], qna_pairs[train_size:] | |
if not os.path.exists('RELEASE'): | |
os.mkdir('RELEASE') | |
with open(os.path.join('RELEASE', 'final-training.json'), 'w') as f: | |
json.dump(train_set, f) | |
with open(os.path.join('RELEASE', 'final-testing.json'), 'w') as f: | |
json.dump(test_set, f) | |
with open("unprocessed-questions.txt", "w") as f: | |
for error in error_messages: | |
f.write(error + "\n") | |
def parse_arguments(): | |
parser = argparse.ArgumentParser(description='Process StackExchange data for QLORA (Question Generation from Large Open-domain Reading) task.') | |
parser.add_argument('--folder', default='questions', help='Folder to process containing HTML files.') | |
parser.add_argument('--min_score', type=int, default=0, help='Minimum score to include an answer.') | |
parser.add_argument('--release', action='store_true', help='Create final training and testing files.') | |
parser.add_argument('--split_ratio', type=float, default=0.8, help='Split ratio for training and testing data.') | |
return parser.parse_args() | |
if __name__ == "__main__": | |
args = parse_arguments() | |
process_files_in_questions_folder(args.folder, args.min_score, args.release, args.split_ratio) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment