M49ICKPIxi3 · October 10, 2021 20:40
diff --git a/fine_tuned_general_QA.py b/fine_tuned_general_QA.py
 import requests
 from bs4 import BeautifulSoup

 from helpers.simple_mongo import SimpleMongo
 from stackapi import StackAPI

 import jsonlines
 import openai
 import os

 """
 {'
    tags': ['formatting', 'scientific-publishing', 'synopsis'], 
    'owner': {
        'reputation': 11, 
 del     'user_id': 51501, 
        'user_type': 'registered', 
 del     'profile_image': 'https://www.gravatar.com/avatar/e84010a735486fd0d367e8160fa41c6c?s=128&d=identicon&r=PG&f=1', 
 del     'display_name': 'Arpita ', 
 chg     'link': 'https://writing.stackexchange.com/users/51501/arpita'  // _id
    }, 
    'is_answered': False, 
    'view_count': 10, 
    'answer_count': 1, 
    'score': 0, 
    'last_activity_date': 1633766298, /// arrow.get('')
    'creation_date': 1633758860, /// arrow.get('')
    'question_id': 59252, /// _id
    'content_license': 'CC BY-SA 4.0', /// don't need lol (of course you do for prod apps, this ain't prod)
    'link': 'https://writing.stackexchange.com/questions/59252/how-do-i-write-a-synopsis-for-a-scientific-article-publication', 
    'title': 'How do I write a synopsis for a scientific article? publication?'
 }

 """

 """ the response for the fetch "questions" has these fields:
 'backoff' = {int} 0
 'has_more' = {bool} True
 'page' = {int} 1
 'quota_max' = {int} 300
 'quota_remaining' = {int} 298
 'total' = {int} 0
 """

 def process_question(question):
    question_link = question['link']
    print(f'Processing {question_link} ...')

    se_page_response = requests.get(question_link)
    soup = BeautifulSoup(se_page_response.text, 'html.parser')

    se_page_data = dict(**question)

    question_ele = soup.find('div', attrs={'class': 'question'})
    question_content_ele = question_ele.find('div', attrs={'class': 's-prose js-post-body'})
    question_text = question_content_ele.text

    se_page_data['question'] = question_text

    answers = []
    for answer in soup.find_all('div', attrs={'class': 'answer'}):
        answer_content = answer.find('div', attrs={'class': 's-prose js-post-body'})
        answer_content_text = answer_content.text
        answers.append(answer_content_text)

    se_page_data['answers'] = answers

    return se_page_data

 def gather_answered_questions(output_collection):

    writing_se_api = StackAPI('writing')
    writing_se_api.page_size = 100
    writing_se_api.max_pages = 5
    puzzles_se_api = StackAPI('puzzling')
    puzzles_se_api.page_size = 100
    puzzles_se_api.max_pages = 5
    philosophy_se_api = StackAPI('philosophy')
    philosophy_se_api.page_size = 100
    philosophy_se_api.max_pages = 5

    count_just_cuz = 0
    while True:

        all_qs = []
        all_qs.extend(writing_se_api.fetch('questions')['items'])
        all_qs.extend(puzzles_se_api.fetch('questions')['items'])
        all_qs.extend(philosophy_se_api.fetch('questions')['items'])

        for question in all_qs:
            if question['answer_count'] > 0:
                try:
                    qa_data = process_question(question)
                    output_collection.insert_one(document=qa_data)
                except Exception as e:
                    print(e.args)

        print(f'Progress {count_just_cuz}!')
        count_just_cuz += 1

 def main():
    openai.api_key = os.getenv('OPENAI_API_KEY')
    OUT_DIR = '.../output/'     # Change this

    filename = 'generalized_question_answerer.jsonl'
    filepath_out = f'{OUT_DIR}{filename}'

    _database = 'question_answering'
    _writing_qa_col_name = 'writing_qa'
    _puzzles_qa_col_name = 'puzzles_qa'

    simple_mongo = SimpleMongo(_database, [_writing_qa_col_name, _puzzles_qa_col_name])
    writing_qa_col = simple_mongo.collections[_writing_qa_col_name]
    
    # Of course, uncomment this when you want to run it. It won't exit anytime soon...
    # gather_answered_questions(writing_qa_col) 

    json_records = []
    for answered_question in writing_qa_col.find({}):
        title = answered_question['title']
        question = answered_question['question'].replace('\n', '')
        prompt = f'{title}\n\n{question}'

        answer_top = answered_question['answers'][0].replace('\n','')
        record = {
            'rank': answered_question['score'],
            'prompt': f'{prompt}\n\n###\n\n',
            'completion': answer_top
        }
        json_records.append(record)

    sorted(json_records, key=lambda x: x['rank'])

    for json_record in json_records:
        del(json_record['rank'])

    guess_max = 4500

    with jsonlines.open(filepath_out, mode='w') as writer:
        for i, json_record in enumerate(json_records):
            try:
                writer.write(json_record)
            except Exception as e:
                print(e)
            if i > 0 and i % 100 == 0:
                print(f'Processed {i} so far...')
            if i >= guess_max:
                break

    response = openai.File.create(
        file=open(filepath_out),
        purpose='fine-tune'
    )

    print(response)

    response = openai.FineTune.create(
        training_file=response['id'],
        n_epochs=4,
        learning_rate_multiplier=0.07,  #  0.01-0.4
        batch_size=40,
        use_packing=True,
        prompt_loss_weight=1.00
    )

    


 if __name__ == '__main__':
    main()
	import requests
	from bs4 import BeautifulSoup

	from helpers.simple_mongo import SimpleMongo
	from stackapi import StackAPI

	import jsonlines
	import openai
	import os

	"""
	{'
	tags': ['formatting', 'scientific-publishing', 'synopsis'],
	'owner': {
	'reputation': 11,
	del 'user_id': 51501,
	'user_type': 'registered',
	del 'profile_image': 'https://www.gravatar.com/avatar/e84010a735486fd0d367e8160fa41c6c?s=128&d=identicon&r=PG&f=1',
	del 'display_name': 'Arpita ',
	chg 'link': 'https://writing.stackexchange.com/users/51501/arpita' // _id
	},
	'is_answered': False,
	'view_count': 10,
	'answer_count': 1,
	'score': 0,
	'last_activity_date': 1633766298, /// arrow.get('')
	'creation_date': 1633758860, /// arrow.get('')
	'question_id': 59252, /// _id
	'content_license': 'CC BY-SA 4.0', /// don't need lol (of course you do for prod apps, this ain't prod)
	'link': 'https://writing.stackexchange.com/questions/59252/how-do-i-write-a-synopsis-for-a-scientific-article-publication',
	'title': 'How do I write a synopsis for a scientific article? publication?'
	}

	"""

	""" the response for the fetch "questions" has these fields:
	'backoff' = {int} 0
	'has_more' = {bool} True
	'page' = {int} 1
	'quota_max' = {int} 300
	'quota_remaining' = {int} 298
	'total' = {int} 0
	"""

	def process_question(question):
	question_link = question['link']
	print(f'Processing {question_link} ...')

	se_page_response = requests.get(question_link)
	soup = BeautifulSoup(se_page_response.text, 'html.parser')

	se_page_data = dict(**question)

	question_ele = soup.find('div', attrs={'class': 'question'})
	question_content_ele = question_ele.find('div', attrs={'class': 's-prose js-post-body'})
	question_text = question_content_ele.text

	se_page_data['question'] = question_text

	answers = []
	for answer in soup.find_all('div', attrs={'class': 'answer'}):
	answer_content = answer.find('div', attrs={'class': 's-prose js-post-body'})
	answer_content_text = answer_content.text
	answers.append(answer_content_text)

	se_page_data['answers'] = answers

	return se_page_data

	def gather_answered_questions(output_collection):

	writing_se_api = StackAPI('writing')
	writing_se_api.page_size = 100
	writing_se_api.max_pages = 5
	puzzles_se_api = StackAPI('puzzling')
	puzzles_se_api.page_size = 100
	puzzles_se_api.max_pages = 5
	philosophy_se_api = StackAPI('philosophy')
	philosophy_se_api.page_size = 100
	philosophy_se_api.max_pages = 5

	count_just_cuz = 0
	while True:

	all_qs = []
	all_qs.extend(writing_se_api.fetch('questions')['items'])
	all_qs.extend(puzzles_se_api.fetch('questions')['items'])
	all_qs.extend(philosophy_se_api.fetch('questions')['items'])

	for question in all_qs:
	if question['answer_count'] > 0:
	try:
	qa_data = process_question(question)
	output_collection.insert_one(document=qa_data)
	except Exception as e:
	print(e.args)

	print(f'Progress {count_just_cuz}!')
	count_just_cuz += 1

	def main():
	openai.api_key = os.getenv('OPENAI_API_KEY')
	OUT_DIR = '.../output/' # Change this

	filename = 'generalized_question_answerer.jsonl'
	filepath_out = f'{OUT_DIR}{filename}'

	_database = 'question_answering'
	_writing_qa_col_name = 'writing_qa'
	_puzzles_qa_col_name = 'puzzles_qa'

	simple_mongo = SimpleMongo(_database, [_writing_qa_col_name, _puzzles_qa_col_name])
	writing_qa_col = simple_mongo.collections[_writing_qa_col_name]

	# Of course, uncomment this when you want to run it. It won't exit anytime soon...
	# gather_answered_questions(writing_qa_col)

	json_records = []
	for answered_question in writing_qa_col.find({}):
	title = answered_question['title']
	question = answered_question['question'].replace('\n', '')
	prompt = f'{title}\n\n{question}'

	answer_top = answered_question['answers'][0].replace('\n','')
	record = {
	'rank': answered_question['score'],
	'prompt': f'{prompt}\n\n###\n\n',
	'completion': answer_top
	}
	json_records.append(record)

	sorted(json_records, key=lambda x: x['rank'])

	for json_record in json_records:
	del(json_record['rank'])

	guess_max = 4500

	with jsonlines.open(filepath_out, mode='w') as writer:
	for i, json_record in enumerate(json_records):
	try:
	writer.write(json_record)
	except Exception as e:
	print(e)
	if i > 0 and i % 100 == 0:
	print(f'Processed {i} so far...')
	if i >= guess_max:
	break

	response = openai.File.create(
	file=open(filepath_out),
	purpose='fine-tune'
	)

	print(response)

	response = openai.FineTune.create(
	training_file=response['id'],
	n_epochs=4,
	learning_rate_multiplier=0.07, # 0.01-0.4
	batch_size=40,
	use_packing=True,
	prompt_loss_weight=1.00
	)




	if __name__ == '__main__':
	main()
No results found