Skip to content

Instantly share code, notes, and snippets.

@M49ICKPIxi3
Created October 10, 2021 20:40
Show Gist options
  • Select an option

  • Save M49ICKPIxi3/9b6134c0d7a12e0b7ee886f490244006 to your computer and use it in GitHub Desktop.

Select an option

Save M49ICKPIxi3/9b6134c0d7a12e0b7ee886f490244006 to your computer and use it in GitHub Desktop.
Easy to use script for pulling answered questions from the stackexchange api
import requests
from bs4 import BeautifulSoup
from helpers.simple_mongo import SimpleMongo
from stackapi import StackAPI
import jsonlines
import openai
import os
"""
{'
tags': ['formatting', 'scientific-publishing', 'synopsis'],
'owner': {
'reputation': 11,
del 'user_id': 51501,
'user_type': 'registered',
del 'profile_image': 'https://www.gravatar.com/avatar/e84010a735486fd0d367e8160fa41c6c?s=128&d=identicon&r=PG&f=1',
del 'display_name': 'Arpita ',
chg 'link': 'https://writing.stackexchange.com/users/51501/arpita' // _id
},
'is_answered': False,
'view_count': 10,
'answer_count': 1,
'score': 0,
'last_activity_date': 1633766298, /// arrow.get('')
'creation_date': 1633758860, /// arrow.get('')
'question_id': 59252, /// _id
'content_license': 'CC BY-SA 4.0', /// don't need lol (of course you do for prod apps, this ain't prod)
'link': 'https://writing.stackexchange.com/questions/59252/how-do-i-write-a-synopsis-for-a-scientific-article-publication',
'title': 'How do I write a synopsis for a scientific article? publication?'
}
"""
""" the response for the fetch "questions" has these fields:
'backoff' = {int} 0
'has_more' = {bool} True
'page' = {int} 1
'quota_max' = {int} 300
'quota_remaining' = {int} 298
'total' = {int} 0
"""
def process_question(question):
question_link = question['link']
print(f'Processing {question_link} ...')
se_page_response = requests.get(question_link)
soup = BeautifulSoup(se_page_response.text, 'html.parser')
se_page_data = dict(**question)
question_ele = soup.find('div', attrs={'class': 'question'})
question_content_ele = question_ele.find('div', attrs={'class': 's-prose js-post-body'})
question_text = question_content_ele.text
se_page_data['question'] = question_text
answers = []
for answer in soup.find_all('div', attrs={'class': 'answer'}):
answer_content = answer.find('div', attrs={'class': 's-prose js-post-body'})
answer_content_text = answer_content.text
answers.append(answer_content_text)
se_page_data['answers'] = answers
return se_page_data
def gather_answered_questions(output_collection):
writing_se_api = StackAPI('writing')
writing_se_api.page_size = 100
writing_se_api.max_pages = 5
puzzles_se_api = StackAPI('puzzling')
puzzles_se_api.page_size = 100
puzzles_se_api.max_pages = 5
philosophy_se_api = StackAPI('philosophy')
philosophy_se_api.page_size = 100
philosophy_se_api.max_pages = 5
count_just_cuz = 0
while True:
all_qs = []
all_qs.extend(writing_se_api.fetch('questions')['items'])
all_qs.extend(puzzles_se_api.fetch('questions')['items'])
all_qs.extend(philosophy_se_api.fetch('questions')['items'])
for question in all_qs:
if question['answer_count'] > 0:
try:
qa_data = process_question(question)
output_collection.insert_one(document=qa_data)
except Exception as e:
print(e.args)
print(f'Progress {count_just_cuz}!')
count_just_cuz += 1
def main():
openai.api_key = os.getenv('OPENAI_API_KEY')
OUT_DIR = '.../output/' # Change this
filename = 'generalized_question_answerer.jsonl'
filepath_out = f'{OUT_DIR}{filename}'
_database = 'question_answering'
_writing_qa_col_name = 'writing_qa'
_puzzles_qa_col_name = 'puzzles_qa'
simple_mongo = SimpleMongo(_database, [_writing_qa_col_name, _puzzles_qa_col_name])
writing_qa_col = simple_mongo.collections[_writing_qa_col_name]
# Of course, uncomment this when you want to run it. It won't exit anytime soon...
# gather_answered_questions(writing_qa_col)
json_records = []
for answered_question in writing_qa_col.find({}):
title = answered_question['title']
question = answered_question['question'].replace('\n', '')
prompt = f'{title}\n\n{question}'
answer_top = answered_question['answers'][0].replace('\n','')
record = {
'rank': answered_question['score'],
'prompt': f'{prompt}\n\n###\n\n',
'completion': answer_top
}
json_records.append(record)
sorted(json_records, key=lambda x: x['rank'])
for json_record in json_records:
del(json_record['rank'])
guess_max = 4500
with jsonlines.open(filepath_out, mode='w') as writer:
for i, json_record in enumerate(json_records):
try:
writer.write(json_record)
except Exception as e:
print(e)
if i > 0 and i % 100 == 0:
print(f'Processed {i} so far...')
if i >= guess_max:
break
response = openai.File.create(
file=open(filepath_out),
purpose='fine-tune'
)
print(response)
response = openai.FineTune.create(
training_file=response['id'],
n_epochs=4,
learning_rate_multiplier=0.07, # 0.01-0.4
batch_size=40,
use_packing=True,
prompt_loss_weight=1.00
)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment