Created
October 10, 2021 20:40
-
-
Save M49ICKPIxi3/9b6134c0d7a12e0b7ee886f490244006 to your computer and use it in GitHub Desktop.
Easy to use script for pulling answered questions from the stackexchange api
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| from helpers.simple_mongo import SimpleMongo | |
| from stackapi import StackAPI | |
| import jsonlines | |
| import openai | |
| import os | |
| """ | |
| {' | |
| tags': ['formatting', 'scientific-publishing', 'synopsis'], | |
| 'owner': { | |
| 'reputation': 11, | |
| del 'user_id': 51501, | |
| 'user_type': 'registered', | |
| del 'profile_image': 'https://www.gravatar.com/avatar/e84010a735486fd0d367e8160fa41c6c?s=128&d=identicon&r=PG&f=1', | |
| del 'display_name': 'Arpita ', | |
| chg 'link': 'https://writing.stackexchange.com/users/51501/arpita' // _id | |
| }, | |
| 'is_answered': False, | |
| 'view_count': 10, | |
| 'answer_count': 1, | |
| 'score': 0, | |
| 'last_activity_date': 1633766298, /// arrow.get('') | |
| 'creation_date': 1633758860, /// arrow.get('') | |
| 'question_id': 59252, /// _id | |
| 'content_license': 'CC BY-SA 4.0', /// don't need lol (of course you do for prod apps, this ain't prod) | |
| 'link': 'https://writing.stackexchange.com/questions/59252/how-do-i-write-a-synopsis-for-a-scientific-article-publication', | |
| 'title': 'How do I write a synopsis for a scientific article? publication?' | |
| } | |
| """ | |
| """ the response for the fetch "questions" has these fields: | |
| 'backoff' = {int} 0 | |
| 'has_more' = {bool} True | |
| 'page' = {int} 1 | |
| 'quota_max' = {int} 300 | |
| 'quota_remaining' = {int} 298 | |
| 'total' = {int} 0 | |
| """ | |
| def process_question(question): | |
| question_link = question['link'] | |
| print(f'Processing {question_link} ...') | |
| se_page_response = requests.get(question_link) | |
| soup = BeautifulSoup(se_page_response.text, 'html.parser') | |
| se_page_data = dict(**question) | |
| question_ele = soup.find('div', attrs={'class': 'question'}) | |
| question_content_ele = question_ele.find('div', attrs={'class': 's-prose js-post-body'}) | |
| question_text = question_content_ele.text | |
| se_page_data['question'] = question_text | |
| answers = [] | |
| for answer in soup.find_all('div', attrs={'class': 'answer'}): | |
| answer_content = answer.find('div', attrs={'class': 's-prose js-post-body'}) | |
| answer_content_text = answer_content.text | |
| answers.append(answer_content_text) | |
| se_page_data['answers'] = answers | |
| return se_page_data | |
| def gather_answered_questions(output_collection): | |
| writing_se_api = StackAPI('writing') | |
| writing_se_api.page_size = 100 | |
| writing_se_api.max_pages = 5 | |
| puzzles_se_api = StackAPI('puzzling') | |
| puzzles_se_api.page_size = 100 | |
| puzzles_se_api.max_pages = 5 | |
| philosophy_se_api = StackAPI('philosophy') | |
| philosophy_se_api.page_size = 100 | |
| philosophy_se_api.max_pages = 5 | |
| count_just_cuz = 0 | |
| while True: | |
| all_qs = [] | |
| all_qs.extend(writing_se_api.fetch('questions')['items']) | |
| all_qs.extend(puzzles_se_api.fetch('questions')['items']) | |
| all_qs.extend(philosophy_se_api.fetch('questions')['items']) | |
| for question in all_qs: | |
| if question['answer_count'] > 0: | |
| try: | |
| qa_data = process_question(question) | |
| output_collection.insert_one(document=qa_data) | |
| except Exception as e: | |
| print(e.args) | |
| print(f'Progress {count_just_cuz}!') | |
| count_just_cuz += 1 | |
| def main(): | |
| openai.api_key = os.getenv('OPENAI_API_KEY') | |
| OUT_DIR = '.../output/' # Change this | |
| filename = 'generalized_question_answerer.jsonl' | |
| filepath_out = f'{OUT_DIR}{filename}' | |
| _database = 'question_answering' | |
| _writing_qa_col_name = 'writing_qa' | |
| _puzzles_qa_col_name = 'puzzles_qa' | |
| simple_mongo = SimpleMongo(_database, [_writing_qa_col_name, _puzzles_qa_col_name]) | |
| writing_qa_col = simple_mongo.collections[_writing_qa_col_name] | |
| # Of course, uncomment this when you want to run it. It won't exit anytime soon... | |
| # gather_answered_questions(writing_qa_col) | |
| json_records = [] | |
| for answered_question in writing_qa_col.find({}): | |
| title = answered_question['title'] | |
| question = answered_question['question'].replace('\n', '') | |
| prompt = f'{title}\n\n{question}' | |
| answer_top = answered_question['answers'][0].replace('\n','') | |
| record = { | |
| 'rank': answered_question['score'], | |
| 'prompt': f'{prompt}\n\n###\n\n', | |
| 'completion': answer_top | |
| } | |
| json_records.append(record) | |
| sorted(json_records, key=lambda x: x['rank']) | |
| for json_record in json_records: | |
| del(json_record['rank']) | |
| guess_max = 4500 | |
| with jsonlines.open(filepath_out, mode='w') as writer: | |
| for i, json_record in enumerate(json_records): | |
| try: | |
| writer.write(json_record) | |
| except Exception as e: | |
| print(e) | |
| if i > 0 and i % 100 == 0: | |
| print(f'Processed {i} so far...') | |
| if i >= guess_max: | |
| break | |
| response = openai.File.create( | |
| file=open(filepath_out), | |
| purpose='fine-tune' | |
| ) | |
| print(response) | |
| response = openai.FineTune.create( | |
| training_file=response['id'], | |
| n_epochs=4, | |
| learning_rate_multiplier=0.07, # 0.01-0.4 | |
| batch_size=40, | |
| use_packing=True, | |
| prompt_loss_weight=1.00 | |
| ) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment