bojanbabic · August 20, 2023 22:55
diff --git a/gpt_scraper.py b/gpt_scraper.py
 import openai
 import os
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 import re

 openai.api_key  = os.getenv('OPENAI_API_KEY')


 url = "https://www.greatschools.org/california/san-jose/5442-Joaquin-Miller-Middle-School"
 questions = [
    "where is the location of school? ",
    "how many students this school has?",
    "what percentage of the students are asian?"
 ]

 tags = ["p_div"]

 question_str = " ".join(questions)
 chrome_driver_path = "<path_to_chromium>"


 options = webdriver.ChromeOptions()
 options.add_argument('--headless')

 driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
 driver.get(url)

 driver.implicitly_wait(5) # Let the JavaScript load

 html = driver.page_source
 soup = BeautifulSoup(html, "html.parser") # Get the page source and parse it
 driver.close() # Close the driver

 def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

 def text_from_html(html):
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

 def get_completion(prompt, model="gpt-3.5-turbo"): #This is a more one turn conversation
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

 target_text = text_from_html(html)
 prompt = f"""
 The output from webscrap and subsequent regex processing is delimited in triple quotes.
 Answer the following questions: {question_str}
 ### {target_text} ###.

 """
 response = get_completion(prompt)
 print(response)
	import openai
	import os
	from bs4 import BeautifulSoup
	from bs4.element import Comment
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	import re

	openai.api_key = os.getenv('OPENAI_API_KEY')


	url = "https://www.greatschools.org/california/san-jose/5442-Joaquin-Miller-Middle-School"
	questions = [
	"where is the location of school? ",
	"how many students this school has?",
	"what percentage of the students are asian?"
	]

	tags = ["p_div"]

	question_str = " ".join(questions)
	chrome_driver_path = "<path_to_chromium>"


	options = webdriver.ChromeOptions()
	options.add_argument('--headless')

	driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
	driver.get(url)

	driver.implicitly_wait(5) # Let the JavaScript load

	html = driver.page_source
	soup = BeautifulSoup(html, "html.parser") # Get the page source and parse it
	driver.close() # Close the driver

	def tag_visible(element):
	if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
	return False
	if isinstance(element, Comment):
	return False
	return True

	def text_from_html(html):
	texts = soup.findAll(text=True)
	visible_texts = filter(tag_visible, texts)
	return u" ".join(t.strip() for t in visible_texts)

	def get_completion(prompt, model="gpt-3.5-turbo"): #This is a more one turn conversation
	messages = [{"role": "user", "content": prompt}]
	response = openai.ChatCompletion.create(
	model=model,
	messages=messages,
	temperature=0, # this is the degree of randomness of the model's output
	)
	return response.choices[0].message["content"]

	target_text = text_from_html(html)
	prompt = f"""
	The output from webscrap and subsequent regex processing is delimited in triple quotes.
	Answer the following questions: {question_str}
	### {target_text} ###.

	"""
	response = get_completion(prompt)
	print(response)