conceptofmind · June 19, 2023 18:00
diff --git a/web_search.py b/web_search.py
 import re
 from my_secret_keys import api_key, cse_id
 from googleapiclient.discovery import build
 from fuzzywuzzy import fuzz
 from playwright.sync_api import sync_playwright

 # get the google search api result

 def google_search(input_query: str, api_key: str, cse_id: str):
    num_results = 3
    results = custom_search(
        input_query, num=num_results, api_key=api_key, cse_id=cse_id
    )
    if results:
        return results #[0]
    return None

 def custom_search(query, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=query, cx=cse_id, **kwargs).execute()
    return res["items"]

 # scrape the text from the webpage

 def scrape_text(url):
    with sync_playwright() as playwright:
        browser = playwright.chromium.launch(headless=True)
        context = browser.new_context()
        page = context.new_page()
        
        try:
            page.goto(url)
            snippet = extract_snippet(page)
            return snippet
        
        finally:
            context.close()
            browser.close()

 def extract_snippet(page):
    paragraphs = page.query_selector_all("p")
    for paragraph in paragraphs:
        text = paragraph.inner_text()
        if len(text) >= 500:
            return text[:500]
    return None

 def truncate_document(document, snippet, match_ratio_threshold=0.75, before=100, max_length=500):
    match = fuzz.token_set_ratio(document, snippet)
    if match < match_ratio_threshold:
        return snippet
    
    snippet_position = document.find(snippet)
    start = max(0, snippet_position - before)
    end = min(len(document), start + max_length)
    truncated_document = document[start:end]
    
    return truncated_document

 def get_search_result(query, api_key, cse_id):
    search_results = google_search(query, api_key, cse_id)
    if search_results:
        for result in search_results:
            url = result['link']
            snippet = result['snippet']
            search_result = scrape_text(url)
            truncated_result = truncate_document(search_result, snippet)
            
            # Check if suitable text is found
            suitable_text_found = (truncated_result != snippet) #and (truncated_result != search_result)

            if suitable_text_found:
                return truncated_result
    return "No search results found."
	import re
	from my_secret_keys import api_key, cse_id
	from googleapiclient.discovery import build
	from fuzzywuzzy import fuzz
	from playwright.sync_api import sync_playwright

	# get the google search api result

	def google_search(input_query: str, api_key: str, cse_id: str):
	num_results = 3
	results = custom_search(
	input_query, num=num_results, api_key=api_key, cse_id=cse_id
	)
	if results:
	return results #[0]
	return None

	def custom_search(query, api_key, cse_id, **kwargs):
	service = build("customsearch", "v1", developerKey=api_key)
	res = service.cse().list(q=query, cx=cse_id, **kwargs).execute()
	return res["items"]

	# scrape the text from the webpage

	def scrape_text(url):
	with sync_playwright() as playwright:
	browser = playwright.chromium.launch(headless=True)
	context = browser.new_context()
	page = context.new_page()

	try:
	page.goto(url)
	snippet = extract_snippet(page)
	return snippet

	finally:
	context.close()
	browser.close()

	def extract_snippet(page):
	paragraphs = page.query_selector_all("p")
	for paragraph in paragraphs:
	text = paragraph.inner_text()
	if len(text) >= 500:
	return text[:500]
	return None

	def truncate_document(document, snippet, match_ratio_threshold=0.75, before=100, max_length=500):
	match = fuzz.token_set_ratio(document, snippet)
	if match < match_ratio_threshold:
	return snippet

	snippet_position = document.find(snippet)
	start = max(0, snippet_position - before)
	end = min(len(document), start + max_length)
	truncated_document = document[start:end]

	return truncated_document

	def get_search_result(query, api_key, cse_id):
	search_results = google_search(query, api_key, cse_id)
	if search_results:
	for result in search_results:
	url = result['link']
	snippet = result['snippet']
	search_result = scrape_text(url)
	truncated_result = truncate_document(search_result, snippet)

	# Check if suitable text is found
	suitable_text_found = (truncated_result != snippet) #and (truncated_result != search_result)

	if suitable_text_found:
	return truncated_result
	return "No search results found."