Last active
June 19, 2023 18:00
-
-
Save conceptofmind/26c76c2d2732ac5e931ed52390089a0f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from my_secret_keys import api_key, cse_id | |
from googleapiclient.discovery import build | |
from fuzzywuzzy import fuzz | |
from playwright.sync_api import sync_playwright | |
# get the google search api result | |
def google_search(input_query: str, api_key: str, cse_id: str): | |
num_results = 3 | |
results = custom_search( | |
input_query, num=num_results, api_key=api_key, cse_id=cse_id | |
) | |
if results: | |
return results #[0] | |
return None | |
def custom_search(query, api_key, cse_id, **kwargs): | |
service = build("customsearch", "v1", developerKey=api_key) | |
res = service.cse().list(q=query, cx=cse_id, **kwargs).execute() | |
return res["items"] | |
# scrape the text from the webpage | |
def scrape_text(url): | |
with sync_playwright() as playwright: | |
browser = playwright.chromium.launch(headless=True) | |
context = browser.new_context() | |
page = context.new_page() | |
try: | |
page.goto(url) | |
snippet = extract_snippet(page) | |
return snippet | |
finally: | |
context.close() | |
browser.close() | |
def extract_snippet(page): | |
paragraphs = page.query_selector_all("p") | |
for paragraph in paragraphs: | |
text = paragraph.inner_text() | |
if len(text) >= 500: | |
return text[:500] | |
return None | |
def truncate_document(document, snippet, match_ratio_threshold=0.75, before=100, max_length=500): | |
match = fuzz.token_set_ratio(document, snippet) | |
if match < match_ratio_threshold: | |
return snippet | |
snippet_position = document.find(snippet) | |
start = max(0, snippet_position - before) | |
end = min(len(document), start + max_length) | |
truncated_document = document[start:end] | |
return truncated_document | |
def get_search_result(query, api_key, cse_id): | |
search_results = google_search(query, api_key, cse_id) | |
if search_results: | |
for result in search_results: | |
url = result['link'] | |
snippet = result['snippet'] | |
search_result = scrape_text(url) | |
truncated_result = truncate_document(search_result, snippet) | |
# Check if suitable text is found | |
suitable_text_found = (truncated_result != snippet) #and (truncated_result != search_result) | |
if suitable_text_found: | |
return truncated_result | |
return "No search results found." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment