groups = scraper.get_result_similar(url, grouped=True)Since groups is a dictionary, you can get the names of the rules by calling
groups.keys()You can then key into the dictionary using a particular review.
| from autoscraper import AutoScraper | |
| # replace with desired url | |
| url = 'https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing' | |
| # make sure that autoscraper can exactly match the items in your wanted_list | |
| wanted_list = ['A review'] # replace with item(s) of interest | |
| # build the scraper | |
| scraper = AutoScraper() | |
| result = scraper.build(url, wanted_list) |
groups = scraper.get_result_similar(url, grouped=True)Since groups is a dictionary, you can get the names of the rules by calling
groups.keys()You can then key into the dictionary using a particular review.
| #0 | |
| from autoscraper import AutoScraper | |
| #1 | |
| url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc" | |
| #2 | |
| wanted_list = ['Tried their Brown Sugar milk tea and it was not bad compare to Tiger Sugar. I prefer this over Tiger Sugar due to the L size option and sweetness content. It was my to go bubble tea spot for the last two days straight. Will visit again!'] | |
| #3 |
| from bs4 import BeautifulSoup | |
| url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc" | |
| browser.get(url) | |
| response = BeautifulSoup(browser.page_source, 'html.parser') | |
| published = response.find_all('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk') |
| from selenium import webdriver | |
| def setup_browser(driver_path=None, headless=False): | |
| op = webdriver.ChromeOptions() | |
| if headless: | |
| op.add_argument('headless') | |
| if driver_path: | |
| return ebdriver.Chrome(driver_path, options=op) | |
| # this depends on where you install chromedriver |
| # 0: returns the detected language | |
| detected.lang | |
| # 1: returns the confidence in the predicted language | |
| detected.confidence |
| # 0: returns the detected source language | |
| translated.src | |
| # 1: returns the language the text has been translated to | |
| translated.dest | |
| # 2: returns the original text | |
| translated.origin | |
| # 3: returns the translated text |
| #0 import libraries | |
| from googletrans import Translator | |
| import multiprocessing | |
| import workers | |
| #1 create Translator object | |
| translator = Translator() | |
| #2 create multiprocessing pool | |
| pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) |
| from googletrans import Translator | |
| translator = Translator() | |
| def google_translate(text, dest='en'): | |
| return translator.translate(text, dest=dest) |
| if isinstance(text, list): | |
| result = [] | |
| for item in text: | |
| translated = self.translate(item, dest=dest, src=src, **kwargs) | |
| result.append(translated) | |
| return result |