Skip to content

Instantly share code, notes, and snippets.

@fitwist
Created November 12, 2023 18:28
Show Gist options
  • Save fitwist/734f6c0f49b58bed2d624d28bebfcd37 to your computer and use it in GitHub Desktop.
Save fitwist/734f6c0f49b58bed2d624d28bebfcd37 to your computer and use it in GitHub Desktop.
from google.cloud import bigquery
from selenium import webdriver;
from selenium.webdriver.common.by import By
import asyncio
import os
import pandas as pd
import re
import telegram
TELEGRAM_API_KEY = "API Key"
TELEGRAM_CHAT_ID = -1001200247335
TELEGRAM_BOT_TOKEN = "Bot Token"
DELAY = 5
URL = "https://otzivisotrudnikov.ru/company/moskva/lamoda_ru_internet_magazin/"
MAX_LOAD_MORE_CLICKS = 5
bot = telegram.Bot(token=TELEGRAM_BOT_TOKEN)
driver = webdriver.Chrome()
driver.get(URL)
reviews_lst = []
urls_lst = []
dates_lst = []
count = 0
while count <= MAX_LOAD_MORE_CLICKS:
try:
WebDriverWait(driver, DELAY)
full_reviews = driver.find_elements(By.XPATH, "//*[@class='col-xs-10']")
for element in full_reviews:
# Вырежем пустые оценки по критериям
text = element.text.replace('Читать полностью отзыв и комментарии', '')
reviews_lst.append({'text': text})
# Выделим ссылки на отзывы
urls = driver.find_elements(By.XPATH, "//*[@class='read-more-serm']/a")
for element in urls:
review_absolute_path = element.get_attribute("href")
urls_lst.append(review_absolute_path)
dates = driver.find_elements(By.XPATH, "//*[@class='divh1 red']")
for element in dates:
element = element.text
when = re.sub(' \|(.*)', '', element)
dates_lst.append(when)
i = 0
while i < len(reviews_lst):
review = {'text': reviews_lst[i]['text'], 'when': dates_lst[i], 'url': urls_lst[i]}
reviews.append(review)
i += 1
count += 1
except TimeoutException:
break
scrapedReviews = pd.DataFrame(reviews)
scrapedReviews.insert(0, 'id', range(0, 0 + len(scrapedReviews)))
scrapedReviews.to_csv('scrapedReviews.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment