Skip to content

Instantly share code, notes, and snippets.

@tkaczenko
Created January 18, 2017 20:10
Show Gist options
  • Save tkaczenko/c1f4f3c61e3f6117545d3ec36d46905c to your computer and use it in GitHub Desktop.
Save tkaczenko/c1f4f3c61e3f6117545d3ec36d46905c to your computer and use it in GitHub Desktop.
RozetkaScraper is written in Java using Selenium. Not fully worked
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
import product.Product;
import product.category.Category;
import java.util.ArrayList;
import java.util.List;
/**
* Created by tkaczenko on 25.12.16.
*/
public class RozetkaScraper {
private static final String ROZETKA =
"http://rozetka.com.ua/sportivnoe-pitanie/c273292/?gclid=CIS4zIDJkdECFUktGQodo7kCdA";
private static int NUM_OF_PAGES = 100 / 31;
private static WebDriver driver;
static {
System.setProperty("webdriver.gecko.driver", "/home/tkaczenko/Desktop/geckodriver");
driver = new FirefoxDriver();
}
// Data
private List<Category> categories = new ArrayList<>();
private List<Product> products = new ArrayList<>();
private List<String> urls = new ArrayList<>();
public void parseCategory() {
driver.get(ROZETKA);
List<WebElement> titles = driver.findElements(By.cssSelector("a.pab-h3-link"));
for (WebElement element : titles) {
Category category = new Category(element.getText(), "");
System.out.println("\t - " + category.getName());
String url = element.getAttribute("href");
urls.add(url);
for (int i = 2; i <= NUM_OF_PAGES; i++) {
urls.add(url + "/page=" + i + "/");
}
}
parseProducts();
}
public void parseProducts() {
List<String> temp = new ArrayList<>();
for (String url : urls) {
driver.get(url);
((JavascriptExecutor) driver).executeScript("scroll(0, 300)");
List<WebElement> names = driver.findElements(By.xpath("//div[contains(@class, " +
"'g-i-tile-i-title clearfix')]"));
List<WebElement> prices = driver.findElements(By.xpath("//div[contains(@class, " +
"'g-price-uah')]"));
/*WebDriverWait wait = new WebDriverWait(driver, 20);
wait.until(ExpectedConditions.visibilityOfAllElements(driver.findElements(By.tagName("img"))));*/
List<WebElement> images = driver.findElements(By.tagName("img"));
for (int i = 0; i < images.size(); i++) {
System.out.println(i + " " + images.get(i).getAttribute("src"));
}
/*for (int i = 0; i < names.size(); i++) {
System.out.println(i + "\t" + names.get(i).getText() + prices.get(i).getText());
Iterator<WebElement> iterator = images.iterator();
while (iterator.hasNext()) {
WebElement element = iterator.next();
String title = element.getAttribute("title");
if (title.contains(names.get(i).getText())) {
System.out.println(element.getAttribute("src"));
break;
}
}
temp.add(names.get(i).getAttribute("href"));
}*/
}
urls.clear();
urls = temp;
}
public void parseProduct() {
for (String url : urls) {
driver.get(url);
//// TODO: 26.12.16 Implement
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment