Created
January 18, 2017 20:10
-
-
Save tkaczenko/c1f4f3c61e3f6117545d3ec36d46905c to your computer and use it in GitHub Desktop.
RozetkaScraper is written in Java using Selenium. Not fully worked
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.openqa.selenium.By; | |
import org.openqa.selenium.JavascriptExecutor; | |
import org.openqa.selenium.WebDriver; | |
import org.openqa.selenium.WebElement; | |
import org.openqa.selenium.firefox.FirefoxDriver; | |
import product.Product; | |
import product.category.Category; | |
import java.util.ArrayList; | |
import java.util.List; | |
/** | |
* Created by tkaczenko on 25.12.16. | |
*/ | |
public class RozetkaScraper { | |
private static final String ROZETKA = | |
"http://rozetka.com.ua/sportivnoe-pitanie/c273292/?gclid=CIS4zIDJkdECFUktGQodo7kCdA"; | |
private static int NUM_OF_PAGES = 100 / 31; | |
private static WebDriver driver; | |
static { | |
System.setProperty("webdriver.gecko.driver", "/home/tkaczenko/Desktop/geckodriver"); | |
driver = new FirefoxDriver(); | |
} | |
// Data | |
private List<Category> categories = new ArrayList<>(); | |
private List<Product> products = new ArrayList<>(); | |
private List<String> urls = new ArrayList<>(); | |
public void parseCategory() { | |
driver.get(ROZETKA); | |
List<WebElement> titles = driver.findElements(By.cssSelector("a.pab-h3-link")); | |
for (WebElement element : titles) { | |
Category category = new Category(element.getText(), ""); | |
System.out.println("\t - " + category.getName()); | |
String url = element.getAttribute("href"); | |
urls.add(url); | |
for (int i = 2; i <= NUM_OF_PAGES; i++) { | |
urls.add(url + "/page=" + i + "/"); | |
} | |
} | |
parseProducts(); | |
} | |
public void parseProducts() { | |
List<String> temp = new ArrayList<>(); | |
for (String url : urls) { | |
driver.get(url); | |
((JavascriptExecutor) driver).executeScript("scroll(0, 300)"); | |
List<WebElement> names = driver.findElements(By.xpath("//div[contains(@class, " + | |
"'g-i-tile-i-title clearfix')]")); | |
List<WebElement> prices = driver.findElements(By.xpath("//div[contains(@class, " + | |
"'g-price-uah')]")); | |
/*WebDriverWait wait = new WebDriverWait(driver, 20); | |
wait.until(ExpectedConditions.visibilityOfAllElements(driver.findElements(By.tagName("img"))));*/ | |
List<WebElement> images = driver.findElements(By.tagName("img")); | |
for (int i = 0; i < images.size(); i++) { | |
System.out.println(i + " " + images.get(i).getAttribute("src")); | |
} | |
/*for (int i = 0; i < names.size(); i++) { | |
System.out.println(i + "\t" + names.get(i).getText() + prices.get(i).getText()); | |
Iterator<WebElement> iterator = images.iterator(); | |
while (iterator.hasNext()) { | |
WebElement element = iterator.next(); | |
String title = element.getAttribute("title"); | |
if (title.contains(names.get(i).getText())) { | |
System.out.println(element.getAttribute("src")); | |
break; | |
} | |
} | |
temp.add(names.get(i).getAttribute("href")); | |
}*/ | |
} | |
urls.clear(); | |
urls = temp; | |
} | |
public void parseProduct() { | |
for (String url : urls) { | |
driver.get(url); | |
//// TODO: 26.12.16 Implement | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment