Last active
August 29, 2015 14:26
-
-
Save rhzs/417ef552587aa5dff6b0 to your computer and use it in GitHub Desktop.
Simple java groovy example for data and web scrapping via import.io API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Author: Rheza Satria (2015), Indonesia | |
// Purpose: | |
// Simple groovy example for data and web crawler/scrapping via import.io API. | |
// Note: | |
// Lazada is Indonesian e-commerce. Feel free to change with other e-commerce. | |
// Run: | |
// groovy lazada_crawl.groovy | |
import java.io.BufferedReader; | |
import java.io.InputStreamReader; | |
import java.io.OutputStreamWriter; | |
import java.net.HttpURLConnection; | |
import java.net.URL; | |
@Grab('mysql:mysql-connector-java:5.1.25') | |
@GrabConfig(systemClassLoader = true) | |
import groovy.sql.Sql | |
// Need to create mysql database called 'lazada' | |
def sql = Sql.newInstance("jdbc:mysql://localhost:3306/lazada", "root","password", "com.mysql.jdbc.Driver") | |
// simple JSON dump, later we can consume the JSON data via scheduler to get the price and its product information. | |
sql.execute('create table IF NOT EXISTS jsondata(id INT NOT NULL AUTO_INCREMENT, json MEDIUMTEXT NOT NULL, url TEXT, status VARCHAR(255), PRIMARY KEY(id))'); | |
sql.close() | |
def getDataFromLazada(String lazadaUrl, int page, String otherOptions) { | |
String url = "https://api.import.io/store/data/_magic"; | |
URL obj = new URL(url); | |
HttpURLConnection conn = (HttpURLConnection) obj.openConnection(); | |
conn.setReadTimeout(30000); | |
conn.addRequestProperty("Accept-Language", "en-US,en;q=0.8"); | |
conn.addRequestProperty("User-Agent", "Mozilla"); | |
// conn.addRequestProperty("Referer", "google.com"); // forgery huh!? | |
conn.setDoOutput(true); | |
def w = new OutputStreamWriter(conn.getOutputStream(), "UTF-8"); | |
String reqUrl = "http://"+lazadaUrl + "?page=" + page + (otherOptions ? otherOptions : ""); | |
// println '{"url":"'+ reqUrl + '","apiVersionGuid":null,"cookies":null}' | |
// println '{"url":"${lazadaUrl}?page=${page}","apiVersionGuid":null,"cookies":null}' | |
w.write('{"url":"'+ reqUrl + '","apiVersionGuid":null,"cookies":null}'); | |
w.close(); | |
println "Requested URL: " + url; | |
int status = conn.getResponseCode(); | |
// println "Response Code ... " + status; | |
def inside = new BufferedReader(new InputStreamReader(conn.getInputStream())); | |
String inputLine; | |
StringBuffer html = new StringBuffer(); | |
while ((inputLine = inside.readLine()) != null) { | |
html.append(inputLine); | |
} | |
inside.close(); | |
conn.disconnect(); | |
// Define your mysql username and password to connect | |
def sql = Sql.newInstance("jdbc:mysql://localhost:3306/lazada", "root","password", "com.mysql.jdbc.Driver") | |
def params = [html.toString(), reqUrl, 'ACTIVE'] | |
sql.execute 'INSERT INTO jsondata(json, url, status) VALUES (?, ?, ?)', params | |
sql.close() | |
} | |
// Uncomment below lines to get it work! | |
// 248.times { // as of 22 June 2015 - www.lazada.co.id/beli-smartphone/ | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/beli-smartphone/", it); | |
// } | |
// 596 pages as of 22 June 2015 - www.lazada.co.id/fashion-wanita/ | |
// 597.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/fashion-wanita/", it, "&itemperpage=120"); | |
// } | |
// 210 pages as of 22 June 2015 - http://www.lazada.co.id/fashion-pria/ | |
// 211.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/fashion-pria/", it, "&itemperpage=120"); | |
// } | |
// 33.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/fashion-anak-perempuan/", it, "&itemperpage=120"); | |
// } | |
// 19.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/fashion-anak-laki-laki/", it, "&itemperpage=120"); | |
// } | |
// 21.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/beli-kacamata-pria/", it, "&itemperpage=120"); | |
// } | |
// 23.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/beli-kacamata-wanita/", it, "&itemperpage=120"); | |
// } | |
// 17.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/koper/", it, "&itemperpage=120"); | |
// } | |
// 20.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/aksesoris-travel/", it, "&itemperpage=120"); | |
// } | |
// 124.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/tas-dan-tas-ransel/", it, "&itemperpage=120"); | |
// } | |
// 5.times { | |
// if (it > 0) | |
// getDataFromLazada("www.lazada.co.id/penawaran-khusus-tas-koper/", it, "&itemperpage=120"); | |
// } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment