Created
January 24, 2012 23:03
-
-
Save thbkrkr/1673324 to your computer and use it in GitHub Desktop.
LeBonCoin to csv Groovy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/groovy | |
import org.htmlcleaner.* | |
import static C.* | |
class C { | |
static trucks = ['vito', 'transporter', ''] | |
static cats = ['utilitaires', 'caravaning'/*, 'voitures'*/] | |
static kmMax = "200000" | |
static words = ['change', 'Fiat', 'Renault', 'Citroen', 'Romeo', 'Porche', '4x4', 'Bmw', 'Megane', 'break', 'Espace', 'Opel', 'Xsara'] | |
static depts = ["09", "11", "12", "15", "31", "32", "33", "34", "46", "47", "48", "65", "66", "81", "82"] | |
static prices = ['utilitaires': [5, 9], 'caravaning': [9, 13], 'voitures': [10, 18]] | |
} | |
trucks.each { t -> cats.each { c -> search(c, t) } } | |
System.exit(1) | |
List<Truck> search(category, type) { | |
String priceMin = prices.get(category)[0] // 4 000 | |
String priceMax = prices.get(category)[1] // 8 000 | |
url = "http://www.leboncoin.fr/"+category+"/offres/midi_pyrenees/occasions/?f=a&th=1&ps="+priceMin+"&pe="+priceMax+"&q="+type+"&me="+kmMax | |
parseEveryPage(category, type, url, 0) | |
} | |
def parseEveryPage(category, type, address, i) { | |
i++ | |
def page = getXml(address) | |
page.body.div.div.div.table.tbody.tr.td.table.tbody.tr.each | |
{ tr -> parseTruck(category, type, [email protected]()) } | |
def nextPage = "" | |
page.body.div.div.div.div.div.span.findAll { it.text() =~ '.*suivante.*' }.each | |
{ sp -> nextPage = [email protected]() } | |
if (!nextPage.equals("")) parseEveryPage(category, type, nextPage, i) | |
} | |
def parseTruck(category, type, address) { | |
def page = getXml(address) | |
def title = page.body.div.div.div.div.span.h1.text() | |
def map = [:] | |
page.body.div.div.div.div.div.div.span.each { sp -> | |
split = sp.text().split(" : ") | |
map.put(split[0], split[1].replaceAll("\\s+","")); } | |
def id = (address =~ /([0-9].*).htm/)[0][1] | |
def t = new Truck(id, title, category, type, address, map) | |
push(t) | |
} | |
def push(truck) { | |
// clean bad titles | |
def res = words.findAll{ w -> truck.title =~ /$w/ } | |
boolean titleValid = res.size() == 0 | |
// print truck | |
if (truck.km < 200001 | |
&& truck.isGeoZoneOk | |
&& titleValid) | |
println truck | |
} | |
def getXml(address) { | |
def cleaner = new HtmlCleaner() | |
def node = cleaner.clean(address.toURL()) | |
def props = cleaner.getProperties() | |
def serializer = new SimpleXmlSerializer(props) | |
def xml = serializer.getXmlAsString(node) | |
def page = new XmlSlurper(false, false).parseText(xml) | |
} | |
class Truck { | |
String title | |
int price | |
int km | |
String year | |
String gas | |
int cp | |
String city | |
String type | |
String category | |
String id | |
String url | |
boolean isGeoZoneOk | |
Map<String, String> map | |
//String photos | |
public Truck(pid, ptitle, pcategory, ptype, paddress, pmap) { | |
id = pid | |
title = ptitle.replaceAll(",", ".") | |
type = ptype | |
category = pcategory | |
url = paddress | |
map = pmap | |
// year | |
year = map.get("Année-modèle") | |
// price | |
price = Integer.parseInt((map.get("Prix") =~ /[0-9]+/)[0]) | |
// gas | |
gas = map.get("Carburant") | |
// filter km | |
def res = map.get("Kilométrage") =~ /[0-9]+\s*[0-9]*/ | |
km = res.getCount() == 0 ? -1 : Integer.parseInt(res[0].replaceAll(" ", "")) | |
// city | |
city = map.get("Ville") | |
// if no city get cp | |
city = city == null ? map.get("Code postal") : city | |
// filter cp | |
cp = Integer.parseInt((city =~ /[0-9]+/)[0]) | |
// filter dept | |
def dept = (city =~ /[0-9][0-9]/)[0] | |
// filter city | |
res = (city =~ /[A-Za-z\s-éèàâ\']+/) | |
city = res.getCount() == 0 ? "?" : res[0] | |
// check geo area | |
isGeoZoneOk = depts.contains(dept) | |
} | |
String toString() { | |
"$title,$price,$km,$year,$gas,$cp,$city,$type,$category,$url" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment